diff --git a/docs/build/html/_sources/dev/custom_metal_kernels.rst b/docs/build/html/_sources/dev/custom_metal_kernels.rst
index c4c1b0aff..3e92f2814 100644
--- a/docs/build/html/_sources/dev/custom_metal_kernels.rst
+++ b/docs/build/html/_sources/dev/custom_metal_kernels.rst
@@ -1,3 +1,5 @@
+.. _custom_metal_kernels:
+
 Custom Metal Kernels
 ====================
 
@@ -76,6 +78,10 @@ Putting this all together, the generated function signature for ``myexp`` is as
 
   template [[host_name("custom_kernel_myexp_float")]] [[kernel]] decltype(custom_kernel_myexp_float<float>) custom_kernel_myexp_float<float>;
 
+Note: ``grid`` and ``threadgroup`` are parameters to the Metal `dispatchThreads <https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads>`_ function.
+This means we will launch ``mx.prod(grid)`` threads, subdivided into ``threadgroup`` size threadgroups.
+For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.
+
 Passing ``verbose=True`` to ``mx.fast.metal_kernel.__call__`` will print the generated code for debugging purposes.
 
 Using Shape/Strides
diff --git a/docs/build/html/_sources/usage/function_transforms.rst b/docs/build/html/_sources/usage/function_transforms.rst
index 9a15bbf1c..9769fceaa 100644
--- a/docs/build/html/_sources/usage/function_transforms.rst
+++ b/docs/build/html/_sources/usage/function_transforms.rst
@@ -161,7 +161,7 @@ A naive way to add the elements from two sets of vectors is with a loop:
   ys = mx.random.uniform(shape=(100, 4096))
 
   def naive_add(xs, ys):
-      return [xs[i] + ys[:, i] for i in range(xs.shape[1])]
+      return [xs[i] + ys[:, i] for i in range(xs.shape[0])]
 
 Instead you can use :func:`vmap` to automatically vectorize the addition:
 
@@ -169,7 +169,7 @@ Instead you can use :func:`vmap` to automatically vectorize the addition:
 
    # Vectorize over the second dimension of x and the
    # first dimension of y
-   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(1, 0))
+   vmap_add = mx.vmap(lambda x, y: x + y, in_axes=(0, 1))
 
 The ``in_axes`` parameter can be used to specify which dimensions of the
 corresponding input to vectorize over. Similarly, use ``out_axes`` to specify
diff --git a/docs/build/html/_sources/usage/indexing.rst b/docs/build/html/_sources/usage/indexing.rst
index 62994a0fb..c74e357fa 100644
--- a/docs/build/html/_sources/usage/indexing.rst
+++ b/docs/build/html/_sources/usage/indexing.rst
@@ -77,7 +77,7 @@ from the GPU. Performing bounds checking for array indices before launching the
 kernel would be extremely inefficient.
 
 Indexing with boolean masks is something that MLX may support in the future. In
-general, MLX has limited support for operations for which outputs
+general, MLX has limited support for operations for which output
 *shapes* are dependent on input *data*. Other examples of these types of
 operations which MLX does not yet support include :func:`numpy.nonzero` and the
 single input version of :func:`numpy.where`.
diff --git a/docs/build/html/_sources/usage/lazy_evaluation.rst b/docs/build/html/_sources/usage/lazy_evaluation.rst
index 466edaaed..8fd855efa 100644
--- a/docs/build/html/_sources/usage/lazy_evaluation.rst
+++ b/docs/build/html/_sources/usage/lazy_evaluation.rst
@@ -109,7 +109,7 @@ Here is a concrete example:
 
 An important behavior to be aware of is when the graph will be implicitly
 evaluated. Anytime you ``print`` an array, convert it to an
-:obj:`numpy.ndarray`, or otherwise access it's memory via :obj:`memoryview`,
+:obj:`numpy.ndarray`, or otherwise access its memory via :obj:`memoryview`,
 the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX
 saving functions) will also evaluate the array.
 
diff --git a/docs/build/html/backend_2metal_2device_8h_source.html b/docs/build/html/backend_2metal_2device_8h_source.html
index 8b936bd40..bb0b3537b 100644
--- a/docs/build/html/backend_2metal_2device_8h_source.html
+++ b/docs/build/html/backend_2metal_2device_8h_source.html
@@ -149,7 +149,7 @@ $(function(){ initResizable(false); });
 <div class="foldopen" id="foldopen00050" data-start="{" data-end="}">
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#a28bafec56edec3091e8716d8ccfb6ee1">   50</a></span>    <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#a28bafec56edec3091e8716d8ccfb6ee1">~ConcurrentContext</a>() {</div>
 <div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>      enc.concurrent_ = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      enc.outputs_.insert(</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>      enc.prev_outputs_.insert(</div>
 <div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>          enc.concurrent_outputs_.begin(), enc.concurrent_outputs_.end());</div>
 <div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      enc.concurrent_outputs_.clear();</div>
 <div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>    }</div>
@@ -170,212 +170,215 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00066" name="l00066"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">   66</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; a, <span class="keywordtype">int</span> idx, int64_t offset = 0);</div>
 <div class="line"><a id="l00067" name="l00067"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">   67</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">dispatchThreadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
 <div class="line"><a id="l00068" name="l00068"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">   68</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a>(MTL::Size grid_dims, MTL::Size group_dims);</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
-<div class="foldopen" id="foldopen00070" data-start="{" data-end="}">
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">   70</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>() {</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>(*<span class="keyword">this</span>);</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  }</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">   69</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a>();</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span> </div>
+<div class="foldopen" id="foldopen00071" data-start="{" data-end="}">
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">   71</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a> <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>() {</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    <span class="keywordflow">return</span> <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>(*<span class="keyword">this</span>);</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  }</div>
 </div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">   73</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>();</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span> </div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>  <span class="comment">// Inputs to all kernels in the encoder including temporaries</span></div>
-<div class="foldopen" id="foldopen00076" data-start="{" data-end="}">
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">   76</a></span>  std::unordered_set&lt;const void*&gt;&amp; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>() {</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="keywordflow">return</span> all_inputs_;</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>  };</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">   74</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>();</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span> </div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>  <span class="comment">// Inputs to all kernels in the encoder including temporaries</span></div>
+<div class="foldopen" id="foldopen00077" data-start="{" data-end="}">
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">   77</a></span>  std::unordered_set&lt;const void*&gt;&amp; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>() {</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keywordflow">return</span> all_inputs_;</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>  };</div>
 </div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span> </div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  <span class="comment">// Outputs of all kernels in the encoder including temporaries</span></div>
-<div class="foldopen" id="foldopen00081" data-start="{" data-end="}">
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">   81</a></span>  std::unordered_set&lt;const void*&gt; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>() {</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keywordflow">return</span> all_outputs_;</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>  };</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span> </div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>  <span class="comment">// Outputs of all kernels in the encoder including temporaries</span></div>
+<div class="foldopen" id="foldopen00082" data-start="{" data-end="}">
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">   82</a></span>  std::unordered_set&lt;const void*&gt; <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>() {</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="keywordflow">return</span> all_outputs_;</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>  };</div>
 </div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span> </div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>  MTL::ComputeCommandEncoder* enc_;</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  <span class="keywordtype">bool</span> concurrent_{<span class="keyword">false</span>};</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  std::unordered_set&lt;MTL::Resource*&gt; outputs_;</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  std::unordered_set&lt;MTL::Resource*&gt; concurrent_outputs_;</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  std::unordered_set&lt;const void*&gt; all_inputs_;</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  std::unordered_set&lt;const void*&gt; all_outputs_;</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>};</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  MTL::ComputeCommandEncoder* enc_;</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  <span class="keywordtype">bool</span> needs_barrier_{<span class="keyword">false</span>};</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  <span class="keywordtype">bool</span> concurrent_{<span class="keyword">false</span>};</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  std::unordered_set&lt;MTL::Resource*&gt; prev_outputs_;</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  std::unordered_set&lt;MTL::Resource*&gt; next_outputs_;</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  std::unordered_set&lt;MTL::Resource*&gt; concurrent_outputs_;</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  std::unordered_set&lt;const void*&gt; all_inputs_;</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  std::unordered_set&lt;const void*&gt; all_outputs_;</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>};</div>
 </div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span> </div>
-<div class="foldopen" id="foldopen00094" data-start="{" data-end="};">
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html">   94</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_fence.html">Fence</a> {</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">   95</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">Fence</a>(MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) {}</div>
-<div class="foldopen" id="foldopen00096" data-start="{" data-end="}">
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">   96</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">~Fence</a>() {</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>-&gt;release();</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  }</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span> </div>
+<div class="foldopen" id="foldopen00097" data-start="{" data-end="};">
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html">   97</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_fence.html">Fence</a> {</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">   98</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">Fence</a>(MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>) {}</div>
+<div class="foldopen" id="foldopen00099" data-start="{" data-end="}">
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">   99</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">~Fence</a>() {</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>-&gt;release();</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  }</div>
 </div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">   99</a></span>  MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>;</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>};</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">  102</a></span>  MTL::Fence* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">fence</a>;</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>};</div>
 </div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span> </div>
-<div class="foldopen" id="foldopen00102" data-start="{" data-end="};">
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">  102</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> {</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">  103</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">DeviceStream</a>(MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) {};</div>
-<div class="foldopen" id="foldopen00104" data-start="{" data-end="}">
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">  104</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">~DeviceStream</a>() {</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>-&gt;release();</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a> != <span class="keyword">nullptr</span>) {</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>      <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>-&gt;release();</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    }</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  };</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span> </div>
+<div class="foldopen" id="foldopen00105" data-start="{" data-end="};">
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">  105</a></span><span class="keyword">struct </span><a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a> {</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">  106</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">DeviceStream</a>(MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) : <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>(<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>) {};</div>
+<div class="foldopen" id="foldopen00107" data-start="{" data-end="}">
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">  107</a></span>  <a class="code hl_function" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">~DeviceStream</a>() {</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>-&gt;release();</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordflow">if</span> (<a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a> != <span class="keyword">nullptr</span>) {</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>-&gt;release();</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    }</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>  };</div>
 </div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">  110</a></span>  MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>;</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="comment">// A map of prior command encoder outputs to their corresponding fence</span></div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">  112</a></span>  std::unordered_map&lt;const void*, std::shared_ptr&lt;Fence&gt;&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">outputs</a>;</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>  <span class="comment">// Used to allow thread-safe access to the outputs map</span></div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">  114</a></span>  std::mutex <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">fence_mtx</a>;</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span> </div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="comment">// The buffer and buffer op count are updated</span></div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>  <span class="comment">// between command buffers</span></div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">  118</a></span>  MTL::CommandBuffer* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">  119</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">buffer_ops</a>{0};</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span> </div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="comment">// The command encoder, fence, and temporaries are updated between command</span></div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  <span class="comment">// encoders</span></div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">  123</a></span>  std::unique_ptr&lt;CommandEncoder&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">encoder</a>{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">  124</a></span>  std::shared_ptr&lt;Fence&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">fence</a>;</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">  125</a></span>  std::vector&lt;array&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">temporaries</a>;</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>};</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">  113</a></span>  MTL::CommandQueue* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">queue</a>;</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  <span class="comment">// A map of prior command encoder outputs to their corresponding fence</span></div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">  115</a></span>  std::unordered_map&lt;const void*, std::shared_ptr&lt;Fence&gt;&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">outputs</a>;</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="comment">// Used to allow thread-safe access to the outputs map</span></div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">  117</a></span>  std::mutex <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">fence_mtx</a>;</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span> </div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="comment">// The buffer and buffer op count are updated</span></div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="comment">// between command buffers</span></div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">  121</a></span>  MTL::CommandBuffer* <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">buffer</a>{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">  122</a></span>  <span class="keywordtype">int</span> <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">buffer_ops</a>{0};</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span> </div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  <span class="comment">// The command encoder, fence, and temporaries are updated between command</span></div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  <span class="comment">// encoders</span></div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">  126</a></span>  std::unique_ptr&lt;CommandEncoder&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">encoder</a>{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">  127</a></span>  std::shared_ptr&lt;Fence&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">fence</a>;</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno"><a class="line" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">  128</a></span>  std::vector&lt;array&gt; <a class="code hl_variable" href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">temporaries</a>;</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>};</div>
 </div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span> </div>
-<div class="foldopen" id="foldopen00128" data-start="{" data-end="};">
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html">  128</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> {</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span> <span class="keyword">public</span>:</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">  130</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">Device</a>();</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">  131</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">Device</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">  132</a></span>  <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">operator=</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">  133</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">~Device</a>();</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span> </div>
-<div class="foldopen" id="foldopen00135" data-start="{" data-end="}">
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">  135</a></span>  MTL::Device* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mtl_device</a>() {</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keywordflow">return</span> device_;</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>  };</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span> </div>
+<div class="foldopen" id="foldopen00131" data-start="{" data-end="};">
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html">  131</a></span><span class="keyword">class </span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a> {</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span> <span class="keyword">public</span>:</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">  133</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ae0db74570eb4b19d8cf19774db91bfd6">Device</a>();</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">  134</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">Device</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">  135</a></span>  <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">operator=</a>(<span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp;) = <span class="keyword">delete</span>;</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">  136</a></span>  <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">~Device</a>();</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
+<div class="foldopen" id="foldopen00138" data-start="{" data-end="}">
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">  138</a></span>  MTL::Device* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mtl_device</a>() {</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordflow">return</span> device_;</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  };</div>
 </div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span> </div>
-<div class="foldopen" id="foldopen00139" data-start="{" data-end="}">
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">  139</a></span>  <span class="keyword">const</span> std::string&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">get_architecture</a>() {</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    <span class="keywordflow">return</span> arch_;</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>  }</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span> </div>
+<div class="foldopen" id="foldopen00142" data-start="{" data-end="}">
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">  142</a></span>  <span class="keyword">const</span> std::string&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">get_architecture</a>() {</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordflow">return</span> arch_;</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>  }</div>
 </div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span> </div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">  143</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">new_queue</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">  144</a></span>  MTL::CommandBuffer* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">get_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">  145</a></span>  <span class="keywordtype">int</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">get_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">  146</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">increment_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">  147</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">commit_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">  148</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">get_command_encoder</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">  149</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">end_encoding</a>(<span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span> </div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">  151</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>      <span class="keyword">const</span> std::string&amp; lib_name,</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>      <span class="keyword">const</span> std::string&amp; lib_path);</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span> </div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>  <span class="comment">// Note, this should remain in the header so that it is not dynamically</span></div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>  <span class="comment">// linked</span></div>
-<div class="foldopen" id="foldopen00157" data-start="{" data-end="}">
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">  157</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">register_library</a>(<span class="keyword">const</span> std::string&amp; lib_name) {</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span> it = library_map_.find(lib_name); it == library_map_.end()) {</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>      <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(lib_name, <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f">get_colocated_mtllib_path</a>(lib_name));</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    }</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>  }</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span> </div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">  146</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">new_queue</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">  147</a></span>  MTL::CommandBuffer* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">get_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">  148</a></span>  <span class="keywordtype">int</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">get_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">  149</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">increment_command_buffer_ops</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">  150</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">commit_command_buffer</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">  151</a></span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#affa682ef612def4890f5152f81ffb7e6">get_command_encoder</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">  152</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">end_encoding</a>(<span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span> </div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">  154</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>      <span class="keyword">const</span> std::string&amp; lib_name,</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>      <span class="keyword">const</span> std::string&amp; lib_path);</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span> </div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>  <span class="comment">// Note, this should remain in the header so that it is not dynamically</span></div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>  <span class="comment">// linked</span></div>
+<div class="foldopen" id="foldopen00160" data-start="{" data-end="}">
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">  160</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">register_library</a>(<span class="keyword">const</span> std::string&amp; lib_name) {</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keywordflow">if</span> (<span class="keyword">auto</span> it = library_map_.find(lib_name); it == library_map_.end()) {</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>      <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">register_library</a>(lib_name, <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a5fd6ba2040e53a254b9d71ae7ebd315f">get_colocated_mtllib_path</a>(lib_name));</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    }</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>  }</div>
 </div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span> </div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">  163</a></span>  MTL::Library* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">get_library</a>(</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>      <span class="keyword">const</span> std::function&lt;std::string(<span class="keywordtype">void</span>)&gt;&amp; builder);</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span> </div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">  167</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">get_kernel</a>(</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>      MTL::Library* mtl_lib,</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span> </div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">  174</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">get_kernel</a>(</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>      <span class="keyword">const</span> std::string&amp; lib_name = <span class="stringliteral">&quot;mlx&quot;</span>,</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span> </div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">  181</a></span>  MTL::ArgumentEncoder* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">argument_encoder</a>(</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      <span class="keyword">const</span> std::vector&lt;MTL::ArgumentDescriptor*&gt;&amp; arg_descs) <span class="keyword">const</span>;</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span> </div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">  166</a></span>  MTL::Library* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a75ed55e73baf48013028796518723ff0">get_library</a>(</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>      <span class="keyword">const</span> std::function&lt;std::string(<span class="keywordtype">void</span>)&gt;&amp; builder);</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span> </div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">  170</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">get_kernel</a>(</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>      MTL::Library* mtl_lib,</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">  177</a></span>  MTL::ComputePipelineState* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#afa0cac9d800c21a8a7f6cb224256abaf">get_kernel</a>(</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>      <span class="keyword">const</span> std::string&amp; lib_name = <span class="stringliteral">&quot;mlx&quot;</span>,</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>      <span class="keyword">const</span> std::string&amp; hash_name = <span class="stringliteral">&quot;&quot;</span>,</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
 <div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span> </div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>  <span class="comment">// Record temporary arrays for the given stream index</span></div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">  185</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">add_temporary</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> arr, <span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">  186</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">add_temporaries</a>(std::vector&lt;array&gt; arrays, <span class="keywordtype">int</span> index);</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span> </div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">  188</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">set_residency_set</a>(<span class="keyword">const</span> MTL::ResidencySet* residency_set);</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span> </div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> <span class="keyword">private</span>:</div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a>&amp; get_stream_(<span class="keywordtype">int</span> index) {</div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>    <span class="keywordflow">return</span> stream_map_.find(index)-&gt;second;</div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>  }</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>  MTL::Library* get_library_cache_(<span class="keyword">const</span> std::string&amp; name);</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span> </div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  MTL::Library* get_library_(<span class="keyword">const</span> std::string&amp; name);</div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  MTL::Library* build_library_(<span class="keyword">const</span> std::string&amp; source_string);</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">  184</a></span>  MTL::ArgumentEncoder* <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">argument_encoder</a>(</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>      <span class="keyword">const</span> std::vector&lt;MTL::ArgumentDescriptor*&gt;&amp; arg_descs) <span class="keyword">const</span>;</div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span> </div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  <span class="comment">// Record temporary arrays for the given stream index</span></div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">  188</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">add_temporary</a>(<a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a> arr, <span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">  189</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">add_temporaries</a>(std::vector&lt;array&gt; arrays, <span class="keywordtype">int</span> index);</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno"><a class="line" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">  191</a></span>  <span class="keywordtype">void</span> <a class="code hl_function" href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">set_residency_set</a>(<span class="keyword">const</span> MTL::ResidencySet* residency_set);</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span> </div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span> <span class="keyword">private</span>:</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>  <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_device_stream.html">DeviceStream</a>&amp; get_stream_(<span class="keywordtype">int</span> index) {</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keywordflow">return</span> stream_map_.find(index)-&gt;second;</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  }</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  MTL::Library* get_library_cache_(<span class="keyword">const</span> std::string&amp; name);</div>
 <div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span> </div>
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>  MTL::Function* get_function_(<span class="keyword">const</span> std::string&amp; name, MTL::Library* mtl_lib);</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span> </div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>  MTL::Function* get_function_(</div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      <span class="keyword">const</span> std::string&amp; specialized_name,</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      MTL::Library* mtl_lib);</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span> </div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>  MTL::LinkedFunctions* get_linked_functions_(</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; funcs);</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span>  MTL::Library* get_library_(<span class="keyword">const</span> std::string&amp; name);</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  MTL::Library* build_library_(<span class="keyword">const</span> std::string&amp; source_string);</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span> </div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  MTL::Function* get_function_(<span class="keyword">const</span> std::string&amp; name, MTL::Library* mtl_lib);</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span> </div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  MTL::Function* get_function_(</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keyword">const</span> std::string&amp; specialized_name,</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>      MTL::Library* mtl_lib);</div>
 <div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span> </div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>      <span class="keyword">const</span> MTL::Function* mtl_function);</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span> </div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <span class="keyword">const</span> std::string&amp; name,</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>      <span class="keyword">const</span> MTL::Function* mtl_function,</div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      <span class="keyword">const</span> MTL::LinkedFunctions* linked_functions);</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span> </div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  MTL::ComputePipelineState* get_kernel_(</div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>      MTL::Library* mtl_lib,</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>      <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span> </div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>  MTL::Device* device_;</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>  std::unordered_map&lt;int32_t, DeviceStream&gt; stream_map_;</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  MTL::LinkedFunctions* get_linked_functions_(</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; funcs);</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span> </div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <span class="keyword">const</span> MTL::Function* mtl_function);</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span> </div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>      <span class="keyword">const</span> std::string&amp; name,</div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>      <span class="keyword">const</span> MTL::Function* mtl_function,</div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>      <span class="keyword">const</span> MTL::LinkedFunctions* linked_functions);</div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span> </div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  MTL::ComputePipelineState* get_kernel_(</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>      <span class="keyword">const</span> std::string&amp; base_name,</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>      MTL::Library* mtl_lib,</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>      <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>      <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">MTLFCList</a>&amp; func_consts = {},</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>      <span class="keyword">const</span> std::vector&lt;MTL::Function*&gt;&amp; linked_functions = {});</div>
 <div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  std::shared_mutex kernel_mtx_;</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>  std::unordered_map&lt;std::string, MTL::ComputePipelineState*&gt; kernel_map_;</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  MTL::Device* device_;</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>  std::unordered_map&lt;int32_t, DeviceStream&gt; stream_map_;</div>
 <div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span> </div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>  std::shared_mutex library_mtx_;</div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  std::unordered_map&lt;std::string, MTL::Library*&gt; library_map_;</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>  <span class="keyword">const</span> MTL::ResidencySet* residency_set_{<span class="keyword">nullptr</span>};</div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>  std::string arch_;</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>};</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>  std::shared_mutex kernel_mtx_;</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  std::unordered_map&lt;std::string, MTL::ComputePipelineState*&gt; kernel_map_;</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span> </div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>  std::shared_mutex library_mtx_;</div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  std::unordered_map&lt;std::string, MTL::Library*&gt; library_map_;</div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>  <span class="keyword">const</span> MTL::ResidencySet* residency_set_{<span class="keyword">nullptr</span>};</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>  std::string arch_;</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>};</div>
 </div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">  238</a></span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">device</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_device.html">mlx::core::Device</a>);</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span> </div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>} <span class="comment">// namespace mlx::core::metal</span></div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span> </div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">  241</a></span><a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">Device</a>&amp; <a class="code hl_function" href="namespacemlx_1_1core_1_1metal.html#a910797b74824e6ee576fbb533dee8b57">device</a>(<a class="code hl_struct" href="structmlx_1_1core_1_1_device.html">mlx::core::Device</a>);</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span> </div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>} <span class="comment">// namespace mlx::core::metal</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a03a2f0c712660a1bd437cb16e4aba79f"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a03a2f0c712660a1bd437cb16e4aba79f">mlx::core::metal::Device::set_residency_set</a></div><div class="ttdeci">void set_residency_set(const MTL::ResidencySet *residency_set)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a064e1cb6a16de7a0619f6447622350f8"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a064e1cb6a16de7a0619f6447622350f8">mlx::core::metal::Device::get_command_buffer_ops</a></div><div class="ttdeci">int get_command_buffer_ops(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a31dba377f2be44a746db10d1b9367653"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mlx::core::metal::Device::mtl_device</a></div><div class="ttdeci">MTL::Device * mtl_device()</div><div class="ttdef"><b>Definition</b> device.h:135</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a31dba377f2be44a746db10d1b9367653"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653">mlx::core::metal::Device::mtl_device</a></div><div class="ttdeci">MTL::Device * mtl_device()</div><div class="ttdef"><b>Definition</b> device.h:138</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a45945f2efcd242d915ffa2171e92bf9d"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a45945f2efcd242d915ffa2171e92bf9d">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name, const std::string &amp;lib_path)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a4f39c28c6cdd1d2da1918f5871bcba6e"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a4f39c28c6cdd1d2da1918f5871bcba6e">mlx::core::metal::Device::~Device</a></div><div class="ttdeci">~Device()</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a5fe3970fbe92ccc55fce4241ffbe5210"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a5fe3970fbe92ccc55fce4241ffbe5210">mlx::core::metal::Device::get_command_buffer</a></div><div class="ttdeci">MTL::CommandBuffer * get_command_buffer(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a60689f97347811b27e8c5ca23e0372bf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a60689f97347811b27e8c5ca23e0372bf">mlx::core::metal::Device::end_encoding</a></div><div class="ttdeci">void end_encoding(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a65f64dd8bafdc704d871fc5be5e7bc0b"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">mlx::core::metal::Device::get_architecture</a></div><div class="ttdeci">const std::string &amp; get_architecture()</div><div class="ttdef"><b>Definition</b> device.h:139</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a65f64dd8bafdc704d871fc5be5e7bc0b"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a65f64dd8bafdc704d871fc5be5e7bc0b">mlx::core::metal::Device::get_architecture</a></div><div class="ttdeci">const std::string &amp; get_architecture()</div><div class="ttdef"><b>Definition</b> device.h:142</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a6810c4dcbcfbf93fc51d42aa5ff0fc3a"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a6810c4dcbcfbf93fc51d42aa5ff0fc3a">mlx::core::metal::Device::get_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_kernel(const std::string &amp;base_name, MTL::Library *mtl_lib, const std::string &amp;hash_name=&quot;&quot;, const MTLFCList &amp;func_consts={}, const std::vector&lt; MTL::Function * &gt; &amp;linked_functions={})</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a6e33e2b1287324fb4a6575e0da5e5881"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a6e33e2b1287324fb4a6575e0da5e5881">mlx::core::metal::Device::argument_encoder</a></div><div class="ttdeci">MTL::ArgumentEncoder * argument_encoder(const std::vector&lt; MTL::ArgumentDescriptor * &gt; &amp;arg_descs) const</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a72ad17c96fc6ce825bc77f0bed657901"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a72ad17c96fc6ce825bc77f0bed657901">mlx::core::metal::Device::add_temporaries</a></div><div class="ttdeci">void add_temporaries(std::vector&lt; array &gt; arrays, int index)</div></div>
@@ -383,7 +386,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a7a33d4d601423a3d3c23d5ad7072abb6"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a7a33d4d601423a3d3c23d5ad7072abb6">mlx::core::metal::Device::increment_command_buffer_ops</a></div><div class="ttdeci">void increment_command_buffer_ops(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a8135ae2a8c1e6f3861e84d4e60c28b67"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a8135ae2a8c1e6f3861e84d4e60c28b67">mlx::core::metal::Device::new_queue</a></div><div class="ttdeci">void new_queue(int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a95248f1387824067fd4fed23ace5ac0c"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c">mlx::core::metal::Device::commit_command_buffer</a></div><div class="ttdeci">void commit_command_buffer(int index)</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a99ff72689b7beb65ad4541391b0eeabf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name)</div><div class="ttdef"><b>Definition</b> device.h:157</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_a99ff72689b7beb65ad4541391b0eeabf"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#a99ff72689b7beb65ad4541391b0eeabf">mlx::core::metal::Device::register_library</a></div><div class="ttdeci">void register_library(const std::string &amp;lib_name)</div><div class="ttdef"><b>Definition</b> device.h:160</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_abf59a4addb5473f9e814e3651ba85f06"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#abf59a4addb5473f9e814e3651ba85f06">mlx::core::metal::Device::Device</a></div><div class="ttdeci">Device(const Device &amp;)=delete</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_acb90010af0cffe27fd8cc6c253d3a576"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#acb90010af0cffe27fd8cc6c253d3a576">mlx::core::metal::Device::add_temporary</a></div><div class="ttdeci">void add_temporary(array arr, int index)</div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html_ad1d6382fd18a46b1906e1b43e0bd2e73"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html#ad1d6382fd18a46b1906e1b43e0bd2e73">mlx::core::metal::Device::operator=</a></div><div class="ttdeci">Device &amp; operator=(const Device &amp;)=delete</div></div>
@@ -402,31 +405,32 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></div><div class="ttdef"><b>Definition</b> device.h:41</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a1e41477f2f489e38499f7830a91c9810"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">mlx::core::metal::CommandEncoder::dispatchThreads</a></div><div class="ttdeci">void dispatchThreads(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a2334774486f447213ee997e55c2e52a3"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3">mlx::core::metal::CommandEncoder::CommandEncoder</a></div><div class="ttdeci">CommandEncoder(MTL::CommandBuffer *cbuf)</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a27ded7e54bc1712063c874646b445509"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">mlx::core::metal::CommandEncoder::inputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; &amp; inputs()</div><div class="ttdef"><b>Definition</b> device.h:76</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a27ded7e54bc1712063c874646b445509"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">mlx::core::metal::CommandEncoder::inputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; &amp; inputs()</div><div class="ttdef"><b>Definition</b> device.h:77</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a3f42a1362b4a513fa89e7b3dcc570a8e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">mlx::core::metal::CommandEncoder::operator=</a></div><div class="ttdeci">CommandEncoder &amp; operator=(const CommandEncoder &amp;)=delete</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a48b548a0b15f9d1279c938a1c6167034"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">mlx::core::metal::CommandEncoder::start_concurrent</a></div><div class="ttdeci">ConcurrentContext start_concurrent()</div><div class="ttdef"><b>Definition</b> device.h:70</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a48b548a0b15f9d1279c938a1c6167034"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">mlx::core::metal::CommandEncoder::start_concurrent</a></div><div class="ttdeci">ConcurrentContext start_concurrent()</div><div class="ttdef"><b>Definition</b> device.h:71</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a6a2e28e542eaa2886041bddd51ff6522"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">mlx::core::metal::CommandEncoder::set_output_array</a></div><div class="ttdeci">void set_output_array(array &amp;a, int idx, int64_t offset=0)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a74bcd8e35f80f5a62db48c4a2bb0173e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">mlx::core::metal::CommandEncoder::dispatchThreadgroups</a></div><div class="ttdeci">void dispatchThreadgroups(MTL::Size grid_dims, MTL::Size group_dims)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_a9b6dd221ccd2d939d544004cb6279198"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">mlx::core::metal::CommandEncoder::~CommandEncoder</a></div><div class="ttdeci">~CommandEncoder()</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aac45ab0630ea32cf7d15c7ba3e229966"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">mlx::core::metal::CommandEncoder::operator-&gt;</a></div><div class="ttdeci">MTL::ComputeCommandEncoder * operator-&gt;()</div><div class="ttdef"><b>Definition</b> device.h:61</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ab69ff0d7f14b9b59db4df0608193dce4"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">mlx::core::metal::CommandEncoder::set_input_array</a></div><div class="ttdeci">void set_input_array(const array &amp;a, int idx, int64_t offset=0)</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ac68ca977b5bde5434284ce7979647f14"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14">mlx::core::metal::CommandEncoder::CommandEncoder</a></div><div class="ttdeci">CommandEncoder(const CommandEncoder &amp;)=delete</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aefa48740fdee884f02e2d379bca4e78f"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder::outputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; outputs()</div><div class="ttdef"><b>Definition</b> device.h:81</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html">mlx::core::metal::DeviceStream</a></div><div class="ttdef"><b>Definition</b> device.h:102</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a1c4397732f64f5811381dd01e30e020e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">mlx::core::metal::DeviceStream::~DeviceStream</a></div><div class="ttdeci">~DeviceStream()</div><div class="ttdef"><b>Definition</b> device.h:104</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a55a7a92c6abad369c99a5ede7a2521b9"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">mlx::core::metal::DeviceStream::outputs</a></div><div class="ttdeci">std::unordered_map&lt; const void *, std::shared_ptr&lt; Fence &gt; &gt; outputs</div><div class="ttdef"><b>Definition</b> device.h:112</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a573326bc8b48e39076850c7bf52ad0d7"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">mlx::core::metal::DeviceStream::DeviceStream</a></div><div class="ttdeci">DeviceStream(MTL::CommandQueue *queue)</div><div class="ttdef"><b>Definition</b> device.h:103</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a58e435217b9922f882507ebf48bfbbdd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">mlx::core::metal::DeviceStream::encoder</a></div><div class="ttdeci">std::unique_ptr&lt; CommandEncoder &gt; encoder</div><div class="ttdef"><b>Definition</b> device.h:123</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a6fa08cca881fc3798ae45994a11a4fcd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">mlx::core::metal::DeviceStream::fence_mtx</a></div><div class="ttdeci">std::mutex fence_mtx</div><div class="ttdef"><b>Definition</b> device.h:114</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a77c75a63c51ea56815a86bd882ed190d"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream::queue</a></div><div class="ttdeci">MTL::CommandQueue * queue</div><div class="ttdef"><b>Definition</b> device.h:110</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a876199de8da1efa9a362451029638499"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">mlx::core::metal::DeviceStream::fence</a></div><div class="ttdeci">std::shared_ptr&lt; Fence &gt; fence</div><div class="ttdef"><b>Definition</b> device.h:124</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a99183c92599edfeb75f7fa0f37e1d9eb"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream::buffer</a></div><div class="ttdeci">MTL::CommandBuffer * buffer</div><div class="ttdef"><b>Definition</b> device.h:118</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_ab6048b329e65a59033834f3bdd351782"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream::buffer_ops</a></div><div class="ttdeci">int buffer_ops</div><div class="ttdef"><b>Definition</b> device.h:119</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_aee88009117dfff1ad121eabe28d5f3de"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">mlx::core::metal::DeviceStream::temporaries</a></div><div class="ttdeci">std::vector&lt; array &gt; temporaries</div><div class="ttdef"><b>Definition</b> device.h:125</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html">mlx::core::metal::Fence</a></div><div class="ttdef"><b>Definition</b> device.h:94</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a30bee4957ae595e04922952a8010fc79"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">mlx::core::metal::Fence::Fence</a></div><div class="ttdeci">Fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:95</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a4940c1aece13814af7727de9abb511f2"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">mlx::core::metal::Fence::~Fence</a></div><div class="ttdeci">~Fence()</div><div class="ttdef"><b>Definition</b> device.h:96</div></div>
-<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_aeccd8f2b81418ae9fc446ae2b6e15b87"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">mlx::core::metal::Fence::fence</a></div><div class="ttdeci">MTL::Fence * fence</div><div class="ttdef"><b>Definition</b> device.h:99</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_ad538ae88f90560063f9ba502e2795991"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">mlx::core::metal::CommandEncoder::maybeInsertBarrier</a></div><div class="ttdeci">void maybeInsertBarrier()</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_command_encoder_html_aefa48740fdee884f02e2d379bca4e78f"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">mlx::core::metal::CommandEncoder::outputs</a></div><div class="ttdeci">std::unordered_set&lt; const void * &gt; outputs()</div><div class="ttdef"><b>Definition</b> device.h:82</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html">mlx::core::metal::DeviceStream</a></div><div class="ttdef"><b>Definition</b> device.h:105</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a1c4397732f64f5811381dd01e30e020e"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a1c4397732f64f5811381dd01e30e020e">mlx::core::metal::DeviceStream::~DeviceStream</a></div><div class="ttdeci">~DeviceStream()</div><div class="ttdef"><b>Definition</b> device.h:107</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a55a7a92c6abad369c99a5ede7a2521b9"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a55a7a92c6abad369c99a5ede7a2521b9">mlx::core::metal::DeviceStream::outputs</a></div><div class="ttdeci">std::unordered_map&lt; const void *, std::shared_ptr&lt; Fence &gt; &gt; outputs</div><div class="ttdef"><b>Definition</b> device.h:115</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a573326bc8b48e39076850c7bf52ad0d7"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a573326bc8b48e39076850c7bf52ad0d7">mlx::core::metal::DeviceStream::DeviceStream</a></div><div class="ttdeci">DeviceStream(MTL::CommandQueue *queue)</div><div class="ttdef"><b>Definition</b> device.h:106</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a58e435217b9922f882507ebf48bfbbdd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a58e435217b9922f882507ebf48bfbbdd">mlx::core::metal::DeviceStream::encoder</a></div><div class="ttdeci">std::unique_ptr&lt; CommandEncoder &gt; encoder</div><div class="ttdef"><b>Definition</b> device.h:126</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a6fa08cca881fc3798ae45994a11a4fcd"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a6fa08cca881fc3798ae45994a11a4fcd">mlx::core::metal::DeviceStream::fence_mtx</a></div><div class="ttdeci">std::mutex fence_mtx</div><div class="ttdef"><b>Definition</b> device.h:117</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a77c75a63c51ea56815a86bd882ed190d"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d">mlx::core::metal::DeviceStream::queue</a></div><div class="ttdeci">MTL::CommandQueue * queue</div><div class="ttdef"><b>Definition</b> device.h:113</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a876199de8da1efa9a362451029638499"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a876199de8da1efa9a362451029638499">mlx::core::metal::DeviceStream::fence</a></div><div class="ttdeci">std::shared_ptr&lt; Fence &gt; fence</div><div class="ttdef"><b>Definition</b> device.h:127</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_a99183c92599edfeb75f7fa0f37e1d9eb"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#a99183c92599edfeb75f7fa0f37e1d9eb">mlx::core::metal::DeviceStream::buffer</a></div><div class="ttdeci">MTL::CommandBuffer * buffer</div><div class="ttdef"><b>Definition</b> device.h:121</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_ab6048b329e65a59033834f3bdd351782"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#ab6048b329e65a59033834f3bdd351782">mlx::core::metal::DeviceStream::buffer_ops</a></div><div class="ttdeci">int buffer_ops</div><div class="ttdef"><b>Definition</b> device.h:122</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_device_stream_html_aee88009117dfff1ad121eabe28d5f3de"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_device_stream.html#aee88009117dfff1ad121eabe28d5f3de">mlx::core::metal::DeviceStream::temporaries</a></div><div class="ttdeci">std::vector&lt; array &gt; temporaries</div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html">mlx::core::metal::Fence</a></div><div class="ttdef"><b>Definition</b> device.h:97</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a30bee4957ae595e04922952a8010fc79"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a30bee4957ae595e04922952a8010fc79">mlx::core::metal::Fence::Fence</a></div><div class="ttdeci">Fence(MTL::Fence *fence)</div><div class="ttdef"><b>Definition</b> device.h:98</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_a4940c1aece13814af7727de9abb511f2"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#a4940c1aece13814af7727de9abb511f2">mlx::core::metal::Fence::~Fence</a></div><div class="ttdeci">~Fence()</div><div class="ttdef"><b>Definition</b> device.h:99</div></div>
+<div class="ttc" id="astructmlx_1_1core_1_1metal_1_1_fence_html_aeccd8f2b81418ae9fc446ae2b6e15b87"><div class="ttname"><a href="structmlx_1_1core_1_1metal_1_1_fence.html#aeccd8f2b81418ae9fc446ae2b6e15b87">mlx::core::metal::Fence::fence</a></div><div class="ttdeci">MTL::Fence * fence</div><div class="ttdef"><b>Definition</b> device.h:102</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/dev/custom_metal_kernels.html b/docs/build/html/dev/custom_metal_kernels.html
index 93bfa9007..51d5e1f7b 100644
--- a/docs/build/html/dev/custom_metal_kernels.html
+++ b/docs/build/html/dev/custom_metal_kernels.html
@@ -864,7 +864,7 @@
                 <article class="bd-article">
                   
   <section id="custom-metal-kernels">
-<h1>Custom Metal Kernels<a class="headerlink" href="#custom-metal-kernels" title="Link to this heading">#</a></h1>
+<span id="id1"></span><h1>Custom Metal Kernels<a class="headerlink" href="#custom-metal-kernels" title="Link to this heading">#</a></h1>
 <p>MLX supports writing custom Metal kernels through the Python and C++ APIs.</p>
 <section id="simple-example">
 <h2>Simple Example<a class="headerlink" href="#simple-example" title="Link to this heading">#</a></h2>
@@ -947,6 +947,9 @@ All the attributes defined in Table 5.8 of the <a class="reference external" hre
 <span class="k">template</span><span class="w"> </span><span class="p">[[</span><span class="n">host_name</span><span class="p">(</span><span class="s">&quot;custom_kernel_myexp_float&quot;</span><span class="p">)]]</span><span class="w"> </span><span class="p">[[</span><span class="n">kernel</span><span class="p">]]</span><span class="w"> </span><span class="k">decltype</span><span class="p">(</span><span class="n">custom_kernel_myexp_float</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">&gt;</span><span class="p">)</span><span class="w"> </span><span class="n">custom_kernel_myexp_float</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">&gt;</span><span class="p">;</span>
 </pre></div>
 </div>
+<p>Note: <code class="docutils literal notranslate"><span class="pre">grid</span></code> and <code class="docutils literal notranslate"><span class="pre">threadgroup</span></code> are parameters to the Metal <a class="reference external" href="https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/2866532-dispatchthreads">dispatchThreads</a> function.
+This means we will launch <code class="docutils literal notranslate"><span class="pre">mx.prod(grid)</span></code> threads, subdivided into <code class="docutils literal notranslate"><span class="pre">threadgroup</span></code> size threadgroups.
+For optimal performance, each thread group dimension should be less than or equal to the corresponding grid dimension.</p>
 <p>Passing <code class="docutils literal notranslate"><span class="pre">verbose=True</span></code> to <code class="docutils literal notranslate"><span class="pre">mx.fast.metal_kernel.__call__</span></code> will print the generated code for debugging purposes.</p>
 </section>
 <section id="using-shape-strides">
diff --git a/docs/build/html/doxygen_crawl.html b/docs/build/html/doxygen_crawl.html
index d4fc5d84c..0236914c1 100644
--- a/docs/build/html/doxygen_crawl.html
+++ b/docs/build/html/doxygen_crawl.html
@@ -4330,9 +4330,9 @@
 <a href="kernels_8h.html#a195b86cad5bb99aa1bcd23952305af6b"/>
 <a href="kernels_8h.html#a1d4cffc3c78067b3d9a62d64f3fb686f"/>
 <a href="kernels_8h.html#a35a412f688d79eb47e42d20a7c8650ee"/>
+<a href="kernels_8h.html#a3bd386cb6db09f636963ce66ceaf8647"/>
 <a href="kernels_8h.html#a4decd4a07d91487e6903f6e3c8b7513a"/>
 <a href="kernels_8h.html#a4e809746f48e5dcf7fa63215d3f5e33e"/>
-<a href="kernels_8h.html#a51c4bb09230348bd0252e22bfdc9bc89"/>
 <a href="kernels_8h.html#a54eb3b65375022428aab5f810e40624b"/>
 <a href="kernels_8h.html#a76f614e9956a6ca05a9be4db5a483446"/>
 <a href="kernels_8h.html#a7aa91fcfe8b9caa42d60a957f11bfe6b"/>
@@ -4443,9 +4443,9 @@
 <a href="metal_2kernels_2unary_8h.html#a7c7690f0df9d2acc60b63be58d9c7777"/>
 <a href="metal_2kernels_2unary_8h.html#ac965f8d3ed62f8580dbfb645e83d4ae5"/>
 <a href="metal_2reduce_8h.html"/>
+<a href="metal_2reduce_8h.html#a3ab0fd997d9a35782106ff083a72e098"/>
 <a href="metal_2reduce_8h.html#aa0332c64ee9965f05026c30a0b778000"/>
 <a href="metal_2reduce_8h.html#ab1eeca8ec6fa31819ee108fa6ed2c41b"/>
-<a href="metal_2reduce_8h.html#af7b7ca7c6aa87558d9f98cee5c7a99a8"/>
 <a href="metal_2slicing_8h.html"/>
 <a href="metal_2slicing_8h.html#a050299d0d366ca5c9d09d1004dcc3e7d"/>
 <a href="metal_2slicing_8h.html#a59048c5ff114c101a496bf33f62e3de9"/>
@@ -4850,9 +4850,11 @@
 <a href="namespacemlx_1_1core.html#a3a6f43c2485f0d42293184f1aecbeaee"/>
 <a href="namespacemlx_1_1core.html#a3a8f6f0af477788c4f0aa98abfc5f1ab"/>
 <a href="namespacemlx_1_1core.html#a3a8fe7ba84714dbb5fdc81e93a07abc8"/>
+<a href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098"/>
 <a href="namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782"/>
 <a href="namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027"/>
 <a href="namespacemlx_1_1core.html#a3ba20a804c306067b7023259429e0e48"/>
+<a href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647"/>
 <a href="namespacemlx_1_1core.html#a3c41a304126bc225bdc68062d1eb6e7e"/>
 <a href="namespacemlx_1_1core.html#a3cc5c154e4ad9a83ad43da8513146fdc"/>
 <a href="namespacemlx_1_1core.html#a3d2b2929ed4636e9e2b86e125b2e57d9"/>
@@ -4900,7 +4902,6 @@
 <a href="namespacemlx_1_1core.html#a514263e63f6825b490203ca586864687"/>
 <a href="namespacemlx_1_1core.html#a514cf8b4e6f0a6af3a867e752f4338f7"/>
 <a href="namespacemlx_1_1core.html#a517019d42d4e426b7b98e1c719bb47ce"/>
-<a href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89"/>
 <a href="namespacemlx_1_1core.html#a5287610200ff573730c9c92413f48881"/>
 <a href="namespacemlx_1_1core.html#a54833be1d44bc3adfc9ea218fc3685bd"/>
 <a href="namespacemlx_1_1core.html#a54863a54f258acf2b5c734950618e4e1"/>
@@ -5272,7 +5273,6 @@
 <a href="namespacemlx_1_1core.html#af69db7def588d7da430434a69456e29c"/>
 <a href="namespacemlx_1_1core.html#af7577c91b8c43682f0ebc9eb9758aae4"/>
 <a href="namespacemlx_1_1core.html#af776fd91dd60594dcfebbafd17f19068"/>
-<a href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8"/>
 <a href="namespacemlx_1_1core.html#af7eea1682a38d363c56a066321e6d526"/>
 <a href="namespacemlx_1_1core.html#af810587a17e692f4eec256d3c3cd27de"/>
 <a href="namespacemlx_1_1core.html#af84ed854132c1514dca5a524fdb7ed05"/>
@@ -5933,11 +5933,11 @@
 <a href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd"/>
 <a href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42"/>
 <a href="quantized_8h.html#a0ba59096494f1001c195312571523ae9"/>
+<a href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a"/>
 <a href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f"/>
 <a href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7"/>
 <a href="quantized_8h.html#a3ab400746ad77be89c30d25638e01698"/>
 <a href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59"/>
-<a href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12"/>
 <a href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7"/>
 <a href="quantized_8h.html#a6076203615038eb06816158f7b3869c6"/>
 <a href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99"/>
@@ -5952,6 +5952,7 @@
 <a href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7"/>
 <a href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84"/>
 <a href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071"/>
+<a href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8"/>
 <a href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81"/>
 <a href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10"/>
 <a href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be"/>
@@ -6016,8 +6017,10 @@
 <a href="reduce__all_8h.html"/>
 <a href="reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d"/>
 <a href="reduce__col_8h.html"/>
+<a href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"/>
 <a href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385"/>
-<a href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce"/>
+<a href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb"/>
+<a href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5"/>
 <a href="reduce__init_8h.html"/>
 <a href="reduce__init_8h.html#a0088604ac2eaa6940689ff12c4ba5fc2"/>
 <a href="reduce__row_8h.html"/>
@@ -6051,7 +6054,7 @@
 <a href="scheduler_8h.html#aa2d4eacf5d5cbc778a51aafd4fd8e4d7"/>
 <a href="scheduler_8h.html#ae856e468c2f7c8f8ec672522cc13730b"/>
 <a href="sdpa__vector_8h.html"/>
-<a href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927"/>
+<a href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae"/>
 <a href="sort_8h.html"/>
 <a href="sort_8h.html#a0386011c52d03e60885a31e6fbd903dd"/>
 <a href="sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2"/>
@@ -6929,6 +6932,7 @@
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14"/>
+<a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html"/>
 <a href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#a28bafec56edec3091e8716d8ccfb6ee1"/>
diff --git a/docs/build/html/functions_func_m.html b/docs/build/html/functions_func_m.html
index d4f90d50b..b812c59a5 100644
--- a/docs/build/html/functions_func_m.html
+++ b/docs/build/html/functions_func_m.html
@@ -93,6 +93,7 @@ $(function(){ initResizable(false); });
 <li>Matmul()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_matmul.html#adef92f30ab35e540ccb316ea6b94e6f7">mlx::core::Matmul</a></li>
 <li>max()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a92320d40a58218e40cc414986ac95c50">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>Maximum()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">mlx::core::Maximum</a></li>
+<li>maybeInsertBarrier()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">mlx::core::metal::CommandEncoder</a></li>
 <li>merge_partition()&#160;:&#160;<a class="el" href="struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>merge_step()&#160;:&#160;<a class="el" href="struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>min()&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
diff --git a/docs/build/html/functions_m.html b/docs/build/html/functions_m.html
index 550ae601f..3be734fe6 100644
--- a/docs/build/html/functions_m.html
+++ b/docs/build/html/functions_m.html
@@ -102,6 +102,7 @@ $(function(){ initResizable(false); });
 <li>max_exponent&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a61bb136f819fa392c50bdf3c38f3aad2">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>max_exponent10&#160;:&#160;<a class="el" href="structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a76bfb2deb0e0afc011f77bf5a6d0ed94">metal::_numeric_limits_impl&lt; bfloat16_t &gt;</a></li>
 <li>Maximum()&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816">mlx::core::Maximum</a></li>
+<li>maybeInsertBarrier()&#160;:&#160;<a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">mlx::core::metal::CommandEncoder</a></li>
 <li>merge_partition()&#160;:&#160;<a class="el" href="struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a>, <a class="el" href="struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe">KernelMultiBlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>merge_step()&#160;:&#160;<a class="el" href="struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c">BlockMergeSort&lt; val_t, idx_t, ARG_SORT, BLOCK_THREADS, N_PER_THREAD, CompareOp &gt;</a></li>
 <li>Min&#160;:&#160;<a class="el" href="classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a4f685dcd48e6614d6bb2ccda4f2686ef">mlx::core::distributed::AllReduce</a>, <a class="el" href="classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a0d3d1f5c94725bdc42fa692e2c074418">mlx::core::Reduce</a>, <a class="el" href="classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a7d2ee8f14f2e70a9d47170fecc6da898">mlx::core::Scan</a>, <a class="el" href="classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613cad914e4c3475ce9858f2de4bf35dcfdbf">mlx::core::Scatter</a></li>
diff --git a/docs/build/html/globals_c.html b/docs/build/html/globals_c.html
index f2a87df5d..a9eed8c2d 100644
--- a/docs/build/html/globals_c.html
+++ b/docs/build/html/globals_c.html
@@ -92,8 +92,10 @@ $(function(){ initResizable(false); });
 <li>can_convert_to_bfloat&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2bf16_8h.html#aae77817d261452b2f001f4d947a3e04e">bf16.h</a></li>
 <li>can_convert_to_complex64&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2complex_8h.html#a4f90ad54f4fae363e8d3cc41d539557b">complex.h</a></li>
 <li>ceildiv()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">utils.h</a></li>
+<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">reduce_col.h</a></li>
+<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">reduce_col.h</a></li>
 <li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">reduce_col.h</a></li>
-<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce">reduce_col.h</a></li>
+<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">reduce_col.h</a></li>
 <li>complex_binop&#160;:&#160;<a class="el" href="types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd">complex.h</a></li>
 <li>complex_binop_helper&#160;:&#160;<a class="el" href="types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4">complex.h</a></li>
 <li>complex_mul()&#160;:&#160;<a class="el" href="radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6">radix.h</a></li>
diff --git a/docs/build/html/globals_func_c.html b/docs/build/html/globals_func_c.html
index 11545a40d..227276f3e 100644
--- a/docs/build/html/globals_func_c.html
+++ b/docs/build/html/globals_func_c.html
@@ -88,8 +88,10 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_c" name="index_c"></a>- c -</h3><ul>
 <li>ceildiv()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a8e5a4b0fb5d018d7b078d147efe4f1e3">utils.h</a></li>
+<li>col_reduce_2pass()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">reduce_col.h</a></li>
+<li>col_reduce_longcolumn()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">reduce_col.h</a></li>
 <li>col_reduce_looped()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">reduce_col.h</a></li>
-<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce">reduce_col.h</a></li>
+<li>col_reduce_small()&#160;:&#160;<a class="el" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">reduce_col.h</a></li>
 <li>complex_mul()&#160;:&#160;<a class="el" href="radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6">radix.h</a></li>
 <li>complex_mul_conj()&#160;:&#160;<a class="el" href="radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3">radix.h</a></li>
 <li>contiguous_scan()&#160;:&#160;<a class="el" href="scan_8h.html#a60d279b9add7d56639bb209408f09d79">scan.h</a></li>
diff --git a/docs/build/html/globals_func_q.html b/docs/build/html/globals_func_q.html
index ca466a0b3..6f3f372b0 100644
--- a/docs/build/html/globals_func_q.html
+++ b/docs/build/html/globals_func_q.html
@@ -101,7 +101,8 @@ $(function(){ initResizable(false); });
 <li>qmv_quad_impl()&#160;:&#160;<a class="el" href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">quantized.h</a></li>
 <li>qouter()&#160;:&#160;<a class="el" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">quantized.h</a></li>
 <li>qvm()&#160;:&#160;<a class="el" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">quantized.h</a></li>
-<li>qvm_impl()&#160;:&#160;<a class="el" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">quantized.h</a></li>
+<li>qvm_impl()&#160;:&#160;<a class="el" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">quantized.h</a></li>
+<li>qvm_split_k()&#160;:&#160;<a class="el" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">quantized.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_func_s.html b/docs/build/html/globals_func_s.html
index d8ecab0f0..ce63bfa00 100644
--- a/docs/build/html/globals_func_s.html
+++ b/docs/build/html/globals_func_s.html
@@ -88,7 +88,7 @@ $(function(){ initResizable(false); });
 
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
 <li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter.h</a></li>
-<li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927">sdpa_vector.h</a></li>
+<li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector.h</a></li>
 <li>simd_shuffle()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">utils.h</a></li>
 <li>simd_shuffle_and_fill_up()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">utils.h</a></li>
 <li>simd_shuffle_down()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">utils.h</a></li>
diff --git a/docs/build/html/globals_q.html b/docs/build/html/globals_q.html
index 49ac4d16c..ab11942b9 100644
--- a/docs/build/html/globals_q.html
+++ b/docs/build/html/globals_q.html
@@ -102,7 +102,8 @@ $(function(){ initResizable(false); });
 <li>qouter()&#160;:&#160;<a class="el" href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">quantized.h</a></li>
 <li>QUAD_SIZE&#160;:&#160;<a class="el" href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">quantized.h</a></li>
 <li>qvm()&#160;:&#160;<a class="el" href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">quantized.h</a></li>
-<li>qvm_impl()&#160;:&#160;<a class="el" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">quantized.h</a></li>
+<li>qvm_impl()&#160;:&#160;<a class="el" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">quantized.h</a></li>
+<li>qvm_split_k()&#160;:&#160;<a class="el" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">quantized.h</a></li>
 </ul>
 </div><!-- contents -->
 <!-- start footer part -->
diff --git a/docs/build/html/globals_s.html b/docs/build/html/globals_s.html
index a46a742a1..00ccc52f6 100644
--- a/docs/build/html/globals_s.html
+++ b/docs/build/html/globals_s.html
@@ -89,7 +89,7 @@ $(function(){ initResizable(false); });
 <h3><a id="index_s" name="index_s"></a>- s -</h3><ul>
 <li>scatter_impl()&#160;:&#160;<a class="el" href="scatter_8h.html#ad1ce39d0b6d733a95e739121fcc61bd1">scatter.h</a></li>
 <li>scatter_kernels&#160;:&#160;<a class="el" href="jit_2indexing_8h.html#a768c949cd650a44c6b402fc1440c1a56">indexing.h</a></li>
-<li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927">sdpa_vector.h</a></li>
+<li>sdpa_vector()&#160;:&#160;<a class="el" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector.h</a></li>
 <li>simd_shuffle()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a71986ecdd7d18f975dd22c3df7421ce2">utils.h</a></li>
 <li>simd_shuffle_and_fill_up()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#a5862d5ea154c9b76cf56a630cf6385b4">utils.h</a></li>
 <li>simd_shuffle_down()&#160;:&#160;<a class="el" href="backend_2metal_2kernels_2utils_8h.html#aba6279624b1d30c525efee856a222b5c">utils.h</a></li>
diff --git a/docs/build/html/kernels_8h.html b/docs/build/html/kernels_8h.html
index 3f1bb1307..4a5c35d1c 100644
--- a/docs/build/html/kernels_8h.html
+++ b/docs/build/html/kernels_8h.html
@@ -129,8 +129,8 @@ Functions</h2></td></tr>
 <tr class="separator:a84ebe6275218070f0ea320f126f64e22"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:afb57825bb763050cc9a9d194aa41ac36" id="r_afb57825bb763050cc9a9d194aa41ac36"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#afb57825bb763050cc9a9d194aa41ac36">mlx::core::get_mb_sort_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;idx, int bn, int tn)</td></tr>
 <tr class="separator:afb57825bb763050cc9a9d194aa41ac36"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a51c4bb09230348bd0252e22bfdc9bc89" id="r_a51c4bb09230348bd0252e22bfdc9bc89"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">mlx::core::get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
-<tr class="separator:a51c4bb09230348bd0252e22bfdc9bc89"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3bd386cb6db09f636963ce66ceaf8647" id="r_a3bd386cb6db09f636963ce66ceaf8647"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core::get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
+<tr class="separator:a3bd386cb6db09f636963ce66ceaf8647"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a7aa91fcfe8b9caa42d60a957f11bfe6b" id="r_a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core::get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
 <tr class="separator:a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a84fa8e0aee321a9d614433a0b933103b" id="r_a84fa8e0aee321a9d614433a0b933103b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">mlx::core::get_steel_gemm_fused_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const <a class="el" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a> &amp;func_consts, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</td></tr>
diff --git a/docs/build/html/kernels_8h_source.html b/docs/build/html/kernels_8h_source.html
index 93d7a928c..de9608168 100644
--- a/docs/build/html/kernels_8h_source.html
+++ b/docs/build/html/kernels_8h_source.html
@@ -169,152 +169,154 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    <span class="keywordtype">int</span> bn,</div>
 <div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="keywordtype">int</span> tn);</div>
 <div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">   79</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">get_reduce_init_kernel</a>(</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">   79</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">get_reduce_init_kernel</a>(</div>
 <div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
 <div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span> </div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">   84</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">get_reduce_kernel</a>(</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <span class="keyword">const</span> std::string&amp; func_name,</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="keywordtype">int</span> ndim = -1,</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    <span class="keywordtype">int</span> bm = -1,</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    <span class="keywordtype">int</span> bn = -1);</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span> </div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">   95</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a>(</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    <span class="keywordtype">int</span> wn);</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span> </div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">  109</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">get_steel_gemm_splitk_kernel</a>(</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keywordtype">bool</span> k_aligned);</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span> </div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">  124</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">get_steel_gemm_splitk_accum_kernel</a>(</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    <span class="keywordtype">bool</span> axbpy);</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span> </div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">  131</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">get_steel_gemm_masked_kernel</a>(</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    <span class="keywordtype">bool</span> transpose_a,</div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    <span class="keywordtype">bool</span> transpose_b,</div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    <span class="keywordtype">bool</span> k_aligned);</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span> </div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">  147</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">get_steel_conv_kernel</a>(</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keywordtype">int</span> wn,</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keywordtype">int</span> n_channel_specialization,</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keywordtype">bool</span> small_filter);</div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span> </div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">  159</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a>(</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keywordtype">bool</span> transpose_mat,</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordtype">int</span> sm,</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keywordtype">int</span> sn,</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keywordtype">int</span> tm,</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keywordtype">int</span> tn,</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keywordtype">bool</span> contiguous);</div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span> </div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">  174</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">get_steel_conv_general_kernel</a>(</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>    <span class="keywordtype">int</span> bm,</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="keywordtype">int</span> bn,</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    <span class="keywordtype">int</span> bk,</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>    <span class="keywordtype">int</span> wm,</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>    <span class="keywordtype">int</span> wn);</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span> </div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">  184</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">get_fft_kernel</a>(</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
-<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
-<div class="line"><a id="l00191" name="l00191"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">  191</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">get_quantized_kernel</a>(</div>
-<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
-<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
-<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span> </div>
-<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span><span class="comment">// Create a GPU kernel template definition for JIT compilation</span></div>
-<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span>... Args&gt;</div>
-<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>std::string</div>
-<div class="foldopen" id="foldopen00199" data-start="{" data-end="}">
-<div class="line"><a id="l00199" name="l00199"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">  199</a></span><a class="code hl_function" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">get_template_definition</a>(std::string name, std::string func, Args... args) {</div>
-<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>  std::ostringstream s;</div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>  s &lt;&lt; func &lt;&lt; <span class="stringliteral">&quot;&lt;&quot;</span>;</div>
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  <span class="keywordtype">bool</span> first = <span class="keyword">true</span>;</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  <span class="keyword">auto</span> add_arg = [&amp;s, &amp;first](<span class="keyword">const</span> <span class="keyword">auto</span>&amp; arg) {</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    <span class="keywordflow">if</span> (!first) {</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      s &lt;&lt; <span class="stringliteral">&quot;, &quot;</span>;</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    }</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    first = <span class="keyword">false</span>;</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    s &lt;&lt; arg;</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>  };</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>  (add_arg(args), ...);</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  s &lt;&lt; <span class="stringliteral">&quot;&gt;&quot;</span>;</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  <span class="keywordflow">return</span> fmt::format(</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>      <span class="stringliteral">&quot;\ntemplate [[host_name(\&quot;{0}\&quot;)]] [[kernel]] decltype({1}) {1};\n&quot;</span>,</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>      name,</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      s.str());</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>}</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keyword">const</span> std::string&amp; func_name,</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out);</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span> </div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">   86</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">get_reduce_kernel</a>(</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>    <span class="keyword">const</span> std::string&amp; func_name,</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    <span class="keywordtype">int</span> ndim = -1,</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    <span class="keywordtype">int</span> bm = -1,</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>    <span class="keywordtype">int</span> bn = -1);</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span> </div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">   97</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a>(</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordtype">int</span> wn);</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span> </div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">  111</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#af48c6f2f72b61dbd6766e4f5fea85df5">get_steel_gemm_splitk_kernel</a>(</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    <span class="keywordtype">bool</span> k_aligned);</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span> </div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">  126</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">get_steel_gemm_splitk_accum_kernel</a>(</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    <span class="keywordtype">bool</span> axbpy);</div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span> </div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">  133</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">get_steel_gemm_masked_kernel</a>(</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordtype">bool</span> transpose_a,</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    <span class="keywordtype">bool</span> transpose_b,</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>    <span class="keywordtype">bool</span> mn_aligned,</div>
+<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>    <span class="keywordtype">bool</span> k_aligned);</div>
+<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span> </div>
+<div class="line"><a id="l00149" name="l00149"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">  149</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">get_steel_conv_kernel</a>(</div>
+<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    <span class="keywordtype">int</span> wn,</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keywordtype">int</span> n_channel_specialization,</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keywordtype">bool</span> small_filter);</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">  161</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">get_gemv_masked_kernel</a>(</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_out,</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> std::optional&lt;array&gt;&amp; mask_op,</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    <span class="keywordtype">bool</span> transpose_mat,</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    <span class="keywordtype">int</span> sm,</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    <span class="keywordtype">int</span> sn,</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>    <span class="keywordtype">int</span> tm,</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="keywordtype">int</span> tn,</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="keywordtype">bool</span> contiguous);</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span> </div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">  176</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">get_steel_conv_general_kernel</a>(</div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>    <span class="keywordtype">int</span> bm,</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>    <span class="keywordtype">int</span> bn,</div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>    <span class="keywordtype">int</span> bk,</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    <span class="keywordtype">int</span> wm,</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>    <span class="keywordtype">int</span> wn);</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span> </div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">  186</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">get_fft_kernel</a>(</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>    <span class="keyword">const</span> std::string&amp; hash_name,</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span>    <span class="keyword">const</span> <a class="code hl_typedef" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a>&amp; func_consts,</div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span> </div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">  193</a></span>MTL::ComputePipelineState* <a class="code hl_function" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">get_quantized_kernel</a>(</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>    <span class="keyword">const</span> std::string&amp; kernel_name,</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>    <span class="keyword">const</span> std::string&amp; template_def);</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span> </div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span><span class="comment">// Create a GPU kernel template definition for JIT compilation</span></div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span>... Args&gt;</div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>std::string</div>
+<div class="foldopen" id="foldopen00201" data-start="{" data-end="}">
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">  201</a></span><a class="code hl_function" href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">get_template_definition</a>(std::string name, std::string func, Args... args) {</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>  std::ostringstream s;</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>  s &lt;&lt; func &lt;&lt; <span class="stringliteral">&quot;&lt;&quot;</span>;</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>  <span class="keywordtype">bool</span> first = <span class="keyword">true</span>;</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>  <span class="keyword">auto</span> add_arg = [&amp;s, &amp;first](<span class="keyword">const</span> <span class="keyword">auto</span>&amp; arg) {</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keywordflow">if</span> (!first) {</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>      s &lt;&lt; <span class="stringliteral">&quot;, &quot;</span>;</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    }</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    first = <span class="keyword">false</span>;</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    s &lt;&lt; arg;</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>  };</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>  (add_arg(args), ...);</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>  s &lt;&lt; <span class="stringliteral">&quot;&gt;&quot;</span>;</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>  <span class="keywordflow">return</span> fmt::format(</div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>      <span class="stringliteral">&quot;\ntemplate [[host_name(\&quot;{0}\&quot;)]] [[kernel]] decltype({1}) {1};\n&quot;</span>,</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>      name,</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>      s.str());</div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>}</div>
 </div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span> </div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="aarray_8h_html"><div class="ttname"><a href="array_8h.html">array.h</a></div></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_1_1metal_html_a616e09a1ef321d527770721cef264c54"><div class="ttname"><a href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">mlx::core::metal::MTLFCList</a></div><div class="ttdeci">std::vector&lt; std::tuple&lt; const void *, MTL::DataType, NS::UInteger &gt; &gt; MTLFCList</div><div class="ttdef"><b>Definition</b> device.h:38</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
@@ -322,9 +324,9 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_html_a195b86cad5bb99aa1bcd23952305af6b"><div class="ttname"><a href="namespacemlx_1_1core.html#a195b86cad5bb99aa1bcd23952305af6b">mlx::core::get_steel_gemm_splitk_accum_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_splitk_accum_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;in, const array &amp;out, bool axbpy)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a1d4cffc3c78067b3d9a62d64f3fb686f"><div class="ttname"><a href="namespacemlx_1_1core.html#a1d4cffc3c78067b3d9a62d64f3fb686f">mlx::core::get_fft_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_fft_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const metal::MTLFCList &amp;func_consts, const std::string &amp;template_def)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a35a412f688d79eb47e42d20a7c8650ee"><div class="ttname"><a href="namespacemlx_1_1core.html#a35a412f688d79eb47e42d20a7c8650ee">mlx::core::get_softmax_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_softmax_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, bool precise, const array &amp;out)</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a3bd386cb6db09f636963ce66ceaf8647"><div class="ttname"><a href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core::get_reduce_init_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_init_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a4decd4a07d91487e6903f6e3c8b7513a"><div class="ttname"><a href="namespacemlx_1_1core.html#a4decd4a07d91487e6903f6e3c8b7513a">mlx::core::get_binary_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_binary_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype in_type, Dtype out_type, const std::string op)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a4e809746f48e5dcf7fa63215d3f5e33e"><div class="ttname"><a href="namespacemlx_1_1core.html#a4e809746f48e5dcf7fa63215d3f5e33e">mlx::core::get_binary_two_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_binary_two_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype in_type, Dtype out_type, const std::string op)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_a51c4bb09230348bd0252e22bfdc9bc89"><div class="ttname"><a href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">mlx::core::get_reduce_init_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_init_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a54eb3b65375022428aab5f810e40624b"><div class="ttname"><a href="namespacemlx_1_1core.html#a54eb3b65375022428aab5f810e40624b">mlx::core::get_ternary_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_ternary_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, Dtype type, const std::string op)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a76f614e9956a6ca05a9be4db5a483446"><div class="ttname"><a href="namespacemlx_1_1core.html#a76f614e9956a6ca05a9be4db5a483446">mlx::core::get_arange_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_arange_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a7aa91fcfe8b9caa42d60a957f11bfe6b"><div class="ttname"><a href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core::get_reduce_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_reduce_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const array &amp;in, const array &amp;out, int ndim=-1, int bm=-1, int bn=-1)</div></div>
@@ -332,7 +334,7 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="anamespacemlx_1_1core_html_a84fa8e0aee321a9d614433a0b933103b"><div class="ttname"><a href="namespacemlx_1_1core.html#a84fa8e0aee321a9d614433a0b933103b">mlx::core::get_steel_gemm_fused_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_fused_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const metal::MTLFCList &amp;func_consts, const array &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a90c24e0d0b99b68fad9deefcf4d3e818"><div class="ttname"><a href="namespacemlx_1_1core.html#a90c24e0d0b99b68fad9deefcf4d3e818">mlx::core::get_gemv_masked_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_gemv_masked_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, const std::optional&lt; array &gt; &amp;mask_out, const std::optional&lt; array &gt; &amp;mask_op, bool transpose_mat, int bm, int bn, int sm, int sn, int tm, int tn, bool contiguous)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_aa3faeae5378bfaafe3ce3432a051e43e"><div class="ttname"><a href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core::get_quantized_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_quantized_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const std::string &amp;template_def)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_aae0d19f0acdef2accd2428fb84c8a032"><div class="ttname"><a href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">mlx::core::get_template_definition</a></div><div class="ttdeci">std::string get_template_definition(std::string name, std::string func, Args... args)</div><div class="ttdef"><b>Definition</b> kernels.h:199</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_aae0d19f0acdef2accd2428fb84c8a032"><div class="ttname"><a href="namespacemlx_1_1core.html#aae0d19f0acdef2accd2428fb84c8a032">mlx::core::get_template_definition</a></div><div class="ttdeci">std::string get_template_definition(std::string name, std::string func, Args... args)</div><div class="ttdef"><b>Definition</b> kernels.h:201</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ab5f60614e965144b451930fdf935e08d"><div class="ttname"><a href="namespacemlx_1_1core.html#ab5f60614e965144b451930fdf935e08d">mlx::core::get_steel_gemm_masked_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_gemm_masked_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, const std::optional&lt; array &gt; &amp;mask_out, const std::optional&lt; array &gt; &amp;mask_op, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn, bool mn_aligned, bool k_aligned)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_abce2b67044ee06a7bbe7a91ec7c8c48d"><div class="ttname"><a href="namespacemlx_1_1core.html#abce2b67044ee06a7bbe7a91ec7c8c48d">mlx::core::get_steel_conv_general_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_conv_general_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, int bm, int bn, int bk, int wm, int wn)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_adce79d220672f5f3c65cc31d145ca9c4"><div class="ttname"><a href="namespacemlx_1_1core.html#adce79d220672f5f3c65cc31d145ca9c4">mlx::core::get_steel_conv_kernel</a></div><div class="ttdeci">MTL::ComputePipelineState * get_steel_conv_kernel(metal::Device &amp;d, const std::string &amp;kernel_name, const array &amp;out, int bm, int bn, int bk, int wm, int wn, int n_channel_specialization, bool small_filter)</div></div>
diff --git a/docs/build/html/matmul_8h_source.html b/docs/build/html/matmul_8h_source.html
index 4258466e4..ce4ccfe53 100644
--- a/docs/build/html/matmul_8h_source.html
+++ b/docs/build/html/matmul_8h_source.html
@@ -143,7 +143,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_a227588758ccc9ee869dba147e830bb74"><div class="ttname"><a href="namespacemlx_1_1core.html#a227588758ccc9ee869dba147e830bb74">mlx::core::steel_matmul_regular</a></div><div class="ttdeci">void steel_matmul_regular(const Stream &amp;s, metal::Device &amp;d, const array &amp;a, const array &amp;b, array &amp;out, int M, int N, int K, int batch_size_out, int lda, int ldb, int ldd, bool transpose_a, bool transpose_b, std::vector&lt; int &gt; batch_shape, std::vector&lt; size_t &gt; batch_strides, size_t A_batch_stride, size_t B_batch_stride, size_t matrix_stride_out, std::vector&lt; array &gt; &amp;copies)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ab43a7633794498e1c6775cca829eb886"><div class="ttname"><a href="namespacemlx_1_1core.html#ab43a7633794498e1c6775cca829eb886">mlx::core::steel_matmul</a></div><div class="ttdeci">void steel_matmul(const Stream &amp;s, metal::Device &amp;d, const array &amp;a, const array &amp;b, array &amp;out, int M, int N, int K, int batch_size_out, int lda, int ldb, bool transpose_a, bool transpose_b, std::vector&lt; array &gt; &amp;copies, std::vector&lt; int &gt; batch_shape={}, std::vector&lt; size_t &gt; A_batch_stride={}, std::vector&lt; size_t &gt; B_batch_stride={})</div></div>
diff --git a/docs/build/html/metal_2reduce_8h.html b/docs/build/html/metal_2reduce_8h.html
index 43294f485..d08ed5e85 100644
--- a/docs/build/html/metal_2reduce_8h.html
+++ b/docs/build/html/metal_2reduce_8h.html
@@ -109,8 +109,8 @@ Namespaces</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:af7b7ca7c6aa87558d9f98cee5c7a99a8" id="r_af7b7ca7c6aa87558d9f98cee5c7a99a8"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">mlx::core::all_reduce_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;copies)</td></tr>
-<tr class="separator:af7b7ca7c6aa87558d9f98cee5c7a99a8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3ab0fd997d9a35782106ff083a72e098" id="r_a3ab0fd997d9a35782106ff083a72e098"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">mlx::core::all_reduce_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
+<tr class="separator:a3ab0fd997d9a35782106ff083a72e098"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab1eeca8ec6fa31819ee108fa6ed2c41b" id="r_ab1eeca8ec6fa31819ee108fa6ed2c41b"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">mlx::core::row_reduce_general_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a> &amp;plan, const std::vector&lt; int &gt; &amp;axes, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
 <tr class="separator:ab1eeca8ec6fa31819ee108fa6ed2c41b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa0332c64ee9965f05026c30a0b778000" id="r_aa0332c64ee9965f05026c30a0b778000"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">mlx::core::strided_reduce_general_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a> &amp;plan, const std::vector&lt; int &gt; &amp;axes, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
diff --git a/docs/build/html/metal_2reduce_8h_source.html b/docs/build/html/metal_2reduce_8h_source.html
index 9dfc6469d..79e30d6a2 100644
--- a/docs/build/html/metal_2reduce_8h_source.html
+++ b/docs/build/html/metal_2reduce_8h_source.html
@@ -103,44 +103,43 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span> </div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span><span class="keyword">using </span>metal::CommandEncoder;</div>
 <div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span> </div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">   13</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">all_reduce_dispatch</a>(</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">   13</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">all_reduce_dispatch</a>(</div>
 <div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
 <div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
 <div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; compute_encoder,</div>
 <div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s,</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    std::vector&lt;array&gt;&amp; copies);</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span> </div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">   22</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">row_reduce_general_dispatch</a>(</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a>&amp; plan,</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; compute_encoder,</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s);</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span> </div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">   32</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">strided_reduce_general_dispatch</a>(</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a>&amp; plan,</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; compute_encoder,</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s);</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span> </div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>} <span class="comment">// namespace mlx::core</span></div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s);</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span> </div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">   21</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">row_reduce_general_dispatch</a>(</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a>&amp; plan,</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; compute_encoder,</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s);</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno"><a class="line" href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">   31</a></span><span class="keywordtype">void</span> <a class="code hl_function" href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">strided_reduce_general_dispatch</a>(</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keyword">const</span> <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; in,</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    <a class="code hl_class" href="classmlx_1_1core_1_1array.html">array</a>&amp; out,</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keyword">const</span> std::string&amp; op_name,</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a>&amp; plan,</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    <span class="keyword">const</span> std::vector&lt;int&gt;&amp; axes,</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>    <a class="code hl_struct" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a>&amp; compute_encoder,</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    <a class="code hl_class" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a>&amp; d,</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <span class="keyword">const</span> <a class="code hl_struct" href="structmlx_1_1core_1_1_stream.html">Stream</a>&amp; s);</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span> </div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>} <span class="comment">// namespace mlx::core</span></div>
 <div class="ttc" id="abackend_2metal_2device_8h_html"><div class="ttname"><a href="backend_2metal_2device_8h.html">device.h</a></div></div>
 <div class="ttc" id="aclassmlx_1_1core_1_1array_html"><div class="ttname"><a href="classmlx_1_1core_1_1array.html">mlx::core::array</a></div><div class="ttdef"><b>Definition</b> array.h:20</div></div>
-<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:128</div></div>
+<div class="ttc" id="aclassmlx_1_1core_1_1metal_1_1_device_html"><div class="ttname"><a href="classmlx_1_1core_1_1metal_1_1_device.html">mlx::core::metal::Device</a></div><div class="ttdef"><b>Definition</b> device.h:131</div></div>
 <div class="ttc" id="acommon_2reduce_8h_html"><div class="ttname"><a href="common_2reduce_8h.html">reduce.h</a></div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html"><div class="ttname"><a href="namespacemlx_1_1core.html">mlx::core</a></div><div class="ttdef"><b>Definition</b> allocator.h:7</div></div>
+<div class="ttc" id="anamespacemlx_1_1core_html_a3ab0fd997d9a35782106ff083a72e098"><div class="ttname"><a href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">mlx::core::all_reduce_dispatch</a></div><div class="ttdeci">void all_reduce_dispatch(const array &amp;in, array &amp;out, const std::string &amp;op_name, CommandEncoder &amp;compute_encoder, metal::Device &amp;d, const Stream &amp;s)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_aa0332c64ee9965f05026c30a0b778000"><div class="ttname"><a href="namespacemlx_1_1core.html#aa0332c64ee9965f05026c30a0b778000">mlx::core::strided_reduce_general_dispatch</a></div><div class="ttdeci">void strided_reduce_general_dispatch(const array &amp;in, array &amp;out, const std::string &amp;op_name, const ReductionPlan &amp;plan, const std::vector&lt; int &gt; &amp;axes, CommandEncoder &amp;compute_encoder, metal::Device &amp;d, const Stream &amp;s)</div></div>
 <div class="ttc" id="anamespacemlx_1_1core_html_ab1eeca8ec6fa31819ee108fa6ed2c41b"><div class="ttname"><a href="namespacemlx_1_1core.html#ab1eeca8ec6fa31819ee108fa6ed2c41b">mlx::core::row_reduce_general_dispatch</a></div><div class="ttdeci">void row_reduce_general_dispatch(const array &amp;in, array &amp;out, const std::string &amp;op_name, const ReductionPlan &amp;plan, const std::vector&lt; int &gt; &amp;axes, CommandEncoder &amp;compute_encoder, metal::Device &amp;d, const Stream &amp;s)</div></div>
-<div class="ttc" id="anamespacemlx_1_1core_html_af7b7ca7c6aa87558d9f98cee5c7a99a8"><div class="ttname"><a href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">mlx::core::all_reduce_dispatch</a></div><div class="ttdeci">void all_reduce_dispatch(const array &amp;in, array &amp;out, const std::string &amp;op_name, CommandEncoder &amp;compute_encoder, metal::Device &amp;d, const Stream &amp;s, std::vector&lt; array &gt; &amp;copies)</div></div>
 <div class="ttc" id="astream_8h_html"><div class="ttname"><a href="stream_8h.html">stream.h</a></div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_reduction_plan_html"><div class="ttname"><a href="structmlx_1_1core_1_1_reduction_plan.html">mlx::core::ReductionPlan</a></div><div class="ttdef"><b>Definition</b> reduce.h:39</div></div>
 <div class="ttc" id="astructmlx_1_1core_1_1_stream_html"><div class="ttname"><a href="structmlx_1_1core_1_1_stream.html">mlx::core::Stream</a></div><div class="ttdef"><b>Definition</b> stream.h:9</div></div>
diff --git a/docs/build/html/namespacemembers.html b/docs/build/html/namespacemembers.html
index ffff1cc90..5625555af 100644
--- a/docs/build/html/namespacemembers.html
+++ b/docs/build/html/namespacemembers.html
@@ -99,7 +99,7 @@ $(function(){ initResizable(false); });
 <li>aligned_dealloc()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#aec7820e36a33e0a8bb83aa03b04b81e8">pocketfft::detail</a></li>
 <li>all()&#160;:&#160;<a class="el" href="group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68">mlx::core</a></li>
 <li>all_gather()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7">mlx::core::distributed</a>, <a class="el" href="namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04">mlx::core::distributed::detail</a></li>
-<li>all_reduce_dispatch()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">mlx::core</a></li>
+<li>all_reduce_dispatch()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">mlx::core</a></li>
 <li>all_sum()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980">mlx::core::distributed</a>, <a class="el" href="namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960">mlx::core::distributed::detail</a></li>
 <li>allclose()&#160;:&#160;<a class="el" href="group__ops.html#gaf0cd4257de7542daf9faf5e605e31020">mlx::core</a></li>
 <li>alloc_tmp()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a4db03cbcd9d43d9e0b0b9067713c80e9">pocketfft::detail</a></li>
diff --git a/docs/build/html/namespacemembers_func.html b/docs/build/html/namespacemembers_func.html
index 9e16a9a69..e838b45c1 100644
--- a/docs/build/html/namespacemembers_func.html
+++ b/docs/build/html/namespacemembers_func.html
@@ -98,7 +98,7 @@ $(function(){ initResizable(false); });
 <li>aligned_dealloc()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#aec7820e36a33e0a8bb83aa03b04b81e8">pocketfft::detail</a></li>
 <li>all()&#160;:&#160;<a class="el" href="group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68">mlx::core</a></li>
 <li>all_gather()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7">mlx::core::distributed</a>, <a class="el" href="namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04">mlx::core::distributed::detail</a></li>
-<li>all_reduce_dispatch()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8">mlx::core</a></li>
+<li>all_reduce_dispatch()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098">mlx::core</a></li>
 <li>all_sum()&#160;:&#160;<a class="el" href="namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980">mlx::core::distributed</a>, <a class="el" href="namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960">mlx::core::distributed::detail</a></li>
 <li>allclose()&#160;:&#160;<a class="el" href="group__ops.html#gaf0cd4257de7542daf9faf5e605e31020">mlx::core</a></li>
 <li>alloc_tmp()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail.html#a4db03cbcd9d43d9e0b0b9067713c80e9">pocketfft::detail</a></li>
diff --git a/docs/build/html/namespacemembers_func_g.html b/docs/build/html/namespacemembers_func_g.html
index 5e2c379ff..1c777baeb 100644
--- a/docs/build/html/namespacemembers_func_g.html
+++ b/docs/build/html/namespacemembers_func_g.html
@@ -112,7 +112,7 @@ $(function(){ initResizable(false); });
 <li>get_pool()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a">pocketfft::detail::threading</a></li>
 <li>get_primitive_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core</a></li>
 <li>get_quantized_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core</a></li>
-<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">mlx::core</a></li>
+<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core</a></li>
 <li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core</a></li>
 <li>get_reduction_plan()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba">mlx::core</a></li>
 <li>get_scan_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f">mlx::core</a></li>
diff --git a/docs/build/html/namespacemembers_g.html b/docs/build/html/namespacemembers_g.html
index 73be65412..0bbdf9db4 100644
--- a/docs/build/html/namespacemembers_g.html
+++ b/docs/build/html/namespacemembers_g.html
@@ -116,7 +116,7 @@ $(function(){ initResizable(false); });
 <li>get_pool()&#160;:&#160;<a class="el" href="namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a">pocketfft::detail::threading</a></li>
 <li>get_primitive_string()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60">mlx::core</a></li>
 <li>get_quantized_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e">mlx::core</a></li>
-<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89">mlx::core</a></li>
+<li>get_reduce_init_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647">mlx::core</a></li>
 <li>get_reduce_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b">mlx::core</a></li>
 <li>get_reduction_plan()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba">mlx::core</a></li>
 <li>get_scan_kernel()&#160;:&#160;<a class="el" href="namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f">mlx::core</a></li>
diff --git a/docs/build/html/namespacemlx_1_1core.html b/docs/build/html/namespacemlx_1_1core.html
index cef5d8c69..09ca616f9 100644
--- a/docs/build/html/namespacemlx_1_1core.html
+++ b/docs/build/html/namespacemlx_1_1core.html
@@ -534,8 +534,8 @@ Functions</h2></td></tr>
 <tr class="separator:a84ebe6275218070f0ea320f126f64e22"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:afb57825bb763050cc9a9d194aa41ac36" id="r_afb57825bb763050cc9a9d194aa41ac36"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afb57825bb763050cc9a9d194aa41ac36">get_mb_sort_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;idx, int bn, int tn)</td></tr>
 <tr class="separator:afb57825bb763050cc9a9d194aa41ac36"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a51c4bb09230348bd0252e22bfdc9bc89" id="r_a51c4bb09230348bd0252e22bfdc9bc89"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a51c4bb09230348bd0252e22bfdc9bc89">get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
-<tr class="separator:a51c4bb09230348bd0252e22bfdc9bc89"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3bd386cb6db09f636963ce66ceaf8647" id="r_a3bd386cb6db09f636963ce66ceaf8647"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3bd386cb6db09f636963ce66ceaf8647">get_reduce_init_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out)</td></tr>
+<tr class="separator:a3bd386cb6db09f636963ce66ceaf8647"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a7aa91fcfe8b9caa42d60a957f11bfe6b" id="r_a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7aa91fcfe8b9caa42d60a957f11bfe6b">get_reduce_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;func_name, const std::string &amp;op_name, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int ndim=-1, int bm=-1, int bn=-1)</td></tr>
 <tr class="separator:a7aa91fcfe8b9caa42d60a957f11bfe6b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a84fa8e0aee321a9d614433a0b933103b" id="r_a84fa8e0aee321a9d614433a0b933103b"><td class="memItemLeft" align="right" valign="top">MTL::ComputePipelineState *&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a84fa8e0aee321a9d614433a0b933103b">get_steel_gemm_fused_kernel</a> (<a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const std::string &amp;kernel_name, const std::string &amp;hash_name, const <a class="el" href="namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54">metal::MTLFCList</a> &amp;func_consts, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)</td></tr>
@@ -563,8 +563,8 @@ Functions</h2></td></tr>
 <tr class="separator:a227588758ccc9ee869dba147e830bb74"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab43a7633794498e1c6775cca829eb886" id="r_ab43a7633794498e1c6775cca829eb886"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab43a7633794498e1c6775cca829eb886">steel_matmul</a> (const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;a, const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;b, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, int M, int N, int K, int batch_size_out, int lda, int ldb, bool transpose_a, bool transpose_b, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;copies, std::vector&lt; int &gt; batch_shape={}, std::vector&lt; size_t &gt; A_batch_stride={}, std::vector&lt; size_t &gt; B_batch_stride={})</td></tr>
 <tr class="separator:ab43a7633794498e1c6775cca829eb886"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:af7b7ca7c6aa87558d9f98cee5c7a99a8" id="r_af7b7ca7c6aa87558d9f98cee5c7a99a8"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af7b7ca7c6aa87558d9f98cee5c7a99a8">all_reduce_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s, std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;copies)</td></tr>
-<tr class="separator:af7b7ca7c6aa87558d9f98cee5c7a99a8"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a3ab0fd997d9a35782106ff083a72e098" id="r_a3ab0fd997d9a35782106ff083a72e098"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3ab0fd997d9a35782106ff083a72e098">all_reduce_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
+<tr class="separator:a3ab0fd997d9a35782106ff083a72e098"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ab1eeca8ec6fa31819ee108fa6ed2c41b" id="r_ab1eeca8ec6fa31819ee108fa6ed2c41b"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ab1eeca8ec6fa31819ee108fa6ed2c41b">row_reduce_general_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a> &amp;plan, const std::vector&lt; int &gt; &amp;axes, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
 <tr class="separator:ab1eeca8ec6fa31819ee108fa6ed2c41b"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:aa0332c64ee9965f05026c30a0b778000" id="r_aa0332c64ee9965f05026c30a0b778000"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa0332c64ee9965f05026c30a0b778000">strided_reduce_general_dispatch</a> (const <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;in, <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &amp;out, const std::string &amp;op_name, const <a class="el" href="structmlx_1_1core_1_1_reduction_plan.html">ReductionPlan</a> &amp;plan, const std::vector&lt; int &gt; &amp;axes, <a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">CommandEncoder</a> &amp;compute_encoder, <a class="el" href="classmlx_1_1core_1_1metal_1_1_device.html">metal::Device</a> &amp;d, const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;s)</td></tr>
@@ -2634,8 +2634,8 @@ template&lt;typename... T&gt; </div>
 </div>
 </div>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="af7b7ca7c6aa87558d9f98cee5c7a99a8" name="af7b7ca7c6aa87558d9f98cee5c7a99a8"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#af7b7ca7c6aa87558d9f98cee5c7a99a8">&#9670;&#160;</a></span>all_reduce_dispatch()</h2>
+<a id="a3ab0fd997d9a35782106ff083a72e098" name="a3ab0fd997d9a35782106ff083a72e098"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3ab0fd997d9a35782106ff083a72e098">&#9670;&#160;</a></span>all_reduce_dispatch()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -2668,12 +2668,7 @@ template&lt;typename... T&gt; </div>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>s</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">std::vector&lt; <a class="el" href="classmlx_1_1core_1_1array.html">array</a> &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>copies</em></span>&#160;)</td>
+          <td class="paramtype">const <a class="el" href="structmlx_1_1core_1_1_stream.html">Stream</a> &amp;</td>          <td class="paramname"><span class="paramname"><em>s</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
@@ -4418,8 +4413,8 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
 
 </div>
 </div>
-<a id="a51c4bb09230348bd0252e22bfdc9bc89" name="a51c4bb09230348bd0252e22bfdc9bc89"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a51c4bb09230348bd0252e22bfdc9bc89">&#9670;&#160;</a></span>get_reduce_init_kernel()</h2>
+<a id="a3bd386cb6db09f636963ce66ceaf8647" name="a3bd386cb6db09f636963ce66ceaf8647"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a3bd386cb6db09f636963ce66ceaf8647">&#9670;&#160;</a></span>get_reduce_init_kernel()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -4434,6 +4429,16 @@ template&lt;typename... Arrays, typename  = enable_for_arrays_t&lt;Arrays...&gt;
           <td></td>
           <td class="paramtype">const std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>kernel_name</em></span>, </td>
         </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>func_name</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const std::string &amp;</td>          <td class="paramname"><span class="paramname"><em>op_name</em></span>, </td>
+        </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
diff --git a/docs/build/html/objects.inv b/docs/build/html/objects.inv
index c31b0569f..5bbbf441f 100644
Binary files a/docs/build/html/objects.inv and b/docs/build/html/objects.inv differ
diff --git a/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html b/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
index 085ae4f4a..a69f99808 100644
--- a/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
+++ b/docs/build/html/python/_autosummary/mlx.core.fast.metal_kernel.html
@@ -867,6 +867,7 @@
 <dt class="sig sig-object py" id="mlx.core.fast.metal_kernel">
 <span class="sig-name descname"><span class="pre">metal_kernel</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">input_names</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/collections.abc.html#collections.abc.Sequence" title="(in Python v3.13)"><span class="pre">Sequence</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_names</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/collections.abc.html#collections.abc.Sequence" title="(in Python v3.13)"><span class="pre">Sequence</span></a><span class="p"><span class="pre">[</span></span><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a><span class="p"><span class="pre">]</span></span></span></em>, <em class="sig-param"><span class="n"><span class="pre">source</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">header</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.13)"><span class="pre">str</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ensure_row_contiguous</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><span class="pre">bool</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">atomic_outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.13)"><span class="pre">bool</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.13)"><span class="pre">object</span></a></span></span><a class="headerlink" href="#mlx.core.fast.metal_kernel" title="Link to this definition">#</a></dt>
 <dd><p>A jit-compiled custom Metal kernel defined from a source string.</p>
+<p>Full documentation: <a class="reference internal" href="../../dev/custom_metal_kernels.html#custom-metal-kernels"><span class="std std-ref">Custom Metal Kernels</span></a>.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
diff --git a/docs/build/html/quantized_8h.html b/docs/build/html/quantized_8h.html
index f865582b9..c0e18903c 100644
--- a/docs/build/html/quantized_8h.html
+++ b/docs/build/html/quantized_8h.html
@@ -140,9 +140,9 @@ Functions</h2></td></tr>
 <tr class="memitem:a8e13c7d895624f738d2a6d9893b687fd" id="r_a8e13c7d895624f738d2a6d9893b687fd"><td class="memTemplParams" colspan="2">template&lt;typename T , int group_size, int bits&gt; </td></tr>
 <tr class="memitem:a8e13c7d895624f738d2a6d9893b687fd"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
 <tr class="separator:a8e13c7d895624f738d2a6d9893b687fd"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:a4a8c8db7d5d480733726fd6d1a645e12" id="r_a4a8c8db7d5d480733726fd6d1a645e12"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits&gt; </td></tr>
-<tr class="memitem:a4a8c8db7d5d480733726fd6d1a645e12"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4a8c8db7d5d480733726fd6d1a645e12">qvm_impl</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
-<tr class="separator:a4a8c8db7d5d480733726fd6d1a645e12"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a1546533c5b925b2fbb3bec870ec7487a" id="r_a1546533c5b925b2fbb3bec870ec7487a"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits&gt; </td></tr>
+<tr class="memitem:a1546533c5b925b2fbb3bec870ec7487a"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const int in_vec_size, const int out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
+<tr class="separator:a1546533c5b925b2fbb3bec870ec7487a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:af5750a35e8f5462218effba719f7f5b8" id="r_af5750a35e8f5462218effba719f7f5b8"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits, const bool aligned_N, const int BM = 32, const int BK = 32, const int BN = 32&gt; </td></tr>
 <tr class="memitem:af5750a35e8f5462218effba719f7f5b8"><td class="memTemplItemLeft" align="right" valign="top">METAL_FUNC void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</td></tr>
 <tr class="separator:af5750a35e8f5462218effba719f7f5b8"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -167,6 +167,9 @@ Functions</h2></td></tr>
 <tr class="memitem:ad84f7d5ab9e32dbbe3ca759ae5d5d5c5" id="r_ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits, bool batched&gt; </td></tr>
 <tr class="memitem:ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
 <tr class="separator:ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ab8243818512d6078d23e6ffb65fd7bb8" id="r_ab8243818512d6078d23e6ffb65fd7bb8"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits, int split_k = 32&gt; </td></tr>
+<tr class="memitem:ab8243818512d6078d23e6ffb65fd7bb8"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;final_block_size, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
+<tr class="separator:ab8243818512d6078d23e6ffb65fd7bb8"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:abe2e3ef0ee4ec2cb61dc5330ad463d10" id="r_abe2e3ef0ee4ec2cb61dc5330ad463d10"><td class="memTemplParams" colspan="2">template&lt;typename T , const int group_size, const int bits, const bool aligned_N, const bool batched, const int BM = 32, const int BK = 32, const int BN = 32&gt; </td></tr>
 <tr class="memitem:abe2e3ef0ee4ec2cb61dc5330ad463d10"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a> (const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</td></tr>
 <tr class="separator:abe2e3ef0ee4ec2cb61dc5330ad463d10"><td class="memSeparator" colspan="2">&#160;</td></tr>
@@ -2485,8 +2488,8 @@ template&lt;typename T , const int group_size, const int bits, bool batched&gt;
 
 </div>
 </div>
-<a id="a4a8c8db7d5d480733726fd6d1a645e12" name="a4a8c8db7d5d480733726fd6d1a645e12"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a4a8c8db7d5d480733726fd6d1a645e12">&#9670;&#160;</a></span>qvm_impl()</h2>
+<a id="a1546533c5b925b2fbb3bec870ec7487a" name="a1546533c5b925b2fbb3bec870ec7487a"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a1546533c5b925b2fbb3bec870ec7487a">&#9670;&#160;</a></span>qvm_impl()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -2518,6 +2521,69 @@ template&lt;typename T , const int group_size, const int bits&gt; </div>
           <td></td>
           <td class="paramtype">device T *</td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
         </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>in_vec_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const int</td>          <td class="paramname"><span class="paramname"><em>out_vec_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_gid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lid</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ab8243818512d6078d23e6ffb65fd7bb8" name="ab8243818512d6078d23e6ffb65fd7bb8"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ab8243818512d6078d23e6ffb65fd7bb8">&#9670;&#160;</a></span>qvm_split_k()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , const int group_size, const int bits, int split_k = 32&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void qvm_split_k </td>
+          <td>(</td>
+          <td class="paramtype">const device uint32_t *</td>          <td class="paramname"><span class="paramname"><em>w</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>scales</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>biases</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>x</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device T *</td>          <td class="paramname"><span class="paramname"><em>y</em></span>, </td>
+        </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
@@ -2528,6 +2594,51 @@ template&lt;typename T , const int group_size, const int bits&gt; </div>
           <td></td>
           <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>out_vec_size</em></span>, </td>
         </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>x_batch_ndims</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>x_shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>x_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>w_batch_ndims</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>w_shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>w_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>s_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>b_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>final_block_size</em></span>, </td>
+        </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
diff --git a/docs/build/html/quantized_8h_source.html b/docs/build/html/quantized_8h_source.html
index 07b25d33f..0a08db757 100644
--- a/docs/build/html/quantized_8h_source.html
+++ b/docs/build/html/quantized_8h_source.html
@@ -766,14 +766,14 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00645" name="l00645"></a><span class="lineno">  645</span> </div>
 <div class="line"><a id="l00646" name="l00646"></a><span class="lineno">  646</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
 <div class="foldopen" id="foldopen00647" data-start="{" data-end="}">
-<div class="line"><a id="l00647" name="l00647"></a><span class="lineno"><a class="line" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">  647</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">qvm_impl</a>(</div>
+<div class="line"><a id="l00647" name="l00647"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">  647</a></span>METAL_FUNC <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a>(</div>
 <div class="line"><a id="l00648" name="l00648"></a><span class="lineno">  648</span>    <span class="keyword">const</span> device uint32_t* w,</div>
 <div class="line"><a id="l00649" name="l00649"></a><span class="lineno">  649</span>    <span class="keyword">const</span> device T* scales,</div>
 <div class="line"><a id="l00650" name="l00650"></a><span class="lineno">  650</span>    <span class="keyword">const</span> device T* biases,</div>
 <div class="line"><a id="l00651" name="l00651"></a><span class="lineno">  651</span>    <span class="keyword">const</span> device T* x,</div>
 <div class="line"><a id="l00652" name="l00652"></a><span class="lineno">  652</span>    device T* y,</div>
-<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size,</div>
-<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size,</div>
+<div class="line"><a id="l00653" name="l00653"></a><span class="lineno">  653</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> in_vec_size,</div>
+<div class="line"><a id="l00654" name="l00654"></a><span class="lineno">  654</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> out_vec_size,</div>
 <div class="line"><a id="l00655" name="l00655"></a><span class="lineno">  655</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
 <div class="line"><a id="l00656" name="l00656"></a><span class="lineno">  656</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
 <div class="line"><a id="l00657" name="l00657"></a><span class="lineno">  657</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
@@ -1423,7 +1423,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l01285" name="l01285"></a><span class="lineno"> 1285</span>        b_strides,</div>
 <div class="line"><a id="l01286" name="l01286"></a><span class="lineno"> 1286</span>        tid);</div>
 <div class="line"><a id="l01287" name="l01287"></a><span class="lineno"> 1287</span>  }</div>
-<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span>  <a class="code hl_function" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01288" name="l01288"></a><span class="lineno"> 1288</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
 <div class="line"><a id="l01289" name="l01289"></a><span class="lineno"> 1289</span>      w,</div>
 <div class="line"><a id="l01290" name="l01290"></a><span class="lineno"> 1290</span>      scales,</div>
 <div class="line"><a id="l01291" name="l01291"></a><span class="lineno"> 1291</span>      biases,</div>
@@ -1437,610 +1437,667 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l01299" name="l01299"></a><span class="lineno"> 1299</span>}</div>
 </div>
 <div class="line"><a id="l01300" name="l01300"></a><span class="lineno"> 1300</span> </div>
-<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"> 1301</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"> 1302</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"> 1303</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"> 1304</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
-<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
-<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"> 1307</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"> 1309</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01310" data-start="{" data-end="}">
-<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"><a class="line" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10"> 1310</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a>(</div>
-<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"> 1311</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"> 1312</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"> 1313</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"> 1314</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"> 1316</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"> 1317</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"> 1318</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"> 1319</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"> 1320</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"> 1321</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"> 1322</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"> 1325</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"> 1326</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"> 1327</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"> 1328</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"> 1329</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"> 1330</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"> 1331</span>  (void)lid;</div>
-<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"> 1332</span> </div>
-<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"> 1333</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"> 1334</span> </div>
-<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"> 1335</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"> 1336</span>  threadgroup T Ws[BN * BK_padded];</div>
-<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span> </div>
-<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"> 1339</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"> 1340</span>        x,</div>
-<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"> 1341</span>        w,</div>
-<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"> 1342</span>        scales,</div>
-<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span>        biases,</div>
-<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"> 1344</span>        y,</div>
-<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span>        M * N,</div>
-<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"> 1346</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"> 1347</span>        x_shape,</div>
-<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"> 1348</span>        x_strides,</div>
-<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"> 1349</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"> 1350</span>        w_shape,</div>
-<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"> 1351</span>        w_strides,</div>
-<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"> 1352</span>        s_strides,</div>
-<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"> 1353</span>        b_strides,</div>
-<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"> 1354</span>        tid);</div>
-<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"> 1355</span>  }</div>
-<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span>}</div>
+<div class="line"><a id="l01301" name="l01301"></a><span class="lineno"> 1301</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits, <span class="keywordtype">int</span> split_k = 32&gt;</div>
+<div class="foldopen" id="foldopen01302" data-start="{" data-end="}">
+<div class="line"><a id="l01302" name="l01302"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8"> 1302</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a>(</div>
+<div class="line"><a id="l01303" name="l01303"></a><span class="lineno"> 1303</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01304" name="l01304"></a><span class="lineno"> 1304</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01305" name="l01305"></a><span class="lineno"> 1305</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01306" name="l01306"></a><span class="lineno"> 1306</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01307" name="l01307"></a><span class="lineno"> 1307</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01308" name="l01308"></a><span class="lineno"> 1308</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01309" name="l01309"></a><span class="lineno"> 1309</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01310" name="l01310"></a><span class="lineno"> 1310</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01311" name="l01311"></a><span class="lineno"> 1311</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01312" name="l01312"></a><span class="lineno"> 1312</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01313" name="l01313"></a><span class="lineno"> 1313</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01314" name="l01314"></a><span class="lineno"> 1314</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01315" name="l01315"></a><span class="lineno"> 1315</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01316" name="l01316"></a><span class="lineno"> 1316</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01317" name="l01317"></a><span class="lineno"> 1317</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01318" name="l01318"></a><span class="lineno"> 1318</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; final_block_size [[buffer(15)]],</div>
+<div class="line"><a id="l01319" name="l01319"></a><span class="lineno"> 1319</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01320" name="l01320"></a><span class="lineno"> 1320</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01321" name="l01321"></a><span class="lineno"> 1321</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01322" name="l01322"></a><span class="lineno"> 1322</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01323" name="l01323"></a><span class="lineno"> 1323</span>      x,</div>
+<div class="line"><a id="l01324" name="l01324"></a><span class="lineno"> 1324</span>      w,</div>
+<div class="line"><a id="l01325" name="l01325"></a><span class="lineno"> 1325</span>      scales,</div>
+<div class="line"><a id="l01326" name="l01326"></a><span class="lineno"> 1326</span>      biases,</div>
+<div class="line"><a id="l01327" name="l01327"></a><span class="lineno"> 1327</span>      y,</div>
+<div class="line"><a id="l01328" name="l01328"></a><span class="lineno"> 1328</span>      out_vec_size,</div>
+<div class="line"><a id="l01329" name="l01329"></a><span class="lineno"> 1329</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01330" name="l01330"></a><span class="lineno"> 1330</span>      x_shape,</div>
+<div class="line"><a id="l01331" name="l01331"></a><span class="lineno"> 1331</span>      x_strides,</div>
+<div class="line"><a id="l01332" name="l01332"></a><span class="lineno"> 1332</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01333" name="l01333"></a><span class="lineno"> 1333</span>      w_shape,</div>
+<div class="line"><a id="l01334" name="l01334"></a><span class="lineno"> 1334</span>      w_strides,</div>
+<div class="line"><a id="l01335" name="l01335"></a><span class="lineno"> 1335</span>      s_strides,</div>
+<div class="line"><a id="l01336" name="l01336"></a><span class="lineno"> 1336</span>      b_strides,</div>
+<div class="line"><a id="l01337" name="l01337"></a><span class="lineno"> 1337</span>      tid);</div>
+<div class="line"><a id="l01338" name="l01338"></a><span class="lineno"> 1338</span> </div>
+<div class="line"><a id="l01339" name="l01339"></a><span class="lineno"> 1339</span>  <span class="comment">// When (in_vec_size % split_k != 0) the final block needs to be smaller</span></div>
+<div class="line"><a id="l01340" name="l01340"></a><span class="lineno"> 1340</span>  <span class="keywordtype">int</span> in_vec_size_adj =</div>
+<div class="line"><a id="l01341" name="l01341"></a><span class="lineno"> 1341</span>      tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;</div>
+<div class="line"><a id="l01342" name="l01342"></a><span class="lineno"> 1342</span> </div>
+<div class="line"><a id="l01343" name="l01343"></a><span class="lineno"> 1343</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01344" name="l01344"></a><span class="lineno"> 1344</span>      w,</div>
+<div class="line"><a id="l01345" name="l01345"></a><span class="lineno"> 1345</span>      scales,</div>
+<div class="line"><a id="l01346" name="l01346"></a><span class="lineno"> 1346</span>      biases,</div>
+<div class="line"><a id="l01347" name="l01347"></a><span class="lineno"> 1347</span>      x,</div>
+<div class="line"><a id="l01348" name="l01348"></a><span class="lineno"> 1348</span>      y,</div>
+<div class="line"><a id="l01349" name="l01349"></a><span class="lineno"> 1349</span>      in_vec_size_adj,</div>
+<div class="line"><a id="l01350" name="l01350"></a><span class="lineno"> 1350</span>      out_vec_size,</div>
+<div class="line"><a id="l01351" name="l01351"></a><span class="lineno"> 1351</span>      tid,</div>
+<div class="line"><a id="l01352" name="l01352"></a><span class="lineno"> 1352</span>      simd_gid,</div>
+<div class="line"><a id="l01353" name="l01353"></a><span class="lineno"> 1353</span>      simd_lid);</div>
+<div class="line"><a id="l01354" name="l01354"></a><span class="lineno"> 1354</span>}</div>
 </div>
-<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"> 1359</span> </div>
-<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"> 1361</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"> 1363</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"> 1364</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
-<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"> 1365</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"> 1366</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"> 1367</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01368" data-start="{" data-end="}">
-<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"><a class="line" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7"> 1368</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a>(</div>
-<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"> 1369</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"> 1370</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"> 1371</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"> 1372</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"> 1376</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"> 1378</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"> 1380</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"> 1381</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"> 1382</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"> 1383</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"> 1384</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"> 1385</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"> 1386</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"> 1387</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"> 1388</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"> 1389</span>  (void)lid;</div>
-<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span> </div>
-<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"> 1393</span> </div>
-<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"> 1395</span>  threadgroup T Ws[BK * BN_padded];</div>
-<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span> </div>
-<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"> 1397</span>  <span class="keywordflow">if</span> (batched) {</div>
-<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"> 1398</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"> 1399</span>        x,</div>
-<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"> 1400</span>        w,</div>
-<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"> 1401</span>        scales,</div>
-<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"> 1402</span>        biases,</div>
-<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"> 1403</span>        y,</div>
-<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"> 1404</span>        M * N,</div>
-<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"> 1405</span>        x_batch_ndims,</div>
-<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"> 1406</span>        x_shape,</div>
-<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span>        x_strides,</div>
-<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span>        w_batch_ndims,</div>
-<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span>        w_shape,</div>
-<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"> 1410</span>        w_strides,</div>
-<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span>        s_strides,</div>
-<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"> 1412</span>        b_strides,</div>
-<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span>        tid);</div>
-<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"> 1414</span>  }</div>
-<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span> </div>
-<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"> 1416</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"> 1417</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span>}</div>
+<div class="line"><a id="l01355" name="l01355"></a><span class="lineno"> 1355</span> </div>
+<div class="line"><a id="l01356" name="l01356"></a><span class="lineno"> 1356</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01357" name="l01357"></a><span class="lineno"> 1357</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01358" name="l01358"></a><span class="lineno"> 1358</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01359" name="l01359"></a><span class="lineno"> 1359</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01360" name="l01360"></a><span class="lineno"> 1360</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
+<div class="line"><a id="l01361" name="l01361"></a><span class="lineno"> 1361</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
+<div class="line"><a id="l01362" name="l01362"></a><span class="lineno"> 1362</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01363" name="l01363"></a><span class="lineno"> 1363</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01364" name="l01364"></a><span class="lineno"> 1364</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01365" data-start="{" data-end="}">
+<div class="line"><a id="l01365" name="l01365"></a><span class="lineno"><a class="line" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10"> 1365</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a>(</div>
+<div class="line"><a id="l01366" name="l01366"></a><span class="lineno"> 1366</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01367" name="l01367"></a><span class="lineno"> 1367</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01368" name="l01368"></a><span class="lineno"> 1368</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01369" name="l01369"></a><span class="lineno"> 1369</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01370" name="l01370"></a><span class="lineno"> 1370</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01371" name="l01371"></a><span class="lineno"> 1371</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01372" name="l01372"></a><span class="lineno"> 1372</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01373" name="l01373"></a><span class="lineno"> 1373</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01374" name="l01374"></a><span class="lineno"> 1374</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01375" name="l01375"></a><span class="lineno"> 1375</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01376" name="l01376"></a><span class="lineno"> 1376</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01377" name="l01377"></a><span class="lineno"> 1377</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01378" name="l01378"></a><span class="lineno"> 1378</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01379" name="l01379"></a><span class="lineno"> 1379</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01380" name="l01380"></a><span class="lineno"> 1380</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01381" name="l01381"></a><span class="lineno"> 1381</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01382" name="l01382"></a><span class="lineno"> 1382</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01383" name="l01383"></a><span class="lineno"> 1383</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01384" name="l01384"></a><span class="lineno"> 1384</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01385" name="l01385"></a><span class="lineno"> 1385</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01386" name="l01386"></a><span class="lineno"> 1386</span>  (void)lid;</div>
+<div class="line"><a id="l01387" name="l01387"></a><span class="lineno"> 1387</span> </div>
+<div class="line"><a id="l01388" name="l01388"></a><span class="lineno"> 1388</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01389" name="l01389"></a><span class="lineno"> 1389</span> </div>
+<div class="line"><a id="l01390" name="l01390"></a><span class="lineno"> 1390</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01391" name="l01391"></a><span class="lineno"> 1391</span>  threadgroup T Ws[BN * BK_padded];</div>
+<div class="line"><a id="l01392" name="l01392"></a><span class="lineno"> 1392</span> </div>
+<div class="line"><a id="l01393" name="l01393"></a><span class="lineno"> 1393</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01394" name="l01394"></a><span class="lineno"> 1394</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01395" name="l01395"></a><span class="lineno"> 1395</span>        x,</div>
+<div class="line"><a id="l01396" name="l01396"></a><span class="lineno"> 1396</span>        w,</div>
+<div class="line"><a id="l01397" name="l01397"></a><span class="lineno"> 1397</span>        scales,</div>
+<div class="line"><a id="l01398" name="l01398"></a><span class="lineno"> 1398</span>        biases,</div>
+<div class="line"><a id="l01399" name="l01399"></a><span class="lineno"> 1399</span>        y,</div>
+<div class="line"><a id="l01400" name="l01400"></a><span class="lineno"> 1400</span>        M * N,</div>
+<div class="line"><a id="l01401" name="l01401"></a><span class="lineno"> 1401</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01402" name="l01402"></a><span class="lineno"> 1402</span>        x_shape,</div>
+<div class="line"><a id="l01403" name="l01403"></a><span class="lineno"> 1403</span>        x_strides,</div>
+<div class="line"><a id="l01404" name="l01404"></a><span class="lineno"> 1404</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01405" name="l01405"></a><span class="lineno"> 1405</span>        w_shape,</div>
+<div class="line"><a id="l01406" name="l01406"></a><span class="lineno"> 1406</span>        w_strides,</div>
+<div class="line"><a id="l01407" name="l01407"></a><span class="lineno"> 1407</span>        s_strides,</div>
+<div class="line"><a id="l01408" name="l01408"></a><span class="lineno"> 1408</span>        b_strides,</div>
+<div class="line"><a id="l01409" name="l01409"></a><span class="lineno"> 1409</span>        tid);</div>
+<div class="line"><a id="l01410" name="l01410"></a><span class="lineno"> 1410</span>  }</div>
+<div class="line"><a id="l01411" name="l01411"></a><span class="lineno"> 1411</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01412" name="l01412"></a><span class="lineno"> 1412</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01413" name="l01413"></a><span class="lineno"> 1413</span>}</div>
 </div>
-<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"> 1419</span> </div>
-<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"> 1420</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01421" data-start="{" data-end="}">
-<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"><a class="line" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7"> 1421</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a>(</div>
-<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"> 1422</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"> 1423</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01424" name="l01424"></a><span class="lineno"> 1424</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"> 1425</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"> 1426</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"> 1427</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"> 1428</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"> 1429</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"> 1431</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01440" name="l01440"></a><span class="lineno"> 1440</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"> 1441</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"> 1442</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"> 1443</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"> 1444</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"> 1445</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"> 1446</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"> 1447</span>      x,</div>
-<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span>      w,</div>
-<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>      scales,</div>
-<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>      biases,</div>
-<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span>      lhs_indices,</div>
-<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span>      rhs_indices,</div>
-<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"> 1453</span>      y,</div>
-<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"> 1454</span>      out_vec_size,</div>
-<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span>      batch_ndims,</div>
-<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"> 1456</span>      batch_shape,</div>
-<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"> 1457</span>      lhs_strides,</div>
-<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"> 1458</span>      rhs_strides,</div>
-<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"> 1459</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"> 1460</span>      x_shape,</div>
-<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span>      x_strides,</div>
-<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"> 1462</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>      w_shape,</div>
-<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>      w_strides,</div>
-<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span>      s_strides,</div>
-<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>      b_strides,</div>
-<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>      tid);</div>
-<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"> 1469</span>      w,</div>
-<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span>      scales,</div>
-<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"> 1471</span>      biases,</div>
-<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"> 1472</span>      x,</div>
-<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"> 1473</span>      y,</div>
-<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"> 1474</span>      in_vec_size,</div>
-<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"> 1475</span>      out_vec_size,</div>
-<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"> 1476</span>      tid,</div>
-<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"> 1477</span>      simd_gid,</div>
-<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"> 1478</span>      simd_lid);</div>
-<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"> 1479</span>}</div>
+<div class="line"><a id="l01414" name="l01414"></a><span class="lineno"> 1414</span> </div>
+<div class="line"><a id="l01415" name="l01415"></a><span class="lineno"> 1415</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01416" name="l01416"></a><span class="lineno"> 1416</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01417" name="l01417"></a><span class="lineno"> 1417</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01418" name="l01418"></a><span class="lineno"> 1418</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01419" name="l01419"></a><span class="lineno"> 1419</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> batched,</div>
+<div class="line"><a id="l01420" name="l01420"></a><span class="lineno"> 1420</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01421" name="l01421"></a><span class="lineno"> 1421</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01422" name="l01422"></a><span class="lineno"> 1422</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01423" data-start="{" data-end="}">
+<div class="line"><a id="l01423" name="l01423"></a><span class="lineno"><a class="line" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7"> 1423</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a>(</div>
+<div class="line"><a id="l01424" name="l01424"></a><span class="lineno"> 1424</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01425" name="l01425"></a><span class="lineno"> 1425</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01426" name="l01426"></a><span class="lineno"> 1426</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01427" name="l01427"></a><span class="lineno"> 1427</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01428" name="l01428"></a><span class="lineno"> 1428</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01429" name="l01429"></a><span class="lineno"> 1429</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01430" name="l01430"></a><span class="lineno"> 1430</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01431" name="l01431"></a><span class="lineno"> 1431</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01432" name="l01432"></a><span class="lineno"> 1432</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01433" name="l01433"></a><span class="lineno"> 1433</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01434" name="l01434"></a><span class="lineno"> 1434</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01435" name="l01435"></a><span class="lineno"> 1435</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01436" name="l01436"></a><span class="lineno"> 1436</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01437" name="l01437"></a><span class="lineno"> 1437</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01438" name="l01438"></a><span class="lineno"> 1438</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01439" name="l01439"></a><span class="lineno"> 1439</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01440" name="l01440"></a><span class="lineno"> 1440</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01441" name="l01441"></a><span class="lineno"> 1441</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01442" name="l01442"></a><span class="lineno"> 1442</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01443" name="l01443"></a><span class="lineno"> 1443</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01444" name="l01444"></a><span class="lineno"> 1444</span>  (void)lid;</div>
+<div class="line"><a id="l01445" name="l01445"></a><span class="lineno"> 1445</span> </div>
+<div class="line"><a id="l01446" name="l01446"></a><span class="lineno"> 1446</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01447" name="l01447"></a><span class="lineno"> 1447</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01448" name="l01448"></a><span class="lineno"> 1448</span> </div>
+<div class="line"><a id="l01449" name="l01449"></a><span class="lineno"> 1449</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01450" name="l01450"></a><span class="lineno"> 1450</span>  threadgroup T Ws[BK * BN_padded];</div>
+<div class="line"><a id="l01451" name="l01451"></a><span class="lineno"> 1451</span> </div>
+<div class="line"><a id="l01452" name="l01452"></a><span class="lineno"> 1452</span>  <span class="keywordflow">if</span> (batched) {</div>
+<div class="line"><a id="l01453" name="l01453"></a><span class="lineno"> 1453</span>    <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01454" name="l01454"></a><span class="lineno"> 1454</span>        x,</div>
+<div class="line"><a id="l01455" name="l01455"></a><span class="lineno"> 1455</span>        w,</div>
+<div class="line"><a id="l01456" name="l01456"></a><span class="lineno"> 1456</span>        scales,</div>
+<div class="line"><a id="l01457" name="l01457"></a><span class="lineno"> 1457</span>        biases,</div>
+<div class="line"><a id="l01458" name="l01458"></a><span class="lineno"> 1458</span>        y,</div>
+<div class="line"><a id="l01459" name="l01459"></a><span class="lineno"> 1459</span>        M * N,</div>
+<div class="line"><a id="l01460" name="l01460"></a><span class="lineno"> 1460</span>        x_batch_ndims,</div>
+<div class="line"><a id="l01461" name="l01461"></a><span class="lineno"> 1461</span>        x_shape,</div>
+<div class="line"><a id="l01462" name="l01462"></a><span class="lineno"> 1462</span>        x_strides,</div>
+<div class="line"><a id="l01463" name="l01463"></a><span class="lineno"> 1463</span>        w_batch_ndims,</div>
+<div class="line"><a id="l01464" name="l01464"></a><span class="lineno"> 1464</span>        w_shape,</div>
+<div class="line"><a id="l01465" name="l01465"></a><span class="lineno"> 1465</span>        w_strides,</div>
+<div class="line"><a id="l01466" name="l01466"></a><span class="lineno"> 1466</span>        s_strides,</div>
+<div class="line"><a id="l01467" name="l01467"></a><span class="lineno"> 1467</span>        b_strides,</div>
+<div class="line"><a id="l01468" name="l01468"></a><span class="lineno"> 1468</span>        tid);</div>
+<div class="line"><a id="l01469" name="l01469"></a><span class="lineno"> 1469</span>  }</div>
+<div class="line"><a id="l01470" name="l01470"></a><span class="lineno"> 1470</span> </div>
+<div class="line"><a id="l01471" name="l01471"></a><span class="lineno"> 1471</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01472" name="l01472"></a><span class="lineno"> 1472</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01473" name="l01473"></a><span class="lineno"> 1473</span>}</div>
 </div>
-<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"> 1480</span> </div>
-<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"> 1481</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01482" data-start="{" data-end="}">
-<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"><a class="line" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed"> 1482</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a>(</div>
-<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"> 1490</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"> 1494</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"> 1495</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"> 1496</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"> 1497</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"> 1498</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"> 1499</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"> 1500</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"> 1501</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"> 1507</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"> 1508</span>      x,</div>
-<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"> 1509</span>      w,</div>
-<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span>      scales,</div>
-<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"> 1511</span>      biases,</div>
-<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span>      lhs_indices,</div>
-<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"> 1513</span>      rhs_indices,</div>
-<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"> 1514</span>      y,</div>
-<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span>      out_vec_size,</div>
-<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"> 1516</span>      batch_ndims,</div>
-<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"> 1517</span>      batch_shape,</div>
-<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"> 1518</span>      lhs_strides,</div>
-<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"> 1519</span>      rhs_strides,</div>
-<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"> 1520</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01521" name="l01521"></a><span class="lineno"> 1521</span>      x_shape,</div>
-<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"> 1522</span>      x_strides,</div>
-<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"> 1523</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"> 1524</span>      w_shape,</div>
-<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"> 1525</span>      w_strides,</div>
-<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"> 1526</span>      s_strides,</div>
-<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span>      b_strides,</div>
-<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"> 1528</span>      tid);</div>
-<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>      w,</div>
-<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span>      scales,</div>
-<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>      biases,</div>
-<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>      x,</div>
-<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span>      y,</div>
-<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"> 1535</span>      in_vec_size,</div>
-<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span>      out_vec_size,</div>
-<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"> 1537</span>      tid,</div>
-<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"> 1538</span>      simd_gid,</div>
-<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"> 1539</span>      simd_lid);</div>
-<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span>}</div>
+<div class="line"><a id="l01474" name="l01474"></a><span class="lineno"> 1474</span> </div>
+<div class="line"><a id="l01475" name="l01475"></a><span class="lineno"> 1475</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01476" data-start="{" data-end="}">
+<div class="line"><a id="l01476" name="l01476"></a><span class="lineno"><a class="line" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7"> 1476</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a>(</div>
+<div class="line"><a id="l01477" name="l01477"></a><span class="lineno"> 1477</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01478" name="l01478"></a><span class="lineno"> 1478</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01479" name="l01479"></a><span class="lineno"> 1479</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01480" name="l01480"></a><span class="lineno"> 1480</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01481" name="l01481"></a><span class="lineno"> 1481</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01482" name="l01482"></a><span class="lineno"> 1482</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01483" name="l01483"></a><span class="lineno"> 1483</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01484" name="l01484"></a><span class="lineno"> 1484</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01485" name="l01485"></a><span class="lineno"> 1485</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01486" name="l01486"></a><span class="lineno"> 1486</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01487" name="l01487"></a><span class="lineno"> 1487</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01488" name="l01488"></a><span class="lineno"> 1488</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01489" name="l01489"></a><span class="lineno"> 1489</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01490" name="l01490"></a><span class="lineno"> 1490</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01491" name="l01491"></a><span class="lineno"> 1491</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01492" name="l01492"></a><span class="lineno"> 1492</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01493" name="l01493"></a><span class="lineno"> 1493</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01494" name="l01494"></a><span class="lineno"> 1494</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01495" name="l01495"></a><span class="lineno"> 1495</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01496" name="l01496"></a><span class="lineno"> 1496</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01497" name="l01497"></a><span class="lineno"> 1497</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01498" name="l01498"></a><span class="lineno"> 1498</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01499" name="l01499"></a><span class="lineno"> 1499</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01500" name="l01500"></a><span class="lineno"> 1500</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01501" name="l01501"></a><span class="lineno"> 1501</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01502" name="l01502"></a><span class="lineno"> 1502</span>      x,</div>
+<div class="line"><a id="l01503" name="l01503"></a><span class="lineno"> 1503</span>      w,</div>
+<div class="line"><a id="l01504" name="l01504"></a><span class="lineno"> 1504</span>      scales,</div>
+<div class="line"><a id="l01505" name="l01505"></a><span class="lineno"> 1505</span>      biases,</div>
+<div class="line"><a id="l01506" name="l01506"></a><span class="lineno"> 1506</span>      lhs_indices,</div>
+<div class="line"><a id="l01507" name="l01507"></a><span class="lineno"> 1507</span>      rhs_indices,</div>
+<div class="line"><a id="l01508" name="l01508"></a><span class="lineno"> 1508</span>      y,</div>
+<div class="line"><a id="l01509" name="l01509"></a><span class="lineno"> 1509</span>      out_vec_size,</div>
+<div class="line"><a id="l01510" name="l01510"></a><span class="lineno"> 1510</span>      batch_ndims,</div>
+<div class="line"><a id="l01511" name="l01511"></a><span class="lineno"> 1511</span>      batch_shape,</div>
+<div class="line"><a id="l01512" name="l01512"></a><span class="lineno"> 1512</span>      lhs_strides,</div>
+<div class="line"><a id="l01513" name="l01513"></a><span class="lineno"> 1513</span>      rhs_strides,</div>
+<div class="line"><a id="l01514" name="l01514"></a><span class="lineno"> 1514</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01515" name="l01515"></a><span class="lineno"> 1515</span>      x_shape,</div>
+<div class="line"><a id="l01516" name="l01516"></a><span class="lineno"> 1516</span>      x_strides,</div>
+<div class="line"><a id="l01517" name="l01517"></a><span class="lineno"> 1517</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01518" name="l01518"></a><span class="lineno"> 1518</span>      w_shape,</div>
+<div class="line"><a id="l01519" name="l01519"></a><span class="lineno"> 1519</span>      w_strides,</div>
+<div class="line"><a id="l01520" name="l01520"></a><span class="lineno"> 1520</span>      s_strides,</div>
+<div class="line"><a id="l01521" name="l01521"></a><span class="lineno"> 1521</span>      b_strides,</div>
+<div class="line"><a id="l01522" name="l01522"></a><span class="lineno"> 1522</span>      tid);</div>
+<div class="line"><a id="l01523" name="l01523"></a><span class="lineno"> 1523</span>  <a class="code hl_function" href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01524" name="l01524"></a><span class="lineno"> 1524</span>      w,</div>
+<div class="line"><a id="l01525" name="l01525"></a><span class="lineno"> 1525</span>      scales,</div>
+<div class="line"><a id="l01526" name="l01526"></a><span class="lineno"> 1526</span>      biases,</div>
+<div class="line"><a id="l01527" name="l01527"></a><span class="lineno"> 1527</span>      x,</div>
+<div class="line"><a id="l01528" name="l01528"></a><span class="lineno"> 1528</span>      y,</div>
+<div class="line"><a id="l01529" name="l01529"></a><span class="lineno"> 1529</span>      in_vec_size,</div>
+<div class="line"><a id="l01530" name="l01530"></a><span class="lineno"> 1530</span>      out_vec_size,</div>
+<div class="line"><a id="l01531" name="l01531"></a><span class="lineno"> 1531</span>      tid,</div>
+<div class="line"><a id="l01532" name="l01532"></a><span class="lineno"> 1532</span>      simd_gid,</div>
+<div class="line"><a id="l01533" name="l01533"></a><span class="lineno"> 1533</span>      simd_lid);</div>
+<div class="line"><a id="l01534" name="l01534"></a><span class="lineno"> 1534</span>}</div>
 </div>
-<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"> 1541</span> </div>
-<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"> 1542</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01543" data-start="{" data-end="}">
-<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494"> 1543</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a>(</div>
-<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"> 1544</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01545" name="l01545"></a><span class="lineno"> 1545</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"> 1546</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"> 1547</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"> 1548</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"> 1549</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
-<div class="line"><a id="l01550" name="l01550"></a><span class="lineno"> 1550</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
-<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
-<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
-<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
-<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"> 1554</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
-<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
-<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"> 1556</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
-<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"> 1559</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
-<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
-<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"> 1561</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
-<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"> 1562</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"> 1563</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
-<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"> 1564</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"> 1565</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"> 1566</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"> 1567</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"> 1568</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"> 1569</span>      x,</div>
-<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span>      w,</div>
-<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"> 1571</span>      scales,</div>
-<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>      biases,</div>
-<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>      lhs_indices,</div>
-<div class="line"><a id="l01574" name="l01574"></a><span class="lineno"> 1574</span>      rhs_indices,</div>
-<div class="line"><a id="l01575" name="l01575"></a><span class="lineno"> 1575</span>      y,</div>
-<div class="line"><a id="l01576" name="l01576"></a><span class="lineno"> 1576</span>      out_vec_size,</div>
-<div class="line"><a id="l01577" name="l01577"></a><span class="lineno"> 1577</span>      batch_ndims,</div>
-<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"> 1578</span>      batch_shape,</div>
-<div class="line"><a id="l01579" name="l01579"></a><span class="lineno"> 1579</span>      lhs_strides,</div>
-<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"> 1580</span>      rhs_strides,</div>
-<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"> 1582</span>      x_shape,</div>
-<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"> 1583</span>      x_strides,</div>
-<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"> 1584</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"> 1585</span>      w_shape,</div>
-<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"> 1586</span>      w_strides,</div>
-<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"> 1587</span>      s_strides,</div>
-<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"> 1588</span>      b_strides,</div>
-<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"> 1589</span>      tid);</div>
-<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span>  <a class="code hl_function" href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
-<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span>      w,</div>
-<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"> 1592</span>      scales,</div>
-<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span>      biases,</div>
-<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"> 1594</span>      x,</div>
-<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span>      y,</div>
-<div class="line"><a id="l01596" name="l01596"></a><span class="lineno"> 1596</span>      in_vec_size,</div>
-<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"> 1597</span>      out_vec_size,</div>
-<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"> 1598</span>      tid,</div>
-<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"> 1599</span>      simd_gid,</div>
-<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"> 1600</span>      simd_lid);</div>
-<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"> 1601</span>}</div>
+<div class="line"><a id="l01535" name="l01535"></a><span class="lineno"> 1535</span> </div>
+<div class="line"><a id="l01536" name="l01536"></a><span class="lineno"> 1536</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01537" data-start="{" data-end="}">
+<div class="line"><a id="l01537" name="l01537"></a><span class="lineno"><a class="line" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed"> 1537</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a>(</div>
+<div class="line"><a id="l01538" name="l01538"></a><span class="lineno"> 1538</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01539" name="l01539"></a><span class="lineno"> 1539</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01540" name="l01540"></a><span class="lineno"> 1540</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01541" name="l01541"></a><span class="lineno"> 1541</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01542" name="l01542"></a><span class="lineno"> 1542</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01543" name="l01543"></a><span class="lineno"> 1543</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01544" name="l01544"></a><span class="lineno"> 1544</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01545" name="l01545"></a><span class="lineno"> 1545</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01546" name="l01546"></a><span class="lineno"> 1546</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01547" name="l01547"></a><span class="lineno"> 1547</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01548" name="l01548"></a><span class="lineno"> 1548</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01549" name="l01549"></a><span class="lineno"> 1549</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01550" name="l01550"></a><span class="lineno"> 1550</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01551" name="l01551"></a><span class="lineno"> 1551</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01552" name="l01552"></a><span class="lineno"> 1552</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01553" name="l01553"></a><span class="lineno"> 1553</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01554" name="l01554"></a><span class="lineno"> 1554</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01555" name="l01555"></a><span class="lineno"> 1555</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01556" name="l01556"></a><span class="lineno"> 1556</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01557" name="l01557"></a><span class="lineno"> 1557</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01558" name="l01558"></a><span class="lineno"> 1558</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01559" name="l01559"></a><span class="lineno"> 1559</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01560" name="l01560"></a><span class="lineno"> 1560</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01561" name="l01561"></a><span class="lineno"> 1561</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01562" name="l01562"></a><span class="lineno"> 1562</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01563" name="l01563"></a><span class="lineno"> 1563</span>      x,</div>
+<div class="line"><a id="l01564" name="l01564"></a><span class="lineno"> 1564</span>      w,</div>
+<div class="line"><a id="l01565" name="l01565"></a><span class="lineno"> 1565</span>      scales,</div>
+<div class="line"><a id="l01566" name="l01566"></a><span class="lineno"> 1566</span>      biases,</div>
+<div class="line"><a id="l01567" name="l01567"></a><span class="lineno"> 1567</span>      lhs_indices,</div>
+<div class="line"><a id="l01568" name="l01568"></a><span class="lineno"> 1568</span>      rhs_indices,</div>
+<div class="line"><a id="l01569" name="l01569"></a><span class="lineno"> 1569</span>      y,</div>
+<div class="line"><a id="l01570" name="l01570"></a><span class="lineno"> 1570</span>      out_vec_size,</div>
+<div class="line"><a id="l01571" name="l01571"></a><span class="lineno"> 1571</span>      batch_ndims,</div>
+<div class="line"><a id="l01572" name="l01572"></a><span class="lineno"> 1572</span>      batch_shape,</div>
+<div class="line"><a id="l01573" name="l01573"></a><span class="lineno"> 1573</span>      lhs_strides,</div>
+<div class="line"><a id="l01574" name="l01574"></a><span class="lineno"> 1574</span>      rhs_strides,</div>
+<div class="line"><a id="l01575" name="l01575"></a><span class="lineno"> 1575</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01576" name="l01576"></a><span class="lineno"> 1576</span>      x_shape,</div>
+<div class="line"><a id="l01577" name="l01577"></a><span class="lineno"> 1577</span>      x_strides,</div>
+<div class="line"><a id="l01578" name="l01578"></a><span class="lineno"> 1578</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01579" name="l01579"></a><span class="lineno"> 1579</span>      w_shape,</div>
+<div class="line"><a id="l01580" name="l01580"></a><span class="lineno"> 1580</span>      w_strides,</div>
+<div class="line"><a id="l01581" name="l01581"></a><span class="lineno"> 1581</span>      s_strides,</div>
+<div class="line"><a id="l01582" name="l01582"></a><span class="lineno"> 1582</span>      b_strides,</div>
+<div class="line"><a id="l01583" name="l01583"></a><span class="lineno"> 1583</span>      tid);</div>
+<div class="line"><a id="l01584" name="l01584"></a><span class="lineno"> 1584</span>  <a class="code hl_function" href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01585" name="l01585"></a><span class="lineno"> 1585</span>      w,</div>
+<div class="line"><a id="l01586" name="l01586"></a><span class="lineno"> 1586</span>      scales,</div>
+<div class="line"><a id="l01587" name="l01587"></a><span class="lineno"> 1587</span>      biases,</div>
+<div class="line"><a id="l01588" name="l01588"></a><span class="lineno"> 1588</span>      x,</div>
+<div class="line"><a id="l01589" name="l01589"></a><span class="lineno"> 1589</span>      y,</div>
+<div class="line"><a id="l01590" name="l01590"></a><span class="lineno"> 1590</span>      in_vec_size,</div>
+<div class="line"><a id="l01591" name="l01591"></a><span class="lineno"> 1591</span>      out_vec_size,</div>
+<div class="line"><a id="l01592" name="l01592"></a><span class="lineno"> 1592</span>      tid,</div>
+<div class="line"><a id="l01593" name="l01593"></a><span class="lineno"> 1593</span>      simd_gid,</div>
+<div class="line"><a id="l01594" name="l01594"></a><span class="lineno"> 1594</span>      simd_lid);</div>
+<div class="line"><a id="l01595" name="l01595"></a><span class="lineno"> 1595</span>}</div>
 </div>
-<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"> 1602</span> </div>
-<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"> 1603</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"> 1604</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"> 1605</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"> 1606</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"> 1607</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
-<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"> 1608</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01611" data-start="{" data-end="}">
-<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84"> 1611</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a>(</div>
-<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"> 1613</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"> 1616</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"> 1617</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01618" name="l01618"></a><span class="lineno"> 1618</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"> 1619</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"> 1620</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"> 1621</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"> 1622</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"> 1623</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01628" name="l01628"></a><span class="lineno"> 1628</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
-<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"> 1629</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
-<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"> 1630</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
-<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"> 1632</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
-<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"> 1634</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"> 1638</span>  (void)lid;</div>
-<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span> </div>
-<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"> 1640</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span> </div>
-<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"> 1642</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"> 1643</span>  threadgroup T Ws[BN * BK_padded];</div>
-<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span> </div>
-<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>      x,</div>
-<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span>      w,</div>
-<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"> 1648</span>      scales,</div>
-<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"> 1649</span>      biases,</div>
-<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span>      lhs_indices,</div>
-<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"> 1651</span>      rhs_indices,</div>
-<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span>      y,</div>
-<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"> 1653</span>      M * N,</div>
-<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span>      batch_ndims,</div>
-<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>      batch_shape,</div>
-<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>      lhs_strides,</div>
-<div class="line"><a id="l01657" name="l01657"></a><span class="lineno"> 1657</span>      rhs_strides,</div>
-<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"> 1659</span>      x_shape,</div>
-<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>      x_strides,</div>
-<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"> 1662</span>      w_shape,</div>
-<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>      w_strides,</div>
-<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>      s_strides,</div>
-<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"> 1665</span>      b_strides,</div>
-<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"> 1666</span>      tid);</div>
-<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>}</div>
+<div class="line"><a id="l01596" name="l01596"></a><span class="lineno"> 1596</span> </div>
+<div class="line"><a id="l01597" name="l01597"></a><span class="lineno"> 1597</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> group_size, <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01598" data-start="{" data-end="}">
+<div class="line"><a id="l01598" name="l01598"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494"> 1598</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a>(</div>
+<div class="line"><a id="l01599" name="l01599"></a><span class="lineno"> 1599</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01600" name="l01600"></a><span class="lineno"> 1600</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01601" name="l01601"></a><span class="lineno"> 1601</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01602" name="l01602"></a><span class="lineno"> 1602</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01603" name="l01603"></a><span class="lineno"> 1603</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01604" name="l01604"></a><span class="lineno"> 1604</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; in_vec_size [[buffer(5)]],</div>
+<div class="line"><a id="l01605" name="l01605"></a><span class="lineno"> 1605</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; out_vec_size [[buffer(6)]],</div>
+<div class="line"><a id="l01606" name="l01606"></a><span class="lineno"> 1606</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(7)]],</div>
+<div class="line"><a id="l01607" name="l01607"></a><span class="lineno"> 1607</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(8)]],</div>
+<div class="line"><a id="l01608" name="l01608"></a><span class="lineno"> 1608</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(9)]],</div>
+<div class="line"><a id="l01609" name="l01609"></a><span class="lineno"> 1609</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(10)]],</div>
+<div class="line"><a id="l01610" name="l01610"></a><span class="lineno"> 1610</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(11)]],</div>
+<div class="line"><a id="l01611" name="l01611"></a><span class="lineno"> 1611</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(12)]],</div>
+<div class="line"><a id="l01612" name="l01612"></a><span class="lineno"> 1612</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01613" name="l01613"></a><span class="lineno"> 1613</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01614" name="l01614"></a><span class="lineno"> 1614</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(15)]],</div>
+<div class="line"><a id="l01615" name="l01615"></a><span class="lineno"> 1615</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(16)]],</div>
+<div class="line"><a id="l01616" name="l01616"></a><span class="lineno"> 1616</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(17)]],</div>
+<div class="line"><a id="l01617" name="l01617"></a><span class="lineno"> 1617</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01618" name="l01618"></a><span class="lineno"> 1618</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(19)]],</div>
+<div class="line"><a id="l01619" name="l01619"></a><span class="lineno"> 1619</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01620" name="l01620"></a><span class="lineno"> 1620</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01621" name="l01621"></a><span class="lineno"> 1621</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01622" name="l01622"></a><span class="lineno"> 1622</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01623" name="l01623"></a><span class="lineno"> 1623</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01624" name="l01624"></a><span class="lineno"> 1624</span>      x,</div>
+<div class="line"><a id="l01625" name="l01625"></a><span class="lineno"> 1625</span>      w,</div>
+<div class="line"><a id="l01626" name="l01626"></a><span class="lineno"> 1626</span>      scales,</div>
+<div class="line"><a id="l01627" name="l01627"></a><span class="lineno"> 1627</span>      biases,</div>
+<div class="line"><a id="l01628" name="l01628"></a><span class="lineno"> 1628</span>      lhs_indices,</div>
+<div class="line"><a id="l01629" name="l01629"></a><span class="lineno"> 1629</span>      rhs_indices,</div>
+<div class="line"><a id="l01630" name="l01630"></a><span class="lineno"> 1630</span>      y,</div>
+<div class="line"><a id="l01631" name="l01631"></a><span class="lineno"> 1631</span>      out_vec_size,</div>
+<div class="line"><a id="l01632" name="l01632"></a><span class="lineno"> 1632</span>      batch_ndims,</div>
+<div class="line"><a id="l01633" name="l01633"></a><span class="lineno"> 1633</span>      batch_shape,</div>
+<div class="line"><a id="l01634" name="l01634"></a><span class="lineno"> 1634</span>      lhs_strides,</div>
+<div class="line"><a id="l01635" name="l01635"></a><span class="lineno"> 1635</span>      rhs_strides,</div>
+<div class="line"><a id="l01636" name="l01636"></a><span class="lineno"> 1636</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01637" name="l01637"></a><span class="lineno"> 1637</span>      x_shape,</div>
+<div class="line"><a id="l01638" name="l01638"></a><span class="lineno"> 1638</span>      x_strides,</div>
+<div class="line"><a id="l01639" name="l01639"></a><span class="lineno"> 1639</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01640" name="l01640"></a><span class="lineno"> 1640</span>      w_shape,</div>
+<div class="line"><a id="l01641" name="l01641"></a><span class="lineno"> 1641</span>      w_strides,</div>
+<div class="line"><a id="l01642" name="l01642"></a><span class="lineno"> 1642</span>      s_strides,</div>
+<div class="line"><a id="l01643" name="l01643"></a><span class="lineno"> 1643</span>      b_strides,</div>
+<div class="line"><a id="l01644" name="l01644"></a><span class="lineno"> 1644</span>      tid);</div>
+<div class="line"><a id="l01645" name="l01645"></a><span class="lineno"> 1645</span>  <a class="code hl_function" href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl&lt;T, group_size, bits&gt;</a>(</div>
+<div class="line"><a id="l01646" name="l01646"></a><span class="lineno"> 1646</span>      w,</div>
+<div class="line"><a id="l01647" name="l01647"></a><span class="lineno"> 1647</span>      scales,</div>
+<div class="line"><a id="l01648" name="l01648"></a><span class="lineno"> 1648</span>      biases,</div>
+<div class="line"><a id="l01649" name="l01649"></a><span class="lineno"> 1649</span>      x,</div>
+<div class="line"><a id="l01650" name="l01650"></a><span class="lineno"> 1650</span>      y,</div>
+<div class="line"><a id="l01651" name="l01651"></a><span class="lineno"> 1651</span>      in_vec_size,</div>
+<div class="line"><a id="l01652" name="l01652"></a><span class="lineno"> 1652</span>      out_vec_size,</div>
+<div class="line"><a id="l01653" name="l01653"></a><span class="lineno"> 1653</span>      tid,</div>
+<div class="line"><a id="l01654" name="l01654"></a><span class="lineno"> 1654</span>      simd_gid,</div>
+<div class="line"><a id="l01655" name="l01655"></a><span class="lineno"> 1655</span>      simd_lid);</div>
+<div class="line"><a id="l01656" name="l01656"></a><span class="lineno"> 1656</span>}</div>
 </div>
-<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span> </div>
-<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
-<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"> 1674</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
-<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
-<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"> 1676</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
-<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
-<div class="foldopen" id="foldopen01678" data-start="{" data-end="}">
-<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f"> 1678</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a>(</div>
-<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"> 1679</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"> 1681</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"> 1682</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
-<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"> 1683</span>    device T* y [[buffer(4)]],</div>
-<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"> 1684</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
-<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"> 1685</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
-<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
-<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
-<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
-<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
-<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
-<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"> 1691</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
-<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
-<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"> 1693</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
-<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
-<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"> 1695</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
-<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
-<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"> 1697</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
-<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"> 1698</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
-<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
-<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"> 1700</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
-<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"> 1701</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"> 1702</span>    uint lid [[thread_index_in_threadgroup]],</div>
-<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"> 1703</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"> 1704</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span>  (void)lid;</div>
-<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span> </div>
-<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"> 1707</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"> 1708</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
-<div class="line"><a id="l01709" name="l01709"></a><span class="lineno"> 1709</span> </div>
-<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"> 1710</span>  threadgroup T Xs[BM * BK_padded];</div>
-<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"> 1711</span>  threadgroup T Ws[BK * BN_padded];</div>
-<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"> 1712</span> </div>
-<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"> 1713</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
-<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"> 1714</span>      x,</div>
-<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>      w,</div>
-<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      scales,</div>
-<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>      biases,</div>
-<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>      lhs_indices,</div>
-<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      rhs_indices,</div>
-<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>      y,</div>
-<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>      M * N,</div>
-<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>      batch_ndims,</div>
-<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>      batch_shape,</div>
-<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>      lhs_strides,</div>
-<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span>      rhs_strides,</div>
-<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"> 1726</span>      x_batch_ndims,</div>
-<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"> 1727</span>      x_shape,</div>
-<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span>      x_strides,</div>
-<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"> 1729</span>      w_batch_ndims,</div>
-<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"> 1730</span>      w_shape,</div>
-<div class="line"><a id="l01731" name="l01731"></a><span class="lineno"> 1731</span>      w_strides,</div>
-<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"> 1732</span>      s_strides,</div>
-<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"> 1733</span>      b_strides,</div>
-<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>      tid);</div>
-<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
-<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
-<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span>}</div>
+<div class="line"><a id="l01657" name="l01657"></a><span class="lineno"> 1657</span> </div>
+<div class="line"><a id="l01658" name="l01658"></a><span class="lineno"> 1658</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01659" name="l01659"></a><span class="lineno"> 1659</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01660" name="l01660"></a><span class="lineno"> 1660</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01661" name="l01661"></a><span class="lineno"> 1661</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01662" name="l01662"></a><span class="lineno"> 1662</span>    <span class="keyword">const</span> <span class="keywordtype">bool</span> aligned_N,</div>
+<div class="line"><a id="l01663" name="l01663"></a><span class="lineno"> 1663</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01664" name="l01664"></a><span class="lineno"> 1664</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01665" name="l01665"></a><span class="lineno"> 1665</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01666" data-start="{" data-end="}">
+<div class="line"><a id="l01666" name="l01666"></a><span class="lineno"><a class="line" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84"> 1666</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a>(</div>
+<div class="line"><a id="l01667" name="l01667"></a><span class="lineno"> 1667</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01668" name="l01668"></a><span class="lineno"> 1668</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01669" name="l01669"></a><span class="lineno"> 1669</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01670" name="l01670"></a><span class="lineno"> 1670</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01671" name="l01671"></a><span class="lineno"> 1671</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01672" name="l01672"></a><span class="lineno"> 1672</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01673" name="l01673"></a><span class="lineno"> 1673</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01674" name="l01674"></a><span class="lineno"> 1674</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01675" name="l01675"></a><span class="lineno"> 1675</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01676" name="l01676"></a><span class="lineno"> 1676</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01677" name="l01677"></a><span class="lineno"> 1677</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01678" name="l01678"></a><span class="lineno"> 1678</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01679" name="l01679"></a><span class="lineno"> 1679</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01680" name="l01680"></a><span class="lineno"> 1680</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01681" name="l01681"></a><span class="lineno"> 1681</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01682" name="l01682"></a><span class="lineno"> 1682</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01683" name="l01683"></a><span class="lineno"> 1683</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
+<div class="line"><a id="l01684" name="l01684"></a><span class="lineno"> 1684</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
+<div class="line"><a id="l01685" name="l01685"></a><span class="lineno"> 1685</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01686" name="l01686"></a><span class="lineno"> 1686</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
+<div class="line"><a id="l01687" name="l01687"></a><span class="lineno"> 1687</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01688" name="l01688"></a><span class="lineno"> 1688</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
+<div class="line"><a id="l01689" name="l01689"></a><span class="lineno"> 1689</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01690" name="l01690"></a><span class="lineno"> 1690</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01691" name="l01691"></a><span class="lineno"> 1691</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01692" name="l01692"></a><span class="lineno"> 1692</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01693" name="l01693"></a><span class="lineno"> 1693</span>  (void)lid;</div>
+<div class="line"><a id="l01694" name="l01694"></a><span class="lineno"> 1694</span> </div>
+<div class="line"><a id="l01695" name="l01695"></a><span class="lineno"> 1695</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01696" name="l01696"></a><span class="lineno"> 1696</span> </div>
+<div class="line"><a id="l01697" name="l01697"></a><span class="lineno"> 1697</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01698" name="l01698"></a><span class="lineno"> 1698</span>  threadgroup T Ws[BN * BK_padded];</div>
+<div class="line"><a id="l01699" name="l01699"></a><span class="lineno"> 1699</span> </div>
+<div class="line"><a id="l01700" name="l01700"></a><span class="lineno"> 1700</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01701" name="l01701"></a><span class="lineno"> 1701</span>      x,</div>
+<div class="line"><a id="l01702" name="l01702"></a><span class="lineno"> 1702</span>      w,</div>
+<div class="line"><a id="l01703" name="l01703"></a><span class="lineno"> 1703</span>      scales,</div>
+<div class="line"><a id="l01704" name="l01704"></a><span class="lineno"> 1704</span>      biases,</div>
+<div class="line"><a id="l01705" name="l01705"></a><span class="lineno"> 1705</span>      lhs_indices,</div>
+<div class="line"><a id="l01706" name="l01706"></a><span class="lineno"> 1706</span>      rhs_indices,</div>
+<div class="line"><a id="l01707" name="l01707"></a><span class="lineno"> 1707</span>      y,</div>
+<div class="line"><a id="l01708" name="l01708"></a><span class="lineno"> 1708</span>      M * N,</div>
+<div class="line"><a id="l01709" name="l01709"></a><span class="lineno"> 1709</span>      batch_ndims,</div>
+<div class="line"><a id="l01710" name="l01710"></a><span class="lineno"> 1710</span>      batch_shape,</div>
+<div class="line"><a id="l01711" name="l01711"></a><span class="lineno"> 1711</span>      lhs_strides,</div>
+<div class="line"><a id="l01712" name="l01712"></a><span class="lineno"> 1712</span>      rhs_strides,</div>
+<div class="line"><a id="l01713" name="l01713"></a><span class="lineno"> 1713</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01714" name="l01714"></a><span class="lineno"> 1714</span>      x_shape,</div>
+<div class="line"><a id="l01715" name="l01715"></a><span class="lineno"> 1715</span>      x_strides,</div>
+<div class="line"><a id="l01716" name="l01716"></a><span class="lineno"> 1716</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01717" name="l01717"></a><span class="lineno"> 1717</span>      w_shape,</div>
+<div class="line"><a id="l01718" name="l01718"></a><span class="lineno"> 1718</span>      w_strides,</div>
+<div class="line"><a id="l01719" name="l01719"></a><span class="lineno"> 1719</span>      s_strides,</div>
+<div class="line"><a id="l01720" name="l01720"></a><span class="lineno"> 1720</span>      b_strides,</div>
+<div class="line"><a id="l01721" name="l01721"></a><span class="lineno"> 1721</span>      tid);</div>
+<div class="line"><a id="l01722" name="l01722"></a><span class="lineno"> 1722</span>  <a class="code hl_function" href="quantized_8h.html#af5750a35e8f5462218effba719f7f5b8">qmm_t_impl&lt;T, group_size, bits, aligned_N, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01723" name="l01723"></a><span class="lineno"> 1723</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01724" name="l01724"></a><span class="lineno"> 1724</span>}</div>
 </div>
-<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span> </div>
-<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01740" data-start="{" data-end="}">
-<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"><a class="line" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59"> 1740</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a>(</div>
-<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"> 1741</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
-<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span>    device uint8_t* out [[buffer(1)]],</div>
-<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"> 1743</span>    device T* scales [[buffer(2)]],</div>
-<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span>    device T* biases [[buffer(3)]],</div>
-<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"> 1745</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>  <span class="keyword">constexpr</span> T eps = T(1e-7);</div>
-<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> = 32;</div>
-<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"> 1749</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01750" name="l01750"></a><span class="lineno"> 1750</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
-<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"> 1751</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"> 1752</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_reduce = group_size / <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
-<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_reduce = packs_per_int / values_per_reduce;</div>
-<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"> 1754</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_pack =</div>
-<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"> 1755</span>      writes_per_reduce &gt; 1 ? 1 : values_per_reduce / packs_per_int;</div>
-<div class="line"><a id="l01756" name="l01756"></a><span class="lineno"> 1756</span> </div>
-<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"> 1757</span>  <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>      group_size % <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> == 0,</div>
-<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span>      <span class="stringliteral">&quot;Group size must be divisible by simd size.&quot;</span>);</div>
-<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"> 1760</span> </div>
-<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"> 1762</span>  <span class="keywordtype">size_t</span> in_index = offset * values_per_reduce;</div>
-<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span>  <span class="keywordtype">size_t</span> out_index = offset * writes_per_pack;</div>
+<div class="line"><a id="l01725" name="l01725"></a><span class="lineno"> 1725</span> </div>
+<div class="line"><a id="l01726" name="l01726"></a><span class="lineno"> 1726</span><span class="keyword">template</span> &lt;</div>
+<div class="line"><a id="l01727" name="l01727"></a><span class="lineno"> 1727</span>    <span class="keyword">typename</span> T,</div>
+<div class="line"><a id="l01728" name="l01728"></a><span class="lineno"> 1728</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> group_size,</div>
+<div class="line"><a id="l01729" name="l01729"></a><span class="lineno"> 1729</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> <a class="code hl_function" href="namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825">bits</a>,</div>
+<div class="line"><a id="l01730" name="l01730"></a><span class="lineno"> 1730</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BM = 32,</div>
+<div class="line"><a id="l01731" name="l01731"></a><span class="lineno"> 1731</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BK = 32,</div>
+<div class="line"><a id="l01732" name="l01732"></a><span class="lineno"> 1732</span>    <span class="keyword">const</span> <span class="keywordtype">int</span> BN = 32&gt;</div>
+<div class="foldopen" id="foldopen01733" data-start="{" data-end="}">
+<div class="line"><a id="l01733" name="l01733"></a><span class="lineno"><a class="line" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f"> 1733</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a>(</div>
+<div class="line"><a id="l01734" name="l01734"></a><span class="lineno"> 1734</span>    <span class="keyword">const</span> device uint32_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01735" name="l01735"></a><span class="lineno"> 1735</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01736" name="l01736"></a><span class="lineno"> 1736</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01737" name="l01737"></a><span class="lineno"> 1737</span>    <span class="keyword">const</span> device T* x [[buffer(3)]],</div>
+<div class="line"><a id="l01738" name="l01738"></a><span class="lineno"> 1738</span>    device T* y [[buffer(4)]],</div>
+<div class="line"><a id="l01739" name="l01739"></a><span class="lineno"> 1739</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; K [[buffer(5)]],</div>
+<div class="line"><a id="l01740" name="l01740"></a><span class="lineno"> 1740</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N [[buffer(6)]],</div>
+<div class="line"><a id="l01741" name="l01741"></a><span class="lineno"> 1741</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; M [[buffer(7)]],</div>
+<div class="line"><a id="l01742" name="l01742"></a><span class="lineno"> 1742</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; x_batch_ndims [[buffer(8)]],</div>
+<div class="line"><a id="l01743" name="l01743"></a><span class="lineno"> 1743</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* x_shape [[buffer(9)]],</div>
+<div class="line"><a id="l01744" name="l01744"></a><span class="lineno"> 1744</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* x_strides [[buffer(10)]],</div>
+<div class="line"><a id="l01745" name="l01745"></a><span class="lineno"> 1745</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; w_batch_ndims [[buffer(11)]],</div>
+<div class="line"><a id="l01746" name="l01746"></a><span class="lineno"> 1746</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* w_shape [[buffer(12)]],</div>
+<div class="line"><a id="l01747" name="l01747"></a><span class="lineno"> 1747</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* w_strides [[buffer(13)]],</div>
+<div class="line"><a id="l01748" name="l01748"></a><span class="lineno"> 1748</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* s_strides [[buffer(14)]],</div>
+<div class="line"><a id="l01749" name="l01749"></a><span class="lineno"> 1749</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* b_strides [[buffer(15)]],</div>
+<div class="line"><a id="l01750" name="l01750"></a><span class="lineno"> 1750</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; batch_ndims [[buffer(16)]],</div>
+<div class="line"><a id="l01751" name="l01751"></a><span class="lineno"> 1751</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* batch_shape [[buffer(17)]],</div>
+<div class="line"><a id="l01752" name="l01752"></a><span class="lineno"> 1752</span>    <span class="keyword">const</span> device uint32_t* lhs_indices [[buffer(18)]],</div>
+<div class="line"><a id="l01753" name="l01753"></a><span class="lineno"> 1753</span>    <span class="keyword">const</span> device uint32_t* rhs_indices [[buffer(19)]],</div>
+<div class="line"><a id="l01754" name="l01754"></a><span class="lineno"> 1754</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* lhs_strides [[buffer(20)]],</div>
+<div class="line"><a id="l01755" name="l01755"></a><span class="lineno"> 1755</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* rhs_strides [[buffer(21)]],</div>
+<div class="line"><a id="l01756" name="l01756"></a><span class="lineno"> 1756</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l01757" name="l01757"></a><span class="lineno"> 1757</span>    uint lid [[thread_index_in_threadgroup]],</div>
+<div class="line"><a id="l01758" name="l01758"></a><span class="lineno"> 1758</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l01759" name="l01759"></a><span class="lineno"> 1759</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l01760" name="l01760"></a><span class="lineno"> 1760</span>  (void)lid;</div>
+<div class="line"><a id="l01761" name="l01761"></a><span class="lineno"> 1761</span> </div>
+<div class="line"><a id="l01762" name="l01762"></a><span class="lineno"> 1762</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BK_padded = (BK + 16 / <span class="keyword">sizeof</span>(T));</div>
+<div class="line"><a id="l01763" name="l01763"></a><span class="lineno"> 1763</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN_padded = (BN + 16 / <span class="keyword">sizeof</span>(T));</div>
 <div class="line"><a id="l01764" name="l01764"></a><span class="lineno"> 1764</span> </div>
-<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>  T w_thread[values_per_reduce];</div>
-<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>  T w_min = <a class="code hl_struct" href="struct_limits.html">Limits&lt;T&gt;::max</a>;</div>
-<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span>  T w_max = 0;</div>
-<div class="line"><a id="l01768" name="l01768"></a><span class="lineno"> 1768</span> </div>
-<div class="line"><a id="l01769" name="l01769"></a><span class="lineno"> 1769</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01770" name="l01770"></a><span class="lineno"> 1770</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
-<div class="line"><a id="l01771" name="l01771"></a><span class="lineno"> 1771</span>    T val = w[in_index + i];</div>
-<div class="line"><a id="l01772" name="l01772"></a><span class="lineno"> 1772</span>    w_thread[i] = val;</div>
-<div class="line"><a id="l01773" name="l01773"></a><span class="lineno"> 1773</span>    w_min = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(w_min, val);</div>
-<div class="line"><a id="l01774" name="l01774"></a><span class="lineno"> 1774</span>    w_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(w_max, val);</div>
-<div class="line"><a id="l01775" name="l01775"></a><span class="lineno"> 1775</span>  }</div>
-<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"> 1776</span> </div>
-<div class="line"><a id="l01777" name="l01777"></a><span class="lineno"> 1777</span>  w_min = <a class="code hl_function" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a>(w_min);</div>
-<div class="line"><a id="l01778" name="l01778"></a><span class="lineno"> 1778</span>  w_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(w_max);</div>
-<div class="line"><a id="l01779" name="l01779"></a><span class="lineno"> 1779</span> </div>
-<div class="line"><a id="l01780" name="l01780"></a><span class="lineno"> 1780</span>  T scale = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>((w_max - w_min) / n_bins, eps);</div>
-<div class="line"><a id="l01781" name="l01781"></a><span class="lineno"> 1781</span>  <span class="keywordtype">bool</span> side = <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_min) &gt; <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_max);</div>
-<div class="line"><a id="l01782" name="l01782"></a><span class="lineno"> 1782</span>  scale = side ? scale : -scale;</div>
-<div class="line"><a id="l01783" name="l01783"></a><span class="lineno"> 1783</span>  T edge = side ? w_min : w_max;</div>
-<div class="line"><a id="l01784" name="l01784"></a><span class="lineno"> 1784</span>  T q0 = <a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>(edge / scale);</div>
-<div class="line"><a id="l01785" name="l01785"></a><span class="lineno"> 1785</span>  <span class="keywordtype">bool</span> at_zero = q0 == 0.0f;</div>
-<div class="line"><a id="l01786" name="l01786"></a><span class="lineno"> 1786</span>  scale = at_zero ? scale : edge / q0;</div>
-<div class="line"><a id="l01787" name="l01787"></a><span class="lineno"> 1787</span>  T bias = at_zero ? T(0) : edge;</div>
-<div class="line"><a id="l01788" name="l01788"></a><span class="lineno"> 1788</span> </div>
-<div class="line"><a id="l01789" name="l01789"></a><span class="lineno"> 1789</span>  <span class="comment">// Write out the scales and biases</span></div>
-<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
-<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"> 1791</span>  <span class="keywordflow">if</span> (in_index % group_size == 0) {</div>
-<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"> 1792</span>    scales[gindex] = scale;</div>
-<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"> 1793</span>    biases[gindex] = bias;</div>
-<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"> 1794</span>  }</div>
-<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"> 1795</span> </div>
-<div class="line"><a id="l01796" name="l01796"></a><span class="lineno"> 1796</span>  uint8_t output = 0;</div>
-<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
-<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w_thread[i] - bias) / scale), n_bins);</div>
-<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"> 1801</span>      output = val;</div>
-<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"> 1803</span>      output += val &lt;&lt; (bits * (i % packs_per_int));</div>
-<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"> 1804</span>    }</div>
-<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"> 1805</span> </div>
-<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"> 1806</span>    <span class="keywordflow">if</span> (packs_per_int &lt; values_per_reduce &amp;&amp;</div>
-<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"> 1807</span>        i % packs_per_int == packs_per_int - 1) {</div>
-<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"> 1808</span>      out[out_index + i / packs_per_int] = output;</div>
-<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"> 1809</span>      output = 0;</div>
-<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"> 1810</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"> 1811</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"> 1812</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; writes_per_reduce - 1; j++) {</div>
-<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"> 1813</span>        uint8_t sval = <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(val, j + 1);</div>
-<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"> 1814</span>        output += sval &lt;&lt; (bits * (values_per_reduce + j + i));</div>
-<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span>      }</div>
-<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span>    }</div>
-<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span>  }</div>
-<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"> 1818</span>  <span class="keywordflow">if</span> (writes_per_reduce &gt; 0 &amp;&amp; out_index % writes_per_reduce == 0) {</div>
-<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span>    out[out_index / writes_per_reduce] = output;</div>
-<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"> 1820</span>  }</div>
-<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span>}</div>
+<div class="line"><a id="l01765" name="l01765"></a><span class="lineno"> 1765</span>  threadgroup T Xs[BM * BK_padded];</div>
+<div class="line"><a id="l01766" name="l01766"></a><span class="lineno"> 1766</span>  threadgroup T Ws[BK * BN_padded];</div>
+<div class="line"><a id="l01767" name="l01767"></a><span class="lineno"> 1767</span> </div>
+<div class="line"><a id="l01768" name="l01768"></a><span class="lineno"> 1768</span>  <a class="code hl_function" href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets&lt;T&gt;</a>(</div>
+<div class="line"><a id="l01769" name="l01769"></a><span class="lineno"> 1769</span>      x,</div>
+<div class="line"><a id="l01770" name="l01770"></a><span class="lineno"> 1770</span>      w,</div>
+<div class="line"><a id="l01771" name="l01771"></a><span class="lineno"> 1771</span>      scales,</div>
+<div class="line"><a id="l01772" name="l01772"></a><span class="lineno"> 1772</span>      biases,</div>
+<div class="line"><a id="l01773" name="l01773"></a><span class="lineno"> 1773</span>      lhs_indices,</div>
+<div class="line"><a id="l01774" name="l01774"></a><span class="lineno"> 1774</span>      rhs_indices,</div>
+<div class="line"><a id="l01775" name="l01775"></a><span class="lineno"> 1775</span>      y,</div>
+<div class="line"><a id="l01776" name="l01776"></a><span class="lineno"> 1776</span>      M * N,</div>
+<div class="line"><a id="l01777" name="l01777"></a><span class="lineno"> 1777</span>      batch_ndims,</div>
+<div class="line"><a id="l01778" name="l01778"></a><span class="lineno"> 1778</span>      batch_shape,</div>
+<div class="line"><a id="l01779" name="l01779"></a><span class="lineno"> 1779</span>      lhs_strides,</div>
+<div class="line"><a id="l01780" name="l01780"></a><span class="lineno"> 1780</span>      rhs_strides,</div>
+<div class="line"><a id="l01781" name="l01781"></a><span class="lineno"> 1781</span>      x_batch_ndims,</div>
+<div class="line"><a id="l01782" name="l01782"></a><span class="lineno"> 1782</span>      x_shape,</div>
+<div class="line"><a id="l01783" name="l01783"></a><span class="lineno"> 1783</span>      x_strides,</div>
+<div class="line"><a id="l01784" name="l01784"></a><span class="lineno"> 1784</span>      w_batch_ndims,</div>
+<div class="line"><a id="l01785" name="l01785"></a><span class="lineno"> 1785</span>      w_shape,</div>
+<div class="line"><a id="l01786" name="l01786"></a><span class="lineno"> 1786</span>      w_strides,</div>
+<div class="line"><a id="l01787" name="l01787"></a><span class="lineno"> 1787</span>      s_strides,</div>
+<div class="line"><a id="l01788" name="l01788"></a><span class="lineno"> 1788</span>      b_strides,</div>
+<div class="line"><a id="l01789" name="l01789"></a><span class="lineno"> 1789</span>      tid);</div>
+<div class="line"><a id="l01790" name="l01790"></a><span class="lineno"> 1790</span>  <a class="code hl_function" href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl&lt;T, group_size, bits, BM, BK, BN&gt;</a>(</div>
+<div class="line"><a id="l01791" name="l01791"></a><span class="lineno"> 1791</span>      w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);</div>
+<div class="line"><a id="l01792" name="l01792"></a><span class="lineno"> 1792</span>}</div>
 </div>
-<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"> 1822</span> </div>
-<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"> 1823</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01824" data-start="{" data-end="}">
-<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c"> 1824</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a>(</div>
-<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"> 1825</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
-<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"> 1826</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"> 1827</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"> 1828</span>    device uint8_t* out [[buffer(3)]],</div>
-<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"> 1829</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"> 1830</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"> 1831</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
+<div class="line"><a id="l01793" name="l01793"></a><span class="lineno"> 1793</span> </div>
+<div class="line"><a id="l01794" name="l01794"></a><span class="lineno"> 1794</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01795" data-start="{" data-end="}">
+<div class="line"><a id="l01795" name="l01795"></a><span class="lineno"><a class="line" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59"> 1795</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a>(</div>
+<div class="line"><a id="l01796" name="l01796"></a><span class="lineno"> 1796</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
+<div class="line"><a id="l01797" name="l01797"></a><span class="lineno"> 1797</span>    device uint8_t* out [[buffer(1)]],</div>
+<div class="line"><a id="l01798" name="l01798"></a><span class="lineno"> 1798</span>    device T* scales [[buffer(2)]],</div>
+<div class="line"><a id="l01799" name="l01799"></a><span class="lineno"> 1799</span>    device T* biases [[buffer(3)]],</div>
+<div class="line"><a id="l01800" name="l01800"></a><span class="lineno"> 1800</span>    uint2 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l01801" name="l01801"></a><span class="lineno"> 1801</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l01802" name="l01802"></a><span class="lineno"> 1802</span>  <span class="keyword">constexpr</span> T eps = T(1e-7);</div>
+<div class="line"><a id="l01803" name="l01803"></a><span class="lineno"> 1803</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> = 32;</div>
+<div class="line"><a id="l01804" name="l01804"></a><span class="lineno"> 1804</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
+<div class="line"><a id="l01805" name="l01805"></a><span class="lineno"> 1805</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
+<div class="line"><a id="l01806" name="l01806"></a><span class="lineno"> 1806</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
+<div class="line"><a id="l01807" name="l01807"></a><span class="lineno"> 1807</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> values_per_reduce = group_size / <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l01808" name="l01808"></a><span class="lineno"> 1808</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_reduce = packs_per_int / values_per_reduce;</div>
+<div class="line"><a id="l01809" name="l01809"></a><span class="lineno"> 1809</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> writes_per_pack =</div>
+<div class="line"><a id="l01810" name="l01810"></a><span class="lineno"> 1810</span>      writes_per_reduce &gt; 1 ? 1 : values_per_reduce / packs_per_int;</div>
+<div class="line"><a id="l01811" name="l01811"></a><span class="lineno"> 1811</span> </div>
+<div class="line"><a id="l01812" name="l01812"></a><span class="lineno"> 1812</span>  <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l01813" name="l01813"></a><span class="lineno"> 1813</span>      group_size % <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> == 0,</div>
+<div class="line"><a id="l01814" name="l01814"></a><span class="lineno"> 1814</span>      <span class="stringliteral">&quot;Group size must be divisible by simd size.&quot;</span>);</div>
+<div class="line"><a id="l01815" name="l01815"></a><span class="lineno"> 1815</span> </div>
+<div class="line"><a id="l01816" name="l01816"></a><span class="lineno"> 1816</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
+<div class="line"><a id="l01817" name="l01817"></a><span class="lineno"> 1817</span>  <span class="keywordtype">size_t</span> in_index = offset * values_per_reduce;</div>
+<div class="line"><a id="l01818" name="l01818"></a><span class="lineno"> 1818</span>  <span class="keywordtype">size_t</span> out_index = offset * writes_per_pack;</div>
+<div class="line"><a id="l01819" name="l01819"></a><span class="lineno"> 1819</span> </div>
+<div class="line"><a id="l01820" name="l01820"></a><span class="lineno"> 1820</span>  T w_thread[values_per_reduce];</div>
+<div class="line"><a id="l01821" name="l01821"></a><span class="lineno"> 1821</span>  T w_min = <a class="code hl_struct" href="struct_limits.html">Limits&lt;T&gt;::max</a>;</div>
+<div class="line"><a id="l01822" name="l01822"></a><span class="lineno"> 1822</span>  T w_max = 0;</div>
+<div class="line"><a id="l01823" name="l01823"></a><span class="lineno"> 1823</span> </div>
+<div class="line"><a id="l01824" name="l01824"></a><span class="lineno"> 1824</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l01825" name="l01825"></a><span class="lineno"> 1825</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
+<div class="line"><a id="l01826" name="l01826"></a><span class="lineno"> 1826</span>    T val = w[in_index + i];</div>
+<div class="line"><a id="l01827" name="l01827"></a><span class="lineno"> 1827</span>    w_thread[i] = val;</div>
+<div class="line"><a id="l01828" name="l01828"></a><span class="lineno"> 1828</span>    w_min = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(w_min, val);</div>
+<div class="line"><a id="l01829" name="l01829"></a><span class="lineno"> 1829</span>    w_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(w_max, val);</div>
+<div class="line"><a id="l01830" name="l01830"></a><span class="lineno"> 1830</span>  }</div>
+<div class="line"><a id="l01831" name="l01831"></a><span class="lineno"> 1831</span> </div>
+<div class="line"><a id="l01832" name="l01832"></a><span class="lineno"> 1832</span>  w_min = <a class="code hl_function" href="namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b">simd_min</a>(w_min);</div>
+<div class="line"><a id="l01833" name="l01833"></a><span class="lineno"> 1833</span>  w_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(w_max);</div>
 <div class="line"><a id="l01834" name="l01834"></a><span class="lineno"> 1834</span> </div>
-<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"> 1835</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span>  <span class="keywordtype">size_t</span> in_index = offset * packs_per_int;</div>
-<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"> 1837</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
-<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span> </div>
-<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"> 1839</span>  T scale = scales[gindex];</div>
-<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"> 1840</span>  T bias = biases[gindex];</div>
-<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"> 1841</span> </div>
-<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"> 1842</span>  uint8_t output = 0;</div>
-<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"> 1843</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"> 1844</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
-<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"> 1845</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w[in_index + i] - bias) / scale), n_bins);</div>
-<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"> 1846</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"> 1847</span>      output = val;</div>
-<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"> 1848</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span>      output += val &lt;&lt; (bits * i);</div>
-<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span>    }</div>
-<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span>  }</div>
-<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"> 1852</span>  out[offset] = output;</div>
-<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span>}</div>
+<div class="line"><a id="l01835" name="l01835"></a><span class="lineno"> 1835</span>  T scale = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>((w_max - w_min) / n_bins, eps);</div>
+<div class="line"><a id="l01836" name="l01836"></a><span class="lineno"> 1836</span>  <span class="keywordtype">bool</span> side = <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_min) &gt; <a class="code hl_function" href="namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb">abs</a>(w_max);</div>
+<div class="line"><a id="l01837" name="l01837"></a><span class="lineno"> 1837</span>  scale = side ? scale : -scale;</div>
+<div class="line"><a id="l01838" name="l01838"></a><span class="lineno"> 1838</span>  T edge = side ? w_min : w_max;</div>
+<div class="line"><a id="l01839" name="l01839"></a><span class="lineno"> 1839</span>  T q0 = <a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>(edge / scale);</div>
+<div class="line"><a id="l01840" name="l01840"></a><span class="lineno"> 1840</span>  <span class="keywordtype">bool</span> at_zero = q0 == 0.0f;</div>
+<div class="line"><a id="l01841" name="l01841"></a><span class="lineno"> 1841</span>  scale = at_zero ? scale : edge / q0;</div>
+<div class="line"><a id="l01842" name="l01842"></a><span class="lineno"> 1842</span>  T bias = at_zero ? T(0) : edge;</div>
+<div class="line"><a id="l01843" name="l01843"></a><span class="lineno"> 1843</span> </div>
+<div class="line"><a id="l01844" name="l01844"></a><span class="lineno"> 1844</span>  <span class="comment">// Write out the scales and biases</span></div>
+<div class="line"><a id="l01845" name="l01845"></a><span class="lineno"> 1845</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
+<div class="line"><a id="l01846" name="l01846"></a><span class="lineno"> 1846</span>  <span class="keywordflow">if</span> (in_index % group_size == 0) {</div>
+<div class="line"><a id="l01847" name="l01847"></a><span class="lineno"> 1847</span>    scales[gindex] = scale;</div>
+<div class="line"><a id="l01848" name="l01848"></a><span class="lineno"> 1848</span>    biases[gindex] = bias;</div>
+<div class="line"><a id="l01849" name="l01849"></a><span class="lineno"> 1849</span>  }</div>
+<div class="line"><a id="l01850" name="l01850"></a><span class="lineno"> 1850</span> </div>
+<div class="line"><a id="l01851" name="l01851"></a><span class="lineno"> 1851</span>  uint8_t output = 0;</div>
+<div class="line"><a id="l01852" name="l01852"></a><span class="lineno"> 1852</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l01853" name="l01853"></a><span class="lineno"> 1853</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; values_per_reduce; i++) {</div>
+<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"> 1854</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w_thread[i] - bias) / scale), n_bins);</div>
+<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"> 1856</span>      output = val;</div>
+<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"> 1858</span>      output += val &lt;&lt; (bits * (i % packs_per_int));</div>
+<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"> 1859</span>    }</div>
+<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span> </div>
+<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"> 1861</span>    <span class="keywordflow">if</span> (packs_per_int &lt; values_per_reduce &amp;&amp;</div>
+<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"> 1862</span>        i % packs_per_int == packs_per_int - 1) {</div>
+<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"> 1863</span>      out[out_index + i / packs_per_int] = output;</div>
+<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"> 1864</span>      output = 0;</div>
+<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"> 1865</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01866" name="l01866"></a><span class="lineno"> 1866</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"> 1867</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 0; j &lt; writes_per_reduce - 1; j++) {</div>
+<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"> 1868</span>        uint8_t sval = <a class="code hl_function" href="namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c">simd_shuffle_down</a>(val, j + 1);</div>
+<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"> 1869</span>        output += sval &lt;&lt; (bits * (values_per_reduce + j + i));</div>
+<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"> 1870</span>      }</div>
+<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"> 1871</span>    }</div>
+<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"> 1872</span>  }</div>
+<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"> 1873</span>  <span class="keywordflow">if</span> (writes_per_reduce &gt; 0 &amp;&amp; out_index % writes_per_reduce == 0) {</div>
+<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>    out[out_index / writes_per_reduce] = output;</div>
+<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>  }</div>
+<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span>}</div>
 </div>
-<div class="line"><a id="l01854" name="l01854"></a><span class="lineno"> 1854</span> </div>
-<div class="line"><a id="l01855" name="l01855"></a><span class="lineno"> 1855</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
-<div class="foldopen" id="foldopen01856" data-start="{" data-end="}">
-<div class="line"><a id="l01856" name="l01856"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6"> 1856</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a>(</div>
-<div class="line"><a id="l01857" name="l01857"></a><span class="lineno"> 1857</span>    <span class="keyword">const</span> device uint8_t* w [[buffer(0)]],</div>
-<div class="line"><a id="l01858" name="l01858"></a><span class="lineno"> 1858</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
-<div class="line"><a id="l01859" name="l01859"></a><span class="lineno"> 1859</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
-<div class="line"><a id="l01860" name="l01860"></a><span class="lineno"> 1860</span>    device T* out [[buffer(3)]],</div>
-<div class="line"><a id="l01861" name="l01861"></a><span class="lineno"> 1861</span>    uint2 index [[thread_position_in_grid]],</div>
-<div class="line"><a id="l01862" name="l01862"></a><span class="lineno"> 1862</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
-<div class="line"><a id="l01863" name="l01863"></a><span class="lineno"> 1863</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
-<div class="line"><a id="l01864" name="l01864"></a><span class="lineno"> 1864</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
-<div class="line"><a id="l01865" name="l01865"></a><span class="lineno"> 1865</span> </div>
-<div class="line"><a id="l01866" name="l01866"></a><span class="lineno"> 1866</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
-<div class="line"><a id="l01867" name="l01867"></a><span class="lineno"> 1867</span>  <span class="keywordtype">size_t</span> oindex = offset * packs_per_int;</div>
-<div class="line"><a id="l01868" name="l01868"></a><span class="lineno"> 1868</span>  <span class="keywordtype">size_t</span> gindex = oindex / group_size;</div>
-<div class="line"><a id="l01869" name="l01869"></a><span class="lineno"> 1869</span>  T scale = scales[gindex];</div>
-<div class="line"><a id="l01870" name="l01870"></a><span class="lineno"> 1870</span>  T bias = biases[gindex];</div>
-<div class="line"><a id="l01871" name="l01871"></a><span class="lineno"> 1871</span>  uint val = w[offset];</div>
-<div class="line"><a id="l01872" name="l01872"></a><span class="lineno"> 1872</span> </div>
-<div class="line"><a id="l01873" name="l01873"></a><span class="lineno"> 1873</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
-<div class="line"><a id="l01874" name="l01874"></a><span class="lineno"> 1874</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
-<div class="line"><a id="l01875" name="l01875"></a><span class="lineno"> 1875</span>    uint8_t d;</div>
-<div class="line"><a id="l01876" name="l01876"></a><span class="lineno"> 1876</span>    <span class="keywordflow">if</span> (bits == 2) {</div>
-<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x03;</div>
-<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
-<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"> 1879</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x0f;</div>
-<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"> 1880</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
-<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span>      d = val;</div>
-<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"> 1882</span>    }</div>
-<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"> 1883</span>    out[oindex + i] = scale * d + bias;</div>
-<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"> 1884</span>  }</div>
-<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span>}</div>
+<div class="line"><a id="l01877" name="l01877"></a><span class="lineno"> 1877</span> </div>
+<div class="line"><a id="l01878" name="l01878"></a><span class="lineno"> 1878</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01879" data-start="{" data-end="}">
+<div class="line"><a id="l01879" name="l01879"></a><span class="lineno"><a class="line" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c"> 1879</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a>(</div>
+<div class="line"><a id="l01880" name="l01880"></a><span class="lineno"> 1880</span>    <span class="keyword">const</span> device T* w [[buffer(0)]],</div>
+<div class="line"><a id="l01881" name="l01881"></a><span class="lineno"> 1881</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01882" name="l01882"></a><span class="lineno"> 1882</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01883" name="l01883"></a><span class="lineno"> 1883</span>    device uint8_t* out [[buffer(3)]],</div>
+<div class="line"><a id="l01884" name="l01884"></a><span class="lineno"> 1884</span>    uint2 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l01885" name="l01885"></a><span class="lineno"> 1885</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l01886" name="l01886"></a><span class="lineno"> 1886</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
+<div class="line"><a id="l01887" name="l01887"></a><span class="lineno"> 1887</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
+<div class="line"><a id="l01888" name="l01888"></a><span class="lineno"> 1888</span>  <span class="keyword">constexpr</span> T n_bins = (1 &lt;&lt; bits) - 1;</div>
+<div class="line"><a id="l01889" name="l01889"></a><span class="lineno"> 1889</span> </div>
+<div class="line"><a id="l01890" name="l01890"></a><span class="lineno"> 1890</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
+<div class="line"><a id="l01891" name="l01891"></a><span class="lineno"> 1891</span>  <span class="keywordtype">size_t</span> in_index = offset * packs_per_int;</div>
+<div class="line"><a id="l01892" name="l01892"></a><span class="lineno"> 1892</span>  <span class="keywordtype">size_t</span> gindex = in_index / group_size;</div>
+<div class="line"><a id="l01893" name="l01893"></a><span class="lineno"> 1893</span> </div>
+<div class="line"><a id="l01894" name="l01894"></a><span class="lineno"> 1894</span>  T scale = scales[gindex];</div>
+<div class="line"><a id="l01895" name="l01895"></a><span class="lineno"> 1895</span>  T bias = biases[gindex];</div>
+<div class="line"><a id="l01896" name="l01896"></a><span class="lineno"> 1896</span> </div>
+<div class="line"><a id="l01897" name="l01897"></a><span class="lineno"> 1897</span>  uint8_t output = 0;</div>
+<div class="line"><a id="l01898" name="l01898"></a><span class="lineno"> 1898</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l01899" name="l01899"></a><span class="lineno"> 1899</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
+<div class="line"><a id="l01900" name="l01900"></a><span class="lineno"> 1900</span>    uint8_t val = <a class="code hl_function" href="namespacemetal.html#a6653b28c9473087141eddce39878d4d3">min</a>(<a class="code hl_function" href="namespacemetal.html#a46c667e169ff9d51a9204a045305442f">round</a>((w[in_index + i] - bias) / scale), n_bins);</div>
+<div class="line"><a id="l01901" name="l01901"></a><span class="lineno"> 1901</span>    <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l01902" name="l01902"></a><span class="lineno"> 1902</span>      output = val;</div>
+<div class="line"><a id="l01903" name="l01903"></a><span class="lineno"> 1903</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l01904" name="l01904"></a><span class="lineno"> 1904</span>      output += val &lt;&lt; (bits * i);</div>
+<div class="line"><a id="l01905" name="l01905"></a><span class="lineno"> 1905</span>    }</div>
+<div class="line"><a id="l01906" name="l01906"></a><span class="lineno"> 1906</span>  }</div>
+<div class="line"><a id="l01907" name="l01907"></a><span class="lineno"> 1907</span>  out[offset] = output;</div>
+<div class="line"><a id="l01908" name="l01908"></a><span class="lineno"> 1908</span>}</div>
+</div>
+<div class="line"><a id="l01909" name="l01909"></a><span class="lineno"> 1909</span> </div>
+<div class="line"><a id="l01910" name="l01910"></a><span class="lineno"> 1910</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, const <span class="keywordtype">int</span> group_size, const <span class="keywordtype">int</span> bits&gt;</div>
+<div class="foldopen" id="foldopen01911" data-start="{" data-end="}">
+<div class="line"><a id="l01911" name="l01911"></a><span class="lineno"><a class="line" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6"> 1911</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a>(</div>
+<div class="line"><a id="l01912" name="l01912"></a><span class="lineno"> 1912</span>    <span class="keyword">const</span> device uint8_t* w [[buffer(0)]],</div>
+<div class="line"><a id="l01913" name="l01913"></a><span class="lineno"> 1913</span>    <span class="keyword">const</span> device T* scales [[buffer(1)]],</div>
+<div class="line"><a id="l01914" name="l01914"></a><span class="lineno"> 1914</span>    <span class="keyword">const</span> device T* biases [[buffer(2)]],</div>
+<div class="line"><a id="l01915" name="l01915"></a><span class="lineno"> 1915</span>    device T* out [[buffer(3)]],</div>
+<div class="line"><a id="l01916" name="l01916"></a><span class="lineno"> 1916</span>    uint2 index [[thread_position_in_grid]],</div>
+<div class="line"><a id="l01917" name="l01917"></a><span class="lineno"> 1917</span>    uint2 grid_dim [[threads_per_grid]]) {</div>
+<div class="line"><a id="l01918" name="l01918"></a><span class="lineno"> 1918</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> uint8_bits = 8;</div>
+<div class="line"><a id="l01919" name="l01919"></a><span class="lineno"> 1919</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> packs_per_int = uint8_bits / bits;</div>
+<div class="line"><a id="l01920" name="l01920"></a><span class="lineno"> 1920</span> </div>
+<div class="line"><a id="l01921" name="l01921"></a><span class="lineno"> 1921</span>  <span class="keywordtype">size_t</span> offset = index.x + grid_dim.x * size_t(index.y);</div>
+<div class="line"><a id="l01922" name="l01922"></a><span class="lineno"> 1922</span>  <span class="keywordtype">size_t</span> oindex = offset * packs_per_int;</div>
+<div class="line"><a id="l01923" name="l01923"></a><span class="lineno"> 1923</span>  <span class="keywordtype">size_t</span> gindex = oindex / group_size;</div>
+<div class="line"><a id="l01924" name="l01924"></a><span class="lineno"> 1924</span>  T scale = scales[gindex];</div>
+<div class="line"><a id="l01925" name="l01925"></a><span class="lineno"> 1925</span>  T bias = biases[gindex];</div>
+<div class="line"><a id="l01926" name="l01926"></a><span class="lineno"> 1926</span>  uint val = w[offset];</div>
+<div class="line"><a id="l01927" name="l01927"></a><span class="lineno"> 1927</span> </div>
+<div class="line"><a id="l01928" name="l01928"></a><span class="lineno"> 1928</span><span class="preprocessor">#pragma clang loop unroll(full)</span></div>
+<div class="line"><a id="l01929" name="l01929"></a><span class="lineno"> 1929</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; packs_per_int; i++) {</div>
+<div class="line"><a id="l01930" name="l01930"></a><span class="lineno"> 1930</span>    uint8_t d;</div>
+<div class="line"><a id="l01931" name="l01931"></a><span class="lineno"> 1931</span>    <span class="keywordflow">if</span> (bits == 2) {</div>
+<div class="line"><a id="l01932" name="l01932"></a><span class="lineno"> 1932</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x03;</div>
+<div class="line"><a id="l01933" name="l01933"></a><span class="lineno"> 1933</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 4) {</div>
+<div class="line"><a id="l01934" name="l01934"></a><span class="lineno"> 1934</span>      d = (val &gt;&gt; (bits * i)) &amp; 0x0f;</div>
+<div class="line"><a id="l01935" name="l01935"></a><span class="lineno"> 1935</span>    } <span class="keywordflow">else</span> <span class="keywordflow">if</span> (bits == 8) {</div>
+<div class="line"><a id="l01936" name="l01936"></a><span class="lineno"> 1936</span>      d = val;</div>
+<div class="line"><a id="l01937" name="l01937"></a><span class="lineno"> 1937</span>    }</div>
+<div class="line"><a id="l01938" name="l01938"></a><span class="lineno"> 1938</span>    out[oindex + i] = scale * d + bias;</div>
+<div class="line"><a id="l01939" name="l01939"></a><span class="lineno"> 1939</span>  }</div>
+<div class="line"><a id="l01940" name="l01940"></a><span class="lineno"> 1940</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f"><div class="ttname"><a href="backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f">elem_to_loc_broadcast</a></div><div class="ttdeci">METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:7</div></div>
@@ -2058,28 +2115,29 @@ $(function(){ initResizable(false); });
 <div class="ttc" id="aquantized_8h_html_a0386011c52d03e60885a31e6fbd903dd"><div class="ttname"><a href="quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd">MLX_MTL_CONST</a></div><div class="ttdeci">#define MLX_MTL_CONST</div><div class="ttdef"><b>Definition</b> quantized.h:8</div></div>
 <div class="ttc" id="aquantized_8h_html_a07b26d2d0b0d65dfe925c452c453fa42"><div class="ttname"><a href="quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42">qdot_safe</a></div><div class="ttdeci">U qdot_safe(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:142</div></div>
 <div class="ttc" id="aquantized_8h_html_a0ba59096494f1001c195312571523ae9"><div class="ttname"><a href="quantized_8h.html#a0ba59096494f1001c195312571523ae9">qmm_n_impl</a></div><div class="ttdeci">METAL_FUNC void qmm_n_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:879</div></div>
-<div class="ttc" id="aquantized_8h_html_a1a66b061c46383952a0f067c3848971f"><div class="ttname"><a href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a></div><div class="ttdeci">void bs_qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1678</div></div>
-<div class="ttc" id="aquantized_8h_html_a2ce135e392dbf9a3e5180fb083792ed7"><div class="ttname"><a href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a></div><div class="ttdeci">void qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1368</div></div>
-<div class="ttc" id="aquantized_8h_html_a47bcf4a14566e01e14bd3c155811db59"><div class="ttname"><a href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a></div><div class="ttdeci">void affine_quantize(const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1740</div></div>
-<div class="ttc" id="aquantized_8h_html_a4a8c8db7d5d480733726fd6d1a645e12"><div class="ttname"><a href="quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12">qvm_impl</a></div><div class="ttdeci">METAL_FUNC void qvm_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:647</div></div>
-<div class="ttc" id="aquantized_8h_html_a530b720e123e59d73ea89a0a2d0946b7"><div class="ttname"><a href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a></div><div class="ttdeci">void bs_qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1421</div></div>
-<div class="ttc" id="aquantized_8h_html_a6076203615038eb06816158f7b3869c6"><div class="ttname"><a href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a></div><div class="ttdeci">void affine_dequantize(const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1856</div></div>
+<div class="ttc" id="aquantized_8h_html_a1546533c5b925b2fbb3bec870ec7487a"><div class="ttname"><a href="quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a">qvm_impl</a></div><div class="ttdeci">METAL_FUNC void qvm_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const int in_vec_size, const int out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:647</div></div>
+<div class="ttc" id="aquantized_8h_html_a1a66b061c46383952a0f067c3848971f"><div class="ttname"><a href="quantized_8h.html#a1a66b061c46383952a0f067c3848971f">bs_qmm_n</a></div><div class="ttdeci">void bs_qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1733</div></div>
+<div class="ttc" id="aquantized_8h_html_a2ce135e392dbf9a3e5180fb083792ed7"><div class="ttname"><a href="quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7">qmm_n</a></div><div class="ttdeci">void qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1423</div></div>
+<div class="ttc" id="aquantized_8h_html_a47bcf4a14566e01e14bd3c155811db59"><div class="ttname"><a href="quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59">affine_quantize</a></div><div class="ttdeci">void affine_quantize(const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1795</div></div>
+<div class="ttc" id="aquantized_8h_html_a530b720e123e59d73ea89a0a2d0946b7"><div class="ttname"><a href="quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7">bs_qmv_fast</a></div><div class="ttdeci">void bs_qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1476</div></div>
+<div class="ttc" id="aquantized_8h_html_a6076203615038eb06816158f7b3869c6"><div class="ttname"><a href="quantized_8h.html#a6076203615038eb06816158f7b3869c6">affine_dequantize</a></div><div class="ttdeci">void affine_dequantize(const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1911</div></div>
 <div class="ttc" id="aquantized_8h_html_a62969a218d93680f5e35d0c61b160b99"><div class="ttname"><a href="quantized_8h.html#a62969a218d93680f5e35d0c61b160b99">SIMD_SIZE</a></div><div class="ttdeci">static constant constexpr const int SIMD_SIZE</div><div class="ttdef"><b>Definition</b> quantized.h:10</div></div>
 <div class="ttc" id="aquantized_8h_html_a639c50a08b5cf57e8be5279a116274bd"><div class="ttname"><a href="quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd">qmv</a></div><div class="ttdeci">void qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1200</div></div>
-<div class="ttc" id="aquantized_8h_html_a6d6e3c31e44f232e58ae9d605e1f4494"><div class="ttname"><a href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a></div><div class="ttdeci">void bs_qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1543</div></div>
-<div class="ttc" id="aquantized_8h_html_a7561acefd7b55e7e2b25393be08bb99c"><div class="ttname"><a href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a></div><div class="ttdeci">void affine_quantize_scales_biases(const device T *w, const device T *scales, const device T *biases, device uint8_t *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1824</div></div>
+<div class="ttc" id="aquantized_8h_html_a6d6e3c31e44f232e58ae9d605e1f4494"><div class="ttname"><a href="quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494">bs_qvm</a></div><div class="ttdeci">void bs_qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1598</div></div>
+<div class="ttc" id="aquantized_8h_html_a7561acefd7b55e7e2b25393be08bb99c"><div class="ttname"><a href="quantized_8h.html#a7561acefd7b55e7e2b25393be08bb99c">affine_quantize_scales_biases</a></div><div class="ttdeci">void affine_quantize_scales_biases(const device T *w, const device T *scales, const device T *biases, device uint8_t *out, uint2 index, uint2 grid_dim)</div><div class="ttdef"><b>Definition</b> quantized.h:1879</div></div>
 <div class="ttc" id="aquantized_8h_html_a7bd1d9f17c86c8fd34ec13678cff755f"><div class="ttname"><a href="quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f">qmv_fast</a></div><div class="ttdeci">void qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1149</div></div>
 <div class="ttc" id="aquantized_8h_html_a7ce5f53a4d6d1555e9402d545408d0ad"><div class="ttname"><a href="quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad">qmv_quad</a></div><div class="ttdeci">void qmv_quad(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1098</div></div>
 <div class="ttc" id="aquantized_8h_html_a803e4d5a1459844ba647aea5b004e133"><div class="ttname"><a href="quantized_8h.html#a803e4d5a1459844ba647aea5b004e133">QUAD_SIZE</a></div><div class="ttdeci">static constant constexpr const int QUAD_SIZE</div><div class="ttdef"><b>Definition</b> quantized.h:11</div></div>
 <div class="ttc" id="aquantized_8h_html_a8dbace41de9e1e21dd59d016db11b3e9"><div class="ttname"><a href="quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9">load_vector</a></div><div class="ttdeci">U load_vector(const device T *x, thread U *x_thread)</div><div class="ttdef"><b>Definition</b> quantized.h:14</div></div>
 <div class="ttc" id="aquantized_8h_html_a8e13c7d895624f738d2a6d9893b687fd"><div class="ttname"><a href="quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd">qmv_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:498</div></div>
 <div class="ttc" id="aquantized_8h_html_aa69e143d646fad332c1a53e8c9b337b7"><div class="ttname"><a href="quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7">load_vector_safe</a></div><div class="ttdeci">U load_vector_safe(const device T *x, thread U *x_thread, int N)</div><div class="ttdef"><b>Definition</b> quantized.h:52</div></div>
-<div class="ttc" id="aquantized_8h_html_ab1ae143eba2afceb8df63f38b26f9a84"><div class="ttname"><a href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a></div><div class="ttdeci">void bs_qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1611</div></div>
+<div class="ttc" id="aquantized_8h_html_ab1ae143eba2afceb8df63f38b26f9a84"><div class="ttname"><a href="quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84">bs_qmm_t</a></div><div class="ttdeci">void bs_qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1666</div></div>
 <div class="ttc" id="aquantized_8h_html_ab364d58ab652e3ad87a8f80910556071"><div class="ttname"><a href="quantized_8h.html#ab364d58ab652e3ad87a8f80910556071">qdot</a></div><div class="ttdeci">U qdot(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum)</div><div class="ttdef"><b>Definition</b> quantized.h:99</div></div>
+<div class="ttc" id="aquantized_8h_html_ab8243818512d6078d23e6ffb65fd7bb8"><div class="ttname"><a href="quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8">qvm_split_k</a></div><div class="ttdeci">void qvm_split_k(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;final_block_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1302</div></div>
 <div class="ttc" id="aquantized_8h_html_aba7687e6f8f1d29c0a1b2a3db150bd81"><div class="ttname"><a href="quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81">qmv_fast_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_fast_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:434</div></div>
-<div class="ttc" id="aquantized_8h_html_abe2e3ef0ee4ec2cb61dc5330ad463d10"><div class="ttname"><a href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a></div><div class="ttdeci">void qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1310</div></div>
+<div class="ttc" id="aquantized_8h_html_abe2e3ef0ee4ec2cb61dc5330ad463d10"><div class="ttname"><a href="quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10">qmm_t</a></div><div class="ttdeci">void qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;K, const constant int &amp;N, const constant int &amp;M, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1365</div></div>
 <div class="ttc" id="aquantized_8h_html_accab1f9e17a65242347c051f98e4c0be"><div class="ttname"><a href="quantized_8h.html#accab1f9e17a65242347c051f98e4c0be">adjust_matrix_offsets</a></div><div class="ttdeci">METAL_FUNC void adjust_matrix_offsets(const device T *&amp;x, const device uint32_t *&amp;w, const device T *&amp;scales, const device T *&amp;biases, device T *&amp;y, int output_stride, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid)</div><div class="ttdef"><b>Definition</b> quantized.h:1005</div></div>
-<div class="ttc" id="aquantized_8h_html_acf4c7fc77821a83b31aedfb48443d3ed"><div class="ttname"><a href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a></div><div class="ttdeci">void bs_qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1482</div></div>
+<div class="ttc" id="aquantized_8h_html_acf4c7fc77821a83b31aedfb48443d3ed"><div class="ttname"><a href="quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed">bs_qmv</a></div><div class="ttdeci">void bs_qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int &amp;batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1537</div></div>
 <div class="ttc" id="aquantized_8h_html_ad5cf1cf63656bc1780685d22169cd4ef"><div class="ttname"><a href="quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef">qmv_quad_impl</a></div><div class="ttdeci">METAL_FUNC void qmv_quad_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, constant int &amp;in_vec_size, const constant int &amp;out_vec_size, uint3 tid, uint quad_gid, uint quad_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:376</div></div>
 <div class="ttc" id="aquantized_8h_html_ad84f7d5ab9e32dbbe3ca759ae5d5d5c5"><div class="ttname"><a href="quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5">qvm</a></div><div class="ttdeci">void qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int &amp;in_vec_size, const constant int &amp;out_vec_size, const constant int &amp;x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int &amp;w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> quantized.h:1251</div></div>
 <div class="ttc" id="aquantized_8h_html_ae756f6817b584c60f5dcdd1d9c6b4f58"><div class="ttname"><a href="quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58">qouter</a></div><div class="ttdeci">void qouter(const thread uint8_t *w, U x, U scale, U bias, thread U *result)</div><div class="ttdef"><b>Definition</b> quantized.h:187</div></div>
diff --git a/docs/build/html/reduce__col_8h.html b/docs/build/html/reduce__col_8h.html
index cd6b1d3c6..b7dda2eda 100644
--- a/docs/build/html/reduce__col_8h.html
+++ b/docs/build/html/reduce__col_8h.html
@@ -98,15 +98,207 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:adf7aeb18cd1d5042cf6d9b46b582d8ce" id="r_adf7aeb18cd1d5042cf6d9b46b582d8ce"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </td></tr>
-<tr class="memitem:adf7aeb18cd1d5042cf6d9b46b582d8ce"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#adf7aeb18cd1d5042cf6d9b46b582d8ce">col_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 tsize)</td></tr>
-<tr class="separator:adf7aeb18cd1d5042cf6d9b46b582d8ce"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a7c378443a2b6f4d9210db8a21a9ac4f5" id="r_a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS&gt; </td></tr>
+<tr class="memitem:a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
+<tr class="separator:a7c378443a2b6f4d9210db8a21a9ac4f5"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a5b4f4c4c247ad341ff8d31dcbbbce0eb" id="r_a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS&gt; </td></tr>
+<tr class="memitem:a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</td></tr>
+<tr class="separator:a5b4f4c4c247ad341ff8d31dcbbbce0eb"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a11bfc6112ae2386ac03f5ea7b7d93385" id="r_a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </td></tr>
 <tr class="memitem:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
 <tr class="memdesc:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="mdescLeft">&#160;</td><td class="mdescRight">Our approach is the following simple looped approach:  <br /></td></tr>
 <tr class="separator:a11bfc6112ae2386ac03f5ea7b7d93385"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d" id="r_a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memTemplParams" colspan="2">template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </td></tr>
+<tr class="memitem:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a> (const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</td></tr>
+<tr class="separator:a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
+<a id="a0e92fc74eeaa8ee2ceb83bafc6eb1d7d" name="a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">&#9670;&#160;</a></span>col_reduce_2pass()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void col_reduce_2pass </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>reduction_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>reduction_stride</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>ndim</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>reduce_shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>reduce_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>reduce_ndim</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>non_col_reductions</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>out_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>gid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>gsize</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="a5b4f4c4c247ad341ff8d31dcbbbce0eb" name="a5b4f4c4c247ad341ff8d31dcbbbce0eb"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a5b4f4c4c247ad341ff8d31dcbbbce0eb">&#9670;&#160;</a></span>col_reduce_longcolumn()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<div class="memtemplate">
+template&lt;typename T , typename U , typename Op , int NDIMS&gt; </div>
+      <table class="memname">
+        <tr>
+          <td class="memname">void col_reduce_longcolumn </td>
+          <td>(</td>
+          <td class="paramtype">const device T *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">device U *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>reduction_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>reduction_stride</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>ndim</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int *</td>          <td class="paramname"><span class="paramname"><em>reduce_shape</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t *</td>          <td class="paramname"><span class="paramname"><em>reduce_strides</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant int &amp;</td>          <td class="paramname"><span class="paramname"><em>reduce_ndim</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>non_col_reductions</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>out_size</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>gid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>gsize</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lid</em></span>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lsize</em></span>&#160;)</td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
 <a id="a11bfc6112ae2386ac03f5ea7b7d93385" name="a11bfc6112ae2386ac03f5ea7b7d93385"></a>
 <h2 class="memtitle"><span class="permalink"><a href="#a11bfc6112ae2386ac03f5ea7b7d93385">&#9670;&#160;</a></span>col_reduce_looped()</h2>
 
@@ -204,13 +396,13 @@ template&lt;typename T , typename U , typename Op , int NDIMS, int BM, int BN&gt
 
 </div>
 </div>
-<a id="adf7aeb18cd1d5042cf6d9b46b582d8ce" name="adf7aeb18cd1d5042cf6d9b46b582d8ce"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#adf7aeb18cd1d5042cf6d9b46b582d8ce">&#9670;&#160;</a></span>col_reduce_small()</h2>
+<a id="a7c378443a2b6f4d9210db8a21a9ac4f5" name="a7c378443a2b6f4d9210db8a21a9ac4f5"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a7c378443a2b6f4d9210db8a21a9ac4f5">&#9670;&#160;</a></span>col_reduce_small()</h2>
 
 <div class="memitem">
 <div class="memproto">
 <div class="memtemplate">
-template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS&gt; </div>
+template&lt;typename T , typename U , typename Op , int NDIMS&gt; </div>
       <table class="memname">
         <tr>
           <td class="memname">void col_reduce_small </td>
@@ -280,22 +472,12 @@ template&lt;typename T , typename U , typename Op , int NDIMS, int N_READS = RED
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_lane_id</em></span>, </td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lid</em></span>, </td>
         </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
-          <td class="paramtype">uint</td>          <td class="paramname"><span class="paramname"><em>simd_group_id</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tid</em></span>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>tsize</em></span>&#160;)</td>
+          <td class="paramtype">uint3</td>          <td class="paramname"><span class="paramname"><em>lsize</em></span>&#160;)</td>
         </tr>
       </table>
 </div><div class="memdoc">
diff --git a/docs/build/html/reduce__col_8h_source.html b/docs/build/html/reduce__col_8h_source.html
index 835b0cd1c..caf4a5f82 100644
--- a/docs/build/html/reduce__col_8h_source.html
+++ b/docs/build/html/reduce__col_8h_source.html
@@ -93,334 +93,392 @@ $(function(){ initResizable(false); });
 <div class="contents">
 <a href="reduce__col_8h.html">Go to the documentation of this file.</a><div class="fragment"><div class="line"><a id="l00001" name="l00001"></a><span class="lineno">    1</span><span class="comment">// Copyright © 2023-2024 Apple Inc.</span></div>
 <div class="line"><a id="l00002" name="l00002"></a><span class="lineno">    2</span> </div>
-<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;</div>
-<div class="line"><a id="l00004" name="l00004"></a><span class="lineno">    4</span>    <span class="keyword">typename</span> T,</div>
-<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span>    <span class="keyword">typename</span> U,</div>
-<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span>    <span class="keyword">typename</span> Op,</div>
-<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span>    <span class="keywordtype">int</span> NDIMS,</div>
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span>    <span class="keywordtype">int</span> N_READS = <a class="code hl_variable" href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a>&gt;</div>
-<div class="foldopen" id="foldopen00009" data-start="{" data-end="}">
-<div class="line"><a id="l00009" name="l00009"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce">    9</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce">col_reduce_small</a>(</div>
-<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
-<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
-<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>    uint3 tid [[thread_position_in_grid]],</div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>    uint3 tsize [[threads_per_grid]]) {</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span> </div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="comment">// Case 1: Small row small column</span></div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  <span class="keywordflow">if</span> (reduction_size * non_col_reductions &lt; 64 &amp;&amp; reduction_stride &lt; 32) {</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>    U totals[31];</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; 31; i++) {</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>      totals[i] = Op::init;</div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>    }</div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span> </div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>    <span class="keywordtype">short</span> stride = reduction_stride;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>    <span class="keywordtype">short</span> size = reduction_size;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>    <span class="keywordtype">short</span> blocks = stride / N_READS;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>    <span class="keywordtype">short</span> extra = stride - blocks * N_READS;</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span> </div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    <span class="keywordtype">size_t</span> out_idx = tid.x + tsize.y * size_t(tid.y);</div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span> </div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>    <span class="keywordflow">for</span> (uint r = 0; r &lt; non_col_reductions; r++) {</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>      row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span> </div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; size; i++) {</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; blocks; j++) {</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>          <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; N_READS; k++) {</div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>            totals[j * N_READS + k] =</div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>                <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(totals[j * N_READS + k],</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>                   <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i * stride + j * N_READS + k]));</div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>          }</div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>        }</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> k = 0; k &lt; extra; k++) {</div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>          totals[blocks * N_READS + k] =</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(totals[blocks * N_READS + k],</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>                 <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i * stride + blocks * N_READS + k]));</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>        }</div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>      }</div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span> </div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>      loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    }</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    out += out_idx * reduction_stride;</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; stride; j++) {</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>      out[j] = totals[j];</div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    }</div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>  }</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span> </div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>  <span class="comment">// Case 2: Long row small column</span></div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>  <span class="keywordflow">else</span> <span class="keywordflow">if</span> (reduction_size * non_col_reductions &lt; 32) {</div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    U totals[N_READS];</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>      totals[i] = Op::init;</div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    }</div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span> </div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    <span class="keywordtype">short</span> size = reduction_size;</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    <span class="keywordtype">size_t</span> offset = size_t(tid.x) * N_READS;</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    <span class="keywordtype">bool</span> safe = offset + N_READS &lt;= reduction_stride;</div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="keywordtype">short</span> extra = reduction_stride - offset;</div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span> </div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keywordtype">size_t</span> out_idx = tid.y + tsize.z * size_t(tid.z);</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim) + offset;</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span> </div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>    <span class="keywordflow">for</span> (uint r = 0; r &lt; non_col_reductions; r++) {</div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>      row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span> </div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>      <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; size; i++) {</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>          <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; N_READS; j++) {</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>            totals[j] =</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>                <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i * reduction_stride + j]), totals[j]);</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>          }</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>        }</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>        <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; size; i++) {</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>          <span class="keywordflow">for</span> (<span class="keywordtype">short</span> j = 0; j &lt; extra; j++) {</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>            totals[j] =</div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>                <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i * reduction_stride + j]), totals[j]);</div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>          }</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>        }</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>      }</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span> </div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>      loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    }</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    out += out_idx * reduction_stride + offset;</div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>        out[i] = totals[i];</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      }</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>      <span class="keywordflow">for</span> (<span class="keywordtype">short</span> i = 0; i &lt; extra; i++) {</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>        out[i] = totals[i];</div>
-<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>      }</div>
-<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span>    }</div>
-<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  }</div>
-<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span> </div>
-<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  <span class="comment">// Case 3: Long row medium column</span></div>
-<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span>  <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>    threadgroup U shared_vals[1024];</div>
-<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>    U totals[N_READS];</div>
-<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>      totals[i] = Op::init;</div>
-<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>    }</div>
-<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span> </div>
-<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    <span class="keywordtype">short</span> stride = reduction_stride;</div>
-<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
-<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>    short2 tile((stride + N_READS - 1) / N_READS, 32);</div>
-<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span>    short2 offset((lid % tile.x) * N_READS, lid / tile.x);</div>
-<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>    <span class="keywordtype">short</span> sm_stride = tile.x * N_READS;</div>
-<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>    <span class="keywordtype">bool</span> safe = offset.x + N_READS &lt;= stride;</div>
-<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span> </div>
-<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>    <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    in += <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim) + offset.x;</div>
-<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span> </div>
-<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    <span class="comment">// Read cooperatively and contiguously and aggregate the partial results.</span></div>
-<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>    <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y; r &lt; total; r += <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>) {</div>
-<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span>      row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00143" name="l00143"></a><span class="lineno">  143</span> </div>
-<div class="line"><a id="l00144" name="l00144"></a><span class="lineno">  144</span>      <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00145" name="l00145"></a><span class="lineno">  145</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00146" name="l00146"></a><span class="lineno">  146</span>          totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
-<div class="line"><a id="l00147" name="l00147"></a><span class="lineno">  147</span>        }</div>
-<div class="line"><a id="l00148" name="l00148"></a><span class="lineno">  148</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00149" name="l00149"></a><span class="lineno">  149</span>        U vals[N_READS];</div>
-<div class="line"><a id="l00150" name="l00150"></a><span class="lineno">  150</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00151" name="l00151"></a><span class="lineno">  151</span>          vals[i] = (offset.x + i &lt; stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
-<div class="line"><a id="l00152" name="l00152"></a><span class="lineno">  152</span>        }</div>
-<div class="line"><a id="l00153" name="l00153"></a><span class="lineno">  153</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span>          totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
-<div class="line"><a id="l00155" name="l00155"></a><span class="lineno">  155</span>        }</div>
-<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>      }</div>
-<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span> </div>
-<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>      loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(<a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    }</div>
-<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span> </div>
-<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="comment">// Each thread holds N_READS partial results but the simdgroups are not</span></div>
-<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <span class="comment">// aligned to do the reduction across the simdgroup so we write our results</span></div>
-<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="comment">// in the shared memory and read them back according to the simdgroup.</span></div>
-<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>      shared_vals[offset.y * sm_stride + offset.x + i] = totals[i];</div>
-<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    }</div>
-<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>      totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(</div>
-<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>          shared_vals[simd_lane_id * sm_stride + simd_group_id * N_READS + i]);</div>
-<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>    }</div>
-<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span> </div>
-<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>    <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>      <span class="keywordtype">short</span> column = simd_group_id * N_READS;</div>
-<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span>      out += out_idx * reduction_stride + column;</div>
-<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>      <span class="keywordflow">if</span> (column + N_READS &lt;= stride) {</div>
-<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; N_READS; i++) {</div>
-<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>        }</div>
-<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; stride; i++) {</div>
-<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>        }</div>
-<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span>      }</div>
-<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>    }</div>
-<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  }</div>
-<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>}</div>
+<div class="line"><a id="l00003" name="l00003"></a><span class="lineno">    3</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS&gt;</div>
+<div class="foldopen" id="foldopen00004" data-start="{" data-end="}">
+<div class="line"><a id="l00004" name="l00004"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">    4</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a>(</div>
+<div class="line"><a id="l00005" name="l00005"></a><span class="lineno">    5</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno">    8</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00012" name="l00012"></a><span class="lineno">   12</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    uint3 lsize [[threads_per_threadgroup]]) {</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_reads = 4;</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  U totals[n_reads];</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>    totals[i] = Op::init;</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  }</div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span> </div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  <span class="keywordtype">size_t</span> column = size_t(gid.x) * lsize.x * n_reads + lid.x * n_reads;</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  <span class="keywordflow">if</span> (column &gt;= reduction_stride) {</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>    <span class="keywordflow">return</span>;</div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  }</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span> </div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  <span class="keywordtype">size_t</span> total_rows = non_col_reductions * reduction_size;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lid.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = lid.y; r &lt; total_rows; r += lsize.y) {</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>      }</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>      U vals[n_reads];</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>        vals[i] =</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>      }</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>      }</div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    }</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lsize.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>  }</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span> </div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>  <span class="keywordflow">if</span> (lsize.y &gt; 1) {</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="comment">// lsize.y should be &lt;= 8</span></div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    threadgroup U shared_vals[32 * 8 * n_reads];</div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      shared_vals[lid.y * lsize.x * n_reads + lid.x * n_reads + i] = totals[i];</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    }</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    <span class="keywordflow">if</span> (lid.y == 0) {</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>        totals[i] = shared_vals[lid.x * n_reads + i];</div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>      }</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>      <span class="keywordflow">for</span> (uint j = 1; j &lt; lsize.y; j++) {</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>          totals[i] =</div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(shared_vals[j * lsize.x * n_reads + lid.x * n_reads + i],</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>                 totals[i]);</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>        }</div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>      }</div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    }</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>  }</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>  <span class="keywordflow">if</span> (lid.y == 0) {</div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    out += out_idx * reduction_stride + column;</div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>      }</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>      }</div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    }</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>  }</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>}</div>
 </div>
-<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span> </div>
-<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS, <span class="keywordtype">int</span> BM, <span class="keywordtype">int</span> BN&gt;</div>
-<div class="foldopen" id="foldopen00202" data-start="{" data-end="}">
-<div class="line"><a id="l00202" name="l00202"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">  202</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a>(</div>
-<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
-<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    device U* out [[buffer(1)]],</div>
-<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
-<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
-<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
-<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
-<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
-<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
-<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
-<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
-<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
-<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    uint3 gsize [[threadgroups_per_grid]],</div>
-<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
-<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
-<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
-<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 4;</div>
-<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
-<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
-<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
-<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span> </div>
-<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>  threadgroup U shared_vals[BN * BM];</div>
-<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>  U totals[n_reads];</div>
-<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
-<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>  <span class="keyword">const</span> device T* row;</div>
-<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span> </div>
-<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    totals[i] = Op::init;</div>
-<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>  }</div>
-<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span> </div>
-<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
-<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
-<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span>  <span class="keywordtype">size_t</span> column = BN * gid.x + offset.x;</div>
-<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
-<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span> </div>
-<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
-<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
-<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>  in += in_idx + column;</div>
-<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span> </div>
-<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>  <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
-<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y; r &lt; total; r += BM) {</div>
-<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
-<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span> </div>
-<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>    <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
-<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>      }</div>
-<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span>    } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>      U vals[n_reads];</div>
-<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>        vals[i] =</div>
-<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
-<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>      }</div>
-<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
-<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>      }</div>
-<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    }</div>
-<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span> </div>
-<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(BM, reduce_shape, reduce_strides);</div>
-<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>  }</div>
-<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span> </div>
-<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
-<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
-<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>  <span class="comment">// accumulations.</span></div>
-<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>  <span class="keywordflow">if</span> (BM == 32) {</div>
-<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span>    <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
-<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="keyword">static_assert</span>(</div>
-<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>        BM != 32 || n_outputs == n_reads,</div>
-<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>        <span class="stringliteral">&quot;The tile should be selected such that n_outputs == n_reads&quot;</span>);</div>
-<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>      shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
-<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>    }</div>
-<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>    short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
-<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>      totals[i] =</div>
-<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>          <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
-<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>    }</div>
-<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span> </div>
-<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>    <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
-<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span>      <span class="keywordtype">size_t</span> out_column = BN * gid.x + out_offset.x;</div>
-<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span>      out += out_idx * reduction_stride + out_column;</div>
-<div class="line"><a id="l00287" name="l00287"></a><span class="lineno">  287</span>      <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
-<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
-<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>        }</div>
-<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
-<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>        }</div>
-<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>      }</div>
-<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>    }</div>
-<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>  }</div>
-<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span> </div>
-<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>  <span class="comment">// Each thread holds n_reads partial results. We write them all out to shared</span></div>
-<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>  <span class="comment">// memory and threads with offset.y == 0 aggregate the columns and write the</span></div>
-<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>  <span class="comment">// outputs.</span></div>
-<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>  <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    <span class="keywordtype">short</span> x_block = offset.x / n_reads;</div>
-<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>      shared_vals[x_block * BM * n_reads + i * BM + offset.y] = totals[i];</div>
-<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>    }</div>
-<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
-<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; BM; j++) {</div>
-<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>          totals[i] =</div>
-<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(shared_vals[x_block * BM * n_reads + i * BM + j], totals[i]);</div>
-<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>        }</div>
-<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>      }</div>
-<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>    }</div>
-<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span> </div>
-<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span>    <span class="comment">// Write the output.</span></div>
-<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
-<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>      out += out_idx * reduction_stride + column;</div>
-<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>      <span class="keywordflow">if</span> (safe) {</div>
-<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
-<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>        }</div>
-<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>      } <span class="keywordflow">else</span> {</div>
-<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; reduction_stride; i++) {</div>
-<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span>          out[i] = totals[i];</div>
-<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>        }</div>
-<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>      }</div>
-<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>    }</div>
-<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>  }</div>
-<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>}</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span> </div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS&gt;</div>
+<div class="foldopen" id="foldopen00097" data-start="{" data-end="}">
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">   97</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a>(</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(11)]],</div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    uint3 lid [[thread_position_in_threadgroup]],</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    uint3 lsize [[threads_per_threadgroup]]) {</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00117" name="l00117"></a><span class="lineno">  117</span> </div>
+<div class="line"><a id="l00118" name="l00118"></a><span class="lineno">  118</span>  <span class="keywordtype">size_t</span> out_idx = gid.x + gsize.x * size_t(gid.y);</div>
+<div class="line"><a id="l00119" name="l00119"></a><span class="lineno">  119</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00120" name="l00120"></a><span class="lineno">  120</span>  in += in_idx + lid.x;</div>
+<div class="line"><a id="l00121" name="l00121"></a><span class="lineno">  121</span> </div>
+<div class="line"><a id="l00122" name="l00122"></a><span class="lineno">  122</span>  U total = Op::init;</div>
+<div class="line"><a id="l00123" name="l00123"></a><span class="lineno">  123</span>  <span class="keywordtype">size_t</span> total_rows = non_col_reductions * reduction_size;</div>
+<div class="line"><a id="l00124" name="l00124"></a><span class="lineno">  124</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(gid.z * lsize.y + lid.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00125" name="l00125"></a><span class="lineno">  125</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = gid.z * lsize.y + lid.y; r &lt; total_rows;</div>
+<div class="line"><a id="l00126" name="l00126"></a><span class="lineno">  126</span>       r += lsize.y * gsize.z) {</div>
+<div class="line"><a id="l00127" name="l00127"></a><span class="lineno">  127</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00128" name="l00128"></a><span class="lineno">  128</span>    total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(*row), total);</div>
+<div class="line"><a id="l00129" name="l00129"></a><span class="lineno">  129</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(lsize.y * gsize.z, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00130" name="l00130"></a><span class="lineno">  130</span>  }</div>
+<div class="line"><a id="l00131" name="l00131"></a><span class="lineno">  131</span> </div>
+<div class="line"><a id="l00132" name="l00132"></a><span class="lineno">  132</span>  threadgroup U shared_vals[32 * 32];</div>
+<div class="line"><a id="l00133" name="l00133"></a><span class="lineno">  133</span>  shared_vals[lid.y * lsize.x + lid.x] = total;</div>
+<div class="line"><a id="l00134" name="l00134"></a><span class="lineno">  134</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00135" name="l00135"></a><span class="lineno">  135</span>  <span class="keywordflow">if</span> (lid.y == 0) {</div>
+<div class="line"><a id="l00136" name="l00136"></a><span class="lineno">  136</span>    <span class="keywordflow">for</span> (uint i = 1; i &lt; lsize.y; i++) {</div>
+<div class="line"><a id="l00137" name="l00137"></a><span class="lineno">  137</span>      total = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(total, shared_vals[i * lsize.x + lid.x]);</div>
+<div class="line"><a id="l00138" name="l00138"></a><span class="lineno">  138</span>    }</div>
+<div class="line"><a id="l00139" name="l00139"></a><span class="lineno">  139</span>    out[gid.z * out_size + out_idx * reduction_stride + lid.x] = total;</div>
+<div class="line"><a id="l00140" name="l00140"></a><span class="lineno">  140</span>  }</div>
+<div class="line"><a id="l00141" name="l00141"></a><span class="lineno">  141</span>}</div>
+</div>
+<div class="line"><a id="l00142" name="l00142"></a><span class="lineno">  142</span> </div>
+<div class="line"><a id="l00154" name="l00154"></a><span class="lineno">  154</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS, <span class="keywordtype">int</span> BM, <span class="keywordtype">int</span> BN&gt;</div>
+<div class="foldopen" id="foldopen00155" data-start="{" data-end="}">
+<div class="line"><a id="l00155" name="l00155"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">  155</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a>(</div>
+<div class="line"><a id="l00156" name="l00156"></a><span class="lineno">  156</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00157" name="l00157"></a><span class="lineno">  157</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00158" name="l00158"></a><span class="lineno">  158</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00159" name="l00159"></a><span class="lineno">  159</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00160" name="l00160"></a><span class="lineno">  160</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00161" name="l00161"></a><span class="lineno">  161</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00162" name="l00162"></a><span class="lineno">  162</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00163" name="l00163"></a><span class="lineno">  163</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00164" name="l00164"></a><span class="lineno">  164</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00165" name="l00165"></a><span class="lineno">  165</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00166" name="l00166"></a><span class="lineno">  166</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00167" name="l00167"></a><span class="lineno">  167</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00168" name="l00168"></a><span class="lineno">  168</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00169" name="l00169"></a><span class="lineno">  169</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00170" name="l00170"></a><span class="lineno">  170</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00171" name="l00171"></a><span class="lineno">  171</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00172" name="l00172"></a><span class="lineno">  172</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
+<div class="line"><a id="l00173" name="l00173"></a><span class="lineno">  173</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l00174" name="l00174"></a><span class="lineno">  174</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
+<div class="line"><a id="l00175" name="l00175"></a><span class="lineno">  175</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
+<div class="line"><a id="l00176" name="l00176"></a><span class="lineno">  176</span> </div>
+<div class="line"><a id="l00177" name="l00177"></a><span class="lineno">  177</span>  threadgroup U shared_vals[BN * BM];</div>
+<div class="line"><a id="l00178" name="l00178"></a><span class="lineno">  178</span>  U totals[n_reads];</div>
+<div class="line"><a id="l00179" name="l00179"></a><span class="lineno">  179</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00180" name="l00180"></a><span class="lineno">  180</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00181" name="l00181"></a><span class="lineno">  181</span> </div>
+<div class="line"><a id="l00182" name="l00182"></a><span class="lineno">  182</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00183" name="l00183"></a><span class="lineno">  183</span>    totals[i] = Op::init;</div>
+<div class="line"><a id="l00184" name="l00184"></a><span class="lineno">  184</span>  }</div>
+<div class="line"><a id="l00185" name="l00185"></a><span class="lineno">  185</span> </div>
+<div class="line"><a id="l00186" name="l00186"></a><span class="lineno">  186</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
+<div class="line"><a id="l00187" name="l00187"></a><span class="lineno">  187</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
+<div class="line"><a id="l00188" name="l00188"></a><span class="lineno">  188</span>  <span class="keywordtype">size_t</span> column = BN * gid.x + offset.x;</div>
+<div class="line"><a id="l00189" name="l00189"></a><span class="lineno">  189</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
+<div class="line"><a id="l00190" name="l00190"></a><span class="lineno">  190</span> </div>
+<div class="line"><a id="l00191" name="l00191"></a><span class="lineno">  191</span>  <span class="keywordtype">size_t</span> out_idx = gid.y + gsize.y * size_t(gid.z);</div>
+<div class="line"><a id="l00192" name="l00192"></a><span class="lineno">  192</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00193" name="l00193"></a><span class="lineno">  193</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00194" name="l00194"></a><span class="lineno">  194</span> </div>
+<div class="line"><a id="l00195" name="l00195"></a><span class="lineno">  195</span>  <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
+<div class="line"><a id="l00196" name="l00196"></a><span class="lineno">  196</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00197" name="l00197"></a><span class="lineno">  197</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y; r &lt; total; r += BM) {</div>
+<div class="line"><a id="l00198" name="l00198"></a><span class="lineno">  198</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00199" name="l00199"></a><span class="lineno">  199</span> </div>
+<div class="line"><a id="l00200" name="l00200"></a><span class="lineno">  200</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00201" name="l00201"></a><span class="lineno">  201</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00202" name="l00202"></a><span class="lineno">  202</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
+<div class="line"><a id="l00203" name="l00203"></a><span class="lineno">  203</span>      }</div>
+<div class="line"><a id="l00204" name="l00204"></a><span class="lineno">  204</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00205" name="l00205"></a><span class="lineno">  205</span>      U vals[n_reads];</div>
+<div class="line"><a id="l00206" name="l00206"></a><span class="lineno">  206</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00207" name="l00207"></a><span class="lineno">  207</span>        vals[i] =</div>
+<div class="line"><a id="l00208" name="l00208"></a><span class="lineno">  208</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00209" name="l00209"></a><span class="lineno">  209</span>      }</div>
+<div class="line"><a id="l00210" name="l00210"></a><span class="lineno">  210</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00211" name="l00211"></a><span class="lineno">  211</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
+<div class="line"><a id="l00212" name="l00212"></a><span class="lineno">  212</span>      }</div>
+<div class="line"><a id="l00213" name="l00213"></a><span class="lineno">  213</span>    }</div>
+<div class="line"><a id="l00214" name="l00214"></a><span class="lineno">  214</span> </div>
+<div class="line"><a id="l00215" name="l00215"></a><span class="lineno">  215</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00216" name="l00216"></a><span class="lineno">  216</span>  }</div>
+<div class="line"><a id="l00217" name="l00217"></a><span class="lineno">  217</span> </div>
+<div class="line"><a id="l00218" name="l00218"></a><span class="lineno">  218</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
+<div class="line"><a id="l00219" name="l00219"></a><span class="lineno">  219</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
+<div class="line"><a id="l00220" name="l00220"></a><span class="lineno">  220</span>  <span class="comment">// accumulations.</span></div>
+<div class="line"><a id="l00221" name="l00221"></a><span class="lineno">  221</span>  <span class="keywordflow">if</span> (BM == 32) {</div>
+<div class="line"><a id="l00222" name="l00222"></a><span class="lineno">  222</span>    <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
+<div class="line"><a id="l00223" name="l00223"></a><span class="lineno">  223</span>    <span class="keyword">static_assert</span>(</div>
+<div class="line"><a id="l00224" name="l00224"></a><span class="lineno">  224</span>        BM != 32 || n_outputs == n_reads,</div>
+<div class="line"><a id="l00225" name="l00225"></a><span class="lineno">  225</span>        <span class="stringliteral">&quot;The tile should be selected such that n_outputs == n_reads&quot;</span>);</div>
+<div class="line"><a id="l00226" name="l00226"></a><span class="lineno">  226</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00227" name="l00227"></a><span class="lineno">  227</span>      shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
+<div class="line"><a id="l00228" name="l00228"></a><span class="lineno">  228</span>    }</div>
+<div class="line"><a id="l00229" name="l00229"></a><span class="lineno">  229</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00230" name="l00230"></a><span class="lineno">  230</span>    short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
+<div class="line"><a id="l00231" name="l00231"></a><span class="lineno">  231</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00232" name="l00232"></a><span class="lineno">  232</span>      totals[i] =</div>
+<div class="line"><a id="l00233" name="l00233"></a><span class="lineno">  233</span>          <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
+<div class="line"><a id="l00234" name="l00234"></a><span class="lineno">  234</span>    }</div>
+<div class="line"><a id="l00235" name="l00235"></a><span class="lineno">  235</span> </div>
+<div class="line"><a id="l00236" name="l00236"></a><span class="lineno">  236</span>    <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00237" name="l00237"></a><span class="lineno">  237</span>    <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00238" name="l00238"></a><span class="lineno">  238</span>      <span class="keywordtype">size_t</span> out_column = BN * gid.x + out_offset.x;</div>
+<div class="line"><a id="l00239" name="l00239"></a><span class="lineno">  239</span>      out += out_idx * reduction_stride + out_column;</div>
+<div class="line"><a id="l00240" name="l00240"></a><span class="lineno">  240</span>      <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
+<div class="line"><a id="l00241" name="l00241"></a><span class="lineno">  241</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00242" name="l00242"></a><span class="lineno">  242</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00243" name="l00243"></a><span class="lineno">  243</span>        }</div>
+<div class="line"><a id="l00244" name="l00244"></a><span class="lineno">  244</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00245" name="l00245"></a><span class="lineno">  245</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00246" name="l00246"></a><span class="lineno">  246</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00247" name="l00247"></a><span class="lineno">  247</span>        }</div>
+<div class="line"><a id="l00248" name="l00248"></a><span class="lineno">  248</span>      }</div>
+<div class="line"><a id="l00249" name="l00249"></a><span class="lineno">  249</span>    }</div>
+<div class="line"><a id="l00250" name="l00250"></a><span class="lineno">  250</span>  }</div>
+<div class="line"><a id="l00251" name="l00251"></a><span class="lineno">  251</span> </div>
+<div class="line"><a id="l00252" name="l00252"></a><span class="lineno">  252</span>  <span class="comment">// Each thread holds n_reads partial results. We write them all out to shared</span></div>
+<div class="line"><a id="l00253" name="l00253"></a><span class="lineno">  253</span>  <span class="comment">// memory and threads with offset.y == 0 aggregate the columns and write the</span></div>
+<div class="line"><a id="l00254" name="l00254"></a><span class="lineno">  254</span>  <span class="comment">// outputs.</span></div>
+<div class="line"><a id="l00255" name="l00255"></a><span class="lineno">  255</span>  <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00256" name="l00256"></a><span class="lineno">  256</span>    <span class="keywordtype">short</span> x_block = offset.x / n_reads;</div>
+<div class="line"><a id="l00257" name="l00257"></a><span class="lineno">  257</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00258" name="l00258"></a><span class="lineno">  258</span>      shared_vals[x_block * BM * n_reads + i * BM + offset.y] = totals[i];</div>
+<div class="line"><a id="l00259" name="l00259"></a><span class="lineno">  259</span>    }</div>
+<div class="line"><a id="l00260" name="l00260"></a><span class="lineno">  260</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00261" name="l00261"></a><span class="lineno">  261</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
+<div class="line"><a id="l00262" name="l00262"></a><span class="lineno">  262</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00263" name="l00263"></a><span class="lineno">  263</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> j = 1; j &lt; BM; j++) {</div>
+<div class="line"><a id="l00264" name="l00264"></a><span class="lineno">  264</span>          totals[i] =</div>
+<div class="line"><a id="l00265" name="l00265"></a><span class="lineno">  265</span>              <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(shared_vals[x_block * BM * n_reads + i * BM + j], totals[i]);</div>
+<div class="line"><a id="l00266" name="l00266"></a><span class="lineno">  266</span>        }</div>
+<div class="line"><a id="l00267" name="l00267"></a><span class="lineno">  267</span>      }</div>
+<div class="line"><a id="l00268" name="l00268"></a><span class="lineno">  268</span>    }</div>
+<div class="line"><a id="l00269" name="l00269"></a><span class="lineno">  269</span> </div>
+<div class="line"><a id="l00270" name="l00270"></a><span class="lineno">  270</span>    <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00271" name="l00271"></a><span class="lineno">  271</span>    <span class="keywordflow">if</span> (offset.y == 0) {</div>
+<div class="line"><a id="l00272" name="l00272"></a><span class="lineno">  272</span>      out += out_idx * reduction_stride + column;</div>
+<div class="line"><a id="l00273" name="l00273"></a><span class="lineno">  273</span>      <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00274" name="l00274"></a><span class="lineno">  274</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00275" name="l00275"></a><span class="lineno">  275</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00276" name="l00276"></a><span class="lineno">  276</span>        }</div>
+<div class="line"><a id="l00277" name="l00277"></a><span class="lineno">  277</span>      } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00278" name="l00278"></a><span class="lineno">  278</span>        <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00279" name="l00279"></a><span class="lineno">  279</span>          out[i] = totals[i];</div>
+<div class="line"><a id="l00280" name="l00280"></a><span class="lineno">  280</span>        }</div>
+<div class="line"><a id="l00281" name="l00281"></a><span class="lineno">  281</span>      }</div>
+<div class="line"><a id="l00282" name="l00282"></a><span class="lineno">  282</span>    }</div>
+<div class="line"><a id="l00283" name="l00283"></a><span class="lineno">  283</span>  }</div>
+<div class="line"><a id="l00284" name="l00284"></a><span class="lineno">  284</span>}</div>
+</div>
+<div class="line"><a id="l00285" name="l00285"></a><span class="lineno">  285</span> </div>
+<div class="line"><a id="l00286" name="l00286"></a><span class="lineno">  286</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keyword">typename</span> U, <span class="keyword">typename</span> Op, <span class="keywordtype">int</span> NDIMS, <span class="keywordtype">int</span> BM, <span class="keywordtype">int</span> BN&gt;</div>
+<div class="foldopen" id="foldopen00287" data-start="{" data-end="}">
+<div class="line"><a id="l00287" name="l00287"></a><span class="lineno"><a class="line" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">  287</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a>(</div>
+<div class="line"><a id="l00288" name="l00288"></a><span class="lineno">  288</span>    <span class="keyword">const</span> device T* in [[buffer(0)]],</div>
+<div class="line"><a id="l00289" name="l00289"></a><span class="lineno">  289</span>    device U* out [[buffer(1)]],</div>
+<div class="line"><a id="l00290" name="l00290"></a><span class="lineno">  290</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_size [[buffer(2)]],</div>
+<div class="line"><a id="l00291" name="l00291"></a><span class="lineno">  291</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; reduction_stride [[buffer(3)]],</div>
+<div class="line"><a id="l00292" name="l00292"></a><span class="lineno">  292</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* shape [[buffer(4)]],</div>
+<div class="line"><a id="l00293" name="l00293"></a><span class="lineno">  293</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* strides [[buffer(5)]],</div>
+<div class="line"><a id="l00294" name="l00294"></a><span class="lineno">  294</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; ndim [[buffer(6)]],</div>
+<div class="line"><a id="l00295" name="l00295"></a><span class="lineno">  295</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>* reduce_shape [[buffer(7)]],</div>
+<div class="line"><a id="l00296" name="l00296"></a><span class="lineno">  296</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>* reduce_strides [[buffer(8)]],</div>
+<div class="line"><a id="l00297" name="l00297"></a><span class="lineno">  297</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; reduce_ndim [[buffer(9)]],</div>
+<div class="line"><a id="l00298" name="l00298"></a><span class="lineno">  298</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; non_col_reductions [[buffer(10)]],</div>
+<div class="line"><a id="l00299" name="l00299"></a><span class="lineno">  299</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; out_size [[buffer(11)]],</div>
+<div class="line"><a id="l00300" name="l00300"></a><span class="lineno">  300</span>    uint3 gid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00301" name="l00301"></a><span class="lineno">  301</span>    uint3 gsize [[threadgroups_per_grid]],</div>
+<div class="line"><a id="l00302" name="l00302"></a><span class="lineno">  302</span>    uint simd_lane_id [[thread_index_in_simdgroup]],</div>
+<div class="line"><a id="l00303" name="l00303"></a><span class="lineno">  303</span>    uint simd_group_id [[simdgroup_index_in_threadgroup]]) {</div>
+<div class="line"><a id="l00304" name="l00304"></a><span class="lineno">  304</span>  Op <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>;</div>
+<div class="line"><a id="l00305" name="l00305"></a><span class="lineno">  305</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_simdgroups = 8;</div>
+<div class="line"><a id="l00306" name="l00306"></a><span class="lineno">  306</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> tgp_size = n_simdgroups * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a>;</div>
+<div class="line"><a id="l00307" name="l00307"></a><span class="lineno">  307</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_reads = (BM * BN) / tgp_size;</div>
+<div class="line"><a id="l00308" name="l00308"></a><span class="lineno">  308</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> n_read_blocks = BN / n_reads;</div>
+<div class="line"><a id="l00309" name="l00309"></a><span class="lineno">  309</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> n_outputs = BN / n_simdgroups;</div>
+<div class="line"><a id="l00310" name="l00310"></a><span class="lineno">  310</span>  <span class="keyword">constexpr</span> <span class="keywordtype">short</span> outer_blocks = 32;</div>
+<div class="line"><a id="l00311" name="l00311"></a><span class="lineno">  311</span>  <span class="keyword">static_assert</span>(BM == 32, <span class="stringliteral">&quot;BM should be equal to 32&quot;</span>);</div>
+<div class="line"><a id="l00312" name="l00312"></a><span class="lineno">  312</span> </div>
+<div class="line"><a id="l00313" name="l00313"></a><span class="lineno">  313</span>  threadgroup U shared_vals[BN * BM];</div>
+<div class="line"><a id="l00314" name="l00314"></a><span class="lineno">  314</span>  U totals[n_reads];</div>
+<div class="line"><a id="l00315" name="l00315"></a><span class="lineno">  315</span>  <a class="code hl_struct" href="structlooped__elem__to__loc.html">looped_elem_to_loc&lt;NDIMS&gt;</a> loop;</div>
+<div class="line"><a id="l00316" name="l00316"></a><span class="lineno">  316</span>  <span class="keyword">const</span> device T* row;</div>
+<div class="line"><a id="l00317" name="l00317"></a><span class="lineno">  317</span> </div>
+<div class="line"><a id="l00318" name="l00318"></a><span class="lineno">  318</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00319" name="l00319"></a><span class="lineno">  319</span>    totals[i] = Op::init;</div>
+<div class="line"><a id="l00320" name="l00320"></a><span class="lineno">  320</span>  }</div>
+<div class="line"><a id="l00321" name="l00321"></a><span class="lineno">  321</span> </div>
+<div class="line"><a id="l00322" name="l00322"></a><span class="lineno">  322</span>  <span class="keywordtype">short</span> lid = simd_group_id * <a class="code hl_variable" href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a> + simd_lane_id;</div>
+<div class="line"><a id="l00323" name="l00323"></a><span class="lineno">  323</span>  short2 offset((lid % n_read_blocks) * n_reads, lid / n_read_blocks);</div>
+<div class="line"><a id="l00324" name="l00324"></a><span class="lineno">  324</span>  <span class="keywordtype">size_t</span> column = BN * gid.x + offset.x;</div>
+<div class="line"><a id="l00325" name="l00325"></a><span class="lineno">  325</span>  <span class="keywordtype">bool</span> safe = column + n_reads &lt;= reduction_stride;</div>
+<div class="line"><a id="l00326" name="l00326"></a><span class="lineno">  326</span> </div>
+<div class="line"><a id="l00327" name="l00327"></a><span class="lineno">  327</span>  <span class="keywordtype">size_t</span> full_idx = gid.y + gsize.y * size_t(gid.z);</div>
+<div class="line"><a id="l00328" name="l00328"></a><span class="lineno">  328</span>  <span class="keywordtype">size_t</span> block_idx = full_idx / out_size;</div>
+<div class="line"><a id="l00329" name="l00329"></a><span class="lineno">  329</span>  <span class="keywordtype">size_t</span> out_idx = full_idx % out_size;</div>
+<div class="line"><a id="l00330" name="l00330"></a><span class="lineno">  330</span>  <span class="keywordtype">size_t</span> in_idx = <a class="code hl_function" href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a>(out_idx, shape, strides, ndim);</div>
+<div class="line"><a id="l00331" name="l00331"></a><span class="lineno">  331</span>  in += in_idx + column;</div>
+<div class="line"><a id="l00332" name="l00332"></a><span class="lineno">  332</span> </div>
+<div class="line"><a id="l00333" name="l00333"></a><span class="lineno">  333</span>  <span class="keywordtype">size_t</span> total = non_col_reductions * reduction_size;</div>
+<div class="line"><a id="l00334" name="l00334"></a><span class="lineno">  334</span>  loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(offset.y + block_idx * BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00335" name="l00335"></a><span class="lineno">  335</span>  <span class="keywordflow">for</span> (<span class="keywordtype">size_t</span> r = offset.y + block_idx * BM; r &lt; total;</div>
+<div class="line"><a id="l00336" name="l00336"></a><span class="lineno">  336</span>       r += outer_blocks * BM) {</div>
+<div class="line"><a id="l00337" name="l00337"></a><span class="lineno">  337</span>    row = in + loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">location</a>(r, reduce_shape, reduce_strides, reduce_ndim);</div>
+<div class="line"><a id="l00338" name="l00338"></a><span class="lineno">  338</span> </div>
+<div class="line"><a id="l00339" name="l00339"></a><span class="lineno">  339</span>    <span class="keywordflow">if</span> (safe) {</div>
+<div class="line"><a id="l00340" name="l00340"></a><span class="lineno">  340</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00341" name="l00341"></a><span class="lineno">  341</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(<span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(row[i]), totals[i]);</div>
+<div class="line"><a id="l00342" name="l00342"></a><span class="lineno">  342</span>      }</div>
+<div class="line"><a id="l00343" name="l00343"></a><span class="lineno">  343</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00344" name="l00344"></a><span class="lineno">  344</span>      U vals[n_reads];</div>
+<div class="line"><a id="l00345" name="l00345"></a><span class="lineno">  345</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00346" name="l00346"></a><span class="lineno">  346</span>        vals[i] =</div>
+<div class="line"><a id="l00347" name="l00347"></a><span class="lineno">  347</span>            (column + i &lt; reduction_stride) ? static_cast&lt;U&gt;(row[i]) : <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.init;</div>
+<div class="line"><a id="l00348" name="l00348"></a><span class="lineno">  348</span>      }</div>
+<div class="line"><a id="l00349" name="l00349"></a><span class="lineno">  349</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00350" name="l00350"></a><span class="lineno">  350</span>        totals[i] = <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>(vals[i], totals[i]);</div>
+<div class="line"><a id="l00351" name="l00351"></a><span class="lineno">  351</span>      }</div>
+<div class="line"><a id="l00352" name="l00352"></a><span class="lineno">  352</span>    }</div>
+<div class="line"><a id="l00353" name="l00353"></a><span class="lineno">  353</span> </div>
+<div class="line"><a id="l00354" name="l00354"></a><span class="lineno">  354</span>    loop.<a class="code hl_function" href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">next</a>(outer_blocks * BM, reduce_shape, reduce_strides);</div>
+<div class="line"><a id="l00355" name="l00355"></a><span class="lineno">  355</span>  }</div>
+<div class="line"><a id="l00356" name="l00356"></a><span class="lineno">  356</span> </div>
+<div class="line"><a id="l00357" name="l00357"></a><span class="lineno">  357</span>  <span class="comment">// We can use a simd reduction to accumulate across BM so each thread writes</span></div>
+<div class="line"><a id="l00358" name="l00358"></a><span class="lineno">  358</span>  <span class="comment">// the partial output to SM and then each simdgroup does BN / n_simdgroups</span></div>
+<div class="line"><a id="l00359" name="l00359"></a><span class="lineno">  359</span>  <span class="comment">// accumulations.</span></div>
+<div class="line"><a id="l00360" name="l00360"></a><span class="lineno">  360</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_reads; i++) {</div>
+<div class="line"><a id="l00361" name="l00361"></a><span class="lineno">  361</span>    shared_vals[offset.y * BN + offset.x + i] = totals[i];</div>
+<div class="line"><a id="l00362" name="l00362"></a><span class="lineno">  362</span>  }</div>
+<div class="line"><a id="l00363" name="l00363"></a><span class="lineno">  363</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00364" name="l00364"></a><span class="lineno">  364</span>  short2 out_offset(simd_group_id * n_outputs, simd_lane_id);</div>
+<div class="line"><a id="l00365" name="l00365"></a><span class="lineno">  365</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00366" name="l00366"></a><span class="lineno">  366</span>    totals[i] =</div>
+<div class="line"><a id="l00367" name="l00367"></a><span class="lineno">  367</span>        <a class="code hl_variable" href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a>.simd_reduce(shared_vals[out_offset.y * BN + out_offset.x + i]);</div>
+<div class="line"><a id="l00368" name="l00368"></a><span class="lineno">  368</span>  }</div>
+<div class="line"><a id="l00369" name="l00369"></a><span class="lineno">  369</span> </div>
+<div class="line"><a id="l00370" name="l00370"></a><span class="lineno">  370</span>  <span class="comment">// Write the output.</span></div>
+<div class="line"><a id="l00371" name="l00371"></a><span class="lineno">  371</span>  <span class="keywordflow">if</span> (simd_lane_id == 0) {</div>
+<div class="line"><a id="l00372" name="l00372"></a><span class="lineno">  372</span>    <span class="keywordtype">size_t</span> out_column = BN * gid.x + out_offset.x;</div>
+<div class="line"><a id="l00373" name="l00373"></a><span class="lineno">  373</span>    out += full_idx * reduction_stride + out_column;</div>
+<div class="line"><a id="l00374" name="l00374"></a><span class="lineno">  374</span>    <span class="keywordflow">if</span> (out_column + n_outputs &lt;= reduction_stride) {</div>
+<div class="line"><a id="l00375" name="l00375"></a><span class="lineno">  375</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; n_outputs; i++) {</div>
+<div class="line"><a id="l00376" name="l00376"></a><span class="lineno">  376</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00377" name="l00377"></a><span class="lineno">  377</span>      }</div>
+<div class="line"><a id="l00378" name="l00378"></a><span class="lineno">  378</span>    } <span class="keywordflow">else</span> {</div>
+<div class="line"><a id="l00379" name="l00379"></a><span class="lineno">  379</span>      <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; out_column + i &lt; reduction_stride; i++) {</div>
+<div class="line"><a id="l00380" name="l00380"></a><span class="lineno">  380</span>        out[i] = totals[i];</div>
+<div class="line"><a id="l00381" name="l00381"></a><span class="lineno">  381</span>      }</div>
+<div class="line"><a id="l00382" name="l00382"></a><span class="lineno">  382</span>    }</div>
+<div class="line"><a id="l00383" name="l00383"></a><span class="lineno">  383</span>  }</div>
+<div class="line"><a id="l00384" name="l00384"></a><span class="lineno">  384</span>}</div>
 </div>
 <div class="ttc" id="abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3"><div class="ttname"><a href="backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3">simd_size</a></div><div class="ttdeci">static constant constexpr const uint8_t simd_size</div><div class="ttdef"><b>Definition</b> ops.h:22</div></div>
 <div class="ttc" id="abackend_2metal_2kernels_2utils_8h_html_a8fd0c8fc6058e650fc99bca8b6acd7d1"><div class="ttname"><a href="backend_2metal_2kernels_2utils_8h.html#a8fd0c8fc6058e650fc99bca8b6acd7d1">elem_to_loc</a></div><div class="ttdeci">METAL_FUNC stride_t elem_to_loc(uint elem, constant const int *shape, constant const stride_t *strides, int ndim)</div><div class="ttdef"><b>Definition</b> utils.h:87</div></div>
 <div class="ttc" id="acommon_2binary_8h_html_a70228731d29946574b238d21fb4b360c"><div class="ttname"><a href="common_2binary_8h.html#a70228731d29946574b238d21fb4b360c">op</a></div><div class="ttdeci">Op op</div><div class="ttdef"><b>Definition</b> binary.h:129</div></div>
-<div class="ttc" id="adefines_8h_html_a2ad505864a2ab786147766900bc18c21"><div class="ttname"><a href="defines_8h.html#a2ad505864a2ab786147766900bc18c21">REDUCE_N_READS</a></div><div class="ttdeci">static constexpr int REDUCE_N_READS</div><div class="ttdef"><b>Definition</b> defines.h:12</div></div>
-<div class="ttc" id="areduce__col_8h_html_a11bfc6112ae2386ac03f5ea7b7d93385"><div class="ttname"><a href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a></div><div class="ttdeci">void col_reduce_looped(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdoc">Our approach is the following simple looped approach:</div><div class="ttdef"><b>Definition</b> reduce_col.h:202</div></div>
-<div class="ttc" id="areduce__col_8h_html_adf7aeb18cd1d5042cf6d9b46b582d8ce"><div class="ttname"><a href="reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce">col_reduce_small</a></div><div class="ttdeci">void col_reduce_small(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 tsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:9</div></div>
+<div class="ttc" id="areduce__col_8h_html_a0e92fc74eeaa8ee2ceb83bafc6eb1d7d"><div class="ttname"><a href="reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d">col_reduce_2pass</a></div><div class="ttdeci">void col_reduce_2pass(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdef"><b>Definition</b> reduce_col.h:287</div></div>
+<div class="ttc" id="areduce__col_8h_html_a11bfc6112ae2386ac03f5ea7b7d93385"><div class="ttname"><a href="reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385">col_reduce_looped</a></div><div class="ttdeci">void col_reduce_looped(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)</div><div class="ttdoc">Our approach is the following simple looped approach:</div><div class="ttdef"><b>Definition</b> reduce_col.h:155</div></div>
+<div class="ttc" id="areduce__col_8h_html_a5b4f4c4c247ad341ff8d31dcbbbce0eb"><div class="ttname"><a href="reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb">col_reduce_longcolumn</a></div><div class="ttdeci">void col_reduce_longcolumn(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, const constant size_t &amp;out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:97</div></div>
+<div class="ttc" id="areduce__col_8h_html_a7c378443a2b6f4d9210db8a21a9ac4f5"><div class="ttname"><a href="reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5">col_reduce_small</a></div><div class="ttdeci">void col_reduce_small(const device T *in, device U *out, const constant size_t &amp;reduction_size, const constant size_t &amp;reduction_stride, const constant int *shape, const constant size_t *strides, const constant int &amp;ndim, const constant int *reduce_shape, const constant size_t *reduce_strides, const constant int &amp;reduce_ndim, const constant size_t &amp;non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)</div><div class="ttdef"><b>Definition</b> reduce_col.h:4</div></div>
 <div class="ttc" id="astructlooped__elem__to__loc_html"><div class="ttname"><a href="structlooped__elem__to__loc.html">looped_elem_to_loc</a></div><div class="ttdef"><b>Definition</b> utils.h:197</div></div>
 <div class="ttc" id="astructlooped__elem__to__loc_html_a05558dabba889ee0d80ed4b567d901ca"><div class="ttname"><a href="structlooped__elem__to__loc.html#a05558dabba889ee0d80ed4b567d901ca">looped_elem_to_loc::next</a></div><div class="ttdeci">void next(const constant int *shape, const constant size_t *strides)</div><div class="ttdef"><b>Definition</b> utils.h:202</div></div>
 <div class="ttc" id="astructlooped__elem__to__loc_html_accc6d4957a8aeb38f5062754793b74d2"><div class="ttname"><a href="structlooped__elem__to__loc.html#accc6d4957a8aeb38f5062754793b74d2">looped_elem_to_loc::location</a></div><div class="ttdeci">offset_t location(offset_t, const constant int *, const constant size_t *, int)</div><div class="ttdef"><b>Definition</b> utils.h:229</div></div>
diff --git a/docs/build/html/sdpa__vector_8h.html b/docs/build/html/sdpa__vector_8h.html
index df9d5fd42..daee8ad0a 100644
--- a/docs/build/html/sdpa__vector_8h.html
+++ b/docs/build/html/sdpa__vector_8h.html
@@ -99,13 +99,13 @@ $(function(){ initResizable(false); });
 <table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:a6f0d7918430064bab910bdaa6c64e927" id="r_a6f0d7918430064bab910bdaa6c64e927"><td class="memTemplParams" colspan="2">template&lt;typename T , int D&gt; </td></tr>
-<tr class="memitem:a6f0d7918430064bab910bdaa6c64e927"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a6f0d7918430064bab910bdaa6c64e927">sdpa_vector</a> (const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
-<tr class="separator:a6f0d7918430064bab910bdaa6c64e927"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:a4bf36f16e16c1c62d9b243573568e5ae" id="r_a4bf36f16e16c1c62d9b243573568e5ae"><td class="memTemplParams" colspan="2">template&lt;typename T , int D&gt; </td></tr>
+<tr class="memitem:a4bf36f16e16c1c62d9b243573568e5ae"><td class="memTemplItemLeft" align="right" valign="top">void&#160;</td><td class="memTemplItemRight" valign="bottom"><a class="el" href="#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector</a> (const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</td></tr>
+<tr class="separator:a4bf36f16e16c1c62d9b243573568e5ae"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="a6f0d7918430064bab910bdaa6c64e927" name="a6f0d7918430064bab910bdaa6c64e927"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#a6f0d7918430064bab910bdaa6c64e927">&#9670;&#160;</a></span>sdpa_vector()</h2>
+<a id="a4bf36f16e16c1c62d9b243573568e5ae" name="a4bf36f16e16c1c62d9b243573568e5ae"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4bf36f16e16c1c62d9b243573568e5ae">&#9670;&#160;</a></span>sdpa_vector()</h2>
 
 <div class="memitem">
 <div class="memproto">
@@ -147,6 +147,11 @@ template&lt;typename T , int D&gt; </div>
           <td></td>
           <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>k_stride</em></span>, </td>
         </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">const constant size_t &amp;</td>          <td class="paramname"><span class="paramname"><em>v_stride</em></span>, </td>
+        </tr>
         <tr>
           <td class="paramkey"></td>
           <td></td>
diff --git a/docs/build/html/sdpa__vector_8h_source.html b/docs/build/html/sdpa__vector_8h_source.html
index 2d695b09b..390193135 100644
--- a/docs/build/html/sdpa__vector_8h_source.html
+++ b/docs/build/html/sdpa__vector_8h_source.html
@@ -99,7 +99,7 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00006" name="l00006"></a><span class="lineno">    6</span> </div>
 <div class="line"><a id="l00007" name="l00007"></a><span class="lineno">    7</span><span class="keyword">template</span> &lt;<span class="keyword">typename</span> T, <span class="keywordtype">int</span> D&gt;</div>
 <div class="foldopen" id="foldopen00008" data-start="{" data-end="}">
-<div class="line"><a id="l00008" name="l00008"></a><span class="lineno"><a class="line" href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927">    8</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927">sdpa_vector</a>(</div>
+<div class="line"><a id="l00008" name="l00008"></a><span class="lineno"><a class="line" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">    8</a></span>[[kernel]] <span class="keywordtype">void</span> <a class="code hl_function" href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector</a>(</div>
 <div class="line"><a id="l00009" name="l00009"></a><span class="lineno">    9</span>    <span class="keyword">const</span> device T* queries [[buffer(0)]],</div>
 <div class="line"><a id="l00010" name="l00010"></a><span class="lineno">   10</span>    <span class="keyword">const</span> device T* keys [[buffer(1)]],</div>
 <div class="line"><a id="l00011" name="l00011"></a><span class="lineno">   11</span>    <span class="keyword">const</span> device T* values [[buffer(2)]],</div>
@@ -107,113 +107,114 @@ $(function(){ initResizable(false); });
 <div class="line"><a id="l00013" name="l00013"></a><span class="lineno">   13</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; gqa_factor,</div>
 <div class="line"><a id="l00014" name="l00014"></a><span class="lineno">   14</span>    <span class="keyword">const</span> constant <span class="keywordtype">int</span>&amp; N,</div>
 <div class="line"><a id="l00015" name="l00015"></a><span class="lineno">   15</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; k_stride,</div>
-<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> constant <span class="keywordtype">float</span>&amp; scale,</div>
-<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
-<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
-<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
-<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN = 32;</div>
-<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BD = 32;</div>
-<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> elem_per_thread = D / BD;</div>
-<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span> </div>
-<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> stride = BN * D;</div>
-<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span> </div>
-<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
-<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span> </div>
-<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span>  thread U q[elem_per_thread];</div>
-<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  thread U k[elem_per_thread];</div>
-<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  thread U o[elem_per_thread];</div>
-<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span> </div>
-<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span>  threadgroup U outputs[BN * BD];</div>
-<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  threadgroup U max_scores[BN];</div>
-<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  threadgroup U sum_exp_scores[BN];</div>
-<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span> </div>
-<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span>  <span class="comment">// Adjust positions</span></div>
-<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
-<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> kv_head_idx = head_idx / gqa_factor;</div>
-<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  queries += head_idx * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  keys += kv_head_idx * k_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  values += kv_head_idx * k_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
-<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  out += head_idx * D + simd_gid * elem_per_thread;</div>
-<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span> </div>
-<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span>  <span class="comment">// Read the query and 0 the output accumulator</span></div>
-<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>    q[i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(scale) * queries[i];</div>
-<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>  }</div>
-<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>    o[i] = 0;</div>
-<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>  }</div>
-<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span> </div>
-<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span>  U max_score = -INFINITY;</div>
-<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  U sum_exp_score = 0;</div>
-<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span> </div>
-<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span>  <span class="comment">// For each key</span></div>
-<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = simd_gid; i &lt; N; i += BN) {</div>
-<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>    <span class="comment">// Read the key</span></div>
-<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>      k[i] = keys[i];</div>
-<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>    }</div>
-<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span> </div>
-<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span>    <span class="comment">// Compute the i-th score</span></div>
-<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    U score = 0;</div>
-<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>      score += q[i] * k[i];</div>
-<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>    }</div>
-<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(score);</div>
-<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span> </div>
-<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span>    <span class="comment">// Update the accumulators</span></div>
-<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    U new_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(max_score, score);</div>
-<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
-<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    U exp_score = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(score - new_max);</div>
-<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span> </div>
-<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span>    max_score = new_max;</div>
-<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    sum_exp_score = sum_exp_score * factor + exp_score;</div>
-<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span> </div>
-<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span>    <span class="comment">// Update the output accumulator</span></div>
-<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>      o[i] = o[i] * factor + exp_score * values[i];</div>
-<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>    }</div>
-<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span> </div>
-<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span>    <span class="comment">// Move the pointers to the next kv</span></div>
-<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    keys += stride;</div>
-<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    values += stride;</div>
-<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>  }</div>
-<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span> </div>
-<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span>  <span class="comment">// Each thread has a partial part of the output so we need to combine them.</span></div>
-<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span> </div>
-<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span>  <span class="comment">// First let&#39;s communicate the max and sum_exp</span></div>
-<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>    max_scores[simd_gid] = max_score;</div>
-<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    sum_exp_scores[simd_gid] = sum_exp_score;</div>
-<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>  }</div>
-<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  max_score = max_scores[simd_lid];</div>
-<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
-<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
-<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sum_exp_scores[simd_lid] * factor);</div>
-<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span> </div>
-<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span>  <span class="comment">// Now we need to aggregate all the outputs</span></div>
-<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>    outputs[simd_lid * BD + simd_gid] = o[i];</div>
-<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    o[i] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(outputs[simd_gid * BD + simd_lid] * factor) / sum_exp_score;</div>
-<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
-<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>  }</div>
-<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span> </div>
-<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span>  <span class="comment">// And write the output</span></div>
-<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
-<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
-<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(o[i]);</div>
-<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>    }</div>
-<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>  }</div>
-<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>}</div>
+<div class="line"><a id="l00016" name="l00016"></a><span class="lineno">   16</span>    <span class="keyword">const</span> constant <span class="keywordtype">size_t</span>&amp; v_stride,</div>
+<div class="line"><a id="l00017" name="l00017"></a><span class="lineno">   17</span>    <span class="keyword">const</span> constant <span class="keywordtype">float</span>&amp; scale,</div>
+<div class="line"><a id="l00018" name="l00018"></a><span class="lineno">   18</span>    uint3 tid [[threadgroup_position_in_grid]],</div>
+<div class="line"><a id="l00019" name="l00019"></a><span class="lineno">   19</span>    uint simd_gid [[simdgroup_index_in_threadgroup]],</div>
+<div class="line"><a id="l00020" name="l00020"></a><span class="lineno">   20</span>    uint simd_lid [[thread_index_in_simdgroup]]) {</div>
+<div class="line"><a id="l00021" name="l00021"></a><span class="lineno">   21</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BN = 32;</div>
+<div class="line"><a id="l00022" name="l00022"></a><span class="lineno">   22</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> BD = 32;</div>
+<div class="line"><a id="l00023" name="l00023"></a><span class="lineno">   23</span>  <span class="keyword">constexpr</span> <span class="keywordtype">int</span> elem_per_thread = D / BD;</div>
+<div class="line"><a id="l00024" name="l00024"></a><span class="lineno">   24</span> </div>
+<div class="line"><a id="l00025" name="l00025"></a><span class="lineno">   25</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> stride = BN * D;</div>
+<div class="line"><a id="l00026" name="l00026"></a><span class="lineno">   26</span> </div>
+<div class="line"><a id="l00027" name="l00027"></a><span class="lineno">   27</span>  <span class="keyword">typedef</span> <span class="keywordtype">float</span> U;</div>
+<div class="line"><a id="l00028" name="l00028"></a><span class="lineno">   28</span> </div>
+<div class="line"><a id="l00029" name="l00029"></a><span class="lineno">   29</span>  thread U q[elem_per_thread];</div>
+<div class="line"><a id="l00030" name="l00030"></a><span class="lineno">   30</span>  thread U k[elem_per_thread];</div>
+<div class="line"><a id="l00031" name="l00031"></a><span class="lineno">   31</span>  thread U o[elem_per_thread];</div>
+<div class="line"><a id="l00032" name="l00032"></a><span class="lineno">   32</span> </div>
+<div class="line"><a id="l00033" name="l00033"></a><span class="lineno">   33</span>  threadgroup U outputs[BN * BD];</div>
+<div class="line"><a id="l00034" name="l00034"></a><span class="lineno">   34</span>  threadgroup U max_scores[BN];</div>
+<div class="line"><a id="l00035" name="l00035"></a><span class="lineno">   35</span>  threadgroup U sum_exp_scores[BN];</div>
+<div class="line"><a id="l00036" name="l00036"></a><span class="lineno">   36</span> </div>
+<div class="line"><a id="l00037" name="l00037"></a><span class="lineno">   37</span>  <span class="comment">// Adjust positions</span></div>
+<div class="line"><a id="l00038" name="l00038"></a><span class="lineno">   38</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> head_idx = tid.y;</div>
+<div class="line"><a id="l00039" name="l00039"></a><span class="lineno">   39</span>  <span class="keyword">const</span> <span class="keywordtype">int</span> kv_head_idx = head_idx / gqa_factor;</div>
+<div class="line"><a id="l00040" name="l00040"></a><span class="lineno">   40</span>  queries += head_idx * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00041" name="l00041"></a><span class="lineno">   41</span>  keys += kv_head_idx * k_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00042" name="l00042"></a><span class="lineno">   42</span>  values += kv_head_idx * v_stride + simd_gid * D + simd_lid * elem_per_thread;</div>
+<div class="line"><a id="l00043" name="l00043"></a><span class="lineno">   43</span>  out += head_idx * D + simd_gid * elem_per_thread;</div>
+<div class="line"><a id="l00044" name="l00044"></a><span class="lineno">   44</span> </div>
+<div class="line"><a id="l00045" name="l00045"></a><span class="lineno">   45</span>  <span class="comment">// Read the query and 0 the output accumulator</span></div>
+<div class="line"><a id="l00046" name="l00046"></a><span class="lineno">   46</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00047" name="l00047"></a><span class="lineno">   47</span>    q[i] = <span class="keyword">static_cast&lt;</span>U<span class="keyword">&gt;</span>(scale) * queries[i];</div>
+<div class="line"><a id="l00048" name="l00048"></a><span class="lineno">   48</span>  }</div>
+<div class="line"><a id="l00049" name="l00049"></a><span class="lineno">   49</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00050" name="l00050"></a><span class="lineno">   50</span>    o[i] = 0;</div>
+<div class="line"><a id="l00051" name="l00051"></a><span class="lineno">   51</span>  }</div>
+<div class="line"><a id="l00052" name="l00052"></a><span class="lineno">   52</span> </div>
+<div class="line"><a id="l00053" name="l00053"></a><span class="lineno">   53</span>  U max_score = -INFINITY;</div>
+<div class="line"><a id="l00054" name="l00054"></a><span class="lineno">   54</span>  U sum_exp_score = 0;</div>
+<div class="line"><a id="l00055" name="l00055"></a><span class="lineno">   55</span> </div>
+<div class="line"><a id="l00056" name="l00056"></a><span class="lineno">   56</span>  <span class="comment">// For each key</span></div>
+<div class="line"><a id="l00057" name="l00057"></a><span class="lineno">   57</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = simd_gid; i &lt; N; i += BN) {</div>
+<div class="line"><a id="l00058" name="l00058"></a><span class="lineno">   58</span>    <span class="comment">// Read the key</span></div>
+<div class="line"><a id="l00059" name="l00059"></a><span class="lineno">   59</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00060" name="l00060"></a><span class="lineno">   60</span>      k[i] = keys[i];</div>
+<div class="line"><a id="l00061" name="l00061"></a><span class="lineno">   61</span>    }</div>
+<div class="line"><a id="l00062" name="l00062"></a><span class="lineno">   62</span> </div>
+<div class="line"><a id="l00063" name="l00063"></a><span class="lineno">   63</span>    <span class="comment">// Compute the i-th score</span></div>
+<div class="line"><a id="l00064" name="l00064"></a><span class="lineno">   64</span>    U score = 0;</div>
+<div class="line"><a id="l00065" name="l00065"></a><span class="lineno">   65</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00066" name="l00066"></a><span class="lineno">   66</span>      score += q[i] * k[i];</div>
+<div class="line"><a id="l00067" name="l00067"></a><span class="lineno">   67</span>    }</div>
+<div class="line"><a id="l00068" name="l00068"></a><span class="lineno">   68</span>    score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(score);</div>
+<div class="line"><a id="l00069" name="l00069"></a><span class="lineno">   69</span> </div>
+<div class="line"><a id="l00070" name="l00070"></a><span class="lineno">   70</span>    <span class="comment">// Update the accumulators</span></div>
+<div class="line"><a id="l00071" name="l00071"></a><span class="lineno">   71</span>    U new_max = <a class="code hl_function" href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">max</a>(max_score, score);</div>
+<div class="line"><a id="l00072" name="l00072"></a><span class="lineno">   72</span>    U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00073" name="l00073"></a><span class="lineno">   73</span>    U exp_score = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(score - new_max);</div>
+<div class="line"><a id="l00074" name="l00074"></a><span class="lineno">   74</span> </div>
+<div class="line"><a id="l00075" name="l00075"></a><span class="lineno">   75</span>    max_score = new_max;</div>
+<div class="line"><a id="l00076" name="l00076"></a><span class="lineno">   76</span>    sum_exp_score = sum_exp_score * factor + exp_score;</div>
+<div class="line"><a id="l00077" name="l00077"></a><span class="lineno">   77</span> </div>
+<div class="line"><a id="l00078" name="l00078"></a><span class="lineno">   78</span>    <span class="comment">// Update the output accumulator</span></div>
+<div class="line"><a id="l00079" name="l00079"></a><span class="lineno">   79</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00080" name="l00080"></a><span class="lineno">   80</span>      o[i] = o[i] * factor + exp_score * values[i];</div>
+<div class="line"><a id="l00081" name="l00081"></a><span class="lineno">   81</span>    }</div>
+<div class="line"><a id="l00082" name="l00082"></a><span class="lineno">   82</span> </div>
+<div class="line"><a id="l00083" name="l00083"></a><span class="lineno">   83</span>    <span class="comment">// Move the pointers to the next kv</span></div>
+<div class="line"><a id="l00084" name="l00084"></a><span class="lineno">   84</span>    keys += stride;</div>
+<div class="line"><a id="l00085" name="l00085"></a><span class="lineno">   85</span>    values += stride;</div>
+<div class="line"><a id="l00086" name="l00086"></a><span class="lineno">   86</span>  }</div>
+<div class="line"><a id="l00087" name="l00087"></a><span class="lineno">   87</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00088" name="l00088"></a><span class="lineno">   88</span> </div>
+<div class="line"><a id="l00089" name="l00089"></a><span class="lineno">   89</span>  <span class="comment">// Each thread has a partial part of the output so we need to combine them.</span></div>
+<div class="line"><a id="l00090" name="l00090"></a><span class="lineno">   90</span> </div>
+<div class="line"><a id="l00091" name="l00091"></a><span class="lineno">   91</span>  <span class="comment">// First let&#39;s communicate the max and sum_exp</span></div>
+<div class="line"><a id="l00092" name="l00092"></a><span class="lineno">   92</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00093" name="l00093"></a><span class="lineno">   93</span>    max_scores[simd_gid] = max_score;</div>
+<div class="line"><a id="l00094" name="l00094"></a><span class="lineno">   94</span>    sum_exp_scores[simd_gid] = sum_exp_score;</div>
+<div class="line"><a id="l00095" name="l00095"></a><span class="lineno">   95</span>  }</div>
+<div class="line"><a id="l00096" name="l00096"></a><span class="lineno">   96</span>  threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00097" name="l00097"></a><span class="lineno">   97</span>  max_score = max_scores[simd_lid];</div>
+<div class="line"><a id="l00098" name="l00098"></a><span class="lineno">   98</span>  U new_max = <a class="code hl_function" href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">simd_max</a>(max_score);</div>
+<div class="line"><a id="l00099" name="l00099"></a><span class="lineno">   99</span>  U factor = <a class="code hl_function" href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">fast::exp</a>(max_score - new_max);</div>
+<div class="line"><a id="l00100" name="l00100"></a><span class="lineno">  100</span>  sum_exp_score = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(sum_exp_scores[simd_lid] * factor);</div>
+<div class="line"><a id="l00101" name="l00101"></a><span class="lineno">  101</span> </div>
+<div class="line"><a id="l00102" name="l00102"></a><span class="lineno">  102</span>  <span class="comment">// Now we need to aggregate all the outputs</span></div>
+<div class="line"><a id="l00103" name="l00103"></a><span class="lineno">  103</span>  <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00104" name="l00104"></a><span class="lineno">  104</span>    outputs[simd_lid * BD + simd_gid] = o[i];</div>
+<div class="line"><a id="l00105" name="l00105"></a><span class="lineno">  105</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00106" name="l00106"></a><span class="lineno">  106</span>    o[i] = <a class="code hl_function" href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">simd_sum</a>(outputs[simd_gid * BD + simd_lid] * factor) / sum_exp_score;</div>
+<div class="line"><a id="l00107" name="l00107"></a><span class="lineno">  107</span>    threadgroup_barrier(mem_flags::mem_threadgroup);</div>
+<div class="line"><a id="l00108" name="l00108"></a><span class="lineno">  108</span>  }</div>
+<div class="line"><a id="l00109" name="l00109"></a><span class="lineno">  109</span> </div>
+<div class="line"><a id="l00110" name="l00110"></a><span class="lineno">  110</span>  <span class="comment">// And write the output</span></div>
+<div class="line"><a id="l00111" name="l00111"></a><span class="lineno">  111</span>  <span class="keywordflow">if</span> (simd_lid == 0) {</div>
+<div class="line"><a id="l00112" name="l00112"></a><span class="lineno">  112</span>    <span class="keywordflow">for</span> (<span class="keywordtype">int</span> i = 0; i &lt; elem_per_thread; i++) {</div>
+<div class="line"><a id="l00113" name="l00113"></a><span class="lineno">  113</span>      out[i] = <span class="keyword">static_cast&lt;</span>T<span class="keyword">&gt;</span>(o[i]);</div>
+<div class="line"><a id="l00114" name="l00114"></a><span class="lineno">  114</span>    }</div>
+<div class="line"><a id="l00115" name="l00115"></a><span class="lineno">  115</span>  }</div>
+<div class="line"><a id="l00116" name="l00116"></a><span class="lineno">  116</span>}</div>
 </div>
 <div class="ttc" id="anamespacemetal_1_1fast_html_ad3dbd387b63373c29e3449609f763ede"><div class="ttname"><a href="namespacemetal_1_1fast.html#ad3dbd387b63373c29e3449609f763ede">metal::fast::exp</a></div><div class="ttdeci">METAL_FUNC bfloat16_t exp(bfloat16_t x)</div><div class="ttdef"><b>Definition</b> bf16_math.h:242</div></div>
 <div class="ttc" id="anamespacemetal_html"><div class="ttname"><a href="namespacemetal.html">metal</a></div><div class="ttdef"><b>Definition</b> bf16.h:265</div></div>
 <div class="ttc" id="anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49"><div class="ttname"><a href="namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49">metal::simd_max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_max(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
 <div class="ttc" id="anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5"><div class="ttname"><a href="namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5">metal::simd_sum</a></div><div class="ttdeci">METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)</div><div class="ttdef"><b>Definition</b> bf16_math.h:392</div></div>
 <div class="ttc" id="anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b"><div class="ttname"><a href="namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b">metal::max</a></div><div class="ttdeci">METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)</div><div class="ttdef"><b>Definition</b> bf16_math.h:234</div></div>
-<div class="ttc" id="asdpa__vector_8h_html_a6f0d7918430064bab910bdaa6c64e927"><div class="ttname"><a href="sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927">sdpa_vector</a></div><div class="ttdeci">void sdpa_vector(const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> sdpa_vector.h:8</div></div>
+<div class="ttc" id="asdpa__vector_8h_html_a4bf36f16e16c1c62d9b243573568e5ae"><div class="ttname"><a href="sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae">sdpa_vector</a></div><div class="ttdeci">void sdpa_vector(const device T *queries, const device T *keys, const device T *values, device T *out, const constant int &amp;gqa_factor, const constant int &amp;N, const constant size_t &amp;k_stride, const constant size_t &amp;v_stride, const constant float &amp;scale, uint3 tid, uint simd_gid, uint simd_lid)</div><div class="ttdef"><b>Definition</b> sdpa_vector.h:8</div></div>
 </div><!-- fragment --></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/search/all_1.js b/docs/build/html/search/all_1.js
index 1ef7de0c9..2feceb6ce 100644
--- a/docs/build/html/search/all_1.js
+++ b/docs/build/html/search/all_1.js
@@ -38,7 +38,7 @@ var searchData=
   ['all_35',['all',['../group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68',1,'mlx::core::all(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3689e12e8f42dadb4cbe2b07dc4099f4',1,'mlx::core::all(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gac0919c6ba53aea35a7683dea7e9a9a59',1,'mlx::core::all(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gae2d5fcc5b62d673cca76c08b7b4afbbc',1,'mlx::core::all(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
   ['all_5fgather_36',['all_gather',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04',1,'mlx::core::distributed::detail::all_gather()'],['../namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7',1,'mlx::core::distributed::all_gather()']]],
   ['all_5freduce_37',['all_reduce',['../reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d',1,'reduce_all.h']]],
-  ['all_5freduce_5fdispatch_38',['all_reduce_dispatch',['../namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8',1,'mlx::core']]],
+  ['all_5freduce_5fdispatch_38',['all_reduce_dispatch',['../namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098',1,'mlx::core']]],
   ['all_5fsum_39',['all_sum',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960',1,'mlx::core::distributed::detail::all_sum()'],['../namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980',1,'mlx::core::distributed::all_sum()']]],
   ['allclose_40',['allclose',['../group__ops.html#gaf0cd4257de7542daf9faf5e605e31020',1,'mlx::core']]],
   ['allgather_41',['AllGather',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html',1,'mlx::core::distributed::AllGather'],['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#af4b10a5b61f160fb64353057c185b661',1,'mlx::core::distributed::AllGather::AllGather()']]],
diff --git a/docs/build/html/search/all_11.js b/docs/build/html/search/all_11.js
index 2c8497803..1d12b31b2 100644
--- a/docs/build/html/search/all_11.js
+++ b/docs/build/html/search/all_11.js
@@ -27,5 +27,6 @@ var searchData=
   ['queue_24',['queue',['../structmlx_1_1core_1_1metal_1_1_device_stream.html#a77c75a63c51ea56815a86bd882ed190d',1,'mlx::core::metal::DeviceStream']]],
   ['quiet_5fnan_25',['quiet_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
   ['qvm_26',['qvm',['../quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5',1,'quantized.h']]],
-  ['qvm_5fimpl_27',['qvm_impl',['../quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12',1,'quantized.h']]]
+  ['qvm_5fimpl_27',['qvm_impl',['../quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a',1,'quantized.h']]],
+  ['qvm_5fsplit_5fk_28',['qvm_split_k',['../quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8',1,'quantized.h']]]
 ];
diff --git a/docs/build/html/search/all_13.js b/docs/build/html/search/all_13.js
index c2d4bb454..38e2a2c7f 100644
--- a/docs/build/html/search/all_13.js
+++ b/docs/build/html/search/all_13.js
@@ -28,7 +28,7 @@ var searchData=
   ['scheduler_25',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html',1,'mlx::core::scheduler::Scheduler'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3ae42aed78a2200e9d02776fcd2316ba',1,'mlx::core::scheduler::Scheduler::Scheduler()'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a61a74e3628899e66dde600e24a750648',1,'mlx::core::scheduler::Scheduler::Scheduler(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ac3f77b7c93220dadd0b3bb2e903b7059',1,'mlx::core::scheduler::Scheduler::Scheduler(Scheduler &amp;&amp;)=delete']]],
   ['scheduler_26',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html#ae856e468c2f7c8f8ec672522cc13730b',1,'mlx::core::scheduler']]],
   ['scheduler_2eh_27',['scheduler.h',['../scheduler_8h.html',1,'']]],
-  ['sdpa_5fvector_28',['sdpa_vector',['../sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927',1,'sdpa_vector.h']]],
+  ['sdpa_5fvector_28',['sdpa_vector',['../sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae',1,'sdpa_vector.h']]],
   ['sdpa_5fvector_2eh_29',['sdpa_vector.h',['../sdpa__vector_8h.html',1,'']]],
   ['seed_30',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
   ['seek_31',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
diff --git a/docs/build/html/search/all_3.js b/docs/build/html/search/all_3.js
index 97dcc30db..adac75852 100644
--- a/docs/build/html/search/all_3.js
+++ b/docs/build/html/search/all_3.js
@@ -35,118 +35,120 @@ var searchData=
   ['cmplx_3c_20thigh_20_3e_32',['cmplx&lt; Thigh &gt;',['../structpocketfft_1_1detail_1_1cmplx.html',1,'pocketfft::detail']]],
   ['cndarr_33',['cndarr',['../classpocketfft_1_1detail_1_1cndarr.html',1,'pocketfft::detail::cndarr&lt; T &gt;'],['../classpocketfft_1_1detail_1_1cndarr.html#abf73f1b4ddcfb27d7f85cfa441607129',1,'pocketfft::detail::cndarr::cndarr()']]],
   ['col_5fcontiguous_34',['col_contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#ae24709026598d635e6b5c24a15f8a802',1,'mlx::core::array::Flags']]],
-  ['col_5freduce_5flooped_35',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
-  ['col_5freduce_5fsmall_36',['col_reduce_small',['../reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce',1,'reduce_col.h']]],
-  ['collapse_5fcontiguous_5fdims_37',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
-  ['commandencoder_38',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html',1,'mlx::core::metal::CommandEncoder'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
-  ['commit_5fcommand_5fbuffer_39',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
-  ['commonallocator_40',['CommonAllocator',['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html',1,'mlx::core::allocator']]],
-  ['communication_5fstream_41',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
-  ['compile_42',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile()']]],
-  ['compile_2eh_43',['compile.h',['../compile_8h.html',1,'']]],
-  ['compile_5favailable_5ffor_5fdevice_44',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
-  ['compile_5fclear_5fcache_45',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
-  ['compile_5ferase_46',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
-  ['compile_5fimpl_2eh_47',['compile_impl.h',['../compile__impl_8h.html',1,'']]],
-  ['compiled_48',['Compiled',['../classmlx_1_1core_1_1_compiled.html',1,'mlx::core::Compiled'],['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled::Compiled()']]],
-  ['compiled_2eh_49',['compiled.h',['../compiled_8h.html',1,'']]],
-  ['compiled_5fallocate_5foutputs_50',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
-  ['compiled_5fcheck_5fcontiguity_51',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
-  ['compiled_5fpreamble_2eh_52',['compiled_preamble.h',['../compiled__preamble_8h.html',1,'']]],
-  ['compilemode_53',['CompileMode',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4',1,'mlx::core']]],
-  ['complex_2eh_54',['complex.h',['../backend_2metal_2kernels_2complex_8h.html',1,'(Global Namespace)'],['../types_2complex_8h.html',1,'(Global Namespace)']]],
-  ['complex128_5ft_55',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html',1,'mlx::core::complex128_t'],['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
-  ['complex64_56',['complex64',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa8c022579455bcd2c681f007e84f4e2cf',1,'mlx::core::Dtype::complex64'],['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core::complex64']]],
-  ['complex64_5ft_57',['complex64_t',['../structcomplex64__t.html',1,'complex64_t'],['../structmlx_1_1core_1_1complex64__t.html',1,'mlx::core::complex64_t'],['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
-  ['complex_5fbinop_58',['complex_binop',['../types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd',1,'complex.h']]],
-  ['complex_5fbinop_5fhelper_59',['complex_binop_helper',['../types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4',1,'complex.h']]],
-  ['complex_5fmul_60',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
-  ['complex_5fmul_5fconj_61',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
-  ['complexfloating_62',['complexfloating',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dafb203630099d501ff7c255a574bc4812',1,'mlx::core::Dtype::complexfloating'],['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core::complexfloating']]],
-  ['compute_5fstrided_5findices_63',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
-  ['concatenate_64',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html',1,'mlx::core::Concatenate'],['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate::Concatenate()']]],
-  ['concatenate_65',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
-  ['concatenate_5fgpu_66',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
-  ['concurrent_5fqueue_67',['concurrent_queue',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
-  ['concurrent_5fqueue_3c_20std_3a_3afunction_3c_20void_28_29_3e_20_3e_68',['concurrent_queue&lt; std::function&lt; void()&gt; &gt;',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
-  ['concurrentcontext_69',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html',1,'mlx::core::metal::CommandEncoder::ConcurrentContext'],['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext::ConcurrentContext()']]],
-  ['cond_70',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
-  ['conj_71',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
-  ['conjugate_72',['Conjugate',['../struct_conjugate.html',1,'Conjugate'],['../classmlx_1_1core_1_1_conjugate.html',1,'mlx::core::Conjugate'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html',1,'mlx::core::detail::Conjugate'],['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate::Conjugate()']]],
-  ['conjugate_73',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
-  ['contiguous_74',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags']]],
-  ['contiguous_5fscan_75',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
-  ['contiguousallreduce_76',['ContiguousAllReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ae4e34c7154eb8dc47aa8503209730424',1,'mlx::core']]],
-  ['contiguousiterator_77',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core::ContiguousIterator&lt; StrideT &gt;'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
-  ['contiguousreduce_78',['ContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad2547f25dffe8d8936dbec25601cfc84',1,'mlx::core']]],
-  ['contiguousstridedreduce_79',['ContiguousStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ab48dac7508a2c790de1bdc33f29177ed',1,'mlx::core']]],
-  ['conv_80',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
-  ['conv_2eh_81',['conv.h',['../conv_8h.html',1,'']]],
-  ['conv1d_82',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
-  ['conv2d_83',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
-  ['conv2dgeneralbaseinfo_84',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
-  ['conv2dgeneraljumpparams_85',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
-  ['conv2dinputblockloadergeneral_86',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::Conv2DInputBlockLoaderGeneral()']]],
-  ['conv2dinputblockloaderlargefilter_87',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::Conv2DInputBlockLoaderLargeFilter()']]],
-  ['conv2dinputblockloadersmallchannels_88',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::Conv2DInputBlockLoaderSmallChannels()']]],
-  ['conv2dinputblockloadersmallfilter_89',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::Conv2DInputBlockLoaderSmallFilter()']]],
-  ['conv2dweightblockloader_90',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader::Conv2DWeightBlockLoader()']]],
-  ['conv2dweightblockloadergeneral_91',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::Conv2DWeightBlockLoaderGeneral()']]],
-  ['conv2dweightblockloadersmallchannels_92',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::Conv2DWeightBlockLoaderSmallChannels()']]],
-  ['conv3d_93',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
-  ['conv_5fgeneral_94',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
-  ['conv_5ftranspose1d_95',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
-  ['conv_5ftranspose2d_96',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
-  ['conv_5ftranspose3d_97',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
-  ['convolution_98',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core::Convolution'],['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution::Convolution()']]],
-  ['copy_99',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core::Copy'],['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy::Copy()']]],
-  ['copy_100',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
-  ['copy_2eh_101',['copy.h',['../common_2copy_8h.html',1,'(Global Namespace)'],['../metal_2copy_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2copy_8h.html',1,'(Global Namespace)']]],
-  ['copy_5fg_102',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
-  ['copy_5fg_5fnd1_103',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
-  ['copy_5fg_5fnd2_104',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
-  ['copy_5fg_5fnd3_105',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
-  ['copy_5fgg_106',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
-  ['copy_5fgg_5fnd1_107',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
-  ['copy_5fgg_5fnd2_108',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
-  ['copy_5fgg_5fnd3_109',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
-  ['copy_5fgpu_110',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
-  ['copy_5fgpu_5finplace_111',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
-  ['copy_5fhartley_112',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5finplace_113',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
-  ['copy_5finput_114',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
-  ['copy_5foutput_115',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5fs_116',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
-  ['copy_5fs2_117',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
-  ['copy_5fshared_5fbuffer_118',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
-  ['copy_5fv_119',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
-  ['copy_5fv2_120',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
-  ['copytype_121',['CopyType',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337',1,'mlx::core']]],
-  ['core_20array_20operations_122',['Core array operations',['../group__ops.html',1,'']]],
-  ['cos_123',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos'],['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos::Cos()']]],
-  ['cos_124',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
-  ['cosh_125',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh'],['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh::Cosh()']]],
-  ['cosh_126',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
-  ['cosine_127',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
-  ['cospi_128',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
-  ['cost_5fguess_129',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
-  ['count_5fdown_130',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
-  ['cpu_131',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device::cpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdbad9747e2da342bdb995f6389533ad1a3d',1,'mlx::core::Device::cpu']]],
-  ['cross_132',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
-  ['ctile_133',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88',1,'mlx::steel::BlockMMA']]],
-  ['cummax_134',['CumMax',['../struct_cum_max.html',1,'']]],
-  ['cummax_135',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
-  ['cummin_136',['CumMin',['../struct_cum_min.html',1,'']]],
-  ['cummin_137',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
-  ['cumprod_138',['CumProd',['../struct_cum_prod.html',1,'']]],
-  ['cumprod_139',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
-  ['cumprod_3c_20bool_20_3e_140',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
-  ['cumsum_141',['CumSum',['../struct_cum_sum.html',1,'']]],
-  ['cumsum_142',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
-  ['custom_143',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast::Custom'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom::Custom()']]],
-  ['custom_5ffunction_144',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
-  ['custom_5fvjp_145',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
-  ['customkernel_146',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast::CustomKernel'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel::CustomKernel()']]],
-  ['customkernelshapeinfo_147',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
-  ['customtransforms_148',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core::CustomTransforms'],['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms::CustomTransforms()']]]
+  ['col_5freduce_5f2pass_35',['col_reduce_2pass',['../reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d',1,'reduce_col.h']]],
+  ['col_5freduce_5flongcolumn_36',['col_reduce_longcolumn',['../reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb',1,'reduce_col.h']]],
+  ['col_5freduce_5flooped_37',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
+  ['col_5freduce_5fsmall_38',['col_reduce_small',['../reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5',1,'reduce_col.h']]],
+  ['collapse_5fcontiguous_5fdims_39',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
+  ['commandencoder_40',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html',1,'mlx::core::metal::CommandEncoder'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
+  ['commit_5fcommand_5fbuffer_41',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
+  ['commonallocator_42',['CommonAllocator',['../classmlx_1_1core_1_1allocator_1_1_common_allocator.html',1,'mlx::core::allocator']]],
+  ['communication_5fstream_43',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
+  ['compile_44',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile()']]],
+  ['compile_2eh_45',['compile.h',['../compile_8h.html',1,'']]],
+  ['compile_5favailable_5ffor_5fdevice_46',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
+  ['compile_5fclear_5fcache_47',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
+  ['compile_5ferase_48',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
+  ['compile_5fimpl_2eh_49',['compile_impl.h',['../compile__impl_8h.html',1,'']]],
+  ['compiled_50',['Compiled',['../classmlx_1_1core_1_1_compiled.html',1,'mlx::core::Compiled'],['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled::Compiled()']]],
+  ['compiled_2eh_51',['compiled.h',['../compiled_8h.html',1,'']]],
+  ['compiled_5fallocate_5foutputs_52',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
+  ['compiled_5fcheck_5fcontiguity_53',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
+  ['compiled_5fpreamble_2eh_54',['compiled_preamble.h',['../compiled__preamble_8h.html',1,'']]],
+  ['compilemode_55',['CompileMode',['../namespacemlx_1_1core.html#adb15ff2b1ca5207fd4f6e631e2c3bcb4',1,'mlx::core']]],
+  ['complex_2eh_56',['complex.h',['../backend_2metal_2kernels_2complex_8h.html',1,'(Global Namespace)'],['../types_2complex_8h.html',1,'(Global Namespace)']]],
+  ['complex128_5ft_57',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html',1,'mlx::core::complex128_t'],['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
+  ['complex64_58',['complex64',['../structmlx_1_1core_1_1_dtype.html#ade845ef5dcebead13a37fe696436e1daa8c022579455bcd2c681f007e84f4e2cf',1,'mlx::core::Dtype::complex64'],['../namespacemlx_1_1core.html#af99db87e0078bfcdb383f5689bc874d4',1,'mlx::core::complex64']]],
+  ['complex64_5ft_59',['complex64_t',['../structcomplex64__t.html',1,'complex64_t'],['../structmlx_1_1core_1_1complex64__t.html',1,'mlx::core::complex64_t'],['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
+  ['complex_5fbinop_60',['complex_binop',['../types_2complex_8h.html#a9c7995d495359894e1b30c0f1678d6bd',1,'complex.h']]],
+  ['complex_5fbinop_5fhelper_61',['complex_binop_helper',['../types_2complex_8h.html#ac6890f9852de12339b09b65757ebc8c4',1,'complex.h']]],
+  ['complex_5fmul_62',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
+  ['complex_5fmul_5fconj_63',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
+  ['complexfloating_64',['complexfloating',['../structmlx_1_1core_1_1_dtype.html#ac091c39cbd6686ef69aa1e5a2425aa2dafb203630099d501ff7c255a574bc4812',1,'mlx::core::Dtype::complexfloating'],['../namespacemlx_1_1core.html#a70b8e88c9df750af984757105af33423',1,'mlx::core::complexfloating']]],
+  ['compute_5fstrided_5findices_65',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
+  ['concatenate_66',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html',1,'mlx::core::Concatenate'],['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate::Concatenate()']]],
+  ['concatenate_67',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['concatenate_5fgpu_68',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
+  ['concurrent_5fqueue_69',['concurrent_queue',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
+  ['concurrent_5fqueue_3c_20std_3a_3afunction_3c_20void_28_29_3e_20_3e_70',['concurrent_queue&lt; std::function&lt; void()&gt; &gt;',['../classpocketfft_1_1detail_1_1threading_1_1concurrent__queue.html',1,'pocketfft::detail::threading']]],
+  ['concurrentcontext_71',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html',1,'mlx::core::metal::CommandEncoder::ConcurrentContext'],['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext::ConcurrentContext()']]],
+  ['cond_72',['cond',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a4ffd524d6a5bedd1a303b63bdde6701c',1,'mlx::core::scheduler::StreamThread']]],
+  ['conj_73',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
+  ['conjugate_74',['Conjugate',['../struct_conjugate.html',1,'Conjugate'],['../classmlx_1_1core_1_1_conjugate.html',1,'mlx::core::Conjugate'],['../structmlx_1_1core_1_1detail_1_1_conjugate.html',1,'mlx::core::detail::Conjugate'],['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate::Conjugate()']]],
+  ['conjugate_75',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
+  ['contiguous_76',['contiguous',['../structmlx_1_1core_1_1array_1_1_flags.html#afd0ab11e7a486a2a8e50ee84b971ac8a',1,'mlx::core::array::Flags']]],
+  ['contiguous_5fscan_77',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
+  ['contiguousallreduce_78',['ContiguousAllReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ae4e34c7154eb8dc47aa8503209730424',1,'mlx::core']]],
+  ['contiguousiterator_79',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html',1,'mlx::core::ContiguousIterator&lt; StrideT &gt;'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
+  ['contiguousreduce_80',['ContiguousReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ad2547f25dffe8d8936dbec25601cfc84',1,'mlx::core']]],
+  ['contiguousstridedreduce_81',['ContiguousStridedReduce',['../namespacemlx_1_1core.html#a12412984a1cabfe1189942c898f8fe65ab48dac7508a2c790de1bdc33f29177ed',1,'mlx::core']]],
+  ['conv_82',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
+  ['conv_2eh_83',['conv.h',['../conv_8h.html',1,'']]],
+  ['conv1d_84',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
+  ['conv2d_85',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
+  ['conv2dgeneralbaseinfo_86',['Conv2DGeneralBaseInfo',['../structmlx_1_1steel_1_1_conv2_d_general_base_info.html',1,'mlx::steel']]],
+  ['conv2dgeneraljumpparams_87',['Conv2DGeneralJumpParams',['../structmlx_1_1steel_1_1_conv2_d_general_jump_params.html',1,'mlx::steel']]],
+  ['conv2dinputblockloadergeneral_88',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html',1,'mlx::steel::Conv2DInputBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral::Conv2DInputBlockLoaderGeneral()']]],
+  ['conv2dinputblockloaderlargefilter_89',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter::Conv2DInputBlockLoaderLargeFilter()']]],
+  ['conv2dinputblockloadersmallchannels_90',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels::Conv2DInputBlockLoaderSmallChannels()']]],
+  ['conv2dinputblockloadersmallfilter_91',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter::Conv2DInputBlockLoaderSmallFilter()']]],
+  ['conv2dweightblockloader_92',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html',1,'mlx::steel::Conv2DWeightBlockLoader&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader::Conv2DWeightBlockLoader()']]],
+  ['conv2dweightblockloadergeneral_93',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral&lt; T, BM, BN, BK, tgp_size, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral::Conv2DWeightBlockLoaderGeneral()']]],
+  ['conv2dweightblockloadersmallchannels_94',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels&lt; T, BM, BN, BK, tgp_size, n_channels, tgp_padding &gt;'],['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels::Conv2DWeightBlockLoaderSmallChannels()']]],
+  ['conv3d_95',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
+  ['conv_5fgeneral_96',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
+  ['conv_5ftranspose1d_97',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
+  ['conv_5ftranspose2d_98',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
+  ['conv_5ftranspose3d_99',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
+  ['convolution_100',['Convolution',['../classmlx_1_1core_1_1_convolution.html',1,'mlx::core::Convolution'],['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution::Convolution()']]],
+  ['copy_101',['Copy',['../classmlx_1_1core_1_1_copy.html',1,'mlx::core::Copy'],['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy::Copy()']]],
+  ['copy_102',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
+  ['copy_2eh_103',['copy.h',['../common_2copy_8h.html',1,'(Global Namespace)'],['../metal_2copy_8h.html',1,'(Global Namespace)'],['../metal_2kernels_2copy_8h.html',1,'(Global Namespace)']]],
+  ['copy_5fg_104',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
+  ['copy_5fg_5fnd1_105',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
+  ['copy_5fg_5fnd2_106',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
+  ['copy_5fg_5fnd3_107',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
+  ['copy_5fgg_108',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
+  ['copy_5fgg_5fnd1_109',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
+  ['copy_5fgg_5fnd2_110',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
+  ['copy_5fgg_5fnd3_111',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
+  ['copy_5fgpu_112',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
+  ['copy_5fgpu_5finplace_113',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
+  ['copy_5fhartley_114',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5finplace_115',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
+  ['copy_5finput_116',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
+  ['copy_5foutput_117',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5fs_118',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
+  ['copy_5fs2_119',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
+  ['copy_5fshared_5fbuffer_120',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
+  ['copy_5fv_121',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
+  ['copy_5fv2_122',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
+  ['copytype_123',['CopyType',['../namespacemlx_1_1core.html#abd84ff6c5245e4e170b2ef5247594337',1,'mlx::core']]],
+  ['core_20array_20operations_124',['Core array operations',['../group__ops.html',1,'']]],
+  ['cos_125',['Cos',['../struct_cos.html',1,'Cos'],['../classmlx_1_1core_1_1_cos.html',1,'mlx::core::Cos'],['../structmlx_1_1core_1_1detail_1_1_cos.html',1,'mlx::core::detail::Cos'],['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos::Cos()']]],
+  ['cos_126',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
+  ['cosh_127',['Cosh',['../struct_cosh.html',1,'Cosh'],['../classmlx_1_1core_1_1_cosh.html',1,'mlx::core::Cosh'],['../structmlx_1_1core_1_1detail_1_1_cosh.html',1,'mlx::core::detail::Cosh'],['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh::Cosh()']]],
+  ['cosh_128',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
+  ['cosine_129',['cosine',['../structpocketfft_1_1detail_1_1_exec_dcst.html#a185023fc1e386cc8f233b79c49c1fd8a',1,'pocketfft::detail::ExecDcst']]],
+  ['cospi_130',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
+  ['cost_5fguess_131',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
+  ['count_5fdown_132',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
+  ['cpu_133',['cpu',['../structmlx_1_1core_1_1_device.html#a69ee81924251dec96f1945c9d91506fd',1,'mlx::core::Device::cpu'],['../structmlx_1_1core_1_1_device.html#ac45b3de9b3458d8f31005136cde20fdbad9747e2da342bdb995f6389533ad1a3d',1,'mlx::core::Device::cpu']]],
+  ['cross_134',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
+  ['ctile_135',['Ctile',['../structmlx_1_1steel_1_1_block_m_m_a.html#a81838da5d81e62d372d581be599c5a88',1,'mlx::steel::BlockMMA']]],
+  ['cummax_136',['CumMax',['../struct_cum_max.html',1,'']]],
+  ['cummax_137',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
+  ['cummin_138',['CumMin',['../struct_cum_min.html',1,'']]],
+  ['cummin_139',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
+  ['cumprod_140',['CumProd',['../struct_cum_prod.html',1,'']]],
+  ['cumprod_141',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
+  ['cumprod_3c_20bool_20_3e_142',['CumProd&lt; bool &gt;',['../struct_cum_prod_3_01bool_01_4.html',1,'']]],
+  ['cumsum_143',['CumSum',['../struct_cum_sum.html',1,'']]],
+  ['cumsum_144',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
+  ['custom_145',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html',1,'mlx::core::fast::Custom'],['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom::Custom()']]],
+  ['custom_5ffunction_146',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
+  ['custom_5fvjp_147',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
+  ['customkernel_148',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html',1,'mlx::core::fast::CustomKernel'],['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel::CustomKernel()']]],
+  ['customkernelshapeinfo_149',['CustomKernelShapeInfo',['../structmlx_1_1core_1_1fast_1_1_custom_kernel_shape_info.html',1,'mlx::core::fast']]],
+  ['customtransforms_150',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html',1,'mlx::core::CustomTransforms'],['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms::CustomTransforms()']]]
 ];
diff --git a/docs/build/html/search/all_7.js b/docs/build/html/search/all_7.js
index d31b9bda7..50f643bac 100644
--- a/docs/build/html/search/all_7.js
+++ b/docs/build/html/search/all_7.js
@@ -67,7 +67,7 @@ var searchData=
   ['get_5fpool_64',['get_pool',['../namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a',1,'pocketfft::detail::threading']]],
   ['get_5fprimitive_5fstring_65',['get_primitive_string',['../namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60',1,'mlx::core']]],
   ['get_5fquantized_5fkernel_66',['get_quantized_kernel',['../namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e',1,'mlx::core']]],
-  ['get_5freduce_5finit_5fkernel_67',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89',1,'mlx::core']]],
+  ['get_5freduce_5finit_5fkernel_67',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647',1,'mlx::core']]],
   ['get_5freduce_5fkernel_68',['get_reduce_kernel',['../namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b',1,'mlx::core']]],
   ['get_5freduction_5fplan_69',['get_reduction_plan',['../namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba',1,'mlx::core']]],
   ['get_5fscan_5fkernel_70',['get_scan_kernel',['../namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f',1,'mlx::core']]],
diff --git a/docs/build/html/search/all_d.js b/docs/build/html/search/all_d.js
index b941fdcba..b1b018f5f 100644
--- a/docs/build/html/search/all_d.js
+++ b/docs/build/html/search/all_d.js
@@ -29,84 +29,85 @@ var searchData=
   ['max_5fthreads_26',['max_threads',['../namespacepocketfft_1_1detail_1_1threading.html#a2d5c0729f0b66cf061918baea4337d70',1,'pocketfft::detail::threading']]],
   ['maximum_27',['Maximum',['../struct_maximum.html',1,'Maximum'],['../structmlx_1_1core_1_1detail_1_1_maximum.html',1,'mlx::core::detail::Maximum'],['../classmlx_1_1core_1_1_maximum.html',1,'mlx::core::Maximum'],['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum::Maximum()']]],
   ['maximum_28',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
-  ['mb_5fblock_5fmerge_29',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
-  ['mb_5fblock_5fpartition_30',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
-  ['mb_5fblock_5fsort_31',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
-  ['mean_32',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['median3_33',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
-  ['merge_5fpartition_34',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
-  ['merge_5fstep_35',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
-  ['meshgrid_36',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
-  ['metal_37',['metal',['../namespacemetal.html',1,'']]],
-  ['metal_2eh_38',['metal.h',['../metal_8h.html',1,'']]],
-  ['metal_3a_3afast_39',['fast',['../namespacemetal_1_1fast.html',1,'metal']]],
-  ['metal_3a_3aprecise_40',['precise',['../namespacemetal_1_1precise.html',1,'metal']]],
-  ['metal_5fimpl_2eh_41',['metal_impl.h',['../metal__impl_8h.html',1,'']]],
-  ['metal_5fkernel_42',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
-  ['metalallocator_43',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
-  ['metalkernelfunction_44',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
-  ['min_45',['Min',['../struct_min.html',1,'Min&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a4f685dcd48e6614d6bb2ccda4f2686ef',1,'mlx::core::distributed::AllReduce::Min'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a0d3d1f5c94725bdc42fa692e2c074418',1,'mlx::core::Reduce::Min'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a7d2ee8f14f2e70a9d47170fecc6da898',1,'mlx::core::Scan::Min'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613cad914e4c3475ce9858f2de4bf35dcfdbf',1,'mlx::core::Scatter::Min']]],
-  ['min_46',['min',['../struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e',1,'Limits::min'],['../struct_limits_3_01uint8__t_01_4.html#a408bd5a337e7292f06e63da81193629a',1,'Limits&lt; uint8_t &gt;::min'],['../struct_limits_3_01uint16__t_01_4.html#ae173984c3be8b6750f27daed581805fe',1,'Limits&lt; uint16_t &gt;::min'],['../struct_limits_3_01uint32__t_01_4.html#ab0c3975e02053b234c7b606ababa66e1',1,'Limits&lt; uint32_t &gt;::min'],['../struct_limits_3_01uint64__t_01_4.html#a80627f39e951398283942cefa48f4dd0',1,'Limits&lt; uint64_t &gt;::min'],['../struct_limits_3_01int8__t_01_4.html#a7a809307d2bba80382f0645d277eaa4b',1,'Limits&lt; int8_t &gt;::min'],['../struct_limits_3_01int16__t_01_4.html#adca7139647801e223c35b0abc7da5240',1,'Limits&lt; int16_t &gt;::min'],['../struct_limits_3_01int32__t_01_4.html#af336a1b22a8ed6a83a4cfb5bf8869771',1,'Limits&lt; int32_t &gt;::min'],['../struct_limits_3_01int64__t_01_4.html#a1c90fb96af515badaccaa835b08f7428',1,'Limits&lt; int64_t &gt;::min'],['../struct_limits_3_01half_01_4.html#aca7b036c257878bf1b80912fb5d4516d',1,'Limits&lt; half &gt;::min'],['../struct_limits_3_01float_01_4.html#a3225e334d372ee86128c89a440d8648f',1,'Limits&lt; float &gt;::min'],['../struct_limits_3_01bfloat16__t_01_4.html#a2fd1811b9f615b2b897904bc27d1cb49',1,'Limits&lt; bfloat16_t &gt;::min'],['../struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc',1,'Limits&lt; bool &gt;::min'],['../struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14',1,'Limits&lt; complex64_t &gt;::min'],['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['min3_47',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
-  ['min_5fexponent_48',['min_exponent',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['min_5fexponent10_49',['min_exponent10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
-  ['minimum_50',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum'],['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum::Minimum()']]],
-  ['minimum_51',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
-  ['mlx_52',['mlx',['../namespacemlx.html',1,'']]],
-  ['mlx_2eh_53',['mlx.h',['../mlx_8h.html',1,'']]],
-  ['mlx_3a_3acore_54',['core',['../namespacemlx_1_1core.html',1,'mlx']]],
-  ['mlx_3a_3acore_3a_3aallocator_55',['allocator',['../namespacemlx_1_1core_1_1allocator.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adetail_56',['detail',['../namespacemlx_1_1core_1_1detail.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adistributed_57',['distributed',['../namespacemlx_1_1core_1_1distributed.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3adistributed_3a_3adetail_58',['detail',['../namespacemlx_1_1core_1_1distributed_1_1detail.html',1,'mlx::core::distributed']]],
-  ['mlx_3a_3acore_3a_3afast_59',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3afft_60',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3aio_61',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3alinalg_62',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ametal_63',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3arandom_64',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
-  ['mlx_3a_3acore_3a_3ascheduler_65',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
-  ['mlx_3a_3asteel_66',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]],
-  ['mlx_5fatomic_67',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
-  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_68',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
-  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_69',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_70',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_71',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_72',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_73',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_74',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_75',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_76',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5for_5fexplicit_77',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
-  ['mlx_5fatomic_5fload_5fexplicit_78',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
-  ['mlx_5fatomic_5fstore_5fexplicit_79',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
-  ['mlx_5flapack_5ffunc_80',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
-  ['mlx_5fmtl_5fconst_81',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
-  ['mlx_5fmtl_5floop_5funroll_82',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
-  ['mlx_5fmtl_5fpragma_5funroll_83',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
-  ['mlxconvparams_84',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxconvparams_3c_202_20_3e_85',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
-  ['mlxfastattentionparams_86',['MLXFastAttentionParams',['../struct_m_l_x_fast_attention_params.html',1,'']]],
-  ['mlxscaleddotproductattentionparams_87',['MLXScaledDotProductAttentionParams',['../struct_m_l_x_scaled_dot_product_attention_params.html',1,'']]],
-  ['mma_88',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
-  ['mma_2eh_89',['mma.h',['../mma_8h.html',1,'']]],
-  ['mma_5ft_90',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782',1,'mlx::steel::GEMMKernel']]],
-  ['mmafrag_5facc_5ft_91',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c',1,'mlx::steel::BlockMMA']]],
-  ['mmafrag_5ft_92',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382',1,'mlx::steel::MMATile']]],
-  ['mmatile_93',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile()']]],
-  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_94',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_95',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_96',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
-  ['move_5fshared_5fbuffer_97',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
-  ['moveaxis_98',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
-  ['mpinplace_99',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
-  ['mtl_5fconst_100',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]],
-  ['mtl_5fdevice_101',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
-  ['mtl_5fresidency_5fset_102',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
-  ['mtlfclist_103',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]],
-  ['mtx_104',['mtx',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a70410c9e612f871663929f1e8441a976',1,'mlx::core::scheduler::StreamThread']]],
-  ['multi_5fiter_105',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html',1,'pocketfft::detail::multi_iter&lt; N &gt;'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter::multi_iter()']]],
-  ['multiply_106',['Multiply',['../structmlx_1_1core_1_1detail_1_1_multiply.html',1,'mlx::core::detail::Multiply'],['../classmlx_1_1core_1_1_multiply.html',1,'mlx::core::Multiply'],['../struct_multiply.html',1,'Multiply'],['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply::Multiply()']]],
-  ['multiply_107',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
-  ['multivariate_5fnormal_108',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
+  ['maybeinsertbarrier_29',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
+  ['mb_5fblock_5fmerge_30',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
+  ['mb_5fblock_5fpartition_31',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
+  ['mb_5fblock_5fsort_32',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
+  ['mean_33',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['median3_34',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
+  ['merge_5fpartition_35',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
+  ['merge_5fstep_36',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
+  ['meshgrid_37',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
+  ['metal_38',['metal',['../namespacemetal.html',1,'']]],
+  ['metal_2eh_39',['metal.h',['../metal_8h.html',1,'']]],
+  ['metal_3a_3afast_40',['fast',['../namespacemetal_1_1fast.html',1,'metal']]],
+  ['metal_3a_3aprecise_41',['precise',['../namespacemetal_1_1precise.html',1,'metal']]],
+  ['metal_5fimpl_2eh_42',['metal_impl.h',['../metal__impl_8h.html',1,'']]],
+  ['metal_5fkernel_43',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
+  ['metalallocator_44',['MetalAllocator',['../classmlx_1_1core_1_1metal_1_1_metal_allocator.html',1,'mlx::core::metal']]],
+  ['metalkernelfunction_45',['MetalKernelFunction',['../namespacemlx_1_1core_1_1fast.html#a0e8c2c4ea7a946568c8fe5b4810417e0',1,'mlx::core::fast']]],
+  ['min_46',['Min',['../struct_min.html',1,'Min&lt; U &gt;'],['../classmlx_1_1core_1_1distributed_1_1_all_reduce.html#abb4560980e5d01aed14175ce8f6fc924a4f685dcd48e6614d6bb2ccda4f2686ef',1,'mlx::core::distributed::AllReduce::Min'],['../classmlx_1_1core_1_1_reduce.html#a0848518b16ae6d4043d6be247bdf31c9a0d3d1f5c94725bdc42fa692e2c074418',1,'mlx::core::Reduce::Min'],['../classmlx_1_1core_1_1_scan.html#a47bf2ec54ead4b8f00f9f188518630f1a7d2ee8f14f2e70a9d47170fecc6da898',1,'mlx::core::Scan::Min'],['../classmlx_1_1core_1_1_scatter.html#a614d19af11dc30644b2b4941033b613cad914e4c3475ce9858f2de4bf35dcfdbf',1,'mlx::core::Scatter::Min']]],
+  ['min_47',['min',['../struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e',1,'Limits::min'],['../struct_limits_3_01uint8__t_01_4.html#a408bd5a337e7292f06e63da81193629a',1,'Limits&lt; uint8_t &gt;::min'],['../struct_limits_3_01uint16__t_01_4.html#ae173984c3be8b6750f27daed581805fe',1,'Limits&lt; uint16_t &gt;::min'],['../struct_limits_3_01uint32__t_01_4.html#ab0c3975e02053b234c7b606ababa66e1',1,'Limits&lt; uint32_t &gt;::min'],['../struct_limits_3_01uint64__t_01_4.html#a80627f39e951398283942cefa48f4dd0',1,'Limits&lt; uint64_t &gt;::min'],['../struct_limits_3_01int8__t_01_4.html#a7a809307d2bba80382f0645d277eaa4b',1,'Limits&lt; int8_t &gt;::min'],['../struct_limits_3_01int16__t_01_4.html#adca7139647801e223c35b0abc7da5240',1,'Limits&lt; int16_t &gt;::min'],['../struct_limits_3_01int32__t_01_4.html#af336a1b22a8ed6a83a4cfb5bf8869771',1,'Limits&lt; int32_t &gt;::min'],['../struct_limits_3_01int64__t_01_4.html#a1c90fb96af515badaccaa835b08f7428',1,'Limits&lt; int64_t &gt;::min'],['../struct_limits_3_01half_01_4.html#aca7b036c257878bf1b80912fb5d4516d',1,'Limits&lt; half &gt;::min'],['../struct_limits_3_01float_01_4.html#a3225e334d372ee86128c89a440d8648f',1,'Limits&lt; float &gt;::min'],['../struct_limits_3_01bfloat16__t_01_4.html#a2fd1811b9f615b2b897904bc27d1cb49',1,'Limits&lt; bfloat16_t &gt;::min'],['../struct_limits_3_01bool_01_4.html#a139f787b57536d455490b8ef801d37cc',1,'Limits&lt; bool &gt;::min'],['../struct_limits_3_01complex64__t_01_4.html#aa67b04aa7abcd67f7af0808737ab8e14',1,'Limits&lt; complex64_t &gt;::min'],['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['min3_48',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
+  ['min_5fexponent_49',['min_exponent',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#a13829f8c7a7c0efdc8946eff5d3c9470',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['min_5fexponent10_50',['min_exponent10',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aeaed172780720e06b8731cef3177e277',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
+  ['minimum_51',['Minimum',['../struct_minimum.html',1,'Minimum'],['../structmlx_1_1core_1_1detail_1_1_minimum.html',1,'mlx::core::detail::Minimum'],['../classmlx_1_1core_1_1_minimum.html',1,'mlx::core::Minimum'],['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum::Minimum()']]],
+  ['minimum_52',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
+  ['mlx_53',['mlx',['../namespacemlx.html',1,'']]],
+  ['mlx_2eh_54',['mlx.h',['../mlx_8h.html',1,'']]],
+  ['mlx_3a_3acore_55',['core',['../namespacemlx_1_1core.html',1,'mlx']]],
+  ['mlx_3a_3acore_3a_3aallocator_56',['allocator',['../namespacemlx_1_1core_1_1allocator.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adetail_57',['detail',['../namespacemlx_1_1core_1_1detail.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adistributed_58',['distributed',['../namespacemlx_1_1core_1_1distributed.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3adistributed_3a_3adetail_59',['detail',['../namespacemlx_1_1core_1_1distributed_1_1detail.html',1,'mlx::core::distributed']]],
+  ['mlx_3a_3acore_3a_3afast_60',['fast',['../namespacemlx_1_1core_1_1fast.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3afft_61',['fft',['../namespacemlx_1_1core_1_1fft.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3aio_62',['io',['../namespacemlx_1_1core_1_1io.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3alinalg_63',['linalg',['../namespacemlx_1_1core_1_1linalg.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ametal_64',['metal',['../namespacemlx_1_1core_1_1metal.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3arandom_65',['random',['../namespacemlx_1_1core_1_1random.html',1,'mlx::core']]],
+  ['mlx_3a_3acore_3a_3ascheduler_66',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html',1,'mlx::core']]],
+  ['mlx_3a_3asteel_67',['steel',['../namespacemlx_1_1steel.html',1,'mlx']]],
+  ['mlx_5fatomic_68',['mlx_atomic',['../structmlx__atomic.html',1,'']]],
+  ['mlx_5fatomic_3c_20t_2c_20enable_5fif_5ft_3c_20is_5fmetal_5fatomic_3c_20t_20_3e_20_3e_20_3e_69',['mlx_atomic&lt; T, enable_if_t&lt; is_metal_atomic&lt; T &gt; &gt; &gt;',['../structmlx__atomic_3_01_t_00_01enable__if__t_3_01is__metal__atomic_3_01_t_01_4_01_4_01_4.html',1,'']]],
+  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_70',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_71',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_72',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_73',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_74',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_75',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_76',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_77',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5for_5fexplicit_78',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
+  ['mlx_5fatomic_5fload_5fexplicit_79',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
+  ['mlx_5fatomic_5fstore_5fexplicit_80',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
+  ['mlx_5flapack_5ffunc_81',['MLX_LAPACK_FUNC',['../lapack_8h.html#ae22db9704827bf013a0a61f21a47464b',1,'lapack.h']]],
+  ['mlx_5fmtl_5fconst_82',['MLX_MTL_CONST',['../kernels_2gemv__masked_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;gemv_masked.h'],['../quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;quantized.h'],['../sort_8h.html#a0386011c52d03e60885a31e6fbd903dd',1,'MLX_MTL_CONST:&#160;sort.h']]],
+  ['mlx_5fmtl_5floop_5funroll_83',['MLX_MTL_LOOP_UNROLL',['../sort_8h.html#ad34b622323cebef136669fedd7229515',1,'sort.h']]],
+  ['mlx_5fmtl_5fpragma_5funroll_84',['MLX_MTL_PRAGMA_UNROLL',['../kernels_2gemv__masked_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;gemv_masked.h'],['../backend_2metal_2kernels_2utils_8h.html#a069b682d7d21827461544817d722bfd3',1,'MLX_MTL_PRAGMA_UNROLL:&#160;utils.h']]],
+  ['mlxconvparams_85',['MLXConvParams',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mlxconvparams_3c_202_20_3e_86',['MLXConvParams&lt; 2 &gt;',['../struct_m_l_x_conv_params.html',1,'']]],
+  ['mlxfastattentionparams_87',['MLXFastAttentionParams',['../struct_m_l_x_fast_attention_params.html',1,'']]],
+  ['mlxscaleddotproductattentionparams_88',['MLXScaledDotProductAttentionParams',['../struct_m_l_x_scaled_dot_product_attention_params.html',1,'']]],
+  ['mma_89',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
+  ['mma_2eh_90',['mma.h',['../mma_8h.html',1,'']]],
+  ['mma_5ft_91',['mma_t',['../structmlx_1_1steel_1_1_g_e_m_m_kernel.html#add8c6a31011a4895667c2a94a5af3782',1,'mlx::steel::GEMMKernel']]],
+  ['mmafrag_5facc_5ft_92',['MMAFrag_acc_t',['../structmlx_1_1steel_1_1_block_m_m_a.html#ae2c42cb6d0dde785859164c195f4d13c',1,'mlx::steel::BlockMMA']]],
+  ['mmafrag_5ft_93',['MMAFrag_t',['../structmlx_1_1steel_1_1_m_m_a_tile.html#abe33de70e34300745bad9aa822fd0382',1,'mlx::steel::MMATile']]],
+  ['mmatile_94',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel::MMATile&lt; T, kTileRows_, kTileCols_, MMAFrag_ &gt;'],['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile::MMATile()']]],
+  ['mmatile_3c_20float_2c_201_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_95',['MMATile&lt; float, 1, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_201_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_96',['MMATile&lt; float, TM, 1, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['mmatile_3c_20float_2c_20tm_2c_20tn_2c_20mlx_3a_3asteel_3a_3abasemmafrag_20_3e_97',['MMATile&lt; float, TM, TN, mlx::steel::BaseMMAFrag &gt;',['../structmlx_1_1steel_1_1_m_m_a_tile.html',1,'mlx::steel']]],
+  ['move_5fshared_5fbuffer_98',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
+  ['moveaxis_99',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
+  ['mpinplace_100',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
+  ['mtl_5fconst_101',['MTL_CONST',['../defines_8h.html#a767ed9f2604de22b259cee02c4ce1d22',1,'defines.h']]],
+  ['mtl_5fdevice_102',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
+  ['mtl_5fresidency_5fset_103',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
+  ['mtlfclist_104',['MTLFCList',['../namespacemlx_1_1core_1_1metal.html#a616e09a1ef321d527770721cef264c54',1,'mlx::core::metal']]],
+  ['mtx_105',['mtx',['../structmlx_1_1core_1_1scheduler_1_1_stream_thread.html#a70410c9e612f871663929f1e8441a976',1,'mlx::core::scheduler::StreamThread']]],
+  ['multi_5fiter_106',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html',1,'pocketfft::detail::multi_iter&lt; N &gt;'],['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter::multi_iter()']]],
+  ['multiply_107',['Multiply',['../structmlx_1_1core_1_1detail_1_1_multiply.html',1,'mlx::core::detail::Multiply'],['../classmlx_1_1core_1_1_multiply.html',1,'mlx::core::Multiply'],['../struct_multiply.html',1,'Multiply'],['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply::Multiply()']]],
+  ['multiply_108',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
+  ['multivariate_5fnormal_109',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
 ];
diff --git a/docs/build/html/search/functions_1.js b/docs/build/html/search/functions_1.js
index 255ad1343..b4193ba2a 100644
--- a/docs/build/html/search/functions_1.js
+++ b/docs/build/html/search/functions_1.js
@@ -22,7 +22,7 @@ var searchData=
   ['all_19',['all',['../group__ops.html#ga3b1b90ef1275ca17655b6d7f25d3ee68',1,'mlx::core::all(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga3689e12e8f42dadb4cbe2b07dc4099f4',1,'mlx::core::all(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#gac0919c6ba53aea35a7683dea7e9a9a59',1,'mlx::core::all(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#gae2d5fcc5b62d673cca76c08b7b4afbbc',1,'mlx::core::all(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
   ['all_5fgather_20',['all_gather',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aeb5a1726358213bc75756506f7b54d04',1,'mlx::core::distributed::detail::all_gather()'],['../namespacemlx_1_1core_1_1distributed.html#a82ef5e8cc7ac62cd228e51b1c1b77cb7',1,'mlx::core::distributed::all_gather()']]],
   ['all_5freduce_21',['all_reduce',['../reduce__all_8h.html#a99ef48ae72b3e715c5f4d7ea07cd213d',1,'reduce_all.h']]],
-  ['all_5freduce_5fdispatch_22',['all_reduce_dispatch',['../namespacemlx_1_1core.html#af7b7ca7c6aa87558d9f98cee5c7a99a8',1,'mlx::core']]],
+  ['all_5freduce_5fdispatch_22',['all_reduce_dispatch',['../namespacemlx_1_1core.html#a3ab0fd997d9a35782106ff083a72e098',1,'mlx::core']]],
   ['all_5fsum_23',['all_sum',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#aa1d225b25f7b6426c48c5e35860ee960',1,'mlx::core::distributed::detail::all_sum()'],['../namespacemlx_1_1core_1_1distributed.html#a67ccb1a5445fc6f5db49dd36a15e5980',1,'mlx::core::distributed::all_sum()']]],
   ['allclose_24',['allclose',['../group__ops.html#gaf0cd4257de7542daf9faf5e605e31020',1,'mlx::core']]],
   ['allgather_25',['AllGather',['../classmlx_1_1core_1_1distributed_1_1_all_gather.html#af4b10a5b61f160fb64353057c185b661',1,'mlx::core::distributed::AllGather']]],
diff --git a/docs/build/html/search/functions_11.js b/docs/build/html/search/functions_11.js
index 7dfd15ac3..f07193647 100644
--- a/docs/build/html/search/functions_11.js
+++ b/docs/build/html/search/functions_11.js
@@ -22,5 +22,6 @@ var searchData=
   ['quantizedmatmul_19',['QuantizedMatmul',['../classmlx_1_1core_1_1_quantized_matmul.html#a5bd164d038d9dc21919f7e0bfdeaa25c',1,'mlx::core::QuantizedMatmul']]],
   ['quiet_5fnan_20',['quiet_NaN',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#aebeb07c01984be246bc2d1b8f8e4ac7b',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;']]],
   ['qvm_21',['qvm',['../quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5',1,'quantized.h']]],
-  ['qvm_5fimpl_22',['qvm_impl',['../quantized_8h.html#a4a8c8db7d5d480733726fd6d1a645e12',1,'quantized.h']]]
+  ['qvm_5fimpl_22',['qvm_impl',['../quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a',1,'quantized.h']]],
+  ['qvm_5fsplit_5fk_23',['qvm_split_k',['../quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8',1,'quantized.h']]]
 ];
diff --git a/docs/build/html/search/functions_13.js b/docs/build/html/search/functions_13.js
index bbac045cb..5342f6271 100644
--- a/docs/build/html/search/functions_13.js
+++ b/docs/build/html/search/functions_13.js
@@ -17,7 +17,7 @@ var searchData=
   ['scatter_5fprod_14',['scatter_prod',['../group__ops.html#ga3708b5bcb61e2c63d213c4ce6ad0ffc0',1,'mlx::core::scatter_prod(const array &amp;a, const std::vector&lt; array &gt; &amp;indices, const array &amp;updates, const std::vector&lt; int &gt; &amp;axes, StreamOrDevice s={})'],['../group__ops.html#gaf83c53c453faa9083ba27e4b97539339',1,'mlx::core::scatter_prod(const array &amp;a, const array &amp;indices, const array &amp;updates, int axis, StreamOrDevice s={})']]],
   ['scheduler_15',['Scheduler',['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a3ae42aed78a2200e9d02776fcd2316ba',1,'mlx::core::scheduler::Scheduler::Scheduler()'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#a61a74e3628899e66dde600e24a750648',1,'mlx::core::scheduler::Scheduler::Scheduler(const Scheduler &amp;)=delete'],['../classmlx_1_1core_1_1scheduler_1_1_scheduler.html#ac3f77b7c93220dadd0b3bb2e903b7059',1,'mlx::core::scheduler::Scheduler::Scheduler(Scheduler &amp;&amp;)=delete']]],
   ['scheduler_16',['scheduler',['../namespacemlx_1_1core_1_1scheduler.html#ae856e468c2f7c8f8ec672522cc13730b',1,'mlx::core::scheduler']]],
-  ['sdpa_5fvector_17',['sdpa_vector',['../sdpa__vector_8h.html#a6f0d7918430064bab910bdaa6c64e927',1,'sdpa_vector.h']]],
+  ['sdpa_5fvector_17',['sdpa_vector',['../sdpa__vector_8h.html#a4bf36f16e16c1c62d9b243573568e5ae',1,'sdpa_vector.h']]],
   ['seed_18',['seed',['../classmlx_1_1core_1_1random_1_1_key_sequence.html#a9f19c5da2031cba50d0ff996924347d8',1,'mlx::core::random::KeySequence::seed()'],['../namespacemlx_1_1core_1_1random.html#ac4ad325b613257306df74595d3d0e23b',1,'mlx::core::random::seed()']]],
   ['seek_19',['seek',['../structmlx_1_1core_1_1_contiguous_iterator.html#a24719ee9e8667885d29c2ad74445520c',1,'mlx::core::ContiguousIterator::seek()'],['../classmlx_1_1core_1_1io_1_1_reader.html#acea55078bd39ccaa27a9a36f17a39cd1',1,'mlx::core::io::Reader::seek()'],['../classmlx_1_1core_1_1io_1_1_writer.html#a9c1716dda53aa36faea9c8fb1a3e34d4',1,'mlx::core::io::Writer::seek()'],['../classmlx_1_1core_1_1io_1_1_parallel_file_reader.html#a673c16b669f3cee13f387b7b0a1f39f7',1,'mlx::core::io::ParallelFileReader::seek()'],['../classmlx_1_1core_1_1io_1_1_file_writer.html#a9646f4ea048ae58719daeb588e2de433',1,'mlx::core::io::FileWriter::seek()']]],
   ['select_20',['Select',['../classmlx_1_1core_1_1_select.html#a6f833fe55dd68ad3726bbf9a8f75eec9',1,'mlx::core::Select']]],
diff --git a/docs/build/html/search/functions_3.js b/docs/build/html/search/functions_3.js
index 29bcd7684..2e834b06f 100644
--- a/docs/build/html/search/functions_3.js
+++ b/docs/build/html/search/functions_3.js
@@ -18,85 +18,87 @@ var searchData=
   ['clip_15',['clip',['../group__ops.html#ga157cd7c23f9b306fee2e1eb2b9bf1dd8',1,'mlx::core']]],
   ['cmplx_16',['cmplx',['../structpocketfft_1_1detail_1_1cmplx.html#a5b1ce506f1023f5254025ac81b831a2c',1,'pocketfft::detail::cmplx::cmplx()'],['../structpocketfft_1_1detail_1_1cmplx.html#a05491b4f1f22ca0bc49012f6a1c1710a',1,'pocketfft::detail::cmplx::cmplx(T r_, T i_)']]],
   ['cndarr_17',['cndarr',['../classpocketfft_1_1detail_1_1cndarr.html#abf73f1b4ddcfb27d7f85cfa441607129',1,'pocketfft::detail::cndarr']]],
-  ['col_5freduce_5flooped_18',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
-  ['col_5freduce_5fsmall_19',['col_reduce_small',['../reduce__col_8h.html#adf7aeb18cd1d5042cf6d9b46b582d8ce',1,'reduce_col.h']]],
-  ['collapse_5fcontiguous_5fdims_20',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
-  ['commandencoder_21',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
-  ['commit_5fcommand_5fbuffer_22',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
-  ['communication_5fstream_23',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
-  ['compile_24',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, std::uintptr_t fun_id, bool shapeless=false, std::vector&lt; uint64_t &gt; constants={})']]],
-  ['compile_5favailable_5ffor_5fdevice_25',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
-  ['compile_5fclear_5fcache_26',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
-  ['compile_5ferase_27',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
-  ['compiled_28',['Compiled',['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled']]],
-  ['compiled_5fallocate_5foutputs_29',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
-  ['compiled_5fcheck_5fcontiguity_30',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
-  ['complex128_5ft_31',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
-  ['complex64_5ft_32',['complex64_t',['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
-  ['complex_5fmul_33',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
-  ['complex_5fmul_5fconj_34',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
-  ['compute_5fstrided_5findices_35',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
-  ['concatenate_36',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate']]],
-  ['concatenate_37',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
-  ['concatenate_5fgpu_38',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
-  ['concurrentcontext_39',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext']]],
-  ['conj_40',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
-  ['conjugate_41',['Conjugate',['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate']]],
-  ['conjugate_42',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
-  ['contiguous_5fscan_43',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
-  ['contiguousiterator_44',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
-  ['conv_45',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
-  ['conv1d_46',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
-  ['conv2d_47',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
-  ['conv2dinputblockloadergeneral_48',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral']]],
-  ['conv2dinputblockloaderlargefilter_49',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter']]],
-  ['conv2dinputblockloadersmallchannels_50',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels']]],
-  ['conv2dinputblockloadersmallfilter_51',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
-  ['conv2dweightblockloader_52',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader']]],
-  ['conv2dweightblockloadergeneral_53',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
-  ['conv2dweightblockloadersmallchannels_54',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels']]],
-  ['conv3d_55',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
-  ['conv_5fgeneral_56',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
-  ['conv_5ftranspose1d_57',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
-  ['conv_5ftranspose2d_58',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
-  ['conv_5ftranspose3d_59',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
-  ['convolution_60',['Convolution',['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution']]],
-  ['copy_61',['Copy',['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy']]],
-  ['copy_62',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
-  ['copy_5fg_63',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
-  ['copy_5fg_5fnd1_64',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
-  ['copy_5fg_5fnd2_65',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
-  ['copy_5fg_5fnd3_66',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
-  ['copy_5fgg_67',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
-  ['copy_5fgg_5fnd1_68',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
-  ['copy_5fgg_5fnd2_69',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
-  ['copy_5fgg_5fnd3_70',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
-  ['copy_5fgpu_71',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
-  ['copy_5fgpu_5finplace_72',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
-  ['copy_5fhartley_73',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5finplace_74',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
-  ['copy_5finput_75',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
-  ['copy_5foutput_76',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
-  ['copy_5fs_77',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
-  ['copy_5fs2_78',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
-  ['copy_5fshared_5fbuffer_79',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
-  ['copy_5fv_80',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
-  ['copy_5fv2_81',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
-  ['cos_82',['Cos',['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos']]],
-  ['cos_83',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
-  ['cosh_84',['Cosh',['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh']]],
-  ['cosh_85',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
-  ['cospi_86',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
-  ['cost_5fguess_87',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
-  ['count_5fdown_88',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
-  ['cross_89',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
-  ['cummax_90',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
-  ['cummin_91',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
-  ['cumprod_92',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
-  ['cumsum_93',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
-  ['custom_94',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom']]],
-  ['custom_5ffunction_95',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
-  ['custom_5fvjp_96',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
-  ['customkernel_97',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel']]],
-  ['customtransforms_98',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms']]]
+  ['col_5freduce_5f2pass_18',['col_reduce_2pass',['../reduce__col_8h.html#a0e92fc74eeaa8ee2ceb83bafc6eb1d7d',1,'reduce_col.h']]],
+  ['col_5freduce_5flongcolumn_19',['col_reduce_longcolumn',['../reduce__col_8h.html#a5b4f4c4c247ad341ff8d31dcbbbce0eb',1,'reduce_col.h']]],
+  ['col_5freduce_5flooped_20',['col_reduce_looped',['../reduce__col_8h.html#a11bfc6112ae2386ac03f5ea7b7d93385',1,'reduce_col.h']]],
+  ['col_5freduce_5fsmall_21',['col_reduce_small',['../reduce__col_8h.html#a7c378443a2b6f4d9210db8a21a9ac4f5',1,'reduce_col.h']]],
+  ['collapse_5fcontiguous_5fdims_22',['collapse_contiguous_dims',['../namespacemlx_1_1core.html#a38fe6ec5220d13d96c7dad7556d2b613',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; int64_t &gt; &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#af2895f9b0083efd8221275eb8cadccbe',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; std::vector&lt; size_t &gt; &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a90e2b6edc0fe82230cb93f5ea39febb4',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; array &gt; &amp;xs, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#ac813412cce77fc1340dcfefc6e099276',1,'mlx::core::collapse_contiguous_dims(Arrays &amp;&amp;... xs)'],['../namespacemlx_1_1core.html#aab3cc7f3808934ae0727b920eba231bd',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; int64_t &gt; &amp;strides, int64_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a1e0cbcf109d32794ffc8efc7302ba9b0',1,'mlx::core::collapse_contiguous_dims(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; size_t &gt; &amp;strides, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())'],['../namespacemlx_1_1core.html#a4ee50bfb240512d0c0ce151dfe2c74ef',1,'mlx::core::collapse_contiguous_dims(const array &amp;a, size_t size_cap=std::numeric_limits&lt; int32_t &gt;::max())']]],
+  ['commandencoder_23',['CommandEncoder',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#a2334774486f447213ee997e55c2e52a3',1,'mlx::core::metal::CommandEncoder::CommandEncoder(MTL::CommandBuffer *cbuf)'],['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ac68ca977b5bde5434284ce7979647f14',1,'mlx::core::metal::CommandEncoder::CommandEncoder(const CommandEncoder &amp;)=delete']]],
+  ['commit_5fcommand_5fbuffer_24',['commit_command_buffer',['../classmlx_1_1core_1_1metal_1_1_device.html#a95248f1387824067fd4fed23ace5ac0c',1,'mlx::core::metal::Device']]],
+  ['communication_5fstream_25',['communication_stream',['../namespacemlx_1_1core_1_1distributed_1_1detail.html#ac3612edf0e0e18c1e4ba0ce7c6e35cd6',1,'mlx::core::distributed::detail']]],
+  ['compile_26',['compile',['../namespacemlx_1_1core.html#a3ac798e65e59fe10b7fb5c522efce782',1,'mlx::core::compile()'],['../namespacemlx_1_1core_1_1detail.html#ac3b7b09892ff7290d5f3ef26cb444329',1,'mlx::core::detail::compile(const std::function&lt; std::vector&lt; array &gt;(const std::vector&lt; array &gt; &amp;)&gt; &amp;fun, std::uintptr_t fun_id, bool shapeless=false, std::vector&lt; uint64_t &gt; constants={})']]],
+  ['compile_5favailable_5ffor_5fdevice_27',['compile_available_for_device',['../namespacemlx_1_1core_1_1detail.html#aeeff2ba6ec3d9d4ed090de6d2681dbc2',1,'mlx::core::detail']]],
+  ['compile_5fclear_5fcache_28',['compile_clear_cache',['../namespacemlx_1_1core_1_1detail.html#a3fb927c209b946aefebb195993fbe4cf',1,'mlx::core::detail']]],
+  ['compile_5ferase_29',['compile_erase',['../namespacemlx_1_1core_1_1detail.html#a69eb76a14f845ca000f1ccb2edda0175',1,'mlx::core::detail']]],
+  ['compiled_30',['Compiled',['../classmlx_1_1core_1_1_compiled.html#a2d8cefff835c419a48a077d306b8e051',1,'mlx::core::Compiled']]],
+  ['compiled_5fallocate_5foutputs_31',['compiled_allocate_outputs',['../namespacemlx_1_1core.html#ab8c3c4fc05745f586de922c8266f4fce',1,'mlx::core']]],
+  ['compiled_5fcheck_5fcontiguity_32',['compiled_check_contiguity',['../namespacemlx_1_1core.html#a3b900ab319948c5a01a3ecd30a709027',1,'mlx::core']]],
+  ['complex128_5ft_33',['complex128_t',['../structmlx_1_1core_1_1complex128__t.html#aa15d0b805f8790f7c7b76fc7b9d677e0',1,'mlx::core::complex128_t::complex128_t(double v, double u)'],['../structmlx_1_1core_1_1complex128__t.html#abf2842253b874f9f13f39ea68a89e5b6',1,'mlx::core::complex128_t::complex128_t(std::complex&lt; double &gt; v)'],['../structmlx_1_1core_1_1complex128__t.html#a526fba96d7e815360cb4226af085a1bf',1,'mlx::core::complex128_t::complex128_t(T x)']]],
+  ['complex64_5ft_34',['complex64_t',['../structcomplex64__t.html#adbd392a5e92d31997380ad0a38be4be8',1,'complex64_t::complex64_t(float real, float imag)'],['../structcomplex64__t.html#a29782289bb90d6294099667b86509cd3',1,'complex64_t::complex64_t()'],['../structcomplex64__t.html#a905b048d70eb8d748a62454268242291',1,'complex64_t::complex64_t() threadgroup'],['../structcomplex64__t.html#a33a2452eb33b5ed53655773539c357a5',1,'complex64_t::complex64_t(T x) thread'],['../structcomplex64__t.html#a89b65ace8588b7bf215355f705eb23d9',1,'complex64_t::complex64_t(T x) threadgroup'],['../structcomplex64__t.html#ac81b486f642fb3b26c5d659917bdbcd0',1,'complex64_t::complex64_t(T x) device'],['../structcomplex64__t.html#a0a27a41206400f1e62b60ceb56960c93',1,'complex64_t::complex64_t(T x) const ant'],['../structmlx_1_1core_1_1complex64__t.html#a697cc973ae27d63c8e00d830e780bd8c',1,'mlx::core::complex64_t::complex64_t(float v, float u)'],['../structmlx_1_1core_1_1complex64__t.html#ae065e39938f9c4374b4116f4c67d4d09',1,'mlx::core::complex64_t::complex64_t(std::complex&lt; float &gt; v)'],['../structmlx_1_1core_1_1complex64__t.html#a2232cbbe591a9d2bc228cb23fac38b50',1,'mlx::core::complex64_t::complex64_t(T x)']]],
+  ['complex_5fmul_35',['complex_mul',['../radix_8h.html#a5bfc53b531214c9ce277bebc18aa67d6',1,'radix.h']]],
+  ['complex_5fmul_5fconj_36',['complex_mul_conj',['../radix_8h.html#a0e2dfd3d1dda09f47ccc64eec35629f3',1,'radix.h']]],
+  ['compute_5fstrided_5findices_37',['compute_strided_indices',['../struct_read_writer.html#a7c903fbb8b85a856ba5564d7df537cdf',1,'ReadWriter']]],
+  ['concatenate_38',['Concatenate',['../classmlx_1_1core_1_1_concatenate.html#acff07853de2d31faeec7c4ca40ce0888',1,'mlx::core::Concatenate']]],
+  ['concatenate_39',['concatenate',['../group__ops.html#gabdc36fa65697d0361c8d67495de77129',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, int axis, StreamOrDevice s={})'],['../group__ops.html#gaa95c34ca3a8877f2c50cb60e7fa312b8',1,'mlx::core::concatenate(const std::vector&lt; array &gt; &amp;arrays, StreamOrDevice s={})']]],
+  ['concatenate_5fgpu_40',['concatenate_gpu',['../namespacemlx_1_1core.html#a050299d0d366ca5c9d09d1004dcc3e7d',1,'mlx::core']]],
+  ['concurrentcontext_41',['ConcurrentContext',['../structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html#aee044d7729739c96e845823f9ecc5174',1,'mlx::core::metal::CommandEncoder::ConcurrentContext']]],
+  ['conj_42',['conj',['../namespacepocketfft_1_1detail.html#a66d79051d502046a9b9f103e744dbad3',1,'pocketfft::detail']]],
+  ['conjugate_43',['Conjugate',['../classmlx_1_1core_1_1_conjugate.html#a627f9e6a8729fb3ffb3ca3228d007c87',1,'mlx::core::Conjugate']]],
+  ['conjugate_44',['conjugate',['../group__ops.html#ga5b596906bf8cdc8d97ed6ddc9aeb4c23',1,'mlx::core']]],
+  ['contiguous_5fscan_45',['contiguous_scan',['../scan_8h.html#a60d279b9add7d56639bb209408f09d79',1,'scan.h']]],
+  ['contiguousiterator_46',['ContiguousIterator',['../structmlx_1_1core_1_1_contiguous_iterator.html#a68794af4a442d3d8ac4647817af8e1f6',1,'mlx::core::ContiguousIterator::ContiguousIterator()'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a6cb378408b6f546eeb6ade1a4faafe3c',1,'mlx::core::ContiguousIterator::ContiguousIterator(const array &amp;a)'],['../structmlx_1_1core_1_1_contiguous_iterator.html#a16bdacb53f65b7284068cd49d4cba292',1,'mlx::core::ContiguousIterator::ContiguousIterator(const std::vector&lt; int &gt; &amp;shape, const std::vector&lt; StrideT &gt; &amp;strides, int dims)']]],
+  ['conv_47',['conv',['../namespacemlx_1_1core_1_1metal.html#ab1704e853394c725668c06752ebb5c24',1,'mlx::core::metal']]],
+  ['conv1d_48',['conv1d',['../group__ops.html#ga30d47e08093c03a3676f235f9f559411',1,'mlx::core']]],
+  ['conv2d_49',['conv2d',['../group__ops.html#ga73b02833229678786e7f302d458d5a83',1,'mlx::core']]],
+  ['conv2dinputblockloadergeneral_50',['Conv2DInputBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_general.html#a1d83af561a483432bf8dcb42e734b23b',1,'mlx::steel::Conv2DInputBlockLoaderGeneral']]],
+  ['conv2dinputblockloaderlargefilter_51',['Conv2DInputBlockLoaderLargeFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_large_filter.html#a8755116a535539744e4947bc69f9c50f',1,'mlx::steel::Conv2DInputBlockLoaderLargeFilter']]],
+  ['conv2dinputblockloadersmallchannels_52',['Conv2DInputBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_channels.html#ab9fd3fdeab94470dde3326f1dd5c455a',1,'mlx::steel::Conv2DInputBlockLoaderSmallChannels']]],
+  ['conv2dinputblockloadersmallfilter_53',['Conv2DInputBlockLoaderSmallFilter',['../structmlx_1_1steel_1_1_conv2_d_input_block_loader_small_filter.html#a0a2cbf57c51cd928722e3f06aafcf933',1,'mlx::steel::Conv2DInputBlockLoaderSmallFilter']]],
+  ['conv2dweightblockloader_54',['Conv2DWeightBlockLoader',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader.html#a9a7dca3512b64cffb6eac305d795831c',1,'mlx::steel::Conv2DWeightBlockLoader']]],
+  ['conv2dweightblockloadergeneral_55',['Conv2DWeightBlockLoaderGeneral',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_general.html#ad0550fabbdc9297559381a5b488e9af1',1,'mlx::steel::Conv2DWeightBlockLoaderGeneral']]],
+  ['conv2dweightblockloadersmallchannels_56',['Conv2DWeightBlockLoaderSmallChannels',['../structmlx_1_1steel_1_1_conv2_d_weight_block_loader_small_channels.html#ae1806ea1c19713819dee83a38ab35fa6',1,'mlx::steel::Conv2DWeightBlockLoaderSmallChannels']]],
+  ['conv3d_57',['conv3d',['../group__ops.html#ga6e9907d2f14dc4803e4306b3dbc4b3ca',1,'mlx::core']]],
+  ['conv_5fgeneral_58',['conv_general',['../group__ops.html#ga2236e5dfc7e52e28abf6c21675d0a51e',1,'mlx::core::conv_general(array input, array weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding_lo={}, std::vector&lt; int &gt; padding_hi={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})'],['../group__ops.html#gab59f89942cd1efaadffe9e8762e3c99d',1,'mlx::core::conv_general(const array &amp;input, const array &amp;weight, std::vector&lt; int &gt; stride={}, std::vector&lt; int &gt; padding={}, std::vector&lt; int &gt; kernel_dilation={}, std::vector&lt; int &gt; input_dilation={}, int groups=1, bool flip=false, StreamOrDevice s={})']]],
+  ['conv_5ftranspose1d_59',['conv_transpose1d',['../group__ops.html#gaa30bf1adcd78d1c2595d07b215731714',1,'mlx::core']]],
+  ['conv_5ftranspose2d_60',['conv_transpose2d',['../group__ops.html#gaebb59971cb9bc45005dc1d398e4f0a3d',1,'mlx::core']]],
+  ['conv_5ftranspose3d_61',['conv_transpose3d',['../group__ops.html#ga8db814da631d9cd32a8d6563bf4ac530',1,'mlx::core']]],
+  ['convolution_62',['Convolution',['../classmlx_1_1core_1_1_convolution.html#a6f1de77b719bb13217b0d8c64cabb8ef',1,'mlx::core::Convolution']]],
+  ['copy_63',['Copy',['../classmlx_1_1core_1_1_copy.html#a6243e044af119105ffaaed7d405cd584',1,'mlx::core::Copy']]],
+  ['copy_64',['copy',['../namespacemlx_1_1core.html#a479648542a2bea151b947b18f0e79dd2',1,'mlx::core::copy()'],['../namespacemlx_1_1core_1_1metal.html#aa215e631e2680f04a591b88d91571719',1,'mlx::core::metal::copy()'],['../group__ops.html#gae306e93af12f774bd80bad6c231b09d6',1,'mlx::core::copy()']]],
+  ['copy_5fg_65',['copy_g',['../metal_2kernels_2copy_8h.html#a778ce2dbfbaa23b24bd5efbe68448c36',1,'copy.h']]],
+  ['copy_5fg_5fnd1_66',['copy_g_nd1',['../metal_2kernels_2copy_8h.html#aba4530a7db6a61ca36f50e4f5e58fb77',1,'copy.h']]],
+  ['copy_5fg_5fnd2_67',['copy_g_nd2',['../metal_2kernels_2copy_8h.html#aee678c7c31119f3e609685589f37490c',1,'copy.h']]],
+  ['copy_5fg_5fnd3_68',['copy_g_nd3',['../metal_2kernels_2copy_8h.html#a821f8f3f3891159a295c66fc25aed1ff',1,'copy.h']]],
+  ['copy_5fgg_69',['copy_gg',['../metal_2kernels_2copy_8h.html#a1e39c2683eeaf05955e7619fbd34aea5',1,'copy.h']]],
+  ['copy_5fgg_5fnd1_70',['copy_gg_nd1',['../metal_2kernels_2copy_8h.html#a3278d9c999718bee3ccbe2922f501bf1',1,'copy.h']]],
+  ['copy_5fgg_5fnd2_71',['copy_gg_nd2',['../metal_2kernels_2copy_8h.html#a3e2d3cc7f34f56170409b6735f51a950',1,'copy.h']]],
+  ['copy_5fgg_5fnd3_72',['copy_gg_nd3',['../metal_2kernels_2copy_8h.html#a59f43b5bffed936d7559ceb06a10aabd',1,'copy.h']]],
+  ['copy_5fgpu_73',['copy_gpu',['../namespacemlx_1_1core.html#addaa46a13ac2deb1d9ce621338320e0e',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a6a6f4e46c8fc44fdc74c50ace02bcf38',1,'mlx::core::copy_gpu(const array &amp;src, array &amp;out, CopyType ctype)']]],
+  ['copy_5fgpu_5finplace_74',['copy_gpu_inplace',['../namespacemlx_1_1core.html#a69e30f5d30a6d72ac0ffe4886f24b7ba',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#a8e1ccb0ed9387b0a789311d9f8964803',1,'mlx::core::copy_gpu_inplace(const array &amp;src, array &amp;out, CopyType ctype, const Stream &amp;s)'],['../namespacemlx_1_1core.html#ae55b801b09ccf55cba96278163a9b1ef',1,'mlx::core::copy_gpu_inplace(const array &amp;in, array &amp;out, const std::vector&lt; int64_t &gt; &amp;istride, int64_t ioffset, CopyType ctype, const Stream &amp;s)']]],
+  ['copy_5fhartley_75',['copy_hartley',['../namespacepocketfft_1_1detail.html#abac3fcc8ce83800d228774f64c28d4c3',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#ae7b44d2773d9d06a9787aff01d66b3ed',1,'pocketfft::detail::copy_hartley(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5finplace_76',['copy_inplace',['../namespacemlx_1_1core.html#a98495894a796b2cc6d022e7a03432c64',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, CopyType ctype)'],['../namespacemlx_1_1core.html#aad636e2d0b2f882cadd1b438f4daa9ed',1,'mlx::core::copy_inplace(const array &amp;src, array &amp;dst, const std::vector&lt; int &gt; &amp;data_shape, const std::vector&lt; stride_t &gt; &amp;i_strides, const std::vector&lt; stride_t &gt; &amp;o_strides, int64_t i_offset, int64_t o_offset, CopyType ctype)']]],
+  ['copy_5finput_77',['copy_input',['../namespacepocketfft_1_1detail.html#aff05be3064743c1143b19318ab12ad4a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; cmplx&lt; T &gt; &gt; &amp;src, cmplx&lt; vtype_t&lt; T &gt; &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a30fc708f9d8f9cfa74194925c7863c0a',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, vtype_t&lt; T &gt; *dst)'],['../namespacepocketfft_1_1detail.html#a3387bd35f237870e42b8461769e6aec4',1,'pocketfft::detail::copy_input(const multi_iter&lt; vlen &gt; &amp;it, const cndarr&lt; T &gt; &amp;src, T *dst)']]],
+  ['copy_5foutput_78',['copy_output',['../namespacepocketfft_1_1detail.html#a1523a037300a8da05db210b802d9cb0e',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const cmplx&lt; vtype_t&lt; T &gt; &gt; *src, ndarr&lt; cmplx&lt; T &gt; &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a21980853aca4d92ed06e3dcffe7ef660',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const vtype_t&lt; T &gt; *src, ndarr&lt; T &gt; &amp;dst)'],['../namespacepocketfft_1_1detail.html#a310481c334e46674710ba794ad7403c0',1,'pocketfft::detail::copy_output(const multi_iter&lt; vlen &gt; &amp;it, const T *src, ndarr&lt; T &gt; &amp;dst)']]],
+  ['copy_5fs_79',['copy_s',['../metal_2kernels_2copy_8h.html#aef09f9b9475345b1bba121d037d222ea',1,'copy.h']]],
+  ['copy_5fs2_80',['copy_s2',['../metal_2kernels_2copy_8h.html#a8023e9335cc5334847a8d315042be3a3',1,'copy.h']]],
+  ['copy_5fshared_5fbuffer_81',['copy_shared_buffer',['../classmlx_1_1core_1_1array.html#a28df7a333d90a311c49bc4bce7a1ad6d',1,'mlx::core::array::copy_shared_buffer(const array &amp;other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a92974c656c35a972ad241f80584bbd29',1,'mlx::core::array::copy_shared_buffer(const array &amp;other)']]],
+  ['copy_5fv_82',['copy_v',['../metal_2kernels_2copy_8h.html#ae26a13e0c8e6c15f7b10078e65970659',1,'copy.h']]],
+  ['copy_5fv2_83',['copy_v2',['../metal_2kernels_2copy_8h.html#aee14a5326f53d9b30b0b38e27d180ef3',1,'copy.h']]],
+  ['cos_84',['Cos',['../classmlx_1_1core_1_1_cos.html#a2acb9fcf0901462189c476756fd99995',1,'mlx::core::Cos']]],
+  ['cos_85',['cos',['../namespacepocketfft_1_1detail.html#a499c1e8b7d79a5272af024f46c63ff9d',1,'pocketfft::detail::cos()'],['../namespacemetal.html#a2fa4778a6fe2fa43253ea724e5a608a3',1,'metal::cos()'],['../namespacemetal_1_1fast.html#a75b6bb32fa3870eda46a7bfc9f481f88',1,'metal::fast::cos()'],['../namespacemetal_1_1precise.html#ac4941f62e7d8ab9d7cabbd967aa9f220',1,'metal::precise::cos()'],['../group__ops.html#ga39dfdf72b556012aa35ff27a94116e74',1,'mlx::core::cos()']]],
+  ['cosh_86',['Cosh',['../classmlx_1_1core_1_1_cosh.html#a44e8ac2e09a55ec32e9dc6641eedc8f1',1,'mlx::core::Cosh']]],
+  ['cosh_87',['cosh',['../namespacemetal.html#a8a68a88cc110830d057dbd71431b93c0',1,'metal::cosh()'],['../namespacemetal_1_1fast.html#a31544ad9de28012a4ddda86e3966a77e',1,'metal::fast::cosh()'],['../namespacemetal_1_1precise.html#a72d86d508300a9b58f4ccbbe70da4fbc',1,'metal::precise::cosh()'],['../group__ops.html#ga2181b71cda88007a3092be4795ff0715',1,'mlx::core::cosh()']]],
+  ['cospi_88',['cospi',['../namespacemetal.html#a5c2f37939ad705ddea4409d3bedb8ce1',1,'metal::cospi()'],['../namespacemetal_1_1fast.html#a9906b41f75319b384ffb570cc94d67ce',1,'metal::fast::cospi()'],['../namespacemetal_1_1precise.html#a2392b78bd196efdbbac65901c4ab20e7',1,'metal::precise::cospi()']]],
+  ['cost_5fguess_89',['cost_guess',['../structpocketfft_1_1detail_1_1util.html#ad3d874bc3fb0048df2270779a15d4bd0',1,'pocketfft::detail::util']]],
+  ['count_5fdown_90',['count_down',['../classpocketfft_1_1detail_1_1threading_1_1latch.html#a81d6597189b40410e35f3cd653fd1342',1,'pocketfft::detail::threading::latch']]],
+  ['cross_91',['cross',['../namespacemlx_1_1core_1_1linalg.html#abcda3fbda45183c21e7f27aa0dde64e6',1,'mlx::core::linalg']]],
+  ['cummax_92',['cummax',['../group__ops.html#gaee37cac8476e8f8d666bcded5bc59143',1,'mlx::core']]],
+  ['cummin_93',['cummin',['../group__ops.html#ga19c1bf6929fe8d66b9cd408946aea6a8',1,'mlx::core']]],
+  ['cumprod_94',['cumprod',['../group__ops.html#ga0d71dfbc14ef3ed564b0c5ee26af680f',1,'mlx::core']]],
+  ['cumsum_95',['cumsum',['../group__ops.html#gaddc825a5c173e195ab0fda83ad630420',1,'mlx::core']]],
+  ['custom_96',['Custom',['../classmlx_1_1core_1_1fast_1_1_custom.html#a4186fea23f7156c38960426821fca313',1,'mlx::core::fast::Custom']]],
+  ['custom_5ffunction_97',['custom_function',['../namespacemlx_1_1core.html#a8d3ca5fbaecdb995660c24cde5aeebaf',1,'mlx::core']]],
+  ['custom_5fvjp_98',['custom_vjp',['../namespacemlx_1_1core.html#a9290596250fa308df4c69b44483bb8aa',1,'mlx::core']]],
+  ['customkernel_99',['CustomKernel',['../classmlx_1_1core_1_1fast_1_1_custom_kernel.html#a954893e07f0d36715b4e1e414b6f2153',1,'mlx::core::fast::CustomKernel']]],
+  ['customtransforms_100',['CustomTransforms',['../classmlx_1_1core_1_1_custom_transforms.html#ab52abadb9c6f6db83d087c7b751be488',1,'mlx::core::CustomTransforms']]]
 ];
diff --git a/docs/build/html/search/functions_7.js b/docs/build/html/search/functions_7.js
index 2b5a027d5..91a1bdfb7 100644
--- a/docs/build/html/search/functions_7.js
+++ b/docs/build/html/search/functions_7.js
@@ -44,7 +44,7 @@ var searchData=
   ['get_5fpool_41',['get_pool',['../namespacepocketfft_1_1detail_1_1threading.html#a7ec2b3f99232bd0f15f7b022c59d139a',1,'pocketfft::detail::threading']]],
   ['get_5fprimitive_5fstring_42',['get_primitive_string',['../namespacemlx_1_1core.html#ad4be35b310a252edd80d9cf04f094a60',1,'mlx::core']]],
   ['get_5fquantized_5fkernel_43',['get_quantized_kernel',['../namespacemlx_1_1core.html#aa3faeae5378bfaafe3ce3432a051e43e',1,'mlx::core']]],
-  ['get_5freduce_5finit_5fkernel_44',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a51c4bb09230348bd0252e22bfdc9bc89',1,'mlx::core']]],
+  ['get_5freduce_5finit_5fkernel_44',['get_reduce_init_kernel',['../namespacemlx_1_1core.html#a3bd386cb6db09f636963ce66ceaf8647',1,'mlx::core']]],
   ['get_5freduce_5fkernel_45',['get_reduce_kernel',['../namespacemlx_1_1core.html#a7aa91fcfe8b9caa42d60a957f11bfe6b',1,'mlx::core']]],
   ['get_5freduction_5fplan_46',['get_reduction_plan',['../namespacemlx_1_1core.html#ac97b5a6f009ca3d99854ce9512c20dba',1,'mlx::core']]],
   ['get_5fscan_5fkernel_47',['get_scan_kernel',['../namespacemlx_1_1core.html#aeefaff208444d3fa61ecc0946fe1de5f',1,'mlx::core']]],
diff --git a/docs/build/html/search/functions_d.js b/docs/build/html/search/functions_d.js
index 5e2da1c4d..7967de990 100644
--- a/docs/build/html/search/functions_d.js
+++ b/docs/build/html/search/functions_d.js
@@ -14,39 +14,40 @@ var searchData=
   ['max3_11',['max3',['../namespacemetal.html#a00f9c0ad66d969794614f56912eed9c9',1,'metal::max3()'],['../namespacemetal_1_1fast.html#a6fc2cf18ffa8149561864c86dba0f803',1,'metal::fast::max3()'],['../namespacemetal_1_1precise.html#ac490e8614ebd2c9343af1ae6c0d4e82c',1,'metal::precise::max3()']]],
   ['maximum_12',['Maximum',['../classmlx_1_1core_1_1_maximum.html#a28389307e385efe1b2955b86b115e816',1,'mlx::core::Maximum']]],
   ['maximum_13',['maximum',['../group__ops.html#ga7ade2ea305e2e4219c3609443fb5db8d',1,'mlx::core']]],
-  ['mb_5fblock_5fmerge_14',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
-  ['mb_5fblock_5fpartition_15',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
-  ['mb_5fblock_5fsort_16',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
-  ['mean_17',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['median3_18',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
-  ['merge_5fpartition_19',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
-  ['merge_5fstep_20',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
-  ['meshgrid_21',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
-  ['metal_5fkernel_22',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
-  ['min_23',['min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
-  ['min3_24',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
-  ['minimum_25',['Minimum',['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum']]],
-  ['minimum_26',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
-  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_27',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_28',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_29',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_30',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_31',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_32',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_33',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_34',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
-  ['mlx_5fatomic_5ffetch_5for_5fexplicit_35',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
-  ['mlx_5fatomic_5fload_5fexplicit_36',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
-  ['mlx_5fatomic_5fstore_5fexplicit_37',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
-  ['mma_38',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
-  ['mmatile_39',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile']]],
-  ['move_5fshared_5fbuffer_40',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
-  ['moveaxis_41',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
-  ['mpinplace_42',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
-  ['mtl_5fdevice_43',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
-  ['mtl_5fresidency_5fset_44',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
-  ['multi_5fiter_45',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter']]],
-  ['multiply_46',['Multiply',['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply']]],
-  ['multiply_47',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
-  ['multivariate_5fnormal_48',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
+  ['maybeinsertbarrier_14',['maybeInsertBarrier',['../structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991',1,'mlx::core::metal::CommandEncoder']]],
+  ['mb_5fblock_5fmerge_15',['mb_block_merge',['../sort_8h.html#ab381cd57f344bc7304ab580bfdc78807',1,'sort.h']]],
+  ['mb_5fblock_5fpartition_16',['mb_block_partition',['../sort_8h.html#a32cbe4163b8b0f5cb2c97b256119a4b2',1,'sort.h']]],
+  ['mb_5fblock_5fsort_17',['mb_block_sort',['../sort_8h.html#aa48ff1aff1e9dc1301b6781aa0721d6b',1,'sort.h']]],
+  ['mean_18',['mean',['../group__ops.html#gade46e768fd46b8b640eb16f26abeecef',1,'mlx::core::mean(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga52b59fdd8e8430538e564f5bbcfa31e6',1,'mlx::core::mean(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga066161f3d3e395a1d76c638cb680d444',1,'mlx::core::mean(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga45fba73eab0e3b6e128ed3ce2f43a5da',1,'mlx::core::mean(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['median3_19',['median3',['../namespacemetal.html#aa3ff49457ce3c93fc1c0897fd1525157',1,'metal::median3()'],['../namespacemetal_1_1fast.html#a742b55f1e4369921ee7f60d70185bfbc',1,'metal::fast::median3()'],['../namespacemetal_1_1precise.html#a14555ff99c4388493fec48e070144ae2',1,'metal::precise::median3()']]],
+  ['merge_5fpartition_20',['merge_partition',['../struct_block_merge_sort.html#ab2300cbecb23f3433bad888924c831ca',1,'BlockMergeSort::merge_partition()'],['../struct_kernel_multi_block_merge_sort.html#ab15895b4233aba0e279cc44a07a201fe',1,'KernelMultiBlockMergeSort::merge_partition()']]],
+  ['merge_5fstep_21',['merge_step',['../struct_block_merge_sort.html#ab65f190edf1851b37c39ad49ce99a43c',1,'BlockMergeSort']]],
+  ['meshgrid_22',['meshgrid',['../group__ops.html#ga577c911618575314de63d1060656a26e',1,'mlx::core']]],
+  ['metal_5fkernel_23',['metal_kernel',['../namespacemlx_1_1core_1_1fast.html#ab16436b465dc10ce472193d541d8426e',1,'mlx::core::fast']]],
+  ['min_24',['min',['../structmetal_1_1__numeric__limits__impl_3_01bfloat16__t_01_4.html#adaed80031f5ca0ff69d30ec4c5d0c98f',1,'metal::_numeric_limits_impl&lt; bfloat16_t &gt;::min()'],['../namespacemetal.html#a6653b28c9473087141eddce39878d4d3',1,'metal::min()'],['../namespacemetal_1_1fast.html#a3e958e56a4712687c381a0b64d123e61',1,'metal::fast::min()'],['../namespacemetal_1_1precise.html#afed0da2f7df3505b5dffa2389c3cb36e',1,'metal::precise::min()'],['../group__ops.html#gab27599802617a4c8f9964ab5f4ffee12',1,'mlx::core::min(const array &amp;a, bool keepdims, StreamOrDevice s={})'],['../group__ops.html#ga0140b91e9cdfc3fef0da8e332f65a9e8',1,'mlx::core::min(const array &amp;a, StreamOrDevice s={})'],['../group__ops.html#ga6efb83cd46436678c8f8c4af15cc00f5',1,'mlx::core::min(const array &amp;a, const std::vector&lt; int &gt; &amp;axes, bool keepdims=false, StreamOrDevice s={})'],['../group__ops.html#ga36fa315eef677f4143868f552cd26d03',1,'mlx::core::min(const array &amp;a, int axis, bool keepdims=false, StreamOrDevice s={})']]],
+  ['min3_25',['min3',['../namespacemetal.html#a005510c8c0f964ce2b8aad3ba76a7a3f',1,'metal::min3()'],['../namespacemetal_1_1fast.html#a606a4c1b34ce05ea89ca5af81724036f',1,'metal::fast::min3()'],['../namespacemetal_1_1precise.html#a4d37ce31c3549ca4772a4ee29798e231',1,'metal::precise::min3()']]],
+  ['minimum_26',['Minimum',['../classmlx_1_1core_1_1_minimum.html#ab0f2ce17108df44b82cff68886b0f6f5',1,'mlx::core::Minimum']]],
+  ['minimum_27',['minimum',['../group__ops.html#ga49ba00c090f81f331c91b0c97040bce0',1,'mlx::core']]],
+  ['mlx_5fatomic_5fcompare_5fexchange_5fweak_5fexplicit_28',['mlx_atomic_compare_exchange_weak_explicit',['../atomic_8h.html#ad7f32327ff66354cfa2f0cfdac79316f',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread T *expected, T val, size_t offset):&#160;atomic.h'],['../atomic_8h.html#aa8f47b2e9b95d4b00ad51f08b070deb5',1,'mlx_atomic_compare_exchange_weak_explicit(device mlx_atomic&lt; T &gt; *object, thread uint *expected, uint val, size_t offset):&#160;atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fadd_5fexplicit_29',['mlx_atomic_fetch_add_explicit',['../atomic_8h.html#aad448d9e06e001700b65ca8317216a3b',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fand_5fexplicit_30',['mlx_atomic_fetch_and_explicit',['../atomic_8h.html#a253e3c870c0ddc7c28ab2f6ca2c3eae5',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_31',['mlx_atomic_fetch_max_explicit',['../atomic_8h.html#ac480f2b459a8ad9095cee353e152d00c',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmax_5fexplicit_3c_20float_20_3e_32',['mlx_atomic_fetch_max_explicit&lt; float &gt;',['../atomic_8h.html#a1dce2abfa16417122c4d2bf261129ae4',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_33',['mlx_atomic_fetch_min_explicit',['../atomic_8h.html#a2ec33dca0039bd944d73d1c2b378cc19',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmin_5fexplicit_3c_20float_20_3e_34',['mlx_atomic_fetch_min_explicit&lt; float &gt;',['../atomic_8h.html#ab7d1dc49f319f239b7ee0b7c72976dd0',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5fmul_5fexplicit_35',['mlx_atomic_fetch_mul_explicit',['../atomic_8h.html#adfdbea60436f14f1af9ce36e2a0a77a3',1,'atomic.h']]],
+  ['mlx_5fatomic_5ffetch_5for_5fexplicit_36',['mlx_atomic_fetch_or_explicit',['../atomic_8h.html#ab7391f197001471e4788312bdb6ab37a',1,'atomic.h']]],
+  ['mlx_5fatomic_5fload_5fexplicit_37',['mlx_atomic_load_explicit',['../atomic_8h.html#a253a4e8c2c5768a069e2791b627dfc99',1,'atomic.h']]],
+  ['mlx_5fatomic_5fstore_5fexplicit_38',['mlx_atomic_store_explicit',['../atomic_8h.html#a0ae453140b0819a4c02f265334de98c0',1,'atomic.h']]],
+  ['mma_39',['mma',['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a8028512f5a3d2b6acaf966be529627a3',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread frag_type &amp;D, thread frag_type &amp;A, thread frag_type &amp;B, thread frag_type &amp;C)'],['../structmlx_1_1steel_1_1_base_m_m_a_frag_3_01_t_00_018_00_018_01_4.html#a1868f57d57c8adedab2c58492ec76946',1,'mlx::steel::BaseMMAFrag&lt; T, 8, 8 &gt;::mma(thread mat_type &amp;D, thread mat_type &amp;A, thread mat_type &amp;B, thread mat_type &amp;C)'],['../structmlx_1_1steel_1_1_block_m_m_a.html#a6a2c2a6d5e767d52c41b42a9d36086b0',1,'mlx::steel::BlockMMA::mma()']]],
+  ['mmatile_40',['MMATile',['../structmlx_1_1steel_1_1_m_m_a_tile.html#aa3fb310dd08ec23c334511f7b316d1b6',1,'mlx::steel::MMATile']]],
+  ['move_5fshared_5fbuffer_41',['move_shared_buffer',['../classmlx_1_1core_1_1array.html#acce00db63e0f3d80f797b02397ade836',1,'mlx::core::array::move_shared_buffer(array other, const std::vector&lt; size_t &gt; &amp;strides, Flags flags, size_t data_size, size_t offset=0)'],['../classmlx_1_1core_1_1array.html#a38d7ad605f8282e5e49d0c09e0555c78',1,'mlx::core::array::move_shared_buffer(array other)']]],
+  ['moveaxis_42',['moveaxis',['../group__ops.html#ga24067d10a842db2c9d509ea48135a2c3',1,'mlx::core']]],
+  ['mpinplace_43',['MPINPLACE',['../namespacepocketfft_1_1detail.html#af5eedf3cdfc83c0a30807092c39a9ce2',1,'pocketfft::detail']]],
+  ['mtl_5fdevice_44',['mtl_device',['../classmlx_1_1core_1_1metal_1_1_device.html#a31dba377f2be44a746db10d1b9367653',1,'mlx::core::metal::Device']]],
+  ['mtl_5fresidency_5fset_45',['mtl_residency_set',['../classmlx_1_1core_1_1metal_1_1_residency_set.html#ac4bfe5ef5e2eaebc458a1ed1953d15e9',1,'mlx::core::metal::ResidencySet']]],
+  ['multi_5fiter_46',['multi_iter',['../classpocketfft_1_1detail_1_1multi__iter.html#a9be43bb18840202da6d17988fccc64b9',1,'pocketfft::detail::multi_iter']]],
+  ['multiply_47',['Multiply',['../classmlx_1_1core_1_1_multiply.html#aca5c50f900321f3eb4d6fbcbc225c00c',1,'mlx::core::Multiply']]],
+  ['multiply_48',['multiply',['../group__ops.html#gaf57392e641640b5d06e4c99518391c38',1,'mlx::core']]],
+  ['multivariate_5fnormal_49',['multivariate_normal',['../namespacemlx_1_1core_1_1random.html#a8c37da3c1c0c561cad7499d6d9db81fb',1,'mlx::core::random']]]
 ];
diff --git a/docs/build/html/searchindex.js b/docs/build/html/searchindex.js
index 7e7aaf386..09019b7d5 100644
--- a/docs/build/html/searchindex.js
+++ b/docs/build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"A Simple Example": [[487, "a-simple-example"]], "Array": [[316, null]], "Attention layer": [[5, "attention-layer"]], "Automatic Differentiation": [[481, "automatic-differentiation"]], "Automatic Vectorization": [[481, "automatic-vectorization"]], "Basics": [[485, "basics"]], "Basics of Compile": [[479, "basics-of-compile"]], "Binary Size Minimization": [[8, "binary-size-minimization"]], "Binding to Python": [[2, "binding-to-python"]], "Build Options": [[8, "id3"]], "Build Requirements": [[8, "build-requirements"]], "Build and Install": [[8, null]], "Build from source": [[8, "build-from-source"]], "Building and Binding": [[2, "building-and-binding"]], "Building with CMake": [[2, "building-with-cmake"]], "Building with setuptools": [[2, "building-with-setuptools"]], "C++ API": [[8, "c-api"]], "C++ API Reference": [[7, null]], "Common Optimizers": [[473, null]], "Compilation": [[479, null]], "Compiling Training Graphs": [[479, "compiling-training-graphs"]], "Complex Example": [[1, "complex-example"]], "Conversion to NumPy and Other Frameworks": [[484, null]], "Converting the weights": [[5, "converting-the-weights"]], "Custom Extensions in MLX": [[2, null]], "Custom Metal Kernels": [[1, null]], "Data Types": [[317, null]], "Debugging": [[479, "debugging"]], "Devices and Streams": [[318, null]], "Differences from NumPy": [[482, "differences-from-numpy"]], "Distributed Communication": [[319, null], [480, null]], "Download the code": [[2, null], [5, null]], "Encoder layer": [[5, "encoder-layer"]], "Example Speedup": [[479, "example-speedup"]], "Examples": [[7, null]], "FFT": [[321, null]], "Fast": [[320, null]], "Full model": [[5, "full-model"]], "Function Transforms": [[481, null]], "Function and Graph Transformations": [[485, "function-and-graph-transformations"]], "Functions": [[448, null]], "Further Reading": [[7, null]], "Generation": [[5, "generation"]], "Getting Started": [[480, "getting-started"]], "Grid Sample VJP": [[1, "grid-sample-vjp"]], "Implementing the CPU Back-end": [[2, "implementing-the-cpu-back-end"]], "Implementing the GPU Back-end": [[2, "implementing-the-gpu-back-end"]], "Implementing the Primitive": [[2, "implementing-the-primitive"]], "Implementing the model": [[5, "implementing-the-model"]], "In Place Updates": [[482, "in-place-updates"]], "Indexing Arrays": [[482, null]], "Initializers": [[449, null]], "Inspecting Modules": [[324, "inspecting-modules"]], "Install": [[7, null]], "Installing MPI": [[480, "installing-mpi"]], "Introducing the Example": [[2, "introducing-the-example"]], "JAX": [[484, "jax"]], "LLM inference": [[5, null]], "Layers": [[450, null]], "Lazy Evaluation": [[483, null]], "Linear Algebra": [[322, null]], "Linear Regression": [[4, null]], "Loss Functions": [[451, null]], "MLX": [[7, null]], "Metal": [[323, null]], "Metal Debugger": [[3, null]], "Metal not found": [[8, "metal-not-found"]], "Module": [[452, null]], "Multi-Layer Perceptron": [[6, null]], "Neural Networks": [[324, null]], "Only Compute What You Use": [[483, "only-compute-what-you-use"]], "Operations": [[0, null], [2, "operations"], [453, null]], "Operations and Primitives": [[2, "operations-and-primitives"]], "Optimizer": [[474, null]], "Optimizers": [[454, null]], "Parameters": [[324, "parameters"]], "Primitive Transforms": [[2, "primitive-transforms"]], "Primitives": [[2, "primitives"]], "Pure Functions": [[479, "pure-functions"]], "Putting it all together": [[5, "putting-it-all-together"]], "PyTorch": [[484, "pytorch"]], "Python API": [[8, "python-api"]], "Python API Reference": [[7, null]], "Python Installation": [[8, "python-installation"]], "Quick Start Guide": [[485, null]], "Quick Start with Neural Networks": [[324, "quick-start-with-neural-networks"]], "Random": [[476, null]], "Results": [[2, "results"]], "Saving and Loading": [[454, "saving-and-loading"]], "Saving and Loading Arrays": [[486, null]], "Schedulers": [[475, null]], "Scripts": [[2, "scripts"], [5, "scripts"]], "Serialization Formats": [[486, "id1"]], "Setting up Remote Hosts": [[480, "setting-up-remote-hosts"]], "Simple Example": [[1, "simple-example"]], "Specifying the Stream": [[488, "specifying-the-stream"]], "Supported Data Types": [[317, "id2"]], "TensorFlow": [[484, "tensorflow"]], "The Module Class": [[324, "the-module-class"]], "Training Example": [[480, "training-example"]], "Transformations with Compile": [[479, "transformations-with-compile"]], "Transforming Compute Graphs": [[483, "transforming-compute-graphs"]], "Transforms": [[477, null]], "Tree Utils": [[478, null]], "Troubleshooting": [[8, "troubleshooting"], [8, "id2"]], "Tuning All Reduce": [[480, "tuning-all-reduce"]], "Unified Memory": [[487, null]], "Updating the Parameters": [[324, "updating-the-parameters"]], "Usage": [[2, "usage"], [7, null]], "Using Shape/Strides": [[1, "using-shape-strides"]], "Using Streams": [[488, null]], "Using the Primitive": [[2, "using-the-primitive"]], "Value and Grad": [[324, "value-and-grad"]], "Weight loading and benchmarking": [[5, "weight-loading-and-benchmarking"]], "When to Evaluate": [[483, "when-to-evaluate"]], "Why Lazy Evaluation": [[483, "why-lazy-evaluation"]], "Xcode Workflow": [[3, "xcode-workflow"]], "mlx.core.Device": [[9, null]], "mlx.core.Dtype": [[10, null]], "mlx.core.DtypeCategory": [[11, null]], "mlx.core.Stream": [[315, null]], "mlx.core.abs": [[12, null]], "mlx.core.add": [[13, null]], "mlx.core.addmm": [[14, null]], "mlx.core.all": [[15, null]], "mlx.core.allclose": [[16, null]], "mlx.core.any": [[17, null]], "mlx.core.arange": [[18, null]], "mlx.core.arccos": [[19, null]], "mlx.core.arccosh": [[20, null]], "mlx.core.arcsin": [[21, null]], "mlx.core.arcsinh": [[22, null]], "mlx.core.arctan": [[23, null]], "mlx.core.arctan2": [[24, null]], "mlx.core.arctanh": [[25, null]], "mlx.core.argmax": [[26, null]], "mlx.core.argmin": [[27, null]], "mlx.core.argpartition": [[28, null]], "mlx.core.argsort": [[29, null]], "mlx.core.array": [[30, null]], "mlx.core.array.T": [[31, null]], "mlx.core.array.abs": [[32, null]], "mlx.core.array.all": [[33, null]], "mlx.core.array.any": [[34, null]], "mlx.core.array.argmax": [[35, null]], "mlx.core.array.argmin": [[36, null]], "mlx.core.array.astype": [[37, null]], "mlx.core.array.at": [[38, null]], "mlx.core.array.conj": [[39, null]], "mlx.core.array.cos": [[40, null]], "mlx.core.array.cummax": [[41, null]], "mlx.core.array.cummin": [[42, null]], "mlx.core.array.cumprod": [[43, null]], "mlx.core.array.cumsum": [[44, null]], "mlx.core.array.diag": [[45, null]], "mlx.core.array.diagonal": [[46, null]], "mlx.core.array.dtype": [[47, null]], "mlx.core.array.exp": [[48, null]], "mlx.core.array.flatten": [[49, null]], "mlx.core.array.item": [[50, null]], "mlx.core.array.itemsize": [[51, null]], "mlx.core.array.log": [[52, null]], "mlx.core.array.log10": [[53, null]], "mlx.core.array.log1p": [[54, null]], "mlx.core.array.log2": [[55, null]], "mlx.core.array.logsumexp": [[56, null]], "mlx.core.array.max": [[57, null]], "mlx.core.array.mean": [[58, null]], "mlx.core.array.min": [[59, null]], "mlx.core.array.moveaxis": [[60, null]], "mlx.core.array.nbytes": [[61, null]], "mlx.core.array.ndim": [[62, null]], "mlx.core.array.prod": [[63, null]], "mlx.core.array.reciprocal": [[64, null]], "mlx.core.array.reshape": [[65, null]], "mlx.core.array.round": [[66, null]], "mlx.core.array.rsqrt": [[67, null]], "mlx.core.array.shape": [[68, null]], "mlx.core.array.sin": [[69, null]], "mlx.core.array.size": [[70, null]], "mlx.core.array.split": [[71, null]], "mlx.core.array.sqrt": [[72, null]], "mlx.core.array.square": [[73, null]], "mlx.core.array.squeeze": [[74, null]], "mlx.core.array.std": [[75, null]], "mlx.core.array.sum": [[76, null]], "mlx.core.array.swapaxes": [[77, null]], "mlx.core.array.tolist": [[78, null]], "mlx.core.array.transpose": [[79, null]], "mlx.core.array.var": [[80, null]], "mlx.core.array.view": [[81, null]], "mlx.core.array_equal": [[82, null]], "mlx.core.as_strided": [[83, null]], "mlx.core.atleast_1d": [[84, null]], "mlx.core.atleast_2d": [[85, null]], "mlx.core.atleast_3d": [[86, null]], "mlx.core.bitwise_and": [[87, null]], "mlx.core.bitwise_or": [[88, null]], "mlx.core.bitwise_xor": [[89, null]], "mlx.core.block_masked_mm": [[90, null]], "mlx.core.broadcast_to": [[91, null]], "mlx.core.ceil": [[92, null]], "mlx.core.clip": [[93, null]], "mlx.core.compile": [[94, null]], "mlx.core.concatenate": [[95, null]], "mlx.core.conj": [[96, null]], "mlx.core.conjugate": [[97, null]], "mlx.core.conv1d": [[98, null]], "mlx.core.conv2d": [[99, null]], "mlx.core.conv3d": [[100, null]], "mlx.core.conv_general": [[101, null]], "mlx.core.conv_transpose1d": [[102, null]], "mlx.core.conv_transpose2d": [[103, null]], "mlx.core.conv_transpose3d": [[104, null]], "mlx.core.convolve": [[105, null]], "mlx.core.cos": [[106, null]], "mlx.core.cosh": [[107, null]], "mlx.core.cummax": [[108, null]], "mlx.core.cummin": [[109, null]], "mlx.core.cumprod": [[110, null]], "mlx.core.cumsum": [[111, null]], "mlx.core.custom_function": [[112, null]], "mlx.core.default_device": [[113, null]], "mlx.core.default_stream": [[114, null]], "mlx.core.degrees": [[115, null]], "mlx.core.dequantize": [[116, null]], "mlx.core.diag": [[117, null]], "mlx.core.diagonal": [[118, null]], "mlx.core.disable_compile": [[119, null]], "mlx.core.distributed.Group": [[120, null]], "mlx.core.distributed.all_gather": [[121, null]], "mlx.core.distributed.all_sum": [[122, null]], "mlx.core.distributed.init": [[123, null]], "mlx.core.distributed.is_available": [[124, null]], "mlx.core.distributed.recv": [[125, null]], "mlx.core.distributed.recv_like": [[126, null]], "mlx.core.distributed.send": [[127, null]], "mlx.core.divide": [[128, null]], "mlx.core.divmod": [[129, null]], "mlx.core.einsum": [[130, null]], "mlx.core.einsum_path": [[131, null]], "mlx.core.enable_compile": [[132, null]], "mlx.core.equal": [[133, null]], "mlx.core.erf": [[134, null]], "mlx.core.erfinv": [[135, null]], "mlx.core.eval": [[136, null]], "mlx.core.exp": [[137, null]], "mlx.core.expand_dims": [[138, null]], "mlx.core.expm1": [[139, null]], "mlx.core.eye": [[140, null]], "mlx.core.fast.affine_quantize": [[141, null]], "mlx.core.fast.layer_norm": [[142, null]], "mlx.core.fast.metal_kernel": [[143, null]], "mlx.core.fast.rms_norm": [[144, null]], "mlx.core.fast.rope": [[145, null]], "mlx.core.fast.scaled_dot_product_attention": [[146, null]], "mlx.core.fft.fft": [[147, null]], "mlx.core.fft.fft2": [[148, null]], "mlx.core.fft.fftn": [[149, null]], "mlx.core.fft.ifft": [[150, null]], "mlx.core.fft.ifft2": [[151, null]], "mlx.core.fft.ifftn": [[152, null]], "mlx.core.fft.irfft": [[153, null]], "mlx.core.fft.irfft2": [[154, null]], "mlx.core.fft.irfftn": [[155, null]], "mlx.core.fft.rfft": [[156, null]], "mlx.core.fft.rfft2": [[157, null]], "mlx.core.fft.rfftn": [[158, null]], "mlx.core.flatten": [[159, null]], "mlx.core.floor": [[160, null]], "mlx.core.floor_divide": [[161, null]], "mlx.core.full": [[162, null]], "mlx.core.gather_mm": [[163, null]], "mlx.core.gather_qmm": [[164, null]], "mlx.core.grad": [[165, null]], "mlx.core.greater": [[166, null]], "mlx.core.greater_equal": [[167, null]], "mlx.core.hadamard_transform": [[168, null]], "mlx.core.identity": [[169, null]], "mlx.core.imag": [[170, null]], "mlx.core.inner": [[171, null]], "mlx.core.isclose": [[172, null]], "mlx.core.isfinite": [[173, null]], "mlx.core.isinf": [[174, null]], "mlx.core.isnan": [[175, null]], "mlx.core.isneginf": [[176, null]], "mlx.core.isposinf": [[177, null]], "mlx.core.issubdtype": [[178, null]], "mlx.core.jvp": [[179, null]], "mlx.core.left_shift": [[180, null]], "mlx.core.less": [[181, null]], "mlx.core.less_equal": [[182, null]], "mlx.core.linalg.cholesky": [[183, null]], "mlx.core.linalg.cholesky_inv": [[184, null]], "mlx.core.linalg.cross": [[185, null]], "mlx.core.linalg.eigh": [[186, null]], "mlx.core.linalg.eigvalsh": [[187, null]], "mlx.core.linalg.inv": [[188, null]], "mlx.core.linalg.norm": [[189, null]], "mlx.core.linalg.qr": [[190, null]], "mlx.core.linalg.svd": [[191, null]], "mlx.core.linalg.tri_inv": [[192, null]], "mlx.core.linspace": [[193, null]], "mlx.core.load": [[194, null]], "mlx.core.log": [[195, null]], "mlx.core.log10": [[196, null]], "mlx.core.log1p": [[197, null]], "mlx.core.log2": [[198, null]], "mlx.core.logaddexp": [[199, null]], "mlx.core.logical_and": [[200, null]], "mlx.core.logical_not": [[201, null]], "mlx.core.logical_or": [[202, null]], "mlx.core.logsumexp": [[203, null]], "mlx.core.matmul": [[204, null]], "mlx.core.max": [[205, null]], "mlx.core.maximum": [[206, null]], "mlx.core.mean": [[207, null]], "mlx.core.meshgrid": [[208, null]], "mlx.core.metal.clear_cache": [[209, null]], "mlx.core.metal.device_info": [[210, null]], "mlx.core.metal.get_active_memory": [[211, null]], "mlx.core.metal.get_cache_memory": [[212, null]], "mlx.core.metal.get_peak_memory": [[213, null]], "mlx.core.metal.is_available": [[214, null]], "mlx.core.metal.reset_peak_memory": [[215, null]], "mlx.core.metal.set_cache_limit": [[216, null]], "mlx.core.metal.set_memory_limit": [[217, null]], "mlx.core.metal.set_wired_limit": [[218, null]], "mlx.core.metal.start_capture": [[219, null]], "mlx.core.metal.stop_capture": [[220, null]], "mlx.core.min": [[221, null]], "mlx.core.minimum": [[222, null]], "mlx.core.moveaxis": [[223, null]], "mlx.core.multiply": [[224, null]], "mlx.core.nan_to_num": [[225, null]], "mlx.core.negative": [[226, null]], "mlx.core.new_stream": [[227, null]], "mlx.core.not_equal": [[228, null]], "mlx.core.ones": [[229, null]], "mlx.core.ones_like": [[230, null]], "mlx.core.outer": [[231, null]], "mlx.core.pad": [[232, null]], "mlx.core.partition": [[233, null]], "mlx.core.power": [[234, null]], "mlx.core.prod": [[235, null]], "mlx.core.put_along_axis": [[236, null]], "mlx.core.quantize": [[237, null]], "mlx.core.quantized_matmul": [[238, null]], "mlx.core.radians": [[239, null]], "mlx.core.random.bernoulli": [[240, null]], "mlx.core.random.categorical": [[241, null]], "mlx.core.random.gumbel": [[242, null]], "mlx.core.random.key": [[243, null]], "mlx.core.random.laplace": [[244, null]], "mlx.core.random.multivariate_normal": [[245, null]], "mlx.core.random.normal": [[246, null]], "mlx.core.random.permutation": [[247, null]], "mlx.core.random.randint": [[248, null]], "mlx.core.random.seed": [[249, null]], "mlx.core.random.split": [[250, null]], "mlx.core.random.truncated_normal": [[251, null]], "mlx.core.random.uniform": [[252, null]], "mlx.core.real": [[253, null]], "mlx.core.reciprocal": [[254, null]], "mlx.core.remainder": [[255, null]], "mlx.core.repeat": [[256, null]], "mlx.core.reshape": [[257, null]], "mlx.core.right_shift": [[258, null]], "mlx.core.roll": [[259, null]], "mlx.core.round": [[260, null]], "mlx.core.rsqrt": [[261, null]], "mlx.core.save": [[262, null]], "mlx.core.save_gguf": [[263, null]], "mlx.core.save_safetensors": [[264, null]], "mlx.core.savez": [[265, null]], "mlx.core.savez_compressed": [[266, null]], "mlx.core.set_default_device": [[267, null]], "mlx.core.set_default_stream": [[268, null]], "mlx.core.sigmoid": [[269, null]], "mlx.core.sign": [[270, null]], "mlx.core.sin": [[271, null]], "mlx.core.sinh": [[272, null]], "mlx.core.softmax": [[273, null]], "mlx.core.sort": [[274, null]], "mlx.core.split": [[275, null]], "mlx.core.sqrt": [[276, null]], "mlx.core.square": [[277, null]], "mlx.core.squeeze": [[278, null]], "mlx.core.stack": [[279, null]], "mlx.core.std": [[280, null]], "mlx.core.stop_gradient": [[281, null]], "mlx.core.stream": [[282, null]], "mlx.core.subtract": [[283, null]], "mlx.core.sum": [[284, null]], "mlx.core.swapaxes": [[285, null]], "mlx.core.synchronize": [[286, null]], "mlx.core.take": [[287, null]], "mlx.core.take_along_axis": [[288, null]], "mlx.core.tan": [[289, null]], "mlx.core.tanh": [[290, null]], "mlx.core.tensordot": [[291, null]], "mlx.core.tile": [[292, null]], "mlx.core.topk": [[293, null]], "mlx.core.trace": [[294, null]], "mlx.core.transpose": [[295, null]], "mlx.core.tri": [[296, null]], "mlx.core.tril": [[297, null]], "mlx.core.triu": [[298, null]], "mlx.core.value_and_grad": [[299, null]], "mlx.core.var": [[300, null]], "mlx.core.view": [[301, null]], "mlx.core.vjp": [[302, null]], "mlx.core.vmap": [[303, null]], "mlx.core.where": [[304, null]], "mlx.core.zeros": [[305, null]], "mlx.core.zeros_like": [[306, null]], "mlx.nn.ALiBi": [[325, null]], "mlx.nn.AvgPool1d": [[326, null]], "mlx.nn.AvgPool2d": [[327, null]], "mlx.nn.BatchNorm": [[328, null]], "mlx.nn.CELU": [[329, null]], "mlx.nn.Conv1d": [[330, null]], "mlx.nn.Conv2d": [[331, null]], "mlx.nn.Conv3d": [[332, null]], "mlx.nn.ConvTranspose1d": [[333, null]], "mlx.nn.ConvTranspose2d": [[334, null]], "mlx.nn.ConvTranspose3d": [[335, null]], "mlx.nn.Dropout": [[336, null]], "mlx.nn.Dropout2d": [[337, null]], "mlx.nn.Dropout3d": [[338, null]], "mlx.nn.ELU": [[339, null]], "mlx.nn.Embedding": [[340, null]], "mlx.nn.GELU": [[341, null]], "mlx.nn.GLU": [[342, null]], "mlx.nn.GRU": [[343, null]], "mlx.nn.GroupNorm": [[344, null]], "mlx.nn.HardShrink": [[345, null]], "mlx.nn.HardTanh": [[346, null]], "mlx.nn.Hardswish": [[347, null]], "mlx.nn.InstanceNorm": [[348, null]], "mlx.nn.LSTM": [[349, null]], "mlx.nn.LayerNorm": [[350, null]], "mlx.nn.LeakyReLU": [[351, null]], "mlx.nn.Linear": [[352, null]], "mlx.nn.LogSigmoid": [[353, null]], "mlx.nn.LogSoftmax": [[354, null]], "mlx.nn.MaxPool1d": [[355, null]], "mlx.nn.MaxPool2d": [[356, null]], "mlx.nn.Mish": [[357, null]], "mlx.nn.Module.apply": [[358, null]], "mlx.nn.Module.apply_to_modules": [[359, null]], "mlx.nn.Module.children": [[360, null]], "mlx.nn.Module.eval": [[361, null]], "mlx.nn.Module.filter_and_map": [[362, null]], "mlx.nn.Module.freeze": [[363, null]], "mlx.nn.Module.leaf_modules": [[364, null]], "mlx.nn.Module.load_weights": [[365, null]], "mlx.nn.Module.modules": [[366, null]], "mlx.nn.Module.named_modules": [[367, null]], "mlx.nn.Module.parameters": [[368, null]], "mlx.nn.Module.save_weights": [[369, null]], "mlx.nn.Module.set_dtype": [[370, null]], "mlx.nn.Module.state": [[371, null]], "mlx.nn.Module.train": [[372, null]], "mlx.nn.Module.trainable_parameters": [[373, null]], "mlx.nn.Module.training": [[374, null]], "mlx.nn.Module.unfreeze": [[375, null]], "mlx.nn.Module.update": [[376, null]], "mlx.nn.Module.update_modules": [[377, null]], "mlx.nn.MultiHeadAttention": [[378, null]], "mlx.nn.PReLU": [[379, null]], "mlx.nn.QuantizedEmbedding": [[380, null]], "mlx.nn.QuantizedLinear": [[381, null]], "mlx.nn.RMSNorm": [[382, null]], "mlx.nn.RNN": [[383, null]], "mlx.nn.ReLU": [[384, null]], "mlx.nn.ReLU6": [[385, null]], "mlx.nn.RoPE": [[386, null]], "mlx.nn.SELU": [[387, null]], "mlx.nn.Sequential": [[388, null]], "mlx.nn.SiLU": [[389, null]], "mlx.nn.Sigmoid": [[390, null]], "mlx.nn.SinusoidalPositionalEncoding": [[391, null]], "mlx.nn.Softmax": [[392, null]], "mlx.nn.Softmin": [[393, null]], "mlx.nn.Softplus": [[394, null]], "mlx.nn.Softshrink": [[395, null]], "mlx.nn.Softsign": [[396, null]], "mlx.nn.Step": [[397, null]], "mlx.nn.Tanh": [[398, null]], "mlx.nn.Transformer": [[399, null]], "mlx.nn.Upsample": [[400, null]], "mlx.nn.celu": [[409, null]], "mlx.nn.elu": [[410, null]], "mlx.nn.gelu": [[411, null]], "mlx.nn.gelu_approx": [[412, null]], "mlx.nn.gelu_fast_approx": [[413, null]], "mlx.nn.glu": [[414, null]], "mlx.nn.hard_shrink": [[415, null]], "mlx.nn.hard_tanh": [[416, null]], "mlx.nn.hardswish": [[417, null]], "mlx.nn.init.constant": [[401, null]], "mlx.nn.init.glorot_normal": [[402, null]], "mlx.nn.init.glorot_uniform": [[403, null]], "mlx.nn.init.he_normal": [[404, null]], "mlx.nn.init.he_uniform": [[405, null]], "mlx.nn.init.identity": [[406, null]], "mlx.nn.init.normal": [[407, null]], "mlx.nn.init.uniform": [[408, null]], "mlx.nn.leaky_relu": [[418, null]], "mlx.nn.log_sigmoid": [[419, null]], "mlx.nn.log_softmax": [[420, null]], "mlx.nn.losses.binary_cross_entropy": [[421, null]], "mlx.nn.losses.cosine_similarity_loss": [[422, null]], "mlx.nn.losses.cross_entropy": [[423, null]], "mlx.nn.losses.gaussian_nll_loss": [[424, null]], "mlx.nn.losses.hinge_loss": [[425, null]], "mlx.nn.losses.huber_loss": [[426, null]], "mlx.nn.losses.kl_div_loss": [[427, null]], "mlx.nn.losses.l1_loss": [[428, null]], "mlx.nn.losses.log_cosh_loss": [[429, null]], "mlx.nn.losses.margin_ranking_loss": [[430, null]], "mlx.nn.losses.mse_loss": [[431, null]], "mlx.nn.losses.nll_loss": [[432, null]], "mlx.nn.losses.smooth_l1_loss": [[433, null]], "mlx.nn.losses.triplet_loss": [[434, null]], "mlx.nn.mish": [[435, null]], "mlx.nn.prelu": [[436, null]], "mlx.nn.quantize": [[307, null]], "mlx.nn.relu": [[437, null]], "mlx.nn.relu6": [[438, null]], "mlx.nn.selu": [[439, null]], "mlx.nn.sigmoid": [[440, null]], "mlx.nn.silu": [[441, null]], "mlx.nn.softmax": [[442, null]], "mlx.nn.softmin": [[443, null]], "mlx.nn.softplus": [[444, null]], "mlx.nn.softshrink": [[445, null]], "mlx.nn.step": [[446, null]], "mlx.nn.tanh": [[447, null]], "mlx.nn.value_and_grad": [[308, null]], "mlx.optimizers.AdaDelta": [[455, null]], "mlx.optimizers.Adafactor": [[456, null]], "mlx.optimizers.Adagrad": [[457, null]], "mlx.optimizers.Adam": [[458, null]], "mlx.optimizers.AdamW": [[459, null]], "mlx.optimizers.Adamax": [[460, null]], "mlx.optimizers.Lion": [[461, null]], "mlx.optimizers.Optimizer.apply_gradients": [[462, null]], "mlx.optimizers.Optimizer.init": [[463, null]], "mlx.optimizers.Optimizer.state": [[464, null]], "mlx.optimizers.Optimizer.update": [[465, null]], "mlx.optimizers.RMSprop": [[466, null]], "mlx.optimizers.SGD": [[467, null]], "mlx.optimizers.clip_grad_norm": [[309, null]], "mlx.optimizers.cosine_decay": [[468, null]], "mlx.optimizers.exponential_decay": [[469, null]], "mlx.optimizers.join_schedules": [[470, null]], "mlx.optimizers.linear_schedule": [[471, null]], "mlx.optimizers.step_decay": [[472, null]], "mlx.utils.tree_flatten": [[310, null]], "mlx.utils.tree_map": [[311, null]], "mlx.utils.tree_map_with_path": [[312, null]], "mlx.utils.tree_reduce": [[313, null]], "mlx.utils.tree_unflatten": [[314, null]], "x86 Shell": [[8, "x86-shell"]]}, "docnames": ["cpp/ops", "dev/custom_metal_kernels", "dev/extensions", "dev/metal_debugger", "examples/linear_regression", "examples/llama-inference", "examples/mlp", "index", "install", "python/_autosummary/mlx.core.Device", "python/_autosummary/mlx.core.Dtype", "python/_autosummary/mlx.core.DtypeCategory", "python/_autosummary/mlx.core.abs", "python/_autosummary/mlx.core.add", "python/_autosummary/mlx.core.addmm", "python/_autosummary/mlx.core.all", "python/_autosummary/mlx.core.allclose", "python/_autosummary/mlx.core.any", "python/_autosummary/mlx.core.arange", "python/_autosummary/mlx.core.arccos", "python/_autosummary/mlx.core.arccosh", "python/_autosummary/mlx.core.arcsin", "python/_autosummary/mlx.core.arcsinh", "python/_autosummary/mlx.core.arctan", "python/_autosummary/mlx.core.arctan2", "python/_autosummary/mlx.core.arctanh", "python/_autosummary/mlx.core.argmax", "python/_autosummary/mlx.core.argmin", "python/_autosummary/mlx.core.argpartition", "python/_autosummary/mlx.core.argsort", "python/_autosummary/mlx.core.array", "python/_autosummary/mlx.core.array.T", "python/_autosummary/mlx.core.array.abs", "python/_autosummary/mlx.core.array.all", "python/_autosummary/mlx.core.array.any", "python/_autosummary/mlx.core.array.argmax", "python/_autosummary/mlx.core.array.argmin", "python/_autosummary/mlx.core.array.astype", "python/_autosummary/mlx.core.array.at", "python/_autosummary/mlx.core.array.conj", "python/_autosummary/mlx.core.array.cos", "python/_autosummary/mlx.core.array.cummax", "python/_autosummary/mlx.core.array.cummin", "python/_autosummary/mlx.core.array.cumprod", "python/_autosummary/mlx.core.array.cumsum", "python/_autosummary/mlx.core.array.diag", "python/_autosummary/mlx.core.array.diagonal", "python/_autosummary/mlx.core.array.dtype", "python/_autosummary/mlx.core.array.exp", "python/_autosummary/mlx.core.array.flatten", "python/_autosummary/mlx.core.array.item", "python/_autosummary/mlx.core.array.itemsize", "python/_autosummary/mlx.core.array.log", "python/_autosummary/mlx.core.array.log10", "python/_autosummary/mlx.core.array.log1p", "python/_autosummary/mlx.core.array.log2", "python/_autosummary/mlx.core.array.logsumexp", "python/_autosummary/mlx.core.array.max", "python/_autosummary/mlx.core.array.mean", "python/_autosummary/mlx.core.array.min", "python/_autosummary/mlx.core.array.moveaxis", "python/_autosummary/mlx.core.array.nbytes", "python/_autosummary/mlx.core.array.ndim", "python/_autosummary/mlx.core.array.prod", "python/_autosummary/mlx.core.array.reciprocal", "python/_autosummary/mlx.core.array.reshape", "python/_autosummary/mlx.core.array.round", "python/_autosummary/mlx.core.array.rsqrt", "python/_autosummary/mlx.core.array.shape", "python/_autosummary/mlx.core.array.sin", "python/_autosummary/mlx.core.array.size", "python/_autosummary/mlx.core.array.split", "python/_autosummary/mlx.core.array.sqrt", "python/_autosummary/mlx.core.array.square", "python/_autosummary/mlx.core.array.squeeze", "python/_autosummary/mlx.core.array.std", "python/_autosummary/mlx.core.array.sum", "python/_autosummary/mlx.core.array.swapaxes", "python/_autosummary/mlx.core.array.tolist", "python/_autosummary/mlx.core.array.transpose", "python/_autosummary/mlx.core.array.var", "python/_autosummary/mlx.core.array.view", "python/_autosummary/mlx.core.array_equal", "python/_autosummary/mlx.core.as_strided", "python/_autosummary/mlx.core.atleast_1d", "python/_autosummary/mlx.core.atleast_2d", "python/_autosummary/mlx.core.atleast_3d", "python/_autosummary/mlx.core.bitwise_and", "python/_autosummary/mlx.core.bitwise_or", "python/_autosummary/mlx.core.bitwise_xor", "python/_autosummary/mlx.core.block_masked_mm", "python/_autosummary/mlx.core.broadcast_to", "python/_autosummary/mlx.core.ceil", "python/_autosummary/mlx.core.clip", "python/_autosummary/mlx.core.compile", "python/_autosummary/mlx.core.concatenate", "python/_autosummary/mlx.core.conj", "python/_autosummary/mlx.core.conjugate", "python/_autosummary/mlx.core.conv1d", "python/_autosummary/mlx.core.conv2d", "python/_autosummary/mlx.core.conv3d", "python/_autosummary/mlx.core.conv_general", "python/_autosummary/mlx.core.conv_transpose1d", "python/_autosummary/mlx.core.conv_transpose2d", "python/_autosummary/mlx.core.conv_transpose3d", "python/_autosummary/mlx.core.convolve", "python/_autosummary/mlx.core.cos", "python/_autosummary/mlx.core.cosh", "python/_autosummary/mlx.core.cummax", "python/_autosummary/mlx.core.cummin", "python/_autosummary/mlx.core.cumprod", "python/_autosummary/mlx.core.cumsum", "python/_autosummary/mlx.core.custom_function", "python/_autosummary/mlx.core.default_device", "python/_autosummary/mlx.core.default_stream", "python/_autosummary/mlx.core.degrees", "python/_autosummary/mlx.core.dequantize", "python/_autosummary/mlx.core.diag", "python/_autosummary/mlx.core.diagonal", "python/_autosummary/mlx.core.disable_compile", "python/_autosummary/mlx.core.distributed.Group", "python/_autosummary/mlx.core.distributed.all_gather", "python/_autosummary/mlx.core.distributed.all_sum", "python/_autosummary/mlx.core.distributed.init", "python/_autosummary/mlx.core.distributed.is_available", "python/_autosummary/mlx.core.distributed.recv", "python/_autosummary/mlx.core.distributed.recv_like", "python/_autosummary/mlx.core.distributed.send", "python/_autosummary/mlx.core.divide", "python/_autosummary/mlx.core.divmod", "python/_autosummary/mlx.core.einsum", "python/_autosummary/mlx.core.einsum_path", "python/_autosummary/mlx.core.enable_compile", "python/_autosummary/mlx.core.equal", "python/_autosummary/mlx.core.erf", "python/_autosummary/mlx.core.erfinv", "python/_autosummary/mlx.core.eval", "python/_autosummary/mlx.core.exp", "python/_autosummary/mlx.core.expand_dims", "python/_autosummary/mlx.core.expm1", "python/_autosummary/mlx.core.eye", "python/_autosummary/mlx.core.fast.affine_quantize", "python/_autosummary/mlx.core.fast.layer_norm", "python/_autosummary/mlx.core.fast.metal_kernel", "python/_autosummary/mlx.core.fast.rms_norm", "python/_autosummary/mlx.core.fast.rope", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention", "python/_autosummary/mlx.core.fft.fft", "python/_autosummary/mlx.core.fft.fft2", "python/_autosummary/mlx.core.fft.fftn", "python/_autosummary/mlx.core.fft.ifft", "python/_autosummary/mlx.core.fft.ifft2", "python/_autosummary/mlx.core.fft.ifftn", "python/_autosummary/mlx.core.fft.irfft", "python/_autosummary/mlx.core.fft.irfft2", "python/_autosummary/mlx.core.fft.irfftn", "python/_autosummary/mlx.core.fft.rfft", "python/_autosummary/mlx.core.fft.rfft2", "python/_autosummary/mlx.core.fft.rfftn", "python/_autosummary/mlx.core.flatten", "python/_autosummary/mlx.core.floor", "python/_autosummary/mlx.core.floor_divide", "python/_autosummary/mlx.core.full", "python/_autosummary/mlx.core.gather_mm", "python/_autosummary/mlx.core.gather_qmm", "python/_autosummary/mlx.core.grad", "python/_autosummary/mlx.core.greater", "python/_autosummary/mlx.core.greater_equal", "python/_autosummary/mlx.core.hadamard_transform", "python/_autosummary/mlx.core.identity", "python/_autosummary/mlx.core.imag", "python/_autosummary/mlx.core.inner", "python/_autosummary/mlx.core.isclose", "python/_autosummary/mlx.core.isfinite", "python/_autosummary/mlx.core.isinf", "python/_autosummary/mlx.core.isnan", "python/_autosummary/mlx.core.isneginf", "python/_autosummary/mlx.core.isposinf", "python/_autosummary/mlx.core.issubdtype", "python/_autosummary/mlx.core.jvp", "python/_autosummary/mlx.core.left_shift", "python/_autosummary/mlx.core.less", "python/_autosummary/mlx.core.less_equal", "python/_autosummary/mlx.core.linalg.cholesky", "python/_autosummary/mlx.core.linalg.cholesky_inv", "python/_autosummary/mlx.core.linalg.cross", "python/_autosummary/mlx.core.linalg.eigh", "python/_autosummary/mlx.core.linalg.eigvalsh", "python/_autosummary/mlx.core.linalg.inv", "python/_autosummary/mlx.core.linalg.norm", "python/_autosummary/mlx.core.linalg.qr", "python/_autosummary/mlx.core.linalg.svd", "python/_autosummary/mlx.core.linalg.tri_inv", "python/_autosummary/mlx.core.linspace", "python/_autosummary/mlx.core.load", "python/_autosummary/mlx.core.log", "python/_autosummary/mlx.core.log10", "python/_autosummary/mlx.core.log1p", "python/_autosummary/mlx.core.log2", "python/_autosummary/mlx.core.logaddexp", "python/_autosummary/mlx.core.logical_and", "python/_autosummary/mlx.core.logical_not", "python/_autosummary/mlx.core.logical_or", "python/_autosummary/mlx.core.logsumexp", "python/_autosummary/mlx.core.matmul", "python/_autosummary/mlx.core.max", "python/_autosummary/mlx.core.maximum", "python/_autosummary/mlx.core.mean", "python/_autosummary/mlx.core.meshgrid", "python/_autosummary/mlx.core.metal.clear_cache", "python/_autosummary/mlx.core.metal.device_info", "python/_autosummary/mlx.core.metal.get_active_memory", "python/_autosummary/mlx.core.metal.get_cache_memory", "python/_autosummary/mlx.core.metal.get_peak_memory", "python/_autosummary/mlx.core.metal.is_available", "python/_autosummary/mlx.core.metal.reset_peak_memory", "python/_autosummary/mlx.core.metal.set_cache_limit", "python/_autosummary/mlx.core.metal.set_memory_limit", "python/_autosummary/mlx.core.metal.set_wired_limit", "python/_autosummary/mlx.core.metal.start_capture", "python/_autosummary/mlx.core.metal.stop_capture", "python/_autosummary/mlx.core.min", "python/_autosummary/mlx.core.minimum", "python/_autosummary/mlx.core.moveaxis", "python/_autosummary/mlx.core.multiply", "python/_autosummary/mlx.core.nan_to_num", "python/_autosummary/mlx.core.negative", "python/_autosummary/mlx.core.new_stream", "python/_autosummary/mlx.core.not_equal", "python/_autosummary/mlx.core.ones", "python/_autosummary/mlx.core.ones_like", "python/_autosummary/mlx.core.outer", "python/_autosummary/mlx.core.pad", "python/_autosummary/mlx.core.partition", "python/_autosummary/mlx.core.power", "python/_autosummary/mlx.core.prod", "python/_autosummary/mlx.core.put_along_axis", "python/_autosummary/mlx.core.quantize", "python/_autosummary/mlx.core.quantized_matmul", "python/_autosummary/mlx.core.radians", "python/_autosummary/mlx.core.random.bernoulli", "python/_autosummary/mlx.core.random.categorical", "python/_autosummary/mlx.core.random.gumbel", "python/_autosummary/mlx.core.random.key", "python/_autosummary/mlx.core.random.laplace", "python/_autosummary/mlx.core.random.multivariate_normal", "python/_autosummary/mlx.core.random.normal", "python/_autosummary/mlx.core.random.permutation", "python/_autosummary/mlx.core.random.randint", "python/_autosummary/mlx.core.random.seed", "python/_autosummary/mlx.core.random.split", "python/_autosummary/mlx.core.random.truncated_normal", "python/_autosummary/mlx.core.random.uniform", "python/_autosummary/mlx.core.real", "python/_autosummary/mlx.core.reciprocal", "python/_autosummary/mlx.core.remainder", "python/_autosummary/mlx.core.repeat", "python/_autosummary/mlx.core.reshape", "python/_autosummary/mlx.core.right_shift", "python/_autosummary/mlx.core.roll", "python/_autosummary/mlx.core.round", "python/_autosummary/mlx.core.rsqrt", "python/_autosummary/mlx.core.save", "python/_autosummary/mlx.core.save_gguf", "python/_autosummary/mlx.core.save_safetensors", "python/_autosummary/mlx.core.savez", "python/_autosummary/mlx.core.savez_compressed", "python/_autosummary/mlx.core.set_default_device", "python/_autosummary/mlx.core.set_default_stream", "python/_autosummary/mlx.core.sigmoid", "python/_autosummary/mlx.core.sign", "python/_autosummary/mlx.core.sin", "python/_autosummary/mlx.core.sinh", "python/_autosummary/mlx.core.softmax", "python/_autosummary/mlx.core.sort", "python/_autosummary/mlx.core.split", "python/_autosummary/mlx.core.sqrt", "python/_autosummary/mlx.core.square", "python/_autosummary/mlx.core.squeeze", "python/_autosummary/mlx.core.stack", "python/_autosummary/mlx.core.std", "python/_autosummary/mlx.core.stop_gradient", "python/_autosummary/mlx.core.stream", "python/_autosummary/mlx.core.subtract", "python/_autosummary/mlx.core.sum", "python/_autosummary/mlx.core.swapaxes", "python/_autosummary/mlx.core.synchronize", "python/_autosummary/mlx.core.take", "python/_autosummary/mlx.core.take_along_axis", "python/_autosummary/mlx.core.tan", "python/_autosummary/mlx.core.tanh", "python/_autosummary/mlx.core.tensordot", "python/_autosummary/mlx.core.tile", "python/_autosummary/mlx.core.topk", "python/_autosummary/mlx.core.trace", "python/_autosummary/mlx.core.transpose", "python/_autosummary/mlx.core.tri", "python/_autosummary/mlx.core.tril", "python/_autosummary/mlx.core.triu", "python/_autosummary/mlx.core.value_and_grad", "python/_autosummary/mlx.core.var", "python/_autosummary/mlx.core.view", "python/_autosummary/mlx.core.vjp", "python/_autosummary/mlx.core.vmap", "python/_autosummary/mlx.core.where", "python/_autosummary/mlx.core.zeros", "python/_autosummary/mlx.core.zeros_like", "python/_autosummary/mlx.nn.quantize", "python/_autosummary/mlx.nn.value_and_grad", "python/_autosummary/mlx.optimizers.clip_grad_norm", "python/_autosummary/mlx.utils.tree_flatten", "python/_autosummary/mlx.utils.tree_map", "python/_autosummary/mlx.utils.tree_map_with_path", "python/_autosummary/mlx.utils.tree_reduce", "python/_autosummary/mlx.utils.tree_unflatten", "python/_autosummary/stream_class", "python/array", "python/data_types", "python/devices_and_streams", "python/distributed", "python/fast", "python/fft", "python/linalg", "python/metal", "python/nn", "python/nn/_autosummary/mlx.nn.ALiBi", "python/nn/_autosummary/mlx.nn.AvgPool1d", "python/nn/_autosummary/mlx.nn.AvgPool2d", "python/nn/_autosummary/mlx.nn.BatchNorm", "python/nn/_autosummary/mlx.nn.CELU", "python/nn/_autosummary/mlx.nn.Conv1d", "python/nn/_autosummary/mlx.nn.Conv2d", "python/nn/_autosummary/mlx.nn.Conv3d", "python/nn/_autosummary/mlx.nn.ConvTranspose1d", "python/nn/_autosummary/mlx.nn.ConvTranspose2d", "python/nn/_autosummary/mlx.nn.ConvTranspose3d", "python/nn/_autosummary/mlx.nn.Dropout", "python/nn/_autosummary/mlx.nn.Dropout2d", "python/nn/_autosummary/mlx.nn.Dropout3d", "python/nn/_autosummary/mlx.nn.ELU", "python/nn/_autosummary/mlx.nn.Embedding", "python/nn/_autosummary/mlx.nn.GELU", "python/nn/_autosummary/mlx.nn.GLU", "python/nn/_autosummary/mlx.nn.GRU", "python/nn/_autosummary/mlx.nn.GroupNorm", "python/nn/_autosummary/mlx.nn.HardShrink", "python/nn/_autosummary/mlx.nn.HardTanh", "python/nn/_autosummary/mlx.nn.Hardswish", "python/nn/_autosummary/mlx.nn.InstanceNorm", "python/nn/_autosummary/mlx.nn.LSTM", "python/nn/_autosummary/mlx.nn.LayerNorm", "python/nn/_autosummary/mlx.nn.LeakyReLU", "python/nn/_autosummary/mlx.nn.Linear", "python/nn/_autosummary/mlx.nn.LogSigmoid", "python/nn/_autosummary/mlx.nn.LogSoftmax", "python/nn/_autosummary/mlx.nn.MaxPool1d", "python/nn/_autosummary/mlx.nn.MaxPool2d", "python/nn/_autosummary/mlx.nn.Mish", "python/nn/_autosummary/mlx.nn.Module.apply", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules", "python/nn/_autosummary/mlx.nn.Module.children", "python/nn/_autosummary/mlx.nn.Module.eval", "python/nn/_autosummary/mlx.nn.Module.filter_and_map", "python/nn/_autosummary/mlx.nn.Module.freeze", "python/nn/_autosummary/mlx.nn.Module.leaf_modules", "python/nn/_autosummary/mlx.nn.Module.load_weights", "python/nn/_autosummary/mlx.nn.Module.modules", "python/nn/_autosummary/mlx.nn.Module.named_modules", "python/nn/_autosummary/mlx.nn.Module.parameters", "python/nn/_autosummary/mlx.nn.Module.save_weights", "python/nn/_autosummary/mlx.nn.Module.set_dtype", "python/nn/_autosummary/mlx.nn.Module.state", "python/nn/_autosummary/mlx.nn.Module.train", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters", "python/nn/_autosummary/mlx.nn.Module.training", "python/nn/_autosummary/mlx.nn.Module.unfreeze", "python/nn/_autosummary/mlx.nn.Module.update", "python/nn/_autosummary/mlx.nn.Module.update_modules", "python/nn/_autosummary/mlx.nn.MultiHeadAttention", "python/nn/_autosummary/mlx.nn.PReLU", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding", "python/nn/_autosummary/mlx.nn.QuantizedLinear", "python/nn/_autosummary/mlx.nn.RMSNorm", "python/nn/_autosummary/mlx.nn.RNN", "python/nn/_autosummary/mlx.nn.ReLU", "python/nn/_autosummary/mlx.nn.ReLU6", "python/nn/_autosummary/mlx.nn.RoPE", "python/nn/_autosummary/mlx.nn.SELU", "python/nn/_autosummary/mlx.nn.Sequential", "python/nn/_autosummary/mlx.nn.SiLU", "python/nn/_autosummary/mlx.nn.Sigmoid", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding", "python/nn/_autosummary/mlx.nn.Softmax", "python/nn/_autosummary/mlx.nn.Softmin", "python/nn/_autosummary/mlx.nn.Softplus", "python/nn/_autosummary/mlx.nn.Softshrink", "python/nn/_autosummary/mlx.nn.Softsign", "python/nn/_autosummary/mlx.nn.Step", "python/nn/_autosummary/mlx.nn.Tanh", "python/nn/_autosummary/mlx.nn.Transformer", "python/nn/_autosummary/mlx.nn.Upsample", "python/nn/_autosummary/mlx.nn.init.constant", "python/nn/_autosummary/mlx.nn.init.glorot_normal", "python/nn/_autosummary/mlx.nn.init.glorot_uniform", "python/nn/_autosummary/mlx.nn.init.he_normal", "python/nn/_autosummary/mlx.nn.init.he_uniform", "python/nn/_autosummary/mlx.nn.init.identity", "python/nn/_autosummary/mlx.nn.init.normal", "python/nn/_autosummary/mlx.nn.init.uniform", "python/nn/_autosummary_functions/mlx.nn.celu", "python/nn/_autosummary_functions/mlx.nn.elu", "python/nn/_autosummary_functions/mlx.nn.gelu", "python/nn/_autosummary_functions/mlx.nn.gelu_approx", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx", "python/nn/_autosummary_functions/mlx.nn.glu", "python/nn/_autosummary_functions/mlx.nn.hard_shrink", "python/nn/_autosummary_functions/mlx.nn.hard_tanh", "python/nn/_autosummary_functions/mlx.nn.hardswish", "python/nn/_autosummary_functions/mlx.nn.leaky_relu", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid", "python/nn/_autosummary_functions/mlx.nn.log_softmax", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss", "python/nn/_autosummary_functions/mlx.nn.mish", "python/nn/_autosummary_functions/mlx.nn.prelu", "python/nn/_autosummary_functions/mlx.nn.relu", "python/nn/_autosummary_functions/mlx.nn.relu6", "python/nn/_autosummary_functions/mlx.nn.selu", "python/nn/_autosummary_functions/mlx.nn.sigmoid", "python/nn/_autosummary_functions/mlx.nn.silu", "python/nn/_autosummary_functions/mlx.nn.softmax", "python/nn/_autosummary_functions/mlx.nn.softmin", "python/nn/_autosummary_functions/mlx.nn.softplus", "python/nn/_autosummary_functions/mlx.nn.softshrink", "python/nn/_autosummary_functions/mlx.nn.step", "python/nn/_autosummary_functions/mlx.nn.tanh", "python/nn/functions", "python/nn/init", "python/nn/layers", "python/nn/losses", "python/nn/module", "python/ops", "python/optimizers", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta", "python/optimizers/_autosummary/mlx.optimizers.Adafactor", "python/optimizers/_autosummary/mlx.optimizers.Adagrad", "python/optimizers/_autosummary/mlx.optimizers.Adam", "python/optimizers/_autosummary/mlx.optimizers.AdamW", "python/optimizers/_autosummary/mlx.optimizers.Adamax", "python/optimizers/_autosummary/mlx.optimizers.Lion", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update", "python/optimizers/_autosummary/mlx.optimizers.RMSprop", "python/optimizers/_autosummary/mlx.optimizers.SGD", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay", "python/optimizers/_autosummary/mlx.optimizers.join_schedules", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule", "python/optimizers/_autosummary/mlx.optimizers.step_decay", "python/optimizers/common_optimizers", "python/optimizers/optimizer", "python/optimizers/schedulers", "python/random", "python/transforms", "python/tree_utils", "usage/compile", "usage/distributed", "usage/function_transforms", "usage/indexing", "usage/lazy_evaluation", "usage/numpy", "usage/quick_start", "usage/saving_and_loading", "usage/unified_memory", "usage/using_streams"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1}, "filenames": ["cpp/ops.rst", "dev/custom_metal_kernels.rst", "dev/extensions.rst", "dev/metal_debugger.rst", "examples/linear_regression.rst", "examples/llama-inference.rst", "examples/mlp.rst", "index.rst", "install.rst", "python/_autosummary/mlx.core.Device.rst", "python/_autosummary/mlx.core.Dtype.rst", "python/_autosummary/mlx.core.DtypeCategory.rst", "python/_autosummary/mlx.core.abs.rst", "python/_autosummary/mlx.core.add.rst", "python/_autosummary/mlx.core.addmm.rst", "python/_autosummary/mlx.core.all.rst", "python/_autosummary/mlx.core.allclose.rst", "python/_autosummary/mlx.core.any.rst", "python/_autosummary/mlx.core.arange.rst", "python/_autosummary/mlx.core.arccos.rst", "python/_autosummary/mlx.core.arccosh.rst", "python/_autosummary/mlx.core.arcsin.rst", "python/_autosummary/mlx.core.arcsinh.rst", "python/_autosummary/mlx.core.arctan.rst", "python/_autosummary/mlx.core.arctan2.rst", "python/_autosummary/mlx.core.arctanh.rst", "python/_autosummary/mlx.core.argmax.rst", "python/_autosummary/mlx.core.argmin.rst", "python/_autosummary/mlx.core.argpartition.rst", "python/_autosummary/mlx.core.argsort.rst", "python/_autosummary/mlx.core.array.rst", "python/_autosummary/mlx.core.array.T.rst", "python/_autosummary/mlx.core.array.abs.rst", "python/_autosummary/mlx.core.array.all.rst", "python/_autosummary/mlx.core.array.any.rst", "python/_autosummary/mlx.core.array.argmax.rst", "python/_autosummary/mlx.core.array.argmin.rst", "python/_autosummary/mlx.core.array.astype.rst", "python/_autosummary/mlx.core.array.at.rst", "python/_autosummary/mlx.core.array.conj.rst", "python/_autosummary/mlx.core.array.cos.rst", "python/_autosummary/mlx.core.array.cummax.rst", "python/_autosummary/mlx.core.array.cummin.rst", "python/_autosummary/mlx.core.array.cumprod.rst", "python/_autosummary/mlx.core.array.cumsum.rst", "python/_autosummary/mlx.core.array.diag.rst", "python/_autosummary/mlx.core.array.diagonal.rst", "python/_autosummary/mlx.core.array.dtype.rst", "python/_autosummary/mlx.core.array.exp.rst", "python/_autosummary/mlx.core.array.flatten.rst", "python/_autosummary/mlx.core.array.item.rst", "python/_autosummary/mlx.core.array.itemsize.rst", "python/_autosummary/mlx.core.array.log.rst", "python/_autosummary/mlx.core.array.log10.rst", "python/_autosummary/mlx.core.array.log1p.rst", "python/_autosummary/mlx.core.array.log2.rst", "python/_autosummary/mlx.core.array.logsumexp.rst", "python/_autosummary/mlx.core.array.max.rst", "python/_autosummary/mlx.core.array.mean.rst", "python/_autosummary/mlx.core.array.min.rst", "python/_autosummary/mlx.core.array.moveaxis.rst", "python/_autosummary/mlx.core.array.nbytes.rst", "python/_autosummary/mlx.core.array.ndim.rst", "python/_autosummary/mlx.core.array.prod.rst", "python/_autosummary/mlx.core.array.reciprocal.rst", "python/_autosummary/mlx.core.array.reshape.rst", "python/_autosummary/mlx.core.array.round.rst", "python/_autosummary/mlx.core.array.rsqrt.rst", "python/_autosummary/mlx.core.array.shape.rst", "python/_autosummary/mlx.core.array.sin.rst", "python/_autosummary/mlx.core.array.size.rst", "python/_autosummary/mlx.core.array.split.rst", "python/_autosummary/mlx.core.array.sqrt.rst", "python/_autosummary/mlx.core.array.square.rst", "python/_autosummary/mlx.core.array.squeeze.rst", "python/_autosummary/mlx.core.array.std.rst", "python/_autosummary/mlx.core.array.sum.rst", "python/_autosummary/mlx.core.array.swapaxes.rst", "python/_autosummary/mlx.core.array.tolist.rst", "python/_autosummary/mlx.core.array.transpose.rst", "python/_autosummary/mlx.core.array.var.rst", "python/_autosummary/mlx.core.array.view.rst", "python/_autosummary/mlx.core.array_equal.rst", "python/_autosummary/mlx.core.as_strided.rst", "python/_autosummary/mlx.core.atleast_1d.rst", "python/_autosummary/mlx.core.atleast_2d.rst", "python/_autosummary/mlx.core.atleast_3d.rst", "python/_autosummary/mlx.core.bitwise_and.rst", "python/_autosummary/mlx.core.bitwise_or.rst", "python/_autosummary/mlx.core.bitwise_xor.rst", "python/_autosummary/mlx.core.block_masked_mm.rst", "python/_autosummary/mlx.core.broadcast_to.rst", "python/_autosummary/mlx.core.ceil.rst", "python/_autosummary/mlx.core.clip.rst", "python/_autosummary/mlx.core.compile.rst", "python/_autosummary/mlx.core.concatenate.rst", "python/_autosummary/mlx.core.conj.rst", "python/_autosummary/mlx.core.conjugate.rst", "python/_autosummary/mlx.core.conv1d.rst", "python/_autosummary/mlx.core.conv2d.rst", "python/_autosummary/mlx.core.conv3d.rst", "python/_autosummary/mlx.core.conv_general.rst", "python/_autosummary/mlx.core.conv_transpose1d.rst", "python/_autosummary/mlx.core.conv_transpose2d.rst", "python/_autosummary/mlx.core.conv_transpose3d.rst", "python/_autosummary/mlx.core.convolve.rst", "python/_autosummary/mlx.core.cos.rst", "python/_autosummary/mlx.core.cosh.rst", "python/_autosummary/mlx.core.cummax.rst", "python/_autosummary/mlx.core.cummin.rst", "python/_autosummary/mlx.core.cumprod.rst", "python/_autosummary/mlx.core.cumsum.rst", "python/_autosummary/mlx.core.custom_function.rst", "python/_autosummary/mlx.core.default_device.rst", "python/_autosummary/mlx.core.default_stream.rst", "python/_autosummary/mlx.core.degrees.rst", "python/_autosummary/mlx.core.dequantize.rst", "python/_autosummary/mlx.core.diag.rst", "python/_autosummary/mlx.core.diagonal.rst", "python/_autosummary/mlx.core.disable_compile.rst", "python/_autosummary/mlx.core.distributed.Group.rst", "python/_autosummary/mlx.core.distributed.all_gather.rst", "python/_autosummary/mlx.core.distributed.all_sum.rst", "python/_autosummary/mlx.core.distributed.init.rst", "python/_autosummary/mlx.core.distributed.is_available.rst", "python/_autosummary/mlx.core.distributed.recv.rst", "python/_autosummary/mlx.core.distributed.recv_like.rst", "python/_autosummary/mlx.core.distributed.send.rst", "python/_autosummary/mlx.core.divide.rst", "python/_autosummary/mlx.core.divmod.rst", "python/_autosummary/mlx.core.einsum.rst", "python/_autosummary/mlx.core.einsum_path.rst", "python/_autosummary/mlx.core.enable_compile.rst", "python/_autosummary/mlx.core.equal.rst", "python/_autosummary/mlx.core.erf.rst", "python/_autosummary/mlx.core.erfinv.rst", "python/_autosummary/mlx.core.eval.rst", "python/_autosummary/mlx.core.exp.rst", "python/_autosummary/mlx.core.expand_dims.rst", "python/_autosummary/mlx.core.expm1.rst", "python/_autosummary/mlx.core.eye.rst", "python/_autosummary/mlx.core.fast.affine_quantize.rst", "python/_autosummary/mlx.core.fast.layer_norm.rst", "python/_autosummary/mlx.core.fast.metal_kernel.rst", "python/_autosummary/mlx.core.fast.rms_norm.rst", "python/_autosummary/mlx.core.fast.rope.rst", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention.rst", "python/_autosummary/mlx.core.fft.fft.rst", "python/_autosummary/mlx.core.fft.fft2.rst", "python/_autosummary/mlx.core.fft.fftn.rst", "python/_autosummary/mlx.core.fft.ifft.rst", "python/_autosummary/mlx.core.fft.ifft2.rst", "python/_autosummary/mlx.core.fft.ifftn.rst", "python/_autosummary/mlx.core.fft.irfft.rst", "python/_autosummary/mlx.core.fft.irfft2.rst", "python/_autosummary/mlx.core.fft.irfftn.rst", "python/_autosummary/mlx.core.fft.rfft.rst", "python/_autosummary/mlx.core.fft.rfft2.rst", "python/_autosummary/mlx.core.fft.rfftn.rst", "python/_autosummary/mlx.core.flatten.rst", "python/_autosummary/mlx.core.floor.rst", "python/_autosummary/mlx.core.floor_divide.rst", "python/_autosummary/mlx.core.full.rst", "python/_autosummary/mlx.core.gather_mm.rst", "python/_autosummary/mlx.core.gather_qmm.rst", "python/_autosummary/mlx.core.grad.rst", "python/_autosummary/mlx.core.greater.rst", "python/_autosummary/mlx.core.greater_equal.rst", "python/_autosummary/mlx.core.hadamard_transform.rst", "python/_autosummary/mlx.core.identity.rst", "python/_autosummary/mlx.core.imag.rst", "python/_autosummary/mlx.core.inner.rst", "python/_autosummary/mlx.core.isclose.rst", "python/_autosummary/mlx.core.isfinite.rst", "python/_autosummary/mlx.core.isinf.rst", "python/_autosummary/mlx.core.isnan.rst", "python/_autosummary/mlx.core.isneginf.rst", "python/_autosummary/mlx.core.isposinf.rst", "python/_autosummary/mlx.core.issubdtype.rst", "python/_autosummary/mlx.core.jvp.rst", "python/_autosummary/mlx.core.left_shift.rst", "python/_autosummary/mlx.core.less.rst", "python/_autosummary/mlx.core.less_equal.rst", "python/_autosummary/mlx.core.linalg.cholesky.rst", "python/_autosummary/mlx.core.linalg.cholesky_inv.rst", "python/_autosummary/mlx.core.linalg.cross.rst", "python/_autosummary/mlx.core.linalg.eigh.rst", "python/_autosummary/mlx.core.linalg.eigvalsh.rst", "python/_autosummary/mlx.core.linalg.inv.rst", "python/_autosummary/mlx.core.linalg.norm.rst", "python/_autosummary/mlx.core.linalg.qr.rst", "python/_autosummary/mlx.core.linalg.svd.rst", "python/_autosummary/mlx.core.linalg.tri_inv.rst", "python/_autosummary/mlx.core.linspace.rst", "python/_autosummary/mlx.core.load.rst", "python/_autosummary/mlx.core.log.rst", "python/_autosummary/mlx.core.log10.rst", "python/_autosummary/mlx.core.log1p.rst", "python/_autosummary/mlx.core.log2.rst", "python/_autosummary/mlx.core.logaddexp.rst", "python/_autosummary/mlx.core.logical_and.rst", "python/_autosummary/mlx.core.logical_not.rst", "python/_autosummary/mlx.core.logical_or.rst", "python/_autosummary/mlx.core.logsumexp.rst", "python/_autosummary/mlx.core.matmul.rst", "python/_autosummary/mlx.core.max.rst", "python/_autosummary/mlx.core.maximum.rst", "python/_autosummary/mlx.core.mean.rst", "python/_autosummary/mlx.core.meshgrid.rst", "python/_autosummary/mlx.core.metal.clear_cache.rst", "python/_autosummary/mlx.core.metal.device_info.rst", "python/_autosummary/mlx.core.metal.get_active_memory.rst", "python/_autosummary/mlx.core.metal.get_cache_memory.rst", "python/_autosummary/mlx.core.metal.get_peak_memory.rst", "python/_autosummary/mlx.core.metal.is_available.rst", "python/_autosummary/mlx.core.metal.reset_peak_memory.rst", "python/_autosummary/mlx.core.metal.set_cache_limit.rst", "python/_autosummary/mlx.core.metal.set_memory_limit.rst", "python/_autosummary/mlx.core.metal.set_wired_limit.rst", "python/_autosummary/mlx.core.metal.start_capture.rst", "python/_autosummary/mlx.core.metal.stop_capture.rst", "python/_autosummary/mlx.core.min.rst", "python/_autosummary/mlx.core.minimum.rst", "python/_autosummary/mlx.core.moveaxis.rst", "python/_autosummary/mlx.core.multiply.rst", "python/_autosummary/mlx.core.nan_to_num.rst", "python/_autosummary/mlx.core.negative.rst", "python/_autosummary/mlx.core.new_stream.rst", "python/_autosummary/mlx.core.not_equal.rst", "python/_autosummary/mlx.core.ones.rst", "python/_autosummary/mlx.core.ones_like.rst", "python/_autosummary/mlx.core.outer.rst", "python/_autosummary/mlx.core.pad.rst", "python/_autosummary/mlx.core.partition.rst", "python/_autosummary/mlx.core.power.rst", "python/_autosummary/mlx.core.prod.rst", "python/_autosummary/mlx.core.put_along_axis.rst", "python/_autosummary/mlx.core.quantize.rst", "python/_autosummary/mlx.core.quantized_matmul.rst", "python/_autosummary/mlx.core.radians.rst", "python/_autosummary/mlx.core.random.bernoulli.rst", "python/_autosummary/mlx.core.random.categorical.rst", "python/_autosummary/mlx.core.random.gumbel.rst", "python/_autosummary/mlx.core.random.key.rst", "python/_autosummary/mlx.core.random.laplace.rst", "python/_autosummary/mlx.core.random.multivariate_normal.rst", "python/_autosummary/mlx.core.random.normal.rst", "python/_autosummary/mlx.core.random.permutation.rst", "python/_autosummary/mlx.core.random.randint.rst", "python/_autosummary/mlx.core.random.seed.rst", "python/_autosummary/mlx.core.random.split.rst", "python/_autosummary/mlx.core.random.truncated_normal.rst", "python/_autosummary/mlx.core.random.uniform.rst", "python/_autosummary/mlx.core.real.rst", "python/_autosummary/mlx.core.reciprocal.rst", "python/_autosummary/mlx.core.remainder.rst", "python/_autosummary/mlx.core.repeat.rst", "python/_autosummary/mlx.core.reshape.rst", "python/_autosummary/mlx.core.right_shift.rst", "python/_autosummary/mlx.core.roll.rst", "python/_autosummary/mlx.core.round.rst", "python/_autosummary/mlx.core.rsqrt.rst", "python/_autosummary/mlx.core.save.rst", "python/_autosummary/mlx.core.save_gguf.rst", "python/_autosummary/mlx.core.save_safetensors.rst", "python/_autosummary/mlx.core.savez.rst", "python/_autosummary/mlx.core.savez_compressed.rst", "python/_autosummary/mlx.core.set_default_device.rst", "python/_autosummary/mlx.core.set_default_stream.rst", "python/_autosummary/mlx.core.sigmoid.rst", "python/_autosummary/mlx.core.sign.rst", "python/_autosummary/mlx.core.sin.rst", "python/_autosummary/mlx.core.sinh.rst", "python/_autosummary/mlx.core.softmax.rst", "python/_autosummary/mlx.core.sort.rst", "python/_autosummary/mlx.core.split.rst", "python/_autosummary/mlx.core.sqrt.rst", "python/_autosummary/mlx.core.square.rst", "python/_autosummary/mlx.core.squeeze.rst", "python/_autosummary/mlx.core.stack.rst", "python/_autosummary/mlx.core.std.rst", "python/_autosummary/mlx.core.stop_gradient.rst", "python/_autosummary/mlx.core.stream.rst", "python/_autosummary/mlx.core.subtract.rst", "python/_autosummary/mlx.core.sum.rst", "python/_autosummary/mlx.core.swapaxes.rst", "python/_autosummary/mlx.core.synchronize.rst", "python/_autosummary/mlx.core.take.rst", "python/_autosummary/mlx.core.take_along_axis.rst", "python/_autosummary/mlx.core.tan.rst", "python/_autosummary/mlx.core.tanh.rst", "python/_autosummary/mlx.core.tensordot.rst", "python/_autosummary/mlx.core.tile.rst", "python/_autosummary/mlx.core.topk.rst", "python/_autosummary/mlx.core.trace.rst", "python/_autosummary/mlx.core.transpose.rst", "python/_autosummary/mlx.core.tri.rst", "python/_autosummary/mlx.core.tril.rst", "python/_autosummary/mlx.core.triu.rst", "python/_autosummary/mlx.core.value_and_grad.rst", "python/_autosummary/mlx.core.var.rst", "python/_autosummary/mlx.core.view.rst", "python/_autosummary/mlx.core.vjp.rst", "python/_autosummary/mlx.core.vmap.rst", "python/_autosummary/mlx.core.where.rst", "python/_autosummary/mlx.core.zeros.rst", "python/_autosummary/mlx.core.zeros_like.rst", "python/_autosummary/mlx.nn.quantize.rst", "python/_autosummary/mlx.nn.value_and_grad.rst", "python/_autosummary/mlx.optimizers.clip_grad_norm.rst", "python/_autosummary/mlx.utils.tree_flatten.rst", "python/_autosummary/mlx.utils.tree_map.rst", "python/_autosummary/mlx.utils.tree_map_with_path.rst", "python/_autosummary/mlx.utils.tree_reduce.rst", "python/_autosummary/mlx.utils.tree_unflatten.rst", "python/_autosummary/stream_class.rst", "python/array.rst", "python/data_types.rst", "python/devices_and_streams.rst", "python/distributed.rst", "python/fast.rst", "python/fft.rst", "python/linalg.rst", "python/metal.rst", "python/nn.rst", "python/nn/_autosummary/mlx.nn.ALiBi.rst", "python/nn/_autosummary/mlx.nn.AvgPool1d.rst", "python/nn/_autosummary/mlx.nn.AvgPool2d.rst", "python/nn/_autosummary/mlx.nn.BatchNorm.rst", "python/nn/_autosummary/mlx.nn.CELU.rst", "python/nn/_autosummary/mlx.nn.Conv1d.rst", "python/nn/_autosummary/mlx.nn.Conv2d.rst", "python/nn/_autosummary/mlx.nn.Conv3d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose1d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose2d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose3d.rst", "python/nn/_autosummary/mlx.nn.Dropout.rst", "python/nn/_autosummary/mlx.nn.Dropout2d.rst", "python/nn/_autosummary/mlx.nn.Dropout3d.rst", "python/nn/_autosummary/mlx.nn.ELU.rst", "python/nn/_autosummary/mlx.nn.Embedding.rst", "python/nn/_autosummary/mlx.nn.GELU.rst", "python/nn/_autosummary/mlx.nn.GLU.rst", "python/nn/_autosummary/mlx.nn.GRU.rst", "python/nn/_autosummary/mlx.nn.GroupNorm.rst", "python/nn/_autosummary/mlx.nn.HardShrink.rst", "python/nn/_autosummary/mlx.nn.HardTanh.rst", "python/nn/_autosummary/mlx.nn.Hardswish.rst", "python/nn/_autosummary/mlx.nn.InstanceNorm.rst", "python/nn/_autosummary/mlx.nn.LSTM.rst", "python/nn/_autosummary/mlx.nn.LayerNorm.rst", "python/nn/_autosummary/mlx.nn.LeakyReLU.rst", "python/nn/_autosummary/mlx.nn.Linear.rst", "python/nn/_autosummary/mlx.nn.LogSigmoid.rst", "python/nn/_autosummary/mlx.nn.LogSoftmax.rst", "python/nn/_autosummary/mlx.nn.MaxPool1d.rst", "python/nn/_autosummary/mlx.nn.MaxPool2d.rst", "python/nn/_autosummary/mlx.nn.Mish.rst", "python/nn/_autosummary/mlx.nn.Module.apply.rst", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules.rst", "python/nn/_autosummary/mlx.nn.Module.children.rst", "python/nn/_autosummary/mlx.nn.Module.eval.rst", "python/nn/_autosummary/mlx.nn.Module.filter_and_map.rst", "python/nn/_autosummary/mlx.nn.Module.freeze.rst", "python/nn/_autosummary/mlx.nn.Module.leaf_modules.rst", "python/nn/_autosummary/mlx.nn.Module.load_weights.rst", "python/nn/_autosummary/mlx.nn.Module.modules.rst", "python/nn/_autosummary/mlx.nn.Module.named_modules.rst", "python/nn/_autosummary/mlx.nn.Module.parameters.rst", "python/nn/_autosummary/mlx.nn.Module.save_weights.rst", "python/nn/_autosummary/mlx.nn.Module.set_dtype.rst", "python/nn/_autosummary/mlx.nn.Module.state.rst", "python/nn/_autosummary/mlx.nn.Module.train.rst", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters.rst", "python/nn/_autosummary/mlx.nn.Module.training.rst", "python/nn/_autosummary/mlx.nn.Module.unfreeze.rst", "python/nn/_autosummary/mlx.nn.Module.update.rst", "python/nn/_autosummary/mlx.nn.Module.update_modules.rst", "python/nn/_autosummary/mlx.nn.MultiHeadAttention.rst", "python/nn/_autosummary/mlx.nn.PReLU.rst", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding.rst", "python/nn/_autosummary/mlx.nn.QuantizedLinear.rst", "python/nn/_autosummary/mlx.nn.RMSNorm.rst", "python/nn/_autosummary/mlx.nn.RNN.rst", "python/nn/_autosummary/mlx.nn.ReLU.rst", "python/nn/_autosummary/mlx.nn.ReLU6.rst", "python/nn/_autosummary/mlx.nn.RoPE.rst", "python/nn/_autosummary/mlx.nn.SELU.rst", "python/nn/_autosummary/mlx.nn.Sequential.rst", "python/nn/_autosummary/mlx.nn.SiLU.rst", "python/nn/_autosummary/mlx.nn.Sigmoid.rst", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.rst", "python/nn/_autosummary/mlx.nn.Softmax.rst", "python/nn/_autosummary/mlx.nn.Softmin.rst", "python/nn/_autosummary/mlx.nn.Softplus.rst", "python/nn/_autosummary/mlx.nn.Softshrink.rst", "python/nn/_autosummary/mlx.nn.Softsign.rst", "python/nn/_autosummary/mlx.nn.Step.rst", "python/nn/_autosummary/mlx.nn.Tanh.rst", "python/nn/_autosummary/mlx.nn.Transformer.rst", "python/nn/_autosummary/mlx.nn.Upsample.rst", "python/nn/_autosummary/mlx.nn.init.constant.rst", "python/nn/_autosummary/mlx.nn.init.glorot_normal.rst", "python/nn/_autosummary/mlx.nn.init.glorot_uniform.rst", "python/nn/_autosummary/mlx.nn.init.he_normal.rst", "python/nn/_autosummary/mlx.nn.init.he_uniform.rst", "python/nn/_autosummary/mlx.nn.init.identity.rst", "python/nn/_autosummary/mlx.nn.init.normal.rst", "python/nn/_autosummary/mlx.nn.init.uniform.rst", "python/nn/_autosummary_functions/mlx.nn.celu.rst", "python/nn/_autosummary_functions/mlx.nn.elu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_approx.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.rst", "python/nn/_autosummary_functions/mlx.nn.glu.rst", "python/nn/_autosummary_functions/mlx.nn.hard_shrink.rst", "python/nn/_autosummary_functions/mlx.nn.hard_tanh.rst", "python/nn/_autosummary_functions/mlx.nn.hardswish.rst", "python/nn/_autosummary_functions/mlx.nn.leaky_relu.rst", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.log_softmax.rst", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.rst", "python/nn/_autosummary_functions/mlx.nn.mish.rst", "python/nn/_autosummary_functions/mlx.nn.prelu.rst", "python/nn/_autosummary_functions/mlx.nn.relu.rst", "python/nn/_autosummary_functions/mlx.nn.relu6.rst", "python/nn/_autosummary_functions/mlx.nn.selu.rst", "python/nn/_autosummary_functions/mlx.nn.sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.silu.rst", "python/nn/_autosummary_functions/mlx.nn.softmax.rst", "python/nn/_autosummary_functions/mlx.nn.softmin.rst", "python/nn/_autosummary_functions/mlx.nn.softplus.rst", "python/nn/_autosummary_functions/mlx.nn.softshrink.rst", "python/nn/_autosummary_functions/mlx.nn.step.rst", "python/nn/_autosummary_functions/mlx.nn.tanh.rst", "python/nn/functions.rst", "python/nn/init.rst", "python/nn/layers.rst", "python/nn/losses.rst", "python/nn/module.rst", "python/ops.rst", "python/optimizers.rst", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta.rst", "python/optimizers/_autosummary/mlx.optimizers.Adafactor.rst", "python/optimizers/_autosummary/mlx.optimizers.Adagrad.rst", "python/optimizers/_autosummary/mlx.optimizers.Adam.rst", "python/optimizers/_autosummary/mlx.optimizers.AdamW.rst", "python/optimizers/_autosummary/mlx.optimizers.Adamax.rst", "python/optimizers/_autosummary/mlx.optimizers.Lion.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.rst", "python/optimizers/_autosummary/mlx.optimizers.RMSprop.rst", "python/optimizers/_autosummary/mlx.optimizers.SGD.rst", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.join_schedules.rst", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule.rst", "python/optimizers/_autosummary/mlx.optimizers.step_decay.rst", "python/optimizers/common_optimizers.rst", "python/optimizers/optimizer.rst", "python/optimizers/schedulers.rst", "python/random.rst", "python/transforms.rst", "python/tree_utils.rst", "usage/compile.rst", "usage/distributed.rst", "usage/function_transforms.rst", "usage/indexing.rst", "usage/lazy_evaluation.rst", "usage/numpy.rst", "usage/quick_start.rst", "usage/saving_and_loading.rst", "usage/unified_memory.rst", "usage/using_streams.rst"], "indexentries": {"__init__() (array method)": [[30, "mlx.core.array.__init__", false]], "__init__() (custom_function method)": [[112, "mlx.core.custom_function.__init__", false]], "__init__() (device method)": [[9, "mlx.core.Device.__init__", false]], "__init__() (dtype method)": [[10, "mlx.core.Dtype.__init__", false]], "__init__() (dtypecategory method)": [[11, "mlx.core.DtypeCategory.__init__", false]], "__init__() (group method)": [[120, "mlx.core.distributed.Group.__init__", false]], "__init__() (stream method)": [[315, "mlx.core.Stream.__init__", false]], "abs (c++ function)": [[0, "_CPPv43absRK5array14StreamOrDevice", false]], "abs() (array method)": [[32, "mlx.core.array.abs", false]], "abs() (in module mlx.core)": [[12, "mlx.core.abs", false]], "adadelta (class in mlx.optimizers)": [[455, "mlx.optimizers.AdaDelta", false]], "adafactor (class in mlx.optimizers)": [[456, "mlx.optimizers.Adafactor", false]], "adagrad (class in mlx.optimizers)": [[457, "mlx.optimizers.Adagrad", false]], "adam (class in mlx.optimizers)": [[458, "mlx.optimizers.Adam", false]], "adamax (class in mlx.optimizers)": [[460, "mlx.optimizers.Adamax", false]], "adamw (class in mlx.optimizers)": [[459, "mlx.optimizers.AdamW", false]], "add (c++ function)": [[0, "_CPPv43addRK5arrayRK5array14StreamOrDevice", false]], "add() (in module mlx.core)": [[13, "mlx.core.add", false]], "addmm (c++ function)": [[0, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", false]], "addmm() (in module mlx.core)": [[14, "mlx.core.addmm", false]], "affine_quantize() (in module mlx.core.fast)": [[141, "mlx.core.fast.affine_quantize", false]], "alibi (class in mlx.nn)": [[325, "mlx.nn.ALiBi", false]], "all (c++ function)": [[0, "_CPPv43allRK5array14StreamOrDevice", false], [0, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayib14StreamOrDevice", false]], "all() (array method)": [[33, "mlx.core.array.all", false]], "all() (in module mlx.core)": [[15, "mlx.core.all", false]], "all_gather() (in module mlx.core.distributed)": [[121, "mlx.core.distributed.all_gather", false]], "all_sum() (in module mlx.core.distributed)": [[122, "mlx.core.distributed.all_sum", false]], "allclose (c++ function)": [[0, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "allclose() (in module mlx.core)": [[16, "mlx.core.allclose", false]], "any (c++ function)": [[0, "_CPPv43anyRK5array14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayib14StreamOrDevice", false]], "any() (array method)": [[34, "mlx.core.array.any", false]], "any() (in module mlx.core)": [[17, "mlx.core.any", false]], "apply() (module method)": [[358, "mlx.nn.Module.apply", false]], "apply_gradients() (optimizer method)": [[462, "mlx.optimizers.Optimizer.apply_gradients", false]], "apply_to_modules() (module method)": [[359, "mlx.nn.Module.apply_to_modules", false]], "arange (c++ function)": [[0, "_CPPv46aranged14StreamOrDevice", false], [0, "_CPPv46aranged5Dtype14StreamOrDevice", false], [0, "_CPPv46arangedd14StreamOrDevice", false], [0, "_CPPv46arangedd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangeddd14StreamOrDevice", false], [0, "_CPPv46arangeddd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangei14StreamOrDevice", false], [0, "_CPPv46arangeii14StreamOrDevice", false], [0, "_CPPv46arangeiii14StreamOrDevice", false]], "arange() (in module mlx.core)": [[18, "mlx.core.arange", false]], "arccos (c++ function)": [[0, "_CPPv46arccosRK5array14StreamOrDevice", false]], "arccos() (in module mlx.core)": [[19, "mlx.core.arccos", false]], "arccosh (c++ function)": [[0, "_CPPv47arccoshRK5array14StreamOrDevice", false]], "arccosh() (in module mlx.core)": [[20, "mlx.core.arccosh", false]], "arcsin (c++ function)": [[0, "_CPPv46arcsinRK5array14StreamOrDevice", false]], "arcsin() (in module mlx.core)": [[21, "mlx.core.arcsin", false]], "arcsinh (c++ function)": [[0, "_CPPv47arcsinhRK5array14StreamOrDevice", false]], "arcsinh() (in module mlx.core)": [[22, "mlx.core.arcsinh", false]], "arctan (c++ function)": [[0, "_CPPv46arctanRK5array14StreamOrDevice", false]], "arctan() (in module mlx.core)": [[23, "mlx.core.arctan", false]], "arctan2 (c++ function)": [[0, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", false]], "arctan2() (in module mlx.core)": [[24, "mlx.core.arctan2", false]], "arctanh (c++ function)": [[0, "_CPPv47arctanhRK5array14StreamOrDevice", false]], "arctanh() (in module mlx.core)": [[25, "mlx.core.arctanh", false]], "argmax (c++ function)": [[0, "_CPPv46argmaxRK5array14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayib14StreamOrDevice", false]], "argmax() (array method)": [[35, "mlx.core.array.argmax", false]], "argmax() (in module mlx.core)": [[26, "mlx.core.argmax", false]], "argmin (c++ function)": [[0, "_CPPv46argminRK5array14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayib14StreamOrDevice", false]], "argmin() (array method)": [[36, "mlx.core.array.argmin", false]], "argmin() (in module mlx.core)": [[27, "mlx.core.argmin", false]], "argpartition (c++ function)": [[0, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", false]], "argpartition() (in module mlx.core)": [[28, "mlx.core.argpartition", false]], "argsort (c++ function)": [[0, "_CPPv47argsortRK5array14StreamOrDevice", false], [0, "_CPPv47argsortRK5arrayi14StreamOrDevice", false]], "argsort() (in module mlx.core)": [[29, "mlx.core.argsort", false]], "array (class in mlx.core)": [[30, "mlx.core.array", false]], "array_equal (c++ function)": [[0, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", false]], "array_equal() (in module mlx.core)": [[82, "mlx.core.array_equal", false]], "as_strided (c++ function)": [[0, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", false]], "as_strided() (in module mlx.core)": [[83, "mlx.core.as_strided", false]], "astype (c++ function)": [[0, "_CPPv46astype5array5Dtype14StreamOrDevice", false]], "astype() (array method)": [[37, "mlx.core.array.astype", false]], "at (array property)": [[38, "mlx.core.array.at", false]], "atleast_1d (c++ function)": [[0, "_CPPv410atleast_1dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_1d() (in module mlx.core)": [[84, "mlx.core.atleast_1d", false]], "atleast_2d (c++ function)": [[0, "_CPPv410atleast_2dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_2d() (in module mlx.core)": [[85, "mlx.core.atleast_2d", false]], "atleast_3d (c++ function)": [[0, "_CPPv410atleast_3dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_3d() (in module mlx.core)": [[86, "mlx.core.atleast_3d", false]], "avgpool1d (class in mlx.nn)": [[326, "mlx.nn.AvgPool1d", false]], "avgpool2d (class in mlx.nn)": [[327, "mlx.nn.AvgPool2d", false]], "batchnorm (class in mlx.nn)": [[328, "mlx.nn.BatchNorm", false]], "bernoulli() (in module mlx.core.random)": [[240, "mlx.core.random.bernoulli", false]], "binary_cross_entropy (class in mlx.nn.losses)": [[421, "mlx.nn.losses.binary_cross_entropy", false]], "bitwise_and (c++ function)": [[0, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", false]], "bitwise_and() (in module mlx.core)": [[87, "mlx.core.bitwise_and", false]], "bitwise_or (c++ function)": [[0, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", false]], "bitwise_or() (in module mlx.core)": [[88, "mlx.core.bitwise_or", false]], "bitwise_xor (c++ function)": [[0, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", false]], "bitwise_xor() (in module mlx.core)": [[89, "mlx.core.bitwise_xor", false]], "block_masked_mm (c++ function)": [[0, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "block_masked_mm() (in module mlx.core)": [[90, "mlx.core.block_masked_mm", false]], "broadcast_arrays (c++ function)": [[0, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "broadcast_to (c++ function)": [[0, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "broadcast_to() (in module mlx.core)": [[91, "mlx.core.broadcast_to", false]], "categorical() (in module mlx.core.random)": [[241, "mlx.core.random.categorical", false]], "ceil (c++ function)": [[0, "_CPPv44ceilRK5array14StreamOrDevice", false]], "ceil() (in module mlx.core)": [[92, "mlx.core.ceil", false]], "celu (class in mlx.nn)": [[329, "mlx.nn.CELU", false], [409, "mlx.nn.celu", false]], "children() (module method)": [[360, "mlx.nn.Module.children", false]], "cholesky() (in module mlx.core.linalg)": [[183, "mlx.core.linalg.cholesky", false]], "cholesky_inv() (in module mlx.core.linalg)": [[184, "mlx.core.linalg.cholesky_inv", false]], "clear_cache() (in module mlx.core.metal)": [[209, "mlx.core.metal.clear_cache", false]], "clip (c++ function)": [[0, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", false]], "clip() (in module mlx.core)": [[93, "mlx.core.clip", false]], "clip_grad_norm() (in module mlx.optimizers)": [[309, "mlx.optimizers.clip_grad_norm", false]], "compile() (in module mlx.core)": [[94, "mlx.core.compile", false]], "concatenate (c++ function)": [[0, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "concatenate() (in module mlx.core)": [[95, "mlx.core.concatenate", false]], "conj() (array method)": [[39, "mlx.core.array.conj", false]], "conj() (in module mlx.core)": [[96, "mlx.core.conj", false]], "conjugate (c++ function)": [[0, "_CPPv49conjugateRK5array14StreamOrDevice", false]], "conjugate() (in module mlx.core)": [[97, "mlx.core.conjugate", false]], "constant() (in module mlx.nn.init)": [[401, "mlx.nn.init.constant", false]], "conv1d (c++ function)": [[0, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv1d (class in mlx.nn)": [[330, "mlx.nn.Conv1d", false]], "conv1d() (in module mlx.core)": [[98, "mlx.core.conv1d", false]], "conv2d (c++ function)": [[0, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv2d (class in mlx.nn)": [[331, "mlx.nn.Conv2d", false]], "conv2d() (in module mlx.core)": [[99, "mlx.core.conv2d", false]], "conv3d (c++ function)": [[0, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv3d (class in mlx.nn)": [[332, "mlx.nn.Conv3d", false]], "conv3d() (in module mlx.core)": [[100, "mlx.core.conv3d", false]], "conv_general (c++ function)": [[0, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false], [0, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false]], "conv_general() (in module mlx.core)": [[101, "mlx.core.conv_general", false]], "conv_transpose1d (c++ function)": [[0, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv_transpose1d() (in module mlx.core)": [[102, "mlx.core.conv_transpose1d", false]], "conv_transpose2d (c++ function)": [[0, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv_transpose2d() (in module mlx.core)": [[103, "mlx.core.conv_transpose2d", false]], "conv_transpose3d (c++ function)": [[0, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv_transpose3d() (in module mlx.core)": [[104, "mlx.core.conv_transpose3d", false]], "convolve() (in module mlx.core)": [[105, "mlx.core.convolve", false]], "convtranspose1d (class in mlx.nn)": [[333, "mlx.nn.ConvTranspose1d", false]], "convtranspose2d (class in mlx.nn)": [[334, "mlx.nn.ConvTranspose2d", false]], "convtranspose3d (class in mlx.nn)": [[335, "mlx.nn.ConvTranspose3d", false]], "copy (c++ function)": [[0, "_CPPv44copy5array14StreamOrDevice", false]], "cos (c++ function)": [[0, "_CPPv43cosRK5array14StreamOrDevice", false]], "cos() (array method)": [[40, "mlx.core.array.cos", false]], "cos() (in module mlx.core)": [[106, "mlx.core.cos", false]], "cosh (c++ function)": [[0, "_CPPv44coshRK5array14StreamOrDevice", false]], "cosh() (in module mlx.core)": [[107, "mlx.core.cosh", false]], "cosine_decay() (in module mlx.optimizers)": [[468, "mlx.optimizers.cosine_decay", false]], "cosine_similarity_loss (class in mlx.nn.losses)": [[422, "mlx.nn.losses.cosine_similarity_loss", false]], "cross() (in module mlx.core.linalg)": [[185, "mlx.core.linalg.cross", false]], "cross_entropy (class in mlx.nn.losses)": [[423, "mlx.nn.losses.cross_entropy", false]], "cummax (c++ function)": [[0, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", false]], "cummax() (array method)": [[41, "mlx.core.array.cummax", false]], "cummax() (in module mlx.core)": [[108, "mlx.core.cummax", false]], "cummin (c++ function)": [[0, "_CPPv46cumminRK5arrayibb14StreamOrDevice", false]], "cummin() (array method)": [[42, "mlx.core.array.cummin", false]], "cummin() (in module mlx.core)": [[109, "mlx.core.cummin", false]], "cumprod (c++ function)": [[0, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", false]], "cumprod() (array method)": [[43, "mlx.core.array.cumprod", false]], "cumprod() (in module mlx.core)": [[110, "mlx.core.cumprod", false]], "cumsum (c++ function)": [[0, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", false]], "cumsum() (array method)": [[44, "mlx.core.array.cumsum", false]], "cumsum() (in module mlx.core)": [[111, "mlx.core.cumsum", false]], "custom_function (class in mlx.core)": [[112, "mlx.core.custom_function", false]], "default_device() (in module mlx.core)": [[113, "mlx.core.default_device", false]], "default_stream() (in module mlx.core)": [[114, "mlx.core.default_stream", false]], "degrees (c++ function)": [[0, "_CPPv47degreesRK5array14StreamOrDevice", false]], "degrees() (in module mlx.core)": [[115, "mlx.core.degrees", false]], "depends (c++ function)": [[0, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", false]], "dequantize (c++ function)": [[0, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", false]], "dequantize() (in module mlx.core)": [[116, "mlx.core.dequantize", false]], "device (class in mlx.core)": [[9, "mlx.core.Device", false]], "device_info() (in module mlx.core.metal)": [[210, "mlx.core.metal.device_info", false]], "diag (c++ function)": [[0, "_CPPv44diagRK5arrayi14StreamOrDevice", false]], "diag() (array method)": [[45, "mlx.core.array.diag", false]], "diag() (in module mlx.core)": [[117, "mlx.core.diag", false]], "diagonal (c++ function)": [[0, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", false]], "diagonal() (array method)": [[46, "mlx.core.array.diagonal", false]], "diagonal() (in module mlx.core)": [[118, "mlx.core.diagonal", false]], "disable_compile() (in module mlx.core)": [[119, "mlx.core.disable_compile", false]], "divide (c++ function)": [[0, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", false]], "divide() (in module mlx.core)": [[128, "mlx.core.divide", false]], "divmod (c++ function)": [[0, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", false]], "divmod() (in module mlx.core)": [[129, "mlx.core.divmod", false]], "dropout (class in mlx.nn)": [[336, "mlx.nn.Dropout", false]], "dropout2d (class in mlx.nn)": [[337, "mlx.nn.Dropout2d", false]], "dropout3d (class in mlx.nn)": [[338, "mlx.nn.Dropout3d", false]], "dtype (array property)": [[47, "mlx.core.array.dtype", false]], "dtype (class in mlx.core)": [[10, "mlx.core.Dtype", false]], "dtypecategory (class in mlx.core)": [[11, "mlx.core.DtypeCategory", false]], "eigh() (in module mlx.core.linalg)": [[186, "mlx.core.linalg.eigh", false]], "eigvalsh() (in module mlx.core.linalg)": [[187, "mlx.core.linalg.eigvalsh", false]], "einsum() (in module mlx.core)": [[130, "mlx.core.einsum", false]], "einsum_path() (in module mlx.core)": [[131, "mlx.core.einsum_path", false]], "elu (class in mlx.nn)": [[339, "mlx.nn.ELU", false], [410, "mlx.nn.elu", false]], "embedding (class in mlx.nn)": [[340, "mlx.nn.Embedding", false]], "enable_compile() (in module mlx.core)": [[132, "mlx.core.enable_compile", false]], "equal (c++ function)": [[0, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", false]], "equal() (in module mlx.core)": [[133, "mlx.core.equal", false]], "erf (c++ function)": [[0, "_CPPv43erfRK5array14StreamOrDevice", false]], "erf() (in module mlx.core)": [[134, "mlx.core.erf", false]], "erfinv (c++ function)": [[0, "_CPPv46erfinvRK5array14StreamOrDevice", false]], "erfinv() (in module mlx.core)": [[135, "mlx.core.erfinv", false]], "eval() (in module mlx.core)": [[136, "mlx.core.eval", false]], "eval() (module method)": [[361, "mlx.nn.Module.eval", false]], "exp (c++ function)": [[0, "_CPPv43expRK5array14StreamOrDevice", false]], "exp() (array method)": [[48, "mlx.core.array.exp", false]], "exp() (in module mlx.core)": [[137, "mlx.core.exp", false]], "expand_dims (c++ function)": [[0, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", false]], "expand_dims() (in module mlx.core)": [[138, "mlx.core.expand_dims", false]], "expm1 (c++ function)": [[0, "_CPPv45expm1RK5array14StreamOrDevice", false]], "expm1() (in module mlx.core)": [[139, "mlx.core.expm1", false]], "exponential_decay() (in module mlx.optimizers)": [[469, "mlx.optimizers.exponential_decay", false]], "eye (c++ function)": [[0, "_CPPv43eyei14StreamOrDevice", false], [0, "_CPPv43eyei5Dtype14StreamOrDevice", false], [0, "_CPPv43eyeii14StreamOrDevice", false], [0, "_CPPv43eyeiii14StreamOrDevice", false], [0, "_CPPv43eyeiii5Dtype14StreamOrDevice", false]], "eye() (in module mlx.core)": [[140, "mlx.core.eye", false]], "fft() (in module mlx.core.fft)": [[147, "mlx.core.fft.fft", false]], "fft2() (in module mlx.core.fft)": [[148, "mlx.core.fft.fft2", false]], "fftn() (in module mlx.core.fft)": [[149, "mlx.core.fft.fftn", false]], "filter_and_map() (module method)": [[362, "mlx.nn.Module.filter_and_map", false]], "flatten (c++ function)": [[0, "_CPPv47flattenRK5array14StreamOrDevice", false], [0, "_CPPv47flattenRK5arrayii14StreamOrDevice", false]], "flatten() (array method)": [[49, "mlx.core.array.flatten", false]], "flatten() (in module mlx.core)": [[159, "mlx.core.flatten", false]], "floor (c++ function)": [[0, "_CPPv45floorRK5array14StreamOrDevice", false]], "floor() (in module mlx.core)": [[160, "mlx.core.floor", false]], "floor_divide (c++ function)": [[0, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", false]], "floor_divide() (in module mlx.core)": [[161, "mlx.core.floor_divide", false]], "freeze() (module method)": [[363, "mlx.nn.Module.freeze", false]], "full (c++ function)": [[0, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", false], [0, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", false]], "full() (in module mlx.core)": [[162, "mlx.core.full", false]], "gather (c++ function)": [[0, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "gather_mm (c++ function)": [[0, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "gather_mm() (in module mlx.core)": [[163, "mlx.core.gather_mm", false]], "gather_qmm (c++ function)": [[0, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", false]], "gather_qmm() (in module mlx.core)": [[164, "mlx.core.gather_qmm", false]], "gaussian_nll_loss (class in mlx.nn.losses)": [[424, "mlx.nn.losses.gaussian_nll_loss", false]], "gelu (class in mlx.nn)": [[341, "mlx.nn.GELU", false], [411, "mlx.nn.gelu", false]], "gelu_approx (class in mlx.nn)": [[412, "mlx.nn.gelu_approx", false]], "gelu_fast_approx (class in mlx.nn)": [[413, "mlx.nn.gelu_fast_approx", false]], "get_active_memory() (in module mlx.core.metal)": [[211, "mlx.core.metal.get_active_memory", false]], "get_cache_memory() (in module mlx.core.metal)": [[212, "mlx.core.metal.get_cache_memory", false]], "get_peak_memory() (in module mlx.core.metal)": [[213, "mlx.core.metal.get_peak_memory", false]], "glorot_normal() (in module mlx.nn.init)": [[402, "mlx.nn.init.glorot_normal", false]], "glorot_uniform() (in module mlx.nn.init)": [[403, "mlx.nn.init.glorot_uniform", false]], "glu (class in mlx.nn)": [[342, "mlx.nn.GLU", false], [414, "mlx.nn.glu", false]], "grad() (in module mlx.core)": [[165, "mlx.core.grad", false]], "greater (c++ function)": [[0, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", false]], "greater() (in module mlx.core)": [[166, "mlx.core.greater", false]], "greater_equal (c++ function)": [[0, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", false]], "greater_equal() (in module mlx.core)": [[167, "mlx.core.greater_equal", false]], "group (class in mlx.core.distributed)": [[120, "mlx.core.distributed.Group", false]], "groupnorm (class in mlx.nn)": [[344, "mlx.nn.GroupNorm", false]], "gru (class in mlx.nn)": [[343, "mlx.nn.GRU", false]], "gumbel() (in module mlx.core.random)": [[242, "mlx.core.random.gumbel", false]], "hadamard_transform (c++ function)": [[0, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", false]], "hadamard_transform() (in module mlx.core)": [[168, "mlx.core.hadamard_transform", false]], "hard_shrink (class in mlx.nn)": [[415, "mlx.nn.hard_shrink", false]], "hard_tanh (class in mlx.nn)": [[416, "mlx.nn.hard_tanh", false]], "hardshrink (class in mlx.nn)": [[345, "mlx.nn.HardShrink", false]], "hardswish (class in mlx.nn)": [[347, "mlx.nn.Hardswish", false], [417, "mlx.nn.hardswish", false]], "hardtanh (class in mlx.nn)": [[346, "mlx.nn.HardTanh", false]], "he_normal() (in module mlx.nn.init)": [[404, "mlx.nn.init.he_normal", false]], "he_uniform() (in module mlx.nn.init)": [[405, "mlx.nn.init.he_uniform", false]], "hinge_loss (class in mlx.nn.losses)": [[425, "mlx.nn.losses.hinge_loss", false]], "huber_loss (class in mlx.nn.losses)": [[426, "mlx.nn.losses.huber_loss", false]], "identity (c++ function)": [[0, "_CPPv48identityi14StreamOrDevice", false], [0, "_CPPv48identityi5Dtype14StreamOrDevice", false]], "identity() (in module mlx.core)": [[169, "mlx.core.identity", false]], "identity() (in module mlx.nn.init)": [[406, "mlx.nn.init.identity", false]], "ifft() (in module mlx.core.fft)": [[150, "mlx.core.fft.ifft", false]], "ifft2() (in module mlx.core.fft)": [[151, "mlx.core.fft.ifft2", false]], "ifftn() (in module mlx.core.fft)": [[152, "mlx.core.fft.ifftn", false]], "imag (c++ function)": [[0, "_CPPv44imagRK5array14StreamOrDevice", false]], "imag() (in module mlx.core)": [[170, "mlx.core.imag", false]], "init() (in module mlx.core.distributed)": [[123, "mlx.core.distributed.init", false]], "init() (optimizer method)": [[463, "mlx.optimizers.Optimizer.init", false]], "inner (c++ function)": [[0, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", false]], "inner() (in module mlx.core)": [[171, "mlx.core.inner", false]], "instancenorm (class in mlx.nn)": [[348, "mlx.nn.InstanceNorm", false]], "inv() (in module mlx.core.linalg)": [[188, "mlx.core.linalg.inv", false]], "irfft() (in module mlx.core.fft)": [[153, "mlx.core.fft.irfft", false]], "irfft2() (in module mlx.core.fft)": [[154, "mlx.core.fft.irfft2", false]], "irfftn() (in module mlx.core.fft)": [[155, "mlx.core.fft.irfftn", false]], "is_available() (in module mlx.core.distributed)": [[124, "mlx.core.distributed.is_available", false]], "is_available() (in module mlx.core.metal)": [[214, "mlx.core.metal.is_available", false]], "isclose (c++ function)": [[0, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "isclose() (in module mlx.core)": [[172, "mlx.core.isclose", false]], "isfinite (c++ function)": [[0, "_CPPv48isfiniteRK5array14StreamOrDevice", false]], "isfinite() (in module mlx.core)": [[173, "mlx.core.isfinite", false]], "isinf (c++ function)": [[0, "_CPPv45isinfRK5array14StreamOrDevice", false]], "isinf() (in module mlx.core)": [[174, "mlx.core.isinf", false]], "isnan (c++ function)": [[0, "_CPPv45isnanRK5array14StreamOrDevice", false]], "isnan() (in module mlx.core)": [[175, "mlx.core.isnan", false]], "isneginf (c++ function)": [[0, "_CPPv48isneginfRK5array14StreamOrDevice", false]], "isneginf() (in module mlx.core)": [[176, "mlx.core.isneginf", false]], "isposinf (c++ function)": [[0, "_CPPv48isposinfRK5array14StreamOrDevice", false]], "isposinf() (in module mlx.core)": [[177, "mlx.core.isposinf", false]], "issubdtype() (in module mlx.core)": [[178, "mlx.core.issubdtype", false]], "item() (array method)": [[50, "mlx.core.array.item", false]], "itemsize (array property)": [[51, "mlx.core.array.itemsize", false]], "join_schedules() (in module mlx.optimizers)": [[470, "mlx.optimizers.join_schedules", false]], "jvp() (in module mlx.core)": [[179, "mlx.core.jvp", false]], "key() (in module mlx.core.random)": [[243, "mlx.core.random.key", false]], "kl_div_loss (class in mlx.nn.losses)": [[427, "mlx.nn.losses.kl_div_loss", false]], "l1_loss (class in mlx.nn.losses)": [[428, "mlx.nn.losses.l1_loss", false]], "laplace() (in module mlx.core.random)": [[244, "mlx.core.random.laplace", false]], "layer_norm() (in module mlx.core.fast)": [[142, "mlx.core.fast.layer_norm", false]], "layernorm (class in mlx.nn)": [[350, "mlx.nn.LayerNorm", false]], "leaf_modules() (module method)": [[364, "mlx.nn.Module.leaf_modules", false]], "leaky_relu (class in mlx.nn)": [[418, "mlx.nn.leaky_relu", false]], "leakyrelu (class in mlx.nn)": [[351, "mlx.nn.LeakyReLU", false]], "left_shift (c++ function)": [[0, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", false]], "left_shift() (in module mlx.core)": [[180, "mlx.core.left_shift", false]], "less (c++ function)": [[0, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", false]], "less() (in module mlx.core)": [[181, "mlx.core.less", false]], "less_equal (c++ function)": [[0, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", false]], "less_equal() (in module mlx.core)": [[182, "mlx.core.less_equal", false]], "linear (class in mlx.nn)": [[352, "mlx.nn.Linear", false]], "linear_schedule() (in module mlx.optimizers)": [[471, "mlx.optimizers.linear_schedule", false]], "linspace (c++ function)": [[0, "_CPPv48linspaceddi5Dtype14StreamOrDevice", false]], "linspace() (in module mlx.core)": [[193, "mlx.core.linspace", false]], "lion (class in mlx.optimizers)": [[461, "mlx.optimizers.Lion", false]], "load() (in module mlx.core)": [[194, "mlx.core.load", false]], "load_weights() (module method)": [[365, "mlx.nn.Module.load_weights", false]], "log (c++ function)": [[0, "_CPPv43logRK5array14StreamOrDevice", false]], "log() (array method)": [[52, "mlx.core.array.log", false]], "log() (in module mlx.core)": [[195, "mlx.core.log", false]], "log10 (c++ function)": [[0, "_CPPv45log10RK5array14StreamOrDevice", false]], "log10() (array method)": [[53, "mlx.core.array.log10", false]], "log10() (in module mlx.core)": [[196, "mlx.core.log10", false]], "log1p (c++ function)": [[0, "_CPPv45log1pRK5array14StreamOrDevice", false]], "log1p() (array method)": [[54, "mlx.core.array.log1p", false]], "log1p() (in module mlx.core)": [[197, "mlx.core.log1p", false]], "log2 (c++ function)": [[0, "_CPPv44log2RK5array14StreamOrDevice", false]], "log2() (array method)": [[55, "mlx.core.array.log2", false]], "log2() (in module mlx.core)": [[198, "mlx.core.log2", false]], "log_cosh_loss (class in mlx.nn.losses)": [[429, "mlx.nn.losses.log_cosh_loss", false]], "log_sigmoid (class in mlx.nn)": [[419, "mlx.nn.log_sigmoid", false]], "log_softmax (class in mlx.nn)": [[420, "mlx.nn.log_softmax", false]], "logaddexp (c++ function)": [[0, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", false]], "logaddexp() (in module mlx.core)": [[199, "mlx.core.logaddexp", false]], "logical_and (c++ function)": [[0, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", false]], "logical_and() (in module mlx.core)": [[200, "mlx.core.logical_and", false]], "logical_not (c++ function)": [[0, "_CPPv411logical_notRK5array14StreamOrDevice", false]], "logical_not() (in module mlx.core)": [[201, "mlx.core.logical_not", false]], "logical_or (c++ function)": [[0, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", false]], "logical_or() (in module mlx.core)": [[202, "mlx.core.logical_or", false]], "logsigmoid (class in mlx.nn)": [[353, "mlx.nn.LogSigmoid", false]], "logsoftmax (class in mlx.nn)": [[354, "mlx.nn.LogSoftmax", false]], "logsumexp (c++ function)": [[0, "_CPPv49logsumexpRK5array14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", false]], "logsumexp() (array method)": [[56, "mlx.core.array.logsumexp", false]], "logsumexp() (in module mlx.core)": [[203, "mlx.core.logsumexp", false]], "lstm (class in mlx.nn)": [[349, "mlx.nn.LSTM", false]], "margin_ranking_loss (class in mlx.nn.losses)": [[430, "mlx.nn.losses.margin_ranking_loss", false]], "matmul (c++ function)": [[0, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", false]], "matmul() (in module mlx.core)": [[204, "mlx.core.matmul", false]], "max (c++ function)": [[0, "_CPPv43maxRK5array14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayib14StreamOrDevice", false]], "max() (array method)": [[57, "mlx.core.array.max", false]], "max() (in module mlx.core)": [[205, "mlx.core.max", false]], "maximum (c++ function)": [[0, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", false]], "maximum() (in module mlx.core)": [[206, "mlx.core.maximum", false]], "maxpool1d (class in mlx.nn)": [[355, "mlx.nn.MaxPool1d", false]], "maxpool2d (class in mlx.nn)": [[356, "mlx.nn.MaxPool2d", false]], "mean (c++ function)": [[0, "_CPPv44meanRK5array14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayib14StreamOrDevice", false]], "mean() (array method)": [[58, "mlx.core.array.mean", false]], "mean() (in module mlx.core)": [[207, "mlx.core.mean", false]], "meshgrid (c++ function)": [[0, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", false]], "meshgrid() (in module mlx.core)": [[208, "mlx.core.meshgrid", false]], "metal_kernel() (in module mlx.core.fast)": [[143, "mlx.core.fast.metal_kernel", false]], "min (c++ function)": [[0, "_CPPv43minRK5array14StreamOrDevice", false], [0, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayib14StreamOrDevice", false]], "min() (array method)": [[59, "mlx.core.array.min", false]], "min() (in module mlx.core)": [[221, "mlx.core.min", false]], "minimum (c++ function)": [[0, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", false]], "minimum() (in module mlx.core)": [[222, "mlx.core.minimum", false]], "mish (class in mlx.nn)": [[357, "mlx.nn.Mish", false], [435, "mlx.nn.mish", false]], "module (class in mlx.nn)": [[452, "mlx.nn.Module", false]], "modules() (module method)": [[366, "mlx.nn.Module.modules", false]], "moveaxis (c++ function)": [[0, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", false]], "moveaxis() (array method)": [[60, "mlx.core.array.moveaxis", false]], "moveaxis() (in module mlx.core)": [[223, "mlx.core.moveaxis", false]], "mse_loss (class in mlx.nn.losses)": [[431, "mlx.nn.losses.mse_loss", false]], "multiheadattention (class in mlx.nn)": [[378, "mlx.nn.MultiHeadAttention", false]], "multiply (c++ function)": [[0, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", false]], "multiply() (in module mlx.core)": [[224, "mlx.core.multiply", false]], "multivariate_normal() (in module mlx.core.random)": [[245, "mlx.core.random.multivariate_normal", false]], "named_modules() (module method)": [[367, "mlx.nn.Module.named_modules", false]], "nan_to_num (c++ function)": [[0, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", false]], "nan_to_num() (in module mlx.core)": [[225, "mlx.core.nan_to_num", false]], "nbytes (array property)": [[61, "mlx.core.array.nbytes", false]], "ndim (array property)": [[62, "mlx.core.array.ndim", false]], "negative (c++ function)": [[0, "_CPPv48negativeRK5array14StreamOrDevice", false]], "negative() (in module mlx.core)": [[226, "mlx.core.negative", false]], "new_stream() (in module mlx.core)": [[227, "mlx.core.new_stream", false]], "nll_loss (class in mlx.nn.losses)": [[432, "mlx.nn.losses.nll_loss", false]], "norm() (in module mlx.core.linalg)": [[189, "mlx.core.linalg.norm", false]], "normal() (in module mlx.core.random)": [[246, "mlx.core.random.normal", false]], "normal() (in module mlx.nn.init)": [[407, "mlx.nn.init.normal", false]], "not_equal (c++ function)": [[0, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", false]], "not_equal() (in module mlx.core)": [[228, "mlx.core.not_equal", false]], "number_of_elements (c++ function)": [[0, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", false]], "ones (c++ function)": [[0, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "ones() (in module mlx.core)": [[229, "mlx.core.ones", false]], "ones_like (c++ function)": [[0, "_CPPv49ones_likeRK5array14StreamOrDevice", false]], "ones_like() (in module mlx.core)": [[230, "mlx.core.ones_like", false]], "operator!= (c++ function)": [[0, "_CPPv4I0Ene5array1TRK5array", false], [0, "_CPPv4I0Ene5arrayRK5array1T", false], [0, "_CPPv4neRK5arrayRK5array", false]], "operator% (c++ function)": [[0, "_CPPv4I0Erm5array1TRK5array", false], [0, "_CPPv4I0Erm5arrayRK5array1T", false], [0, "_CPPv4rmRK5arrayRK5array", false]], "operator& (c++ function)": [[0, "_CPPv4anRK5arrayRK5array", false]], "operator&& (c++ function)": [[0, "_CPPv4aaRK5arrayRK5array", false]], "operator* (c++ function)": [[0, "_CPPv4I0Eml5array1TRK5array", false], [0, "_CPPv4I0Eml5arrayRK5array1T", false], [0, "_CPPv4mlRK5arrayRK5array", false]], "operator+ (c++ function)": [[0, "_CPPv4I0Epl5array1TRK5array", false], [0, "_CPPv4I0Epl5arrayRK5array1T", false], [0, "_CPPv4plRK5arrayRK5array", false]], "operator- (c++ function)": [[0, "_CPPv4I0Emi5array1TRK5array", false], [0, "_CPPv4I0Emi5arrayRK5array1T", false], [0, "_CPPv4miRK5array", false], [0, "_CPPv4miRK5arrayRK5array", false]], "operator/ (c++ function)": [[0, "_CPPv4dvRK5arrayRK5array", false], [0, "_CPPv4dvRK5arrayd", false], [0, "_CPPv4dvdRK5array", false]], "operator< (c++ function)": [[0, "_CPPv4I0Elt5array1TRK5array", false], [0, "_CPPv4I0Elt5arrayRK5array1T", false], [0, "_CPPv4ltRK5arrayRK5array", false]], "operator<< (c++ function)": [[0, "_CPPv4lsRK5arrayRK5array", false]], "operator<= (c++ function)": [[0, "_CPPv4I0Ele5array1TRK5array", false], [0, "_CPPv4I0Ele5arrayRK5array1T", false], [0, "_CPPv4leRK5arrayRK5array", false]], "operator== (c++ function)": [[0, "_CPPv4I0Eeq5array1TRK5array", false], [0, "_CPPv4I0Eeq5arrayRK5array1T", false], [0, "_CPPv4eqRK5arrayRK5array", false]], "operator> (c++ function)": [[0, "_CPPv4I0Egt5array1TRK5array", false], [0, "_CPPv4I0Egt5arrayRK5array1T", false], [0, "_CPPv4gtRK5arrayRK5array", false]], "operator>= (c++ function)": [[0, "_CPPv4I0Ege5array1TRK5array", false], [0, "_CPPv4I0Ege5arrayRK5array1T", false], [0, "_CPPv4geRK5arrayRK5array", false]], "operator>> (c++ function)": [[0, "_CPPv4rsRK5arrayRK5array", false]], "operator^ (c++ function)": [[0, "_CPPv4eoRK5arrayRK5array", false]], "operator| (c++ function)": [[0, "_CPPv4orRK5arrayRK5array", false]], "operator|| (c++ function)": [[0, "_CPPv4ooRK5arrayRK5array", false]], "optimizer (class in mlx.optimizers)": [[474, "mlx.optimizers.Optimizer", false]], "outer (c++ function)": [[0, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", false]], "outer() (in module mlx.core)": [[231, "mlx.core.outer", false]], "pad (c++ function)": [[0, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", false]], "pad() (in module mlx.core)": [[232, "mlx.core.pad", false]], "parameters() (module method)": [[368, "mlx.nn.Module.parameters", false]], "partition (c++ function)": [[0, "_CPPv49partitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv49partitionRK5arrayii14StreamOrDevice", false]], "partition() (in module mlx.core)": [[233, "mlx.core.partition", false]], "permutation() (in module mlx.core.random)": [[247, "mlx.core.random.permutation", false]], "power (c++ function)": [[0, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", false]], "power() (in module mlx.core)": [[234, "mlx.core.power", false]], "prelu (class in mlx.nn)": [[379, "mlx.nn.PReLU", false], [436, "mlx.nn.prelu", false]], "prod (c++ function)": [[0, "_CPPv44prodRK5array14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayib14StreamOrDevice", false]], "prod() (array method)": [[63, "mlx.core.array.prod", false]], "prod() (in module mlx.core)": [[235, "mlx.core.prod", false]], "put_along_axis (c++ function)": [[0, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false]], "put_along_axis() (in module mlx.core)": [[236, "mlx.core.put_along_axis", false]], "qr() (in module mlx.core.linalg)": [[190, "mlx.core.linalg.qr", false]], "quantize (c++ function)": [[0, "_CPPv48quantizeRK5arrayii14StreamOrDevice", false]], "quantize() (in module mlx.core)": [[237, "mlx.core.quantize", false]], "quantize() (in module mlx.nn)": [[307, "mlx.nn.quantize", false]], "quantized_matmul (c++ function)": [[0, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", false]], "quantized_matmul() (in module mlx.core)": [[238, "mlx.core.quantized_matmul", false]], "quantizedembedding (class in mlx.nn)": [[380, "mlx.nn.QuantizedEmbedding", false]], "quantizedlinear (class in mlx.nn)": [[381, "mlx.nn.QuantizedLinear", false]], "radians (c++ function)": [[0, "_CPPv47radiansRK5array14StreamOrDevice", false]], "radians() (in module mlx.core)": [[239, "mlx.core.radians", false]], "randint() (in module mlx.core.random)": [[248, "mlx.core.random.randint", false]], "real (c++ function)": [[0, "_CPPv44realRK5array14StreamOrDevice", false]], "real() (in module mlx.core)": [[253, "mlx.core.real", false]], "reciprocal (c++ function)": [[0, "_CPPv410reciprocalRK5array14StreamOrDevice", false]], "reciprocal() (array method)": [[64, "mlx.core.array.reciprocal", false]], "reciprocal() (in module mlx.core)": [[254, "mlx.core.reciprocal", false]], "recv() (in module mlx.core.distributed)": [[125, "mlx.core.distributed.recv", false]], "recv_like() (in module mlx.core.distributed)": [[126, "mlx.core.distributed.recv_like", false]], "relu (class in mlx.nn)": [[384, "mlx.nn.ReLU", false], [437, "mlx.nn.relu", false]], "relu6 (class in mlx.nn)": [[385, "mlx.nn.ReLU6", false], [438, "mlx.nn.relu6", false]], "remainder (c++ function)": [[0, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", false]], "remainder() (in module mlx.core)": [[255, "mlx.core.remainder", false]], "repeat (c++ function)": [[0, "_CPPv46repeatRK5arrayi14StreamOrDevice", false], [0, "_CPPv46repeatRK5arrayii14StreamOrDevice", false]], "repeat() (in module mlx.core)": [[256, "mlx.core.repeat", false]], "reset_peak_memory() (in module mlx.core.metal)": [[215, "mlx.core.metal.reset_peak_memory", false]], "reshape (c++ function)": [[0, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "reshape() (array method)": [[65, "mlx.core.array.reshape", false]], "reshape() (in module mlx.core)": [[257, "mlx.core.reshape", false]], "rfft() (in module mlx.core.fft)": [[156, "mlx.core.fft.rfft", false]], "rfft2() (in module mlx.core.fft)": [[157, "mlx.core.fft.rfft2", false]], "rfftn() (in module mlx.core.fft)": [[158, "mlx.core.fft.rfftn", false]], "right_shift (c++ function)": [[0, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", false]], "right_shift() (in module mlx.core)": [[258, "mlx.core.right_shift", false]], "rms_norm() (in module mlx.core.fast)": [[144, "mlx.core.fast.rms_norm", false]], "rmsnorm (class in mlx.nn)": [[382, "mlx.nn.RMSNorm", false]], "rmsprop (class in mlx.optimizers)": [[466, "mlx.optimizers.RMSprop", false]], "rnn (class in mlx.nn)": [[383, "mlx.nn.RNN", false]], "roll (c++ function)": [[0, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayii14StreamOrDevice", false]], "roll() (in module mlx.core)": [[259, "mlx.core.roll", false]], "rope (class in mlx.nn)": [[386, "mlx.nn.RoPE", false]], "rope() (in module mlx.core.fast)": [[145, "mlx.core.fast.rope", false]], "round (c++ function)": [[0, "_CPPv45roundRK5array14StreamOrDevice", false], [0, "_CPPv45roundRK5arrayi14StreamOrDevice", false]], "round() (array method)": [[66, "mlx.core.array.round", false]], "round() (in module mlx.core)": [[260, "mlx.core.round", false]], "rsqrt (c++ function)": [[0, "_CPPv45rsqrtRK5array14StreamOrDevice", false]], "rsqrt() (array method)": [[67, "mlx.core.array.rsqrt", false]], "rsqrt() (in module mlx.core)": [[261, "mlx.core.rsqrt", false]], "save() (in module mlx.core)": [[262, "mlx.core.save", false]], "save_gguf() (in module mlx.core)": [[263, "mlx.core.save_gguf", false]], "save_safetensors() (in module mlx.core)": [[264, "mlx.core.save_safetensors", false]], "save_weights() (module method)": [[369, "mlx.nn.Module.save_weights", false]], "savez() (in module mlx.core)": [[265, "mlx.core.savez", false]], "savez_compressed() (in module mlx.core)": [[266, "mlx.core.savez_compressed", false]], "scaled_dot_product_attention() (in module mlx.core.fast)": [[146, "mlx.core.fast.scaled_dot_product_attention", false]], "scatter (c++ function)": [[0, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_add (c++ function)": [[0, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_max (c++ function)": [[0, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_min (c++ function)": [[0, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_prod (c++ function)": [[0, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "seed() (in module mlx.core.random)": [[249, "mlx.core.random.seed", false]], "selu (class in mlx.nn)": [[387, "mlx.nn.SELU", false], [439, "mlx.nn.selu", false]], "send() (in module mlx.core.distributed)": [[127, "mlx.core.distributed.send", false]], "sequential (class in mlx.nn)": [[388, "mlx.nn.Sequential", false]], "set_cache_limit() (in module mlx.core.metal)": [[216, "mlx.core.metal.set_cache_limit", false]], "set_default_device() (in module mlx.core)": [[267, "mlx.core.set_default_device", false]], "set_default_stream() (in module mlx.core)": [[268, "mlx.core.set_default_stream", false]], "set_dtype() (module method)": [[370, "mlx.nn.Module.set_dtype", false]], "set_memory_limit() (in module mlx.core.metal)": [[217, "mlx.core.metal.set_memory_limit", false]], "set_wired_limit() (in module mlx.core.metal)": [[218, "mlx.core.metal.set_wired_limit", false]], "sgd (class in mlx.optimizers)": [[467, "mlx.optimizers.SGD", false]], "shape (array property)": [[68, "mlx.core.array.shape", false]], "sigmoid (c++ function)": [[0, "_CPPv47sigmoidRK5array14StreamOrDevice", false]], "sigmoid (class in mlx.nn)": [[390, "mlx.nn.Sigmoid", false], [440, "mlx.nn.sigmoid", false]], "sigmoid() (in module mlx.core)": [[269, "mlx.core.sigmoid", false]], "sign (c++ function)": [[0, "_CPPv44signRK5array14StreamOrDevice", false]], "sign() (in module mlx.core)": [[270, "mlx.core.sign", false]], "silu (class in mlx.nn)": [[389, "mlx.nn.SiLU", false], [441, "mlx.nn.silu", false]], "sin (c++ function)": [[0, "_CPPv43sinRK5array14StreamOrDevice", false]], "sin() (array method)": [[69, "mlx.core.array.sin", false]], "sin() (in module mlx.core)": [[271, "mlx.core.sin", false]], "sinh (c++ function)": [[0, "_CPPv44sinhRK5array14StreamOrDevice", false]], "sinh() (in module mlx.core)": [[272, "mlx.core.sinh", false]], "sinusoidalpositionalencoding (class in mlx.nn)": [[391, "mlx.nn.SinusoidalPositionalEncoding", false]], "size (array property)": [[70, "mlx.core.array.size", false]], "slice (c++ function)": [[0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "slice_update (c++ function)": [[0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "smooth_l1_loss (class in mlx.nn.losses)": [[433, "mlx.nn.losses.smooth_l1_loss", false]], "softmax (c++ function)": [[0, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayib14StreamOrDevice", false]], "softmax (class in mlx.nn)": [[392, "mlx.nn.Softmax", false], [442, "mlx.nn.softmax", false]], "softmax() (in module mlx.core)": [[273, "mlx.core.softmax", false]], "softmin (class in mlx.nn)": [[393, "mlx.nn.Softmin", false], [443, "mlx.nn.softmin", false]], "softplus (class in mlx.nn)": [[394, "mlx.nn.Softplus", false], [444, "mlx.nn.softplus", false]], "softshrink (class in mlx.nn)": [[395, "mlx.nn.Softshrink", false], [445, "mlx.nn.softshrink", false]], "softsign (class in mlx.nn)": [[396, "mlx.nn.Softsign", false]], "sort (c++ function)": [[0, "_CPPv44sortRK5array14StreamOrDevice", false], [0, "_CPPv44sortRK5arrayi14StreamOrDevice", false]], "sort() (in module mlx.core)": [[274, "mlx.core.sort", false]], "split (c++ function)": [[0, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayii14StreamOrDevice", false]], "split() (array method)": [[71, "mlx.core.array.split", false]], "split() (in module mlx.core)": [[275, "mlx.core.split", false]], "split() (in module mlx.core.random)": [[250, "mlx.core.random.split", false]], "sqrt (c++ function)": [[0, "_CPPv44sqrtRK5array14StreamOrDevice", false]], "sqrt() (array method)": [[72, "mlx.core.array.sqrt", false]], "sqrt() (in module mlx.core)": [[276, "mlx.core.sqrt", false]], "square (c++ function)": [[0, "_CPPv46squareRK5array14StreamOrDevice", false]], "square() (array method)": [[73, "mlx.core.array.square", false]], "square() (in module mlx.core)": [[277, "mlx.core.square", false]], "squeeze (c++ function)": [[0, "_CPPv47squeezeRK5array14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayi14StreamOrDevice", false]], "squeeze() (array method)": [[74, "mlx.core.array.squeeze", false]], "squeeze() (in module mlx.core)": [[278, "mlx.core.squeeze", false]], "stack (c++ function)": [[0, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "stack() (in module mlx.core)": [[279, "mlx.core.stack", false]], "start_capture() (in module mlx.core.metal)": [[219, "mlx.core.metal.start_capture", false]], "state (module property)": [[371, "mlx.nn.Module.state", false]], "state (optimizer property)": [[464, "mlx.optimizers.Optimizer.state", false]], "std (c++ function)": [[0, "_CPPv4StRK5array14StreamOrDevice", false], [0, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv4StRK5arraybi14StreamOrDevice", false], [0, "_CPPv4StRK5arrayibi14StreamOrDevice", false]], "std() (array method)": [[75, "mlx.core.array.std", false]], "std() (in module mlx.core)": [[280, "mlx.core.std", false]], "step (class in mlx.nn)": [[397, "mlx.nn.Step", false], [446, "mlx.nn.step", false]], "step_decay() (in module mlx.optimizers)": [[472, "mlx.optimizers.step_decay", false]], "stop_capture() (in module mlx.core.metal)": [[220, "mlx.core.metal.stop_capture", false]], "stop_gradient (c++ function)": [[0, "_CPPv413stop_gradientRK5array14StreamOrDevice", false]], "stop_gradient() (in module mlx.core)": [[281, "mlx.core.stop_gradient", false]], "stream (class in mlx.core)": [[315, "mlx.core.Stream", false]], "stream() (in module mlx.core)": [[282, "mlx.core.stream", false]], "subtract (c++ function)": [[0, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", false]], "subtract() (in module mlx.core)": [[283, "mlx.core.subtract", false]], "sum (c++ function)": [[0, "_CPPv43sumRK5array14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayib14StreamOrDevice", false]], "sum() (array method)": [[76, "mlx.core.array.sum", false]], "sum() (in module mlx.core)": [[284, "mlx.core.sum", false]], "svd() (in module mlx.core.linalg)": [[191, "mlx.core.linalg.svd", false]], "swapaxes (c++ function)": [[0, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", false]], "swapaxes() (array method)": [[77, "mlx.core.array.swapaxes", false]], "swapaxes() (in module mlx.core)": [[285, "mlx.core.swapaxes", false]], "synchronize() (in module mlx.core)": [[286, "mlx.core.synchronize", false]], "t (array property)": [[31, "mlx.core.array.T", false]], "take (c++ function)": [[0, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayii14StreamOrDevice", false]], "take() (in module mlx.core)": [[287, "mlx.core.take", false]], "take_along_axis (c++ function)": [[0, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", false]], "take_along_axis() (in module mlx.core)": [[288, "mlx.core.take_along_axis", false]], "tan (c++ function)": [[0, "_CPPv43tanRK5array14StreamOrDevice", false]], "tan() (in module mlx.core)": [[289, "mlx.core.tan", false]], "tanh (c++ function)": [[0, "_CPPv44tanhRK5array14StreamOrDevice", false]], "tanh (class in mlx.nn)": [[398, "mlx.nn.Tanh", false], [447, "mlx.nn.tanh", false]], "tanh() (in module mlx.core)": [[290, "mlx.core.tanh", false]], "tensordot (c++ function)": [[0, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", false], [0, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "tensordot() (in module mlx.core)": [[291, "mlx.core.tensordot", false]], "tile (c++ function)": [[0, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "tile() (in module mlx.core)": [[292, "mlx.core.tile", false]], "tolist() (array method)": [[78, "mlx.core.array.tolist", false]], "topk (c++ function)": [[0, "_CPPv44topkRK5arrayi14StreamOrDevice", false], [0, "_CPPv44topkRK5arrayii14StreamOrDevice", false]], "topk() (in module mlx.core)": [[293, "mlx.core.topk", false]], "trace (c++ function)": [[0, "_CPPv45traceRK5array14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", false]], "trace() (in module mlx.core)": [[294, "mlx.core.trace", false]], "train() (module method)": [[372, "mlx.nn.Module.train", false]], "trainable_parameters() (module method)": [[373, "mlx.nn.Module.trainable_parameters", false]], "training (module property)": [[374, "mlx.nn.Module.training", false]], "transformer (class in mlx.nn)": [[399, "mlx.nn.Transformer", false]], "transpose (c++ function)": [[0, "_CPPv49transposeRK5array14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "transpose() (array method)": [[79, "mlx.core.array.transpose", false]], "transpose() (in module mlx.core)": [[295, "mlx.core.transpose", false]], "tree_flatten() (in module mlx.utils)": [[310, "mlx.utils.tree_flatten", false]], "tree_map() (in module mlx.utils)": [[311, "mlx.utils.tree_map", false]], "tree_map_with_path() (in module mlx.utils)": [[312, "mlx.utils.tree_map_with_path", false]], "tree_reduce() (in module mlx.utils)": [[313, "mlx.utils.tree_reduce", false]], "tree_unflatten() (in module mlx.utils)": [[314, "mlx.utils.tree_unflatten", false]], "tri (c++ function)": [[0, "_CPPv43trii5Dtype14StreamOrDevice", false], [0, "_CPPv43triiii5Dtype14StreamOrDevice", false]], "tri() (in module mlx.core)": [[296, "mlx.core.tri", false]], "tri_inv() (in module mlx.core.linalg)": [[192, "mlx.core.linalg.tri_inv", false]], "tril (c++ function)": [[0, "_CPPv44tril5arrayi14StreamOrDevice", false]], "tril() (in module mlx.core)": [[297, "mlx.core.tril", false]], "triplet_loss (class in mlx.nn.losses)": [[434, "mlx.nn.losses.triplet_loss", false]], "triu (c++ function)": [[0, "_CPPv44triu5arrayi14StreamOrDevice", false]], "triu() (in module mlx.core)": [[298, "mlx.core.triu", false]], "truncated_normal() (in module mlx.core.random)": [[251, "mlx.core.random.truncated_normal", false]], "unfreeze() (module method)": [[375, "mlx.nn.Module.unfreeze", false]], "uniform() (in module mlx.core.random)": [[252, "mlx.core.random.uniform", false]], "uniform() (in module mlx.nn.init)": [[408, "mlx.nn.init.uniform", false]], "update() (module method)": [[376, "mlx.nn.Module.update", false]], "update() (optimizer method)": [[465, "mlx.optimizers.Optimizer.update", false]], "update_modules() (module method)": [[377, "mlx.nn.Module.update_modules", false]], "upsample (class in mlx.nn)": [[400, "mlx.nn.Upsample", false]], "value_and_grad() (in module mlx.core)": [[299, "mlx.core.value_and_grad", false]], "value_and_grad() (in module mlx.nn)": [[308, "mlx.nn.value_and_grad", false]], "var (c++ function)": [[0, "_CPPv43varRK5array14StreamOrDevice", false], [0, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv43varRK5arraybi14StreamOrDevice", false], [0, "_CPPv43varRK5arrayibi14StreamOrDevice", false]], "var() (array method)": [[80, "mlx.core.array.var", false]], "var() (in module mlx.core)": [[300, "mlx.core.var", false]], "view (c++ function)": [[0, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", false]], "view() (array method)": [[81, "mlx.core.array.view", false]], "view() (in module mlx.core)": [[301, "mlx.core.view", false]], "vjp() (in module mlx.core)": [[302, "mlx.core.vjp", false]], "vmap() (in module mlx.core)": [[303, "mlx.core.vmap", false]], "where (c++ function)": [[0, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", false]], "where() (in module mlx.core)": [[304, "mlx.core.where", false]], "zeros (c++ function)": [[0, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "zeros() (in module mlx.core)": [[305, "mlx.core.zeros", false]], "zeros_like (c++ function)": [[0, "_CPPv410zeros_likeRK5array14StreamOrDevice", false]], "zeros_like() (in module mlx.core)": [[306, "mlx.core.zeros_like", false]]}, "objects": {"": [[0, 0, 1, "_CPPv43absRK5array14StreamOrDevice", "abs"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::a"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::s"], [0, 0, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::a"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::b"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::s"], [0, 0, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::a"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::alpha"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::b"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::beta"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::c"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::s"], [0, 0, 1, "_CPPv43allRK5array14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::axes"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::axis"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::s"], [0, 0, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::a"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::atol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::b"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::equal_nan"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::rtol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::s"], [0, 0, 1, "_CPPv43anyRK5array14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::axes"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::axis"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::s"], [0, 0, 1, "_CPPv46aranged14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangei14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeii14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeiii14StreamOrDevice", "arange"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::stop"], [0, 0, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::a"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::s"], [0, 0, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::a"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::s"], [0, 0, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::a"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::s"], [0, 0, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::a"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::s"], [0, 0, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan"], [0, 0, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::a"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::b"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::s"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::a"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::s"], [0, 0, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::a"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::s"], [0, 0, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::axis"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::s"], [0, 0, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::axis"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::s"], [0, 0, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition"], [0, 0, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::axis"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::s"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::s"], [0, 0, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort"], [0, 0, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::axis"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::s"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::s"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::equal_nan"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::s"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::s"], [0, 0, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::a"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::offset"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::s"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::shape"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::strides"], [0, 0, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::a"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::dtype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::s"], [0, 0, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d"], [0, 0, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::s"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::s"], [0, 0, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d"], [0, 0, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::s"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::s"], [0, 0, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d"], [0, 0, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::s"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::s"], [0, 0, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::a"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::b"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::s"], [0, 0, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::a"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::b"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::s"], [0, 0, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::a"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::b"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::s"], [0, 0, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::a"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::b"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::block_size"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_lhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_out"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_rhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::s"], [0, 0, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::inputs"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::s"], [0, 0, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::a"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::s"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::shape"], [0, 0, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::a"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::s"], [0, 0, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_max"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_min"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::s"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::axis"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::s"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::s"], [0, 0, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::a"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::s"], [0, 0, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::dilation"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::groups"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::input"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::padding"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::s"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::stride"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::weight"], [0, 0, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::dilation"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::groups"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::input"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::padding"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::s"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::stride"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::weight"], [0, 0, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::dilation"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::groups"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::input"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::padding"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::s"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::stride"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::weight"], [0, 0, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 0, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_hi"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_lo"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 0, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::dilation"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::groups"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::input"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::padding"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::s"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::stride"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::weight"], [0, 0, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::dilation"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::groups"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::input"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::padding"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::s"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::stride"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::weight"], [0, 0, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::dilation"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::groups"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::input"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::padding"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::s"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::stride"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::weight"], [0, 0, 1, "_CPPv44copy5array14StreamOrDevice", "copy"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::a"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::s"], [0, 0, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::a"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::s"], [0, 0, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::a"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::s"], [0, 0, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::a"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::axis"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::inclusive"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::reverse"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::s"], [0, 0, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::a"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::axis"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::inclusive"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::reverse"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::s"], [0, 0, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::a"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::axis"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::inclusive"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::reverse"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::s"], [0, 0, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::a"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::axis"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::inclusive"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::reverse"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::s"], [0, 0, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::a"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::s"], [0, 0, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::dependencies"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::inputs"], [0, 0, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::biases"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::bits"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::group_size"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::s"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::scales"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::w"], [0, 0, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::a"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::k"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::s"], [0, 0, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::a"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis1"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis2"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::offset"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::s"], [0, 0, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::a"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::b"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::s"], [0, 0, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::a"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::b"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::s"], [0, 0, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::a"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::b"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::s"], [0, 0, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::a"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::s"], [0, 0, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::a"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::s"], [0, 0, 1, "_CPPv43expRK5array14StreamOrDevice", "exp"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::a"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::s"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::axes"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::axis"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::s"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::s"], [0, 0, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::a"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::s"], [0, 0, 1, "_CPPv43eyei14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::s"], [0, 0, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten"], [0, 0, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::end_axis"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::start_axis"], [0, 0, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::a"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::s"], [0, 0, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::a"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::b"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::s"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::T"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::T"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::vals"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::vals"], [0, 0, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 0, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::axes"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::axis"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 0, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::a"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::b"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::lhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::rhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::s"], [0, 0, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::biases"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::bits"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::group_size"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::lhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::rhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::s"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::scales"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::transpose"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::w"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::x"], [0, 0, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::a"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::b"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::s"], [0, 0, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::a"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::b"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::s"], [0, 0, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::a"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::s"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::scale"], [0, 0, 1, "_CPPv48identityi14StreamOrDevice", "identity"], [0, 0, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::dtype"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::s"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::s"], [0, 0, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::a"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::s"], [0, 0, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::a"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::b"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::s"], [0, 0, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::a"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::atol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::b"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::equal_nan"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::rtol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::s"], [0, 0, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::a"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::s"], [0, 0, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::a"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::s"], [0, 0, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::a"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::s"], [0, 0, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::a"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::s"], [0, 0, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::a"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::s"], [0, 0, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::a"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::b"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::s"], [0, 0, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::a"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::b"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::s"], [0, 0, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::a"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::b"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::s"], [0, 0, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::dtype"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::num"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::s"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::start"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::stop"], [0, 0, 1, "_CPPv43logRK5array14StreamOrDevice", "log"], [0, 0, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::a"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::s"], [0, 0, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::a"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::s"], [0, 0, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::a"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::s"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::a"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::s"], [0, 0, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::a"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::b"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::s"], [0, 0, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::a"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::b"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::s"], [0, 0, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::a"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::s"], [0, 0, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::a"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::b"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::s"], [0, 0, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::axes"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::axis"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::s"], [0, 0, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::a"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::b"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::s"], [0, 0, 1, "_CPPv43maxRK5array14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::axes"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::axis"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::s"], [0, 0, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::a"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::b"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::s"], [0, 0, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::axes"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::axis"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::s"], [0, 0, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::arrays"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::indexing"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::s"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::sparse"], [0, 0, 1, "_CPPv43minRK5array14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::axes"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::axis"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::s"], [0, 0, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::a"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::b"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::s"], [0, 0, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::a"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::destination"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::s"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::source"], [0, 0, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::a"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::b"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::s"], [0, 0, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::a"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::nan"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::neginf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::posinf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::s"], [0, 0, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::a"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::s"], [0, 0, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::a"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::b"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::s"], [0, 0, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::a"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::axes"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::dtype"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::inverted"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::s"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::dtype"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::shape"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::shape"], [0, 0, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::a"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::s"], [0, 0, 1, "_CPPv4I0Ene5array1TRK5array", "operator!="], [0, 0, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!="], [0, 0, 1, "_CPPv4neRK5arrayRK5array", "operator!="], [0, 2, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::T"], [0, 2, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::T"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::a"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::b"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::b"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::b"], [0, 0, 1, "_CPPv4I0Erm5array1TRK5array", "operator%"], [0, 0, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%"], [0, 0, 1, "_CPPv4rmRK5arrayRK5array", "operator%"], [0, 2, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::T"], [0, 2, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::T"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::a"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::b"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::b"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::b"], [0, 0, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;"], [0, 0, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::a"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::b"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::a"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::b"], [0, 0, 1, "_CPPv4I0Eml5array1TRK5array", "operator*"], [0, 0, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*"], [0, 0, 1, "_CPPv4mlRK5arrayRK5array", "operator*"], [0, 2, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::T"], [0, 2, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::T"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::a"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::b"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::b"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::b"], [0, 0, 1, "_CPPv4I0Epl5array1TRK5array", "operator+"], [0, 0, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+"], [0, 0, 1, "_CPPv4plRK5arrayRK5array", "operator+"], [0, 2, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::T"], [0, 2, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::T"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::a"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::b"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::b"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::b"], [0, 0, 1, "_CPPv4I0Emi5array1TRK5array", "operator-"], [0, 0, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-"], [0, 0, 1, "_CPPv4miRK5array", "operator-"], [0, 0, 1, "_CPPv4miRK5arrayRK5array", "operator-"], [0, 2, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::T"], [0, 2, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::T"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::a"], [0, 1, 1, "_CPPv4miRK5array", "operator-::a"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::b"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::b"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::b"], [0, 0, 1, "_CPPv4dvRK5arrayRK5array", "operator/"], [0, 0, 1, "_CPPv4dvRK5arrayd", "operator/"], [0, 0, 1, "_CPPv4dvdRK5array", "operator/"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::a"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::b"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::b"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::b"], [0, 0, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;"], [0, 0, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;"], [0, 0, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;"], [0, 2, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::T"], [0, 2, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::T"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::a"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::b"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::b"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::b"], [0, 0, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::a"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::b"], [0, 0, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;="], [0, 0, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;="], [0, 0, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;="], [0, 2, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::T"], [0, 2, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::T"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::a"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::b"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::b"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::b"], [0, 0, 1, "_CPPv4I0Eeq5array1TRK5array", "operator=="], [0, 0, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator=="], [0, 0, 1, "_CPPv4eqRK5arrayRK5array", "operator=="], [0, 2, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::T"], [0, 2, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::T"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::a"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::b"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::b"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::b"], [0, 0, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;"], [0, 0, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;"], [0, 0, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;"], [0, 2, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::T"], [0, 2, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::T"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::a"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::b"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::b"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::b"], [0, 0, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;="], [0, 0, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;="], [0, 0, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;="], [0, 2, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::T"], [0, 2, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::T"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::a"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::b"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::b"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::b"], [0, 0, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::a"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::b"], [0, 0, 1, "_CPPv4eoRK5arrayRK5array", "operator^"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::a"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::b"], [0, 0, 1, "_CPPv4orRK5arrayRK5array", "operator|"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::a"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::b"], [0, 0, 1, "_CPPv4ooRK5arrayRK5array", "operator||"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::a"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::b"], [0, 0, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::a"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::b"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::s"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::axes"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::high_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::low_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 0, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition"], [0, 0, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::axis"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::s"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::s"], [0, 0, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::a"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::b"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::s"], [0, 0, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::axes"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::axis"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::s"], [0, 0, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::a"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::indices"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::s"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::values"], [0, 0, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::bits"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::group_size"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::s"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::w"], [0, 0, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::biases"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::bits"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::group_size"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::s"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::scales"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::transpose"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::w"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::x"], [0, 0, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::a"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::s"], [0, 0, 1, "_CPPv44realRK5array14StreamOrDevice", "real"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::a"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::s"], [0, 0, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::a"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::s"], [0, 0, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::a"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::b"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::s"], [0, 0, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat"], [0, 0, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::axis"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::s"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::s"], [0, 0, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::a"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::s"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::shape"], [0, 0, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::a"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::b"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::s"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::shift"], [0, 0, 1, "_CPPv45roundRK5array14StreamOrDevice", "round"], [0, 0, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::decimals"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::s"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::s"], [0, 0, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::a"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::s"], [0, 0, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter"], [0, 0, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::axes"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::axis"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::updates"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::updates"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::axes"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::axis"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::updates"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::updates"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::axes"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::axis"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::updates"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::updates"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::axes"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::axis"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::updates"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::updates"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::axes"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::axis"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::updates"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::updates"], [0, 0, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::a"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::s"], [0, 0, 1, "_CPPv44signRK5array14StreamOrDevice", "sign"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::a"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::s"], [0, 0, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::a"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::s"], [0, 0, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::a"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::s"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::strides"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::strides"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 0, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::axes"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::axis"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::s"], [0, 0, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort"], [0, 0, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::axis"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::s"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::s"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::s"], [0, 0, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::a"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::s"], [0, 0, 1, "_CPPv46squareRK5array14StreamOrDevice", "square"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::a"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::s"], [0, 0, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::axes"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::axis"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::s"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::axis"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::s"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::s"], [0, 0, 1, "_CPPv4StRK5array14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::axes"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::axis"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::s"], [0, 0, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::a"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::s"], [0, 0, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::a"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::b"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::s"], [0, 0, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::axes"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::axis"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::s"], [0, 0, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::a"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis1"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis2"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::s"], [0, 0, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::s"], [0, 0, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::a"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::indices"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::s"], [0, 0, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::a"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::s"], [0, 0, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::a"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::s"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::axis"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::s"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::s"], [0, 0, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::arr"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::reps"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::s"], [0, 0, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk"], [0, 0, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::axis"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::s"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::s"], [0, 0, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::dtype"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::s"], [0, 0, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::s"], [0, 0, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri"], [0, 0, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::k"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::m"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::type"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::type"], [0, 0, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::k"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::s"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::x"], [0, 0, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::k"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::s"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::x"], [0, 0, 1, "_CPPv43varRK5array14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::axes"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::axis"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::s"], [0, 0, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::a"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::dtype"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::s"], [0, 0, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::condition"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::s"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::x"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::y"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::dtype"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::shape"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::shape"], [0, 0, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::a"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::s"]], "mlx.core": [[9, 3, 1, "", "Device"], [10, 3, 1, "", "Dtype"], [11, 3, 1, "", "DtypeCategory"], [315, 3, 1, "", "Stream"], [12, 5, 1, "", "abs"], [13, 5, 1, "", "add"], [14, 5, 1, "", "addmm"], [15, 5, 1, "", "all"], [16, 5, 1, "", "allclose"], [17, 5, 1, "", "any"], [18, 5, 1, "", "arange"], [19, 5, 1, "", "arccos"], [20, 5, 1, "", "arccosh"], [21, 5, 1, "", "arcsin"], [22, 5, 1, "", "arcsinh"], [23, 5, 1, "", "arctan"], [24, 5, 1, "", "arctan2"], [25, 5, 1, "", "arctanh"], [26, 5, 1, "", "argmax"], [27, 5, 1, "", "argmin"], [28, 5, 1, "", "argpartition"], [29, 5, 1, "", "argsort"], [30, 3, 1, "", "array"], [82, 5, 1, "", "array_equal"], [83, 5, 1, "", "as_strided"], [84, 5, 1, "", "atleast_1d"], [85, 5, 1, "", "atleast_2d"], [86, 5, 1, "", "atleast_3d"], [87, 5, 1, "", "bitwise_and"], [88, 5, 1, "", "bitwise_or"], [89, 5, 1, "", "bitwise_xor"], [90, 5, 1, "", "block_masked_mm"], [91, 5, 1, "", "broadcast_to"], [92, 5, 1, "", "ceil"], [93, 5, 1, "", "clip"], [94, 5, 1, "", "compile"], [95, 5, 1, "", "concatenate"], [96, 5, 1, "", "conj"], [97, 5, 1, "", "conjugate"], [98, 5, 1, "", "conv1d"], [99, 5, 1, "", "conv2d"], [100, 5, 1, "", "conv3d"], [101, 5, 1, "", "conv_general"], [102, 5, 1, "", "conv_transpose1d"], [103, 5, 1, "", "conv_transpose2d"], [104, 5, 1, "", "conv_transpose3d"], [105, 5, 1, "", "convolve"], [106, 5, 1, "", "cos"], [107, 5, 1, "", "cosh"], [108, 5, 1, "", "cummax"], [109, 5, 1, "", "cummin"], [110, 5, 1, "", "cumprod"], [111, 5, 1, "", "cumsum"], [112, 3, 1, "", "custom_function"], [113, 5, 1, "", "default_device"], [114, 5, 1, "", "default_stream"], [115, 5, 1, "", "degrees"], [116, 5, 1, "", "dequantize"], [117, 5, 1, "", "diag"], [118, 5, 1, "", "diagonal"], [119, 5, 1, "", "disable_compile"], [128, 5, 1, "", "divide"], [129, 5, 1, "", "divmod"], [130, 5, 1, "", "einsum"], [131, 5, 1, "", "einsum_path"], [132, 5, 1, "", "enable_compile"], [133, 5, 1, "", "equal"], [134, 5, 1, "", "erf"], [135, 5, 1, "", "erfinv"], [136, 5, 1, "", "eval"], [137, 5, 1, "", "exp"], [138, 5, 1, "", "expand_dims"], [139, 5, 1, "", "expm1"], [140, 5, 1, "", "eye"], [159, 5, 1, "", "flatten"], [160, 5, 1, "", "floor"], [161, 5, 1, "", "floor_divide"], [162, 5, 1, "", "full"], [163, 5, 1, "", "gather_mm"], [164, 5, 1, "", "gather_qmm"], [165, 5, 1, "", "grad"], [166, 5, 1, "", "greater"], [167, 5, 1, "", "greater_equal"], [168, 5, 1, "", "hadamard_transform"], [169, 5, 1, "", "identity"], [170, 5, 1, "", "imag"], [171, 5, 1, "", "inner"], [172, 5, 1, "", "isclose"], [173, 5, 1, "", "isfinite"], [174, 5, 1, "", "isinf"], [175, 5, 1, "", "isnan"], [176, 5, 1, "", "isneginf"], [177, 5, 1, "", "isposinf"], [178, 5, 1, "", "issubdtype"], [179, 5, 1, "", "jvp"], [180, 5, 1, "", "left_shift"], [181, 5, 1, "", "less"], [182, 5, 1, "", "less_equal"], [193, 5, 1, "", "linspace"], [194, 5, 1, "", "load"], [195, 5, 1, "", "log"], [196, 5, 1, "", "log10"], [197, 5, 1, "", "log1p"], [198, 5, 1, "", "log2"], [199, 5, 1, "", "logaddexp"], [200, 5, 1, "", "logical_and"], [201, 5, 1, "", "logical_not"], [202, 5, 1, "", "logical_or"], [203, 5, 1, "", "logsumexp"], [204, 5, 1, "", "matmul"], [205, 5, 1, "", "max"], [206, 5, 1, "", "maximum"], [207, 5, 1, "", "mean"], [208, 5, 1, "", "meshgrid"], [221, 5, 1, "", "min"], [222, 5, 1, "", "minimum"], [223, 5, 1, "", "moveaxis"], [224, 5, 1, "", "multiply"], [225, 5, 1, "", "nan_to_num"], [226, 5, 1, "", "negative"], [227, 5, 1, "", "new_stream"], [228, 5, 1, "", "not_equal"], [229, 5, 1, "", "ones"], [230, 5, 1, "", "ones_like"], [231, 5, 1, "", "outer"], [232, 5, 1, "", "pad"], [233, 5, 1, "", "partition"], [234, 5, 1, "", "power"], [235, 5, 1, "", "prod"], [236, 5, 1, "", "put_along_axis"], [237, 5, 1, "", "quantize"], [238, 5, 1, "", "quantized_matmul"], [239, 5, 1, "", "radians"], [253, 5, 1, "", "real"], [254, 5, 1, "", "reciprocal"], [255, 5, 1, "", "remainder"], [256, 5, 1, "", "repeat"], [257, 5, 1, "", "reshape"], [258, 5, 1, "", "right_shift"], [259, 5, 1, "", "roll"], [260, 5, 1, "", "round"], [261, 5, 1, "", "rsqrt"], [262, 5, 1, "", "save"], [263, 5, 1, "", "save_gguf"], [264, 5, 1, "", "save_safetensors"], [265, 5, 1, "", "savez"], [266, 5, 1, "", "savez_compressed"], [267, 5, 1, "", "set_default_device"], [268, 5, 1, "", "set_default_stream"], [269, 5, 1, "", "sigmoid"], [270, 5, 1, "", "sign"], [271, 5, 1, "", "sin"], [272, 5, 1, "", "sinh"], [273, 5, 1, "", "softmax"], [274, 5, 1, "", "sort"], [275, 5, 1, "", "split"], [276, 5, 1, "", "sqrt"], [277, 5, 1, "", "square"], [278, 5, 1, "", "squeeze"], [279, 5, 1, "", "stack"], [280, 5, 1, "", "std"], [281, 5, 1, "", "stop_gradient"], [282, 5, 1, "", "stream"], [283, 5, 1, "", "subtract"], [284, 5, 1, "", "sum"], [285, 5, 1, "", "swapaxes"], [286, 5, 1, "", "synchronize"], [287, 5, 1, "", "take"], [288, 5, 1, "", "take_along_axis"], [289, 5, 1, "", "tan"], [290, 5, 1, "", "tanh"], [291, 5, 1, "", "tensordot"], [292, 5, 1, "", "tile"], [293, 5, 1, "", "topk"], [294, 5, 1, "", "trace"], [295, 5, 1, "", "transpose"], [296, 5, 1, "", "tri"], [297, 5, 1, "", "tril"], [298, 5, 1, "", "triu"], [299, 5, 1, "", "value_and_grad"], [300, 5, 1, "", "var"], [301, 5, 1, "", "view"], [302, 5, 1, "", "vjp"], [303, 5, 1, "", "vmap"], [304, 5, 1, "", "where"], [305, 5, 1, "", "zeros"], [306, 5, 1, "", "zeros_like"]], "mlx.core.Device": [[9, 4, 1, "", "__init__"]], "mlx.core.Dtype": [[10, 4, 1, "", "__init__"]], "mlx.core.DtypeCategory": [[11, 4, 1, "", "__init__"]], "mlx.core.Stream": [[315, 4, 1, "", "__init__"]], "mlx.core.array": [[31, 6, 1, "", "T"], [30, 4, 1, "", "__init__"], [32, 4, 1, "", "abs"], [33, 4, 1, "", "all"], [34, 4, 1, "", "any"], [35, 4, 1, "", "argmax"], [36, 4, 1, "", "argmin"], [37, 4, 1, "", "astype"], [38, 6, 1, "", "at"], [39, 4, 1, "", "conj"], [40, 4, 1, "", "cos"], [41, 4, 1, "", "cummax"], [42, 4, 1, "", "cummin"], [43, 4, 1, "", "cumprod"], [44, 4, 1, "", "cumsum"], [45, 4, 1, "", "diag"], [46, 4, 1, "", "diagonal"], [47, 6, 1, "", "dtype"], [48, 4, 1, "", "exp"], [49, 4, 1, "", "flatten"], [50, 4, 1, "", "item"], [51, 6, 1, "", "itemsize"], [52, 4, 1, "", "log"], [53, 4, 1, "", "log10"], [54, 4, 1, "", "log1p"], [55, 4, 1, "", "log2"], [56, 4, 1, "", "logsumexp"], [57, 4, 1, "", "max"], [58, 4, 1, "", "mean"], [59, 4, 1, "", "min"], [60, 4, 1, "", "moveaxis"], [61, 6, 1, "", "nbytes"], [62, 6, 1, "", "ndim"], [63, 4, 1, "", "prod"], [64, 4, 1, "", "reciprocal"], [65, 4, 1, "", "reshape"], [66, 4, 1, "", "round"], [67, 4, 1, "", "rsqrt"], [68, 6, 1, "", "shape"], [69, 4, 1, "", "sin"], [70, 6, 1, "", "size"], [71, 4, 1, "", "split"], [72, 4, 1, "", "sqrt"], [73, 4, 1, "", "square"], [74, 4, 1, "", "squeeze"], [75, 4, 1, "", "std"], [76, 4, 1, "", "sum"], [77, 4, 1, "", "swapaxes"], [78, 4, 1, "", "tolist"], [79, 4, 1, "", "transpose"], [80, 4, 1, "", "var"], [81, 4, 1, "", "view"]], "mlx.core.custom_function": [[112, 4, 1, "", "__init__"]], "mlx.core.distributed": [[120, 3, 1, "", "Group"], [121, 5, 1, "", "all_gather"], [122, 5, 1, "", "all_sum"], [123, 5, 1, "", "init"], [124, 5, 1, "", "is_available"], [125, 5, 1, "", "recv"], [126, 5, 1, "", "recv_like"], [127, 5, 1, "", "send"]], "mlx.core.distributed.Group": [[120, 4, 1, "", "__init__"]], "mlx.core.fast": [[141, 5, 1, "", "affine_quantize"], [142, 5, 1, "", "layer_norm"], [143, 5, 1, "", "metal_kernel"], [144, 5, 1, "", "rms_norm"], [145, 5, 1, "", "rope"], [146, 5, 1, "", "scaled_dot_product_attention"]], "mlx.core.fft": [[147, 5, 1, "", "fft"], [148, 5, 1, "", "fft2"], [149, 5, 1, "", "fftn"], [150, 5, 1, "", "ifft"], [151, 5, 1, "", "ifft2"], [152, 5, 1, "", "ifftn"], [153, 5, 1, "", "irfft"], [154, 5, 1, "", "irfft2"], [155, 5, 1, "", "irfftn"], [156, 5, 1, "", "rfft"], [157, 5, 1, "", "rfft2"], [158, 5, 1, "", "rfftn"]], "mlx.core.linalg": [[183, 5, 1, "", "cholesky"], [184, 5, 1, "", "cholesky_inv"], [185, 5, 1, "", "cross"], [186, 5, 1, "", "eigh"], [187, 5, 1, "", "eigvalsh"], [188, 5, 1, "", "inv"], [189, 5, 1, "", "norm"], [190, 5, 1, "", "qr"], [191, 5, 1, "", "svd"], [192, 5, 1, "", "tri_inv"]], "mlx.core.metal": [[209, 5, 1, "", "clear_cache"], [210, 5, 1, "", "device_info"], [211, 5, 1, "", "get_active_memory"], [212, 5, 1, "", "get_cache_memory"], [213, 5, 1, "", "get_peak_memory"], [214, 5, 1, "", "is_available"], [215, 5, 1, "", "reset_peak_memory"], [216, 5, 1, "", "set_cache_limit"], [217, 5, 1, "", "set_memory_limit"], [218, 5, 1, "", "set_wired_limit"], [219, 5, 1, "", "start_capture"], [220, 5, 1, "", "stop_capture"]], "mlx.core.random": [[240, 5, 1, "", "bernoulli"], [241, 5, 1, "", "categorical"], [242, 5, 1, "", "gumbel"], [243, 5, 1, "", "key"], [244, 5, 1, "", "laplace"], [245, 5, 1, "", "multivariate_normal"], [246, 5, 1, "", "normal"], [247, 5, 1, "", "permutation"], [248, 5, 1, "", "randint"], [249, 5, 1, "", "seed"], [250, 5, 1, "", "split"], [251, 5, 1, "", "truncated_normal"], [252, 5, 1, "", "uniform"]], "mlx.nn": [[325, 3, 1, "", "ALiBi"], [326, 3, 1, "", "AvgPool1d"], [327, 3, 1, "", "AvgPool2d"], [328, 3, 1, "", "BatchNorm"], [329, 3, 1, "", "CELU"], [330, 3, 1, "", "Conv1d"], [331, 3, 1, "", "Conv2d"], [332, 3, 1, "", "Conv3d"], [333, 3, 1, "", "ConvTranspose1d"], [334, 3, 1, "", "ConvTranspose2d"], [335, 3, 1, "", "ConvTranspose3d"], [336, 3, 1, "", "Dropout"], [337, 3, 1, "", "Dropout2d"], [338, 3, 1, "", "Dropout3d"], [339, 3, 1, "", "ELU"], [340, 3, 1, "", "Embedding"], [341, 3, 1, "", "GELU"], [342, 3, 1, "", "GLU"], [343, 3, 1, "", "GRU"], [344, 3, 1, "", "GroupNorm"], [345, 3, 1, "", "HardShrink"], [346, 3, 1, "", "HardTanh"], [347, 3, 1, "", "Hardswish"], [348, 3, 1, "", "InstanceNorm"], [349, 3, 1, "", "LSTM"], [350, 3, 1, "", "LayerNorm"], [351, 3, 1, "", "LeakyReLU"], [352, 3, 1, "", "Linear"], [353, 3, 1, "", "LogSigmoid"], [354, 3, 1, "", "LogSoftmax"], [355, 3, 1, "", "MaxPool1d"], [356, 3, 1, "", "MaxPool2d"], [357, 3, 1, "", "Mish"], [452, 3, 1, "", "Module"], [378, 3, 1, "", "MultiHeadAttention"], [379, 3, 1, "", "PReLU"], [380, 3, 1, "", "QuantizedEmbedding"], [381, 3, 1, "", "QuantizedLinear"], [382, 3, 1, "", "RMSNorm"], [383, 3, 1, "", "RNN"], [384, 3, 1, "", "ReLU"], [385, 3, 1, "", "ReLU6"], [386, 3, 1, "", "RoPE"], [387, 3, 1, "", "SELU"], [388, 3, 1, "", "Sequential"], [389, 3, 1, "", "SiLU"], [390, 3, 1, "", "Sigmoid"], [391, 3, 1, "", "SinusoidalPositionalEncoding"], [392, 3, 1, "", "Softmax"], [393, 3, 1, "", "Softmin"], [394, 3, 1, "", "Softplus"], [395, 3, 1, "", "Softshrink"], [396, 3, 1, "", "Softsign"], [397, 3, 1, "", "Step"], [398, 3, 1, "", "Tanh"], [399, 3, 1, "", "Transformer"], [400, 3, 1, "", "Upsample"], [409, 3, 1, "", "celu"], [410, 3, 1, "", "elu"], [411, 3, 1, "", "gelu"], [412, 3, 1, "", "gelu_approx"], [413, 3, 1, "", "gelu_fast_approx"], [414, 3, 1, "", "glu"], [415, 3, 1, "", "hard_shrink"], [416, 3, 1, "", "hard_tanh"], [417, 3, 1, "", "hardswish"], [418, 3, 1, "", "leaky_relu"], [419, 3, 1, "", "log_sigmoid"], [420, 3, 1, "", "log_softmax"], [435, 3, 1, "", "mish"], [436, 3, 1, "", "prelu"], [307, 5, 1, "", "quantize"], [437, 3, 1, "", "relu"], [438, 3, 1, "", "relu6"], [439, 3, 1, "", "selu"], [440, 3, 1, "", "sigmoid"], [441, 3, 1, "", "silu"], [442, 3, 1, "", "softmax"], [443, 3, 1, "", "softmin"], [444, 3, 1, "", "softplus"], [445, 3, 1, "", "softshrink"], [446, 3, 1, "", "step"], [447, 3, 1, "", "tanh"], [308, 5, 1, "", "value_and_grad"]], "mlx.nn.Module": [[358, 4, 1, "", "apply"], [359, 4, 1, "", "apply_to_modules"], [360, 4, 1, "", "children"], [361, 4, 1, "", "eval"], [362, 4, 1, "", "filter_and_map"], [363, 4, 1, "", "freeze"], [364, 4, 1, "", "leaf_modules"], [365, 4, 1, "", "load_weights"], [366, 4, 1, "", "modules"], [367, 4, 1, "", "named_modules"], [368, 4, 1, "", "parameters"], [369, 4, 1, "", "save_weights"], [370, 4, 1, "", "set_dtype"], [371, 6, 1, "", "state"], [372, 4, 1, "", "train"], [373, 4, 1, "", "trainable_parameters"], [374, 6, 1, "", "training"], [375, 4, 1, "", "unfreeze"], [376, 4, 1, "", "update"], [377, 4, 1, "", "update_modules"]], "mlx.nn.init": [[401, 5, 1, "", "constant"], [402, 5, 1, "", "glorot_normal"], [403, 5, 1, "", "glorot_uniform"], [404, 5, 1, "", "he_normal"], [405, 5, 1, "", "he_uniform"], [406, 5, 1, "", "identity"], [407, 5, 1, "", "normal"], [408, 5, 1, "", "uniform"]], "mlx.nn.losses": [[421, 3, 1, "", "binary_cross_entropy"], [422, 3, 1, "", "cosine_similarity_loss"], [423, 3, 1, "", "cross_entropy"], [424, 3, 1, "", "gaussian_nll_loss"], [425, 3, 1, "", "hinge_loss"], [426, 3, 1, "", "huber_loss"], [427, 3, 1, "", "kl_div_loss"], [428, 3, 1, "", "l1_loss"], [429, 3, 1, "", "log_cosh_loss"], [430, 3, 1, "", "margin_ranking_loss"], [431, 3, 1, "", "mse_loss"], [432, 3, 1, "", "nll_loss"], [433, 3, 1, "", "smooth_l1_loss"], [434, 3, 1, "", "triplet_loss"]], "mlx.optimizers": [[455, 3, 1, "", "AdaDelta"], [456, 3, 1, "", "Adafactor"], [457, 3, 1, "", "Adagrad"], [458, 3, 1, "", "Adam"], [459, 3, 1, "", "AdamW"], [460, 3, 1, "", "Adamax"], [461, 3, 1, "", "Lion"], [474, 3, 1, "", "Optimizer"], [466, 3, 1, "", "RMSprop"], [467, 3, 1, "", "SGD"], [309, 5, 1, "", "clip_grad_norm"], [468, 5, 1, "", "cosine_decay"], [469, 5, 1, "", "exponential_decay"], [470, 5, 1, "", "join_schedules"], [471, 5, 1, "", "linear_schedule"], [472, 5, 1, "", "step_decay"]], "mlx.optimizers.Optimizer": [[462, 4, 1, "", "apply_gradients"], [463, 4, 1, "", "init"], [464, 6, 1, "", "state"], [465, 4, 1, "", "update"]], "mlx.utils": [[310, 5, 1, "", "tree_flatten"], [311, 5, 1, "", "tree_map"], [312, 5, 1, "", "tree_map_with_path"], [313, 5, 1, "", "tree_reduce"], [314, 5, 1, "", "tree_unflatten"]]}, "objnames": {"0": ["cpp", "function", "C++ function"], "1": ["cpp", "functionParam", "C++ function parameter"], "2": ["cpp", "templateParam", "C++ template parameter"], "3": ["py", "class", "Python class"], "4": ["py", "method", "Python method"], "5": ["py", "function", "Python function"], "6": ["py", "property", "Python property"]}, "objtypes": {"0": "cpp:function", "1": "cpp:functionParam", "2": "cpp:templateParam", "3": "py:class", "4": "py:method", "5": "py:function", "6": "py:property"}, "terms": {"": [0, 1, 2, 4, 5, 6, 47, 51, 62, 94, 114, 116, 141, 148, 149, 151, 152, 154, 155, 157, 158, 165, 184, 189, 191, 194, 207, 231, 237, 241, 260, 263, 264, 280, 282, 299, 300, 301, 303, 308, 324, 327, 343, 349, 356, 362, 363, 365, 369, 370, 371, 375, 383, 454, 463, 464, 476, 479, 481, 483, 484, 485, 486, 487], "0": [0, 1, 2, 4, 5, 6, 8, 9, 14, 18, 38, 45, 46, 49, 66, 71, 75, 80, 83, 95, 98, 99, 100, 101, 102, 103, 104, 117, 118, 140, 143, 146, 159, 163, 165, 186, 188, 189, 190, 192, 209, 216, 218, 225, 232, 240, 244, 246, 247, 252, 256, 260, 275, 279, 280, 294, 296, 297, 298, 299, 300, 303, 309, 310, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 344, 345, 348, 350, 351, 355, 356, 379, 384, 386, 391, 395, 397, 399, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 415, 416, 417, 418, 421, 423, 425, 426, 430, 433, 434, 436, 437, 438, 439, 445, 446, 449, 452, 455, 456, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 476, 479, 480, 481, 482, 483, 484, 485, 486], "00005": 4, "0001": 391, "0005": 412, "001": 456, "00364": 4, "01": [4, 351, 418, 459], "0137595": 404, "015": 413, "0184009": 405, "02264": 403, "025": 481, "02765": 404, "0300242": 405, "044715": [341, 412], "0485873": 423, "05": [16, 172, 328, 344, 348, 350, 382], "0507": 439, "05202": 5, "06": [424, 434, 455], "0638": 430, "06450": 350, "0645099": 407, "06561": 469, "06675": 461, "07467": 382, "08": [16, 172, 422, 457, 458, 459, 460, 466], "08022": 348, "081": 472, "08415": 413, "08494": 344, "08619": 405, "08681": [357, 435], "09864": 5, "0999938": 470, "0999961": 468, "0f": 0, "1": [0, 1, 2, 3, 5, 6, 14, 18, 28, 29, 38, 46, 49, 98, 99, 100, 101, 102, 103, 104, 117, 118, 139, 143, 146, 147, 148, 150, 151, 153, 154, 155, 156, 157, 158, 159, 168, 171, 178, 184, 185, 186, 187, 189, 190, 204, 208, 217, 231, 233, 237, 241, 244, 245, 246, 252, 269, 274, 287, 293, 294, 299, 309, 312, 313, 317, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 342, 343, 344, 348, 349, 350, 352, 355, 356, 379, 382, 383, 386, 390, 391, 397, 400, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 414, 416, 419, 420, 421, 422, 423, 424, 425, 426, 427, 429, 430, 432, 433, 434, 439, 440, 442, 443, 444, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486, 487], "10": [0, 3, 5, 6, 196, 260, 265, 311, 324, 365, 449, 470, 472, 479, 480, 482], "100": [2, 4, 5, 421, 471, 479, 481, 483, 487], "1000": [468, 479], "10000": 386, "101": 471, "1024": [1, 5], "105361": 421, "109": 2, "10_000": 4, "10x": 461, "11": 189, "114": 2, "12": [5, 168, 470], "1212": 455, "12451": 403, "128": [265, 324], "13": 8, "14": 8, "15": [1, 8, 189, 218, 313, 479], "150594": 402, "15268": 404, "16": [1, 143, 317, 326, 348, 355, 358, 452], "1606": 413, "1607": [348, 350], "16384": 168, "16506": 405, "17": 8, "177208": 404, "1803": 344, "1908": [357, 435], "1910": 382, "191107": 402, "1985": 189, "1_000": 4, "1d": [0, 98, 102, 105, 263, 288], "1e": [0, 4, 6, 16, 172, 328, 344, 348, 350, 351, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 463, 466, 468, 469, 470, 471, 472], "1e3": 479, "1st": 237, "2": [0, 1, 2, 4, 5, 6, 38, 99, 103, 117, 118, 134, 148, 151, 153, 154, 155, 156, 157, 158, 159, 168, 178, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 198, 204, 237, 245, 250, 291, 294, 296, 297, 298, 309, 313, 317, 324, 326, 327, 331, 334, 341, 351, 355, 356, 382, 391, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 423, 424, 426, 433, 434, 449, 452, 454, 455, 457, 458, 459, 463, 466, 479, 480, 481, 482, 483, 484, 485, 486, 487], "20": [168, 189], "200": [5, 470], "2002": 5, "2011": 457, "2012": [455, 466], "2015": [337, 458, 460], "2019": [5, 459], "2020": 5, "2021": 5, "20397": 421, "20_000": 5, "21": [5, 472], "2104": 5, "223144": 421, "223404": 403, "225": 189, "225763": 430, "2302": 461, "23607": [189, 190], "24": 8, "24264": 189, "247": 5, "25": [379, 400], "25211": 404, "256": [1, 2, 6, 143], "256995": 430, "28": 168, "2d": [0, 99, 103, 118, 237, 328, 337], "2nd": 237, "2x": 484, "3": [0, 1, 2, 5, 8, 100, 104, 159, 178, 185, 186, 187, 189, 190, 309, 313, 332, 335, 341, 400, 403, 405, 412, 417, 456, 461, 476, 479, 482, 484, 485], "30": 456, "3118": 484, "32": [1, 5, 6, 90, 237, 238, 317, 327, 356, 382, 479], "330": 5, "33333": 400, "348587": 423, "363207": 402, "36788": 479, "379159": 403, "380709": 407, "39": 5, "390": 481, "3d": [0, 2, 100, 104, 328, 338, 400], "3f": [2, 6, 479], "3x": 2, "4": [0, 1, 2, 5, 116, 141, 143, 159, 164, 189, 237, 238, 265, 307, 313, 317, 326, 327, 328, 348, 355, 356, 380, 381, 399, 400, 402, 403, 404, 421, 479, 480, 482, 485, 487], "4096": [479, 481, 487], "40x": 1, "41421": 189, "417497": 408, "42": 314, "437": 5, "44": 5, "447214": 190, "458835": 404, "475": 5, "48095": 402, "4d": [1, 400], "4m": 1, "5": [0, 1, 2, 4, 5, 8, 189, 217, 240, 313, 326, 328, 336, 337, 338, 341, 345, 348, 355, 395, 400, 401, 404, 405, 412, 415, 433, 445, 449, 454, 466, 468, 469, 479, 481, 482], "50": [0, 193], "500": [5, 487], "5000": 2, "510826": 421, "512": [2, 3, 5, 399, 487], "534422": 407, "539245": 421, "53947": 402, "55": 1, "5701": 455, "573409": 430, "57771": 190, "579": 5, "5f": 4, "6": [1, 2, 5, 189, 265, 385, 399, 403, 412, 413, 417, 424, 434, 438, 466, 479, 482, 485], "61278": 402, "617261": 408, "628": 5, "633": 5, "64": [0, 1, 90, 116, 141, 164, 237, 238, 307, 317, 380, 381], "64331": 405, "666329": 405, "66667": 400, "67326": 439, "676": 1, "690": 5, "6967": 404, "7": [2, 5, 189, 237, 482], "702": [341, 413], "707107": 186, "71828": 479, "74166": 189, "74597": 189, "75": 400, "75596": 430, "75787": 404, "765166": 430, "773433": 430, "776856": 403, "793615": 405, "79854": 405, "7b": 5, "7m": 1, "8": [0, 1, 2, 5, 8, 189, 237, 317, 327, 348, 356, 399, 422, 455, 456, 457, 458, 459, 460, 466, 479, 482, 485, 487], "8192": [5, 168], "84804": 189, "863726": 408, "883935": 408, "890597": 403, "894427": 190, "89613": 402, "8gb": 5, "8x": 1, "9": [8, 189, 423, 455, 458, 459, 460, 461, 463, 469, 472, 484], "90041": 403, "912766": 403, "916291": 421, "95": 6, "982273": 407, "99": [461, 466], "995016": 402, "999": [458, 459, 460], "A": [0, 2, 5, 7, 8, 9, 68, 82, 94, 142, 143, 144, 146, 165, 178, 179, 184, 186, 187, 189, 190, 191, 194, 203, 204, 205, 210, 221, 237, 240, 241, 242, 244, 245, 246, 247, 248, 251, 252, 275, 279, 282, 299, 302, 303, 307, 308, 309, 310, 311, 312, 313, 314, 315, 324, 328, 337, 343, 344, 348, 350, 362, 366, 367, 370, 376, 377, 382, 388, 391, 399, 402, 403, 405, 413, 434, 435, 452, 454, 458, 460, 462, 463, 465, 470, 479, 480, 481, 483, 484], "AS": 163, "And": [5, 400], "As": [6, 38, 287, 324], "At": 93, "But": 487, "By": [5, 307, 370, 421, 481, 484], "For": [0, 1, 2, 5, 8, 38, 146, 163, 178, 189, 237, 314, 324, 328, 337, 341, 358, 363, 372, 375, 381, 386, 391, 400, 402, 403, 404, 405, 421, 449, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487], "If": [0, 1, 2, 5, 8, 15, 16, 17, 18, 26, 27, 28, 29, 78, 82, 83, 93, 95, 105, 108, 109, 110, 111, 117, 118, 121, 122, 123, 125, 126, 127, 136, 142, 145, 156, 157, 158, 161, 162, 165, 172, 183, 184, 185, 189, 194, 203, 204, 205, 207, 208, 216, 217, 221, 225, 229, 232, 233, 235, 236, 241, 245, 247, 256, 259, 273, 274, 275, 280, 284, 286, 287, 288, 291, 293, 294, 299, 300, 303, 305, 307, 311, 313, 328, 330, 331, 332, 333, 334, 335, 344, 350, 352, 363, 365, 375, 381, 383, 386, 388, 391, 400, 421, 423, 434, 456, 479, 480, 481, 483, 486, 487, 488], "In": [0, 1, 2, 5, 6, 38, 204, 237, 311, 324, 337, 344, 452, 455, 457, 458, 460, 461, 462, 478, 479, 480, 481, 483, 486, 487], "It": [2, 5, 8, 126, 165, 268, 299, 309, 313, 324, 377, 381, 462, 474, 484, 486], "Its": 324, "No": [2, 5, 186, 187], "Not": [94, 228, 479], "ON": [3, 8], "Of": 481, "On": [1, 479, 481, 483], "One": [147, 150, 156, 232, 261, 479, 481], "THE": 8, "That": 5, "The": [0, 1, 2, 3, 5, 6, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 47, 51, 61, 62, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 185, 186, 187, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 212, 213, 216, 217, 218, 219, 221, 222, 223, 224, 226, 228, 229, 230, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 263, 264, 269, 270, 271, 272, 273, 274, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 317, 319, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 358, 359, 363, 365, 369, 370, 371, 372, 375, 376, 377, 378, 380, 381, 382, 383, 386, 388, 391, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 464, 466, 467, 468, 471, 474, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "Then": [4, 8], "There": [1, 2, 324, 400, 479], "These": [1, 2, 94, 236, 288, 423, 487], "To": [0, 2, 3, 4, 5, 6, 8, 216, 324, 449, 454, 479, 480, 481, 485], "With": 2, "_": [1, 3, 4, 5, 312, 324, 468, 469, 470, 471, 472, 476, 479, 483, 487], "__call__": [1, 5, 6, 324, 452], "__init__": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 324, 452], "__main__": [2, 5], "__name__": [2, 5], "_a": 2, "_ext": 2, "_f": 189, "_in": [402, 403], "_out": [402, 403], "_p": 434, "_size": [326, 327, 355, 356], "_val": 416, "a1": 163, "a2": 163, "a_": 189, "a_max": [0, 93], "a_min": [0, 93], "a_ndim": 1, "a_shap": 1, "a_strid": 1, "a_view": 484, "ab": [0, 16, 172, 189, 299, 344, 348, 350, 357, 382, 413, 435, 479], "abil": 480, "abl": [2, 237], "about": [1, 2, 5, 6, 131, 210, 483, 487], "abov": [1, 2, 5, 237, 297, 324, 400, 459, 480, 481, 482, 483, 487], "absolut": [0, 12, 16, 172, 412, 413, 433], "acc": 313, "acceler": [2, 328], "access": [0, 5, 50, 324, 452, 463, 480, 483, 487], "accord": [0, 242, 304, 307, 378, 402, 403, 404, 405], "accordingli": 2, "accross": 8, "accumul": [313, 382], "accuraci": 6, "accustom": 5, "achiev": [324, 480], "across": [1, 2, 344, 480], "act": [2, 429], "action": 324, "activ": [2, 8, 211, 337, 397, 399, 415, 435, 445, 446, 448, 479], "actual": [5, 18, 365, 452, 483], "ad": [0, 1, 2, 4, 8, 142, 348, 452, 455, 456, 457, 458, 459, 460, 466, 480, 483, 486], "adadelta": 454, "adafactor": 454, "adagrad": 454, "adam": [454, 460, 461, 470, 471], "adamax": 454, "adamw": [454, 461], "adapt": [455, 456, 457, 480], "add": [0, 1, 2, 3, 5, 14, 38, 138, 199, 232, 237, 330, 331, 332, 333, 334, 335, 481, 487], "add_argu": 5, "add_depend": 2, "add_librari": 2, "addit": [0, 2, 5, 8, 13, 14, 142, 144, 146, 194, 328, 344, 350, 378, 382, 452, 481], "addmm": 0, "address": 2, "adjac": 337, "advanc": [5, 479], "advantag": 487, "advis": 484, "affin": [328, 344, 348, 350, 352, 381], "after": [2, 5, 6, 28, 159, 161, 164, 209, 233, 237, 328, 344, 350, 358, 359, 363, 365, 372, 375, 376, 377, 378, 399, 433, 479, 487], "after_1": 232, "after_2": 232, "after_i": 232, "after_n": 232, "afternoon": 5, "again": [5, 8, 324, 479], "against": 0, "aggreg": 378, "ago": 5, "ai": 112, "ainv": [188, 192], "albeit": 487, "algebra": 7, "algorithm": [400, 461], "alia": [96, 97, 341], "alibi": 324, "align": [184, 237, 327, 343, 349, 356], "align_corn": 400, "all": [0, 1, 2, 3, 6, 8, 16, 28, 38, 84, 85, 86, 94, 99, 100, 101, 103, 104, 112, 121, 122, 123, 140, 149, 152, 155, 158, 163, 164, 191, 204, 232, 233, 259, 278, 307, 324, 358, 359, 363, 366, 367, 368, 373, 375, 378, 391, 399, 400, 449, 452, 474, 476, 479, 482, 483, 485, 488], "all_avg": 480, "all_reduce_grad": 480, "all_sum": 480, "allclos": [0, 1, 143], "alloc": [2, 212, 216, 217, 452], "allow": [0, 1, 2, 178, 309, 324, 377, 452, 474, 480, 482, 485], "almost": 5, "alon": [2, 484], "along": [0, 2, 26, 27, 94, 95, 108, 109, 110, 111, 121, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 185, 189, 236, 247, 256, 259, 273, 275, 279, 287, 288, 291, 292, 293, 294, 301, 324, 342, 383, 414], "alpha": [0, 2, 14, 237, 329, 339, 409, 410, 434, 436, 439, 459, 466], "alpha_": 2, "alreadi": [2, 3, 5, 480], "also": [0, 1, 2, 5, 6, 7, 8, 11, 13, 87, 88, 89, 119, 128, 129, 133, 149, 152, 155, 158, 166, 167, 180, 181, 182, 199, 206, 222, 224, 228, 234, 237, 255, 258, 283, 307, 308, 319, 324, 362, 376, 378, 380, 381, 389, 411, 439, 441, 448, 454, 479, 480, 481, 482, 483, 484, 485, 488], "altern": 476, "alwai": [1, 83, 211, 310, 481], "am": 5, "among": 2, "amount": [5, 213, 326, 355], "amus": 5, "an": [0, 1, 2, 3, 5, 6, 8, 10, 15, 17, 30, 84, 85, 86, 91, 98, 99, 100, 101, 102, 103, 104, 120, 125, 126, 127, 136, 140, 142, 146, 159, 162, 169, 173, 183, 189, 194, 217, 218, 223, 229, 230, 232, 235, 236, 237, 238, 247, 256, 257, 259, 260, 275, 278, 285, 287, 288, 291, 292, 296, 303, 305, 306, 310, 311, 312, 313, 324, 326, 327, 336, 341, 344, 349, 350, 352, 355, 356, 358, 378, 379, 381, 383, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 436, 449, 454, 455, 465, 469, 474, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "anaconda": 480, "anchor": 434, "angl": [115, 239, 351], "angular": [145, 386], "ani": [0, 1, 2, 5, 7, 18, 94, 310, 311, 312, 313, 314, 324, 341, 358, 359, 362, 371, 381, 399, 400, 449, 471, 478, 479, 481, 483, 485, 486, 487], "anonym": 479, "anoth": [0, 93, 178, 204, 283, 304, 317, 324, 358, 479, 481, 482, 487], "anwywher": 8, "anyhow": 5, "anymor": 5, "anyth": [5, 299, 483], "anytim": 483, "api": [1, 2, 341, 480, 481], "app": 8, "append": [5, 204, 479, 483], "appl": [2, 5, 7, 8, 487], "appli": [0, 38, 145, 146, 163, 191, 311, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337, 338, 339, 341, 342, 344, 345, 346, 347, 348, 350, 351, 352, 353, 354, 355, 356, 357, 359, 372, 379, 381, 382, 383, 384, 385, 387, 389, 390, 392, 393, 394, 395, 396, 397, 398, 400, 409, 410, 411, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 449, 462, 465, 471, 474, 479, 480], "applic": [3, 8], "apply_fn": 359, "apply_gradi": 454, "apply_to_modul": [324, 363], "approach": [429, 481], "appropri": [2, 479], "approx": 341, "approxim": [16, 341, 411, 412, 413], "ar": [0, 1, 2, 4, 5, 6, 7, 8, 16, 18, 82, 90, 91, 93, 94, 101, 105, 112, 118, 125, 126, 136, 140, 143, 148, 149, 151, 152, 154, 155, 157, 158, 159, 164, 165, 172, 173, 174, 175, 176, 177, 178, 179, 186, 187, 189, 190, 194, 204, 217, 231, 232, 233, 237, 238, 240, 241, 242, 247, 248, 251, 252, 259, 265, 266, 278, 279, 287, 299, 302, 303, 307, 310, 311, 317, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 344, 348, 350, 352, 365, 378, 381, 400, 421, 423, 424, 448, 452, 454, 461, 463, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "arang": [0, 1, 189, 247, 317, 400, 482, 484], "arbitrari": [310, 452], "arbitrarili": [1, 94, 324, 478, 481, 485], "arc": 0, "arcco": 0, "arccosh": 0, "architectur": [5, 8, 210, 324, 377, 487], "archiv": 486, "arcsin": 0, "arcsinh": 0, "arctan": 0, "arctan2": 0, "arctanh": 0, "arg": [2, 5, 10, 18, 120, 136, 265, 266], "arg1": 178, "arg2": 178, "argmax": [0, 6], "argmin": 0, "argnam": [165, 299], "argnum": [2, 165, 299, 481], "argpars": 5, "argpartit": 0, "argsort": 0, "argument": [1, 31, 65, 79, 94, 136, 165, 299, 311, 312, 313, 324, 400, 476, 480, 481, 486, 487, 488], "argumentpars": 5, "ari": [84, 85, 86], "aris": 484, "arm": 8, "arm64": 8, "around": 5, "arr": [0, 262, 482], "arr_0": 486, "arrai": [0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 324, 328, 349, 358, 365, 368, 373, 379, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 446, 449, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 483, 484, 485, 487], "array_equ": [0, 16, 172], "arrayfir": 7, "arxiv": [5, 344, 348, 350, 357, 382, 413, 435, 455, 461], "as_strid": 0, "ascend": [186, 187], "ask": [5, 480], "assert": [1, 2, 143], "assign": [0, 2, 38, 452], "associ": [2, 265, 266, 483], "assum": [0, 2, 5, 90, 185, 186, 187, 190, 311, 324, 326, 327, 344, 355, 356], "astyp": [0, 1, 2, 5, 143, 358, 484], "atleast": 0, "atleast_1d": 0, "atleast_2d": 0, "atleast_3d": 0, "atol": [0, 16, 172], "atom": [1, 143], "atomic_fetch_add_explicit": 1, "atomic_output": [1, 143], "attach": 2, "attempt": 94, "attend": 378, "attent": [146, 363, 378, 391, 399], "attention_norm": 5, "attribut": [1, 9, 10, 11, 30, 315, 371, 452, 474], "audio": 400, "auto": [0, 2, 8], "autom": 481, "automat": [1, 2, 7, 143, 194, 480, 485, 486, 487], "autoregress": 5, "avail": [2, 4, 5, 6, 8, 10, 124, 214, 319, 487], "averag": [326, 327, 455, 456, 458, 459, 460, 480], "avgpool1d": 324, "avgpool2d": 324, "avoid": [1, 2, 370, 479], "awai": [2, 5], "awar": [479, 483], "ax": [0, 2, 15, 17, 26, 27, 79, 112, 138, 148, 149, 151, 152, 154, 155, 157, 158, 159, 171, 189, 203, 205, 207, 221, 232, 235, 259, 273, 278, 280, 284, 285, 291, 295, 300, 481], "axes_a": 0, "axes_b": 0, "axi": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 95, 108, 109, 110, 111, 118, 121, 138, 142, 144, 147, 150, 153, 154, 155, 156, 157, 158, 159, 168, 185, 189, 203, 205, 207, 221, 223, 232, 233, 235, 236, 241, 247, 256, 259, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 292, 293, 294, 295, 300, 301, 303, 326, 327, 342, 355, 356, 383, 414, 420, 422, 423, 427, 432, 434, 442, 443, 482], "axis1": [0, 46, 77, 118, 285, 294], "axis2": [0, 46, 77, 118, 285, 294], "axpbi": 2, "axpby_": 2, "axpby_gener": 2, "axpby_general_": 2, "axpby_impl": 2, "axpby_impl_acceler": 2, "b": [0, 1, 2, 3, 5, 13, 14, 16, 24, 82, 87, 88, 89, 90, 128, 129, 133, 143, 161, 163, 166, 167, 171, 172, 180, 181, 182, 185, 189, 199, 200, 202, 204, 206, 222, 224, 228, 231, 234, 237, 244, 255, 258, 283, 291, 299, 312, 313, 342, 352, 383, 400, 414, 481, 482, 483, 484, 485, 486, 487], "b1": 163, "b2": 163, "b_": [343, 349], "b_stride": 1, "ba": [458, 460], "back": [5, 112, 214, 484], "backend": [1, 8, 123, 124], "backward": [1, 479, 481], "bad": 483, "balanc": 429, "baltimor": 189, "bandwidth": [479, 480], "bar": 480, "base": [0, 2, 145, 189, 196, 198, 234, 386, 399, 452, 454, 460, 474, 476, 479, 482], "base_idx": 1, "basi": 474, "basic": [4, 260, 481], "batch": [5, 14, 90, 163, 164, 204, 245, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 378, 383, 400, 483], "batch_idx": 1, "batch_iter": [6, 454], "batch_siz": [6, 454], "batchnorm": 324, "becaus": [5, 211, 324, 483], "been": [0, 2, 5, 212, 483], "befor": [1, 2, 5, 8, 28, 143, 233, 362, 399, 463, 480, 482, 483], "before_1": 232, "before_2": 232, "before_i": 232, "before_n": 232, "beforehand": 231, "beggin": 259, "begin": [83, 184, 213, 237, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446], "behav": 112, "behavior": [245, 429, 482, 483], "behaviour": [112, 183, 184], "behind": 481, "being": [281, 324], "bell": 2, "below": [2, 8, 189, 296, 298, 317, 400, 483], "bench": 2, "benchmark": [2, 479], "benefici": [337, 338, 483], "best": 480, "beta": [0, 2, 14, 116, 141, 237, 328, 344, 348, 350, 433, 454, 458, 459, 460, 461], "beta_": 2, "beta_1": [456, 458, 459, 460, 461], "beta_2": [458, 459, 460, 461], "better": [481, 487], "between": [0, 2, 7, 93, 159, 399, 422, 425, 426, 429, 470, 480, 483, 484, 487], "beyond": [259, 468, 471], "bfloat16": [2, 11, 168, 317, 484], "bfloat16_t": 2, "bia": [5, 116, 141, 142, 164, 237, 238, 311, 324, 330, 331, 332, 333, 334, 335, 343, 349, 350, 352, 363, 365, 375, 378, 381, 383, 458, 459, 460, 463, 481], "bias": [0, 116, 141, 164, 237, 238, 343, 349, 363, 375, 378], "bicub": 400, "big": [1, 479], "bigger": [5, 456], "bilinear": [1, 400], "binari": [194, 262, 263, 264, 265, 266, 301, 397, 421, 446, 479], "binary_cross_entropi": [324, 479], "bit": [0, 116, 141, 164, 180, 237, 238, 258, 307, 317, 358, 380, 381, 382], "bitwis": [0, 87, 88, 89, 180, 258], "bitwise_and": 0, "bitwise_or": 0, "bitwise_xor": 0, "block": [0, 2, 5, 90, 399], "block_masked_mm": 0, "block_siz": [0, 90], "bn": 328, "bodi": [1, 143], "bool": [0, 1, 2, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 78, 80, 82, 94, 101, 108, 109, 110, 111, 123, 124, 143, 145, 164, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 214, 217, 221, 235, 238, 280, 284, 300, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 358, 362, 363, 365, 370, 372, 375, 378, 381, 383, 386, 391, 399, 400, 421, 424, 456, 467], "bool_": [11, 317], "boolean": [0, 16, 82, 172, 173, 174, 175, 176, 177, 178, 200, 201, 202, 317, 374, 482], "both": [1, 2, 13, 87, 88, 89, 128, 129, 133, 166, 167, 178, 180, 181, 182, 189, 199, 206, 222, 224, 228, 234, 241, 255, 258, 283, 307, 326, 327, 348, 349, 355, 356, 454, 479, 480, 481, 485, 487], "bottom": 400, "bound": [0, 248, 251, 252, 341, 408, 479, 482, 487], "boundari": 470, "bracket": 5, "brain": 317, "break": 484, "bregler": 337, "broadcast": [0, 2, 13, 16, 87, 88, 89, 91, 93, 128, 129, 133, 162, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 236, 240, 241, 245, 251, 252, 255, 258, 283, 288, 304, 378], "broadcast_arrai": [0, 2], "broadcast_to": 0, "broadcasted_input": 2, "brought": 7, "btl_tcp_link": 480, "buffer": [1, 2, 211, 484], "bui": 5, "build": [3, 5, 7, 404, 452, 479], "build_ext": [2, 8], "build_shared_lib": [2, 8], "built": [1, 2, 8, 483], "bundl": 5, "byte": [51, 61, 211, 212, 213, 216, 217, 218, 317], "c": [0, 1, 2, 5, 14, 189, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 348, 349, 355, 356, 484, 485, 487], "c_": [349, 461], "c_in": [98, 99, 100, 101, 102, 103, 104], "c_j": [326, 327, 355, 356], "c_out": [98, 99, 100, 101, 102, 103, 104], "c_pad": 1, "c_t": [349, 461], "cach": [5, 8, 209, 211, 212, 216, 479], "calcul": [189, 421, 424, 430, 456], "call": [2, 3, 5, 6, 31, 126, 161, 209, 213, 324, 340, 363, 375, 380, 388, 452, 454, 463, 479, 480, 481, 483], "callabl": [94, 112, 143, 165, 179, 299, 302, 303, 307, 308, 310, 311, 312, 313, 358, 359, 362, 370, 383, 388, 399, 401, 402, 403, 404, 405, 406, 407, 408, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472], "can": [1, 2, 3, 5, 7, 8, 13, 18, 65, 79, 83, 87, 88, 89, 94, 118, 119, 120, 128, 129, 133, 136, 166, 167, 180, 181, 182, 189, 199, 206, 218, 222, 224, 228, 234, 240, 241, 248, 251, 252, 255, 258, 263, 283, 294, 299, 313, 324, 327, 340, 341, 356, 362, 375, 380, 388, 400, 423, 449, 452, 454, 462, 463, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "cannot": [5, 93, 482, 484], "captur": [2, 3, 94, 219, 220, 324, 479], "care": [5, 483], "carri": 2, "cartesian": 208, "case": [2, 5, 121, 122, 123, 125, 126, 127, 149, 152, 153, 155, 156, 157, 158, 159, 183, 184, 185, 186, 187, 188, 190, 191, 192, 204, 257, 278, 327, 337, 356, 397, 415, 433, 439, 445, 446, 462, 463, 479, 481, 485, 486, 487, 488], "cast": [2, 37, 156, 157, 158, 194, 358, 370, 484], "caster": 2, "categor": 5, "categori": [11, 178, 317], "catlas_saxpbi": 2, "caus": [324, 479, 483], "causal": 5, "caution": 83, "cd": [3, 8], "cdf": [242, 341, 411], "cdot": [413, 422, 425, 441], "ceil": 0, "ceildiv": 1, "cell": 349, "celu": 324, "certain": [2, 372, 479], "chang": [83, 94, 268, 301, 376, 381, 400, 426, 433, 479, 484], "channel": [1, 98, 99, 100, 101, 102, 103, 104, 328, 330, 331, 332, 333, 334, 335, 337, 338], "channel_idx": 1, "charact": 310, "check": [0, 2, 8, 82, 124, 178, 186, 187, 214, 365, 481, 482], "checklist": 480, "checkout": [3, 479], "checkpoint": [399, 454], "chen": 461, "child": 377, "children": 324, "chip": 8, "choleski": 184, "choos": [5, 145, 386], "chosen": 131, "clamp": 159, "clang": 8, "clariti": 481, "class": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 474], "class_pred": 307, "classif": [404, 405], "classifi": 6, "classmethod": [380, 381], "clear": 209, "click": 8, "clip": [0, 309, 421, 456], "clip_threshold": 456, "clipped_grad": 309, "clone": 8, "close": [4, 7, 8, 16, 172], "closer": 311, "cmake": [3, 8], "cmake_arg": 3, "cmake_build_parallel_level": 8, "cmake_build_typ": 8, "cmake_current_list_dir": 2, "cmake_host_system_processor": 8, "cmake_library_output_directori": 2, "cmakebuild": 2, "cmakeextens": 2, "cmakelist": 2, "cmdclass": 2, "co": [0, 2, 112, 391, 481], "code": [1, 143, 479, 480, 483], "coeffici": [2, 455, 456, 458, 459, 460, 461], "col": 296, "col_contigu": 2, "cold": 8, "collect": [2, 311, 312, 478], "column": [2, 140, 169, 186, 237], "com": [8, 480], "combin": [5, 191, 313], "come": [2, 5, 480, 481], "command": [2, 3, 8, 480], "command_buff": 2, "common": [2, 454, 479, 483], "commonli": [6, 376, 449, 479], "commun": [7, 120, 123, 124], "compar": [2, 82, 479], "comparison": [16, 133, 166, 167, 181, 182, 228], "compat": [5, 241, 245, 341, 486], "compil": [0, 3, 7, 8, 119, 132, 143, 480, 481, 483], "compiled_fun": 479, "compiled_grad_fn": 479, "complet": [4, 5, 8, 217, 376, 377, 481, 487], "complex": [2, 96, 97, 154, 155, 156, 157, 158, 170, 186, 187, 253, 310, 317, 324, 377, 479, 481], "complex64": [2, 11, 317], "complex64_t": 2, "complexflo": 11, "compon": [2, 5], "compos": [7, 324, 479, 481, 485], "composit": 485, "compress": 266, "compromis": 5, "comput": [0, 1, 2, 4, 5, 6, 7, 8, 108, 109, 110, 111, 112, 116, 131, 139, 141, 145, 165, 179, 183, 184, 185, 186, 187, 188, 189, 192, 199, 207, 231, 237, 255, 273, 280, 281, 291, 299, 300, 302, 308, 324, 328, 343, 344, 348, 349, 350, 363, 376, 381, 382, 386, 399, 402, 403, 404, 405, 412, 413, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 454, 455, 456, 458, 459, 460, 461, 465, 479, 480, 481, 485, 487], "computation": 483, "compute_encod": 2, "concaten": [0, 5, 121], "concept": 452, "concis": 5, "concret": [2, 343, 349, 352, 383, 483, 487], "conda": [8, 480], "condit": [0, 2, 304, 487], "config": [2, 480], "configu": 454, "configur": [116, 141, 480], "confirm": 480, "confus": 6, "conj": 97, "conjug": [0, 96], "connect": 480, "consecut": [145, 237, 386], "consequ": 5, "consid": [5, 16, 82, 172, 310, 311, 312, 344, 478], "consider": 479, "const": [0, 1, 2, 424], "constant": [0, 2, 5, 8, 142, 144, 232, 324, 328, 344, 350, 382, 424, 434, 466, 468, 479, 484], "constant_valu": 232, "constitut": 311, "construct": [0, 2, 6, 45, 117, 162, 229, 292, 305], "consum": 483, "contain": [2, 5, 8, 28, 29, 68, 94, 118, 131, 153, 154, 155, 163, 164, 186, 189, 200, 201, 202, 237, 275, 304, 309, 324, 362, 364, 365, 371, 399, 430, 449, 452, 479, 480, 481], "content": [8, 362, 479], "context": 282, "contigu": [1, 2, 83, 143], "continu": [329, 409, 481], "contract": [0, 131], "contrast": 459, "contribut": 2, "contriv": [481, 487], "control": [0, 351, 476, 483], "conv": 105, "conv1d": [0, 324], "conv2d": [0, 324], "conv3d": [0, 324], "conv_gener": 0, "conv_transpose1d": 0, "conv_transpose2d": 0, "conv_transpose3d": 0, "conveni": [1, 2, 6, 178], "convent": [18, 105, 130, 131, 400, 459], "convers": 7, "convert": [0, 1, 2, 78, 84, 85, 86, 115, 159, 239, 380, 381, 483, 484, 485], "convolut": [0, 98, 99, 100, 101, 102, 103, 104, 105, 330, 331, 332, 333, 334, 335, 337, 338], "convolv": [98, 99, 100, 101, 102, 103, 104], "convtranspose1d": 324, "convtranspose2d": 324, "convtranspose3d": 324, "coordin": [0, 208], "copi": [0, 1, 2, 5, 7, 233, 274, 484], "copy_inplac": 2, "copytyp": 2, "core": [1, 2, 3, 4, 5, 6, 307, 324, 326, 327, 328, 348, 355, 356, 365, 368, 370, 373, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 452, 454, 479, 480, 484, 485], "corner": 400, "correct": [2, 8, 458, 459, 460, 482, 483], "correctli": 38, "correl": [101, 337], "correspond": [0, 2, 15, 17, 78, 93, 116, 118, 141, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 186, 203, 205, 221, 235, 284, 291, 303, 311, 481], "cos_first": 391, "cosh": [0, 429], "cosin": [0, 19, 20, 106, 107, 422, 468, 470, 481], "cosine_decai": [454, 470], "cosine_similarity_loss": 324, "cost": [8, 456, 480, 483], "costli": 483, "cot": 1, "cot_index": 1, "cotan": [2, 112], "cotang": [1, 2, 112, 302], "could": [5, 324], "count": [324, 470], "counter": 476, "cours": 481, "coursera": 466, "cov": 245, "covari": [245, 328], "cover": 2, "cpp": 2, "cpu": [7, 8, 186, 187, 190, 487], "cpython": 2, "crash": [83, 479], "creat": [0, 2, 5, 8, 83, 123, 140, 169, 282, 324, 452, 454, 470, 479, 482, 484], "create_additive_causal_mask": 5, "criteria": 2, "cross": [6, 101, 421, 423], "cross_entropi": [6, 324], "crowd": 5, "cry": 5, "cubic": 400, "cummax": 0, "cummin": 0, "cumprod": 0, "cumsum": 0, "cumul": [0, 83, 108, 109, 110, 111], "current": [5, 7, 8, 83, 90, 100, 103, 104, 127, 210, 212, 237, 313, 324, 456, 480, 483], "custom": [7, 112, 143, 399], "custom_decod": 399, "custom_encod": 399, "custom_funct": 1, "custom_kernel_myexp_float": 1, "custom_tim": 2, "cvpr": 337, "cycl": 478, "d": [0, 1, 2, 5, 100, 104, 117, 118, 171, 189, 204, 208, 231, 287, 294, 296, 297, 298, 314, 332, 335, 338, 343, 349, 383, 455, 458, 460, 487], "d1": 487, "d2": 487, "d2fdx2": 481, "d_i": 352, "dampen": 467, "darwin": 2, "data": [0, 2, 6, 7, 10, 18, 125, 140, 156, 157, 162, 169, 193, 225, 229, 242, 251, 294, 296, 301, 305, 338, 401, 402, 403, 404, 405, 406, 407, 408, 479, 480, 482, 484], "dataset": [4, 480, 483], "datatyp": 51, "dbuild_shared_lib": 8, "dcmake_build_typ": 8, "ddof": [0, 75, 80, 280, 300], "deal": 479, "debug": [1, 3, 480], "debugg": 7, "decai": [456, 459, 461, 467, 468, 469, 472], "decay_r": [456, 469, 472], "decay_step": 468, "decent": 6, "decid": [311, 362], "decim": [0, 66, 260], "declar": 2, "decltyp": 1, "decod": 399, "decomposit": [183, 184, 191], "decor": [1, 112], "decoupl": 459, "deep": [328, 402, 403, 404, 405], "def": [1, 2, 4, 5, 6, 112, 143, 299, 324, 452, 479, 480, 481, 482, 483, 484, 487], "default": [1, 2, 8, 14, 15, 16, 17, 18, 26, 27, 28, 29, 82, 83, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 112, 113, 114, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 143, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 163, 164, 165, 168, 169, 172, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 216, 217, 218, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 260, 267, 268, 274, 275, 278, 279, 280, 282, 284, 286, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 317, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 339, 342, 343, 345, 348, 349, 351, 352, 355, 356, 358, 363, 365, 370, 372, 375, 378, 379, 380, 381, 383, 386, 391, 395, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 476, 478, 479, 481, 484, 486, 488], "default_devic": 488, "default_stream": 488, "defin": [1, 2, 4, 5, 6, 8, 112, 126, 143, 164, 185, 189, 238, 307, 310, 484], "definit": [112, 183, 184, 245], "degre": [0, 239, 434], "delta": [426, 455], "delv": [404, 405], "demonstr": 484, "denomin": [348, 422, 455, 457, 458, 459, 460, 466], "dens": [208, 487], "depend": [0, 2, 3, 4, 8, 78, 189, 343, 349, 383, 480, 482, 486, 487], "depth": [310, 332, 335, 338, 481], "dequant": [0, 237], "deriv": [2, 481, 483], "descend": 360, "descent": [467, 479, 483], "describ": [2, 483], "descript": [2, 5, 317], "design": [1, 4, 7, 476, 487], "destin": [0, 2, 60, 127, 223, 236], "destroi": 479, "detach": 481, "detail": [1, 2, 10, 216, 324, 337, 386, 391, 400, 402, 403, 404, 405, 455, 457, 458, 460, 461, 482, 485], "determin": [0, 2, 118, 245, 313, 317, 369, 486], "dev": [2, 8], "develop": [2, 8], "developer_dir": 8, "deviat": [0, 246, 280, 402, 404, 407], "deviatoin": 0, "devic": [1, 2, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 217, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 315, 487, 488], "device_info": 218, "devicetyp": 9, "df": 484, "dfdx": [481, 482], "dft": [147, 148, 149, 150, 151, 152, 156, 157, 158], "dhwc": 338, "diag": [0, 191], "diagon": [0, 45, 117, 140, 294, 296, 297, 298], "dict": [94, 136, 194, 210, 263, 264, 265, 309, 368, 373, 376, 377, 452, 454, 462, 463, 465, 478, 481, 486], "dict_kei": [311, 463], "dictionari": [5, 94, 194, 210, 263, 264, 309, 310, 313, 324, 362, 371, 376, 377, 464, 478, 486], "did": 5, "diff": 2, "differ": [7, 178, 283, 301, 433, 481], "differenti": [1, 2, 7, 329, 409], "difficult": 481, "difficulti": [402, 403], "dilat": [0, 98, 99, 100, 101, 102, 103, 104, 330, 331, 333, 334], "dim": [1, 5, 145, 146, 340, 344, 348, 350, 378, 380, 382, 386, 391, 399], "dimens": [0, 2, 5, 15, 17, 26, 27, 62, 68, 78, 84, 85, 86, 94, 99, 100, 101, 103, 104, 118, 138, 145, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 189, 190, 191, 192, 203, 204, 205, 207, 221, 235, 236, 237, 241, 250, 280, 284, 288, 291, 295, 300, 328, 330, 331, 332, 333, 334, 335, 337, 338, 342, 343, 344, 348, 349, 350, 378, 382, 383, 386, 399, 400, 414, 423, 479, 481], "dimension": [30, 142, 144, 147, 148, 149, 150, 151, 152, 156, 157, 158, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 352, 355, 356, 380, 381, 391, 482, 484], "direct": [2, 5, 360, 461, 487], "directli": [2, 5, 83], "directori": [2, 5, 8], "disabl": [119, 216, 479], "disable_compil": 479, "disappoint": 5, "discard": [5, 310], "discov": 8, "discoveri": 461, "discret": [105, 147, 148, 149, 150, 151, 152, 156, 157, 158, 340, 380], "discuss": 2, "disk": 5, "dispatch": 2, "dispatchthread": 2, "displai": 324, "distanc": [5, 434], "distribut": [7, 8, 240, 241, 242, 244, 245, 246, 251, 252, 352, 402, 403, 404, 405, 407, 408, 424, 427, 432, 434, 449], "diverg": 427, "divid": [0, 2, 38, 161, 237, 255, 480], "divis": [0, 128, 161, 237, 255], "divisor": [280, 300], "divmod": 0, "dloss_dw": 481, "dloss_dx": 481, "dlpack": 484, "dlvalu": 299, "dmlx_build_cpu": 8, "dmlx_build_gguf": 8, "dmlx_build_safetensor": 8, "dmlx_metal_debug": 3, "dmlx_metal_jit": 8, "do": [0, 2, 5, 8, 301, 324, 364, 375, 449, 452, 459, 479, 480, 481, 483], "doc": [2, 6, 480], "document": [2, 3, 65, 79, 263, 264, 317, 479, 481, 482], "doe": [0, 2, 3, 5, 8, 211, 301, 309, 324, 479, 482, 483, 484], "doesn": [2, 324], "domain": [251, 480], "don": [1, 8, 479, 487], "done": [324, 336, 382, 479, 480, 483, 484], "dot": [183, 188, 192, 291, 310, 367, 378], "doubl": [0, 5], "doubt": 5, "down": [5, 309], "dparam": 299, "draw": 241, "drop": 362, "dropout": [324, 337, 338, 372, 399, 479], "dropout2d": 324, "dropout3d": 324, "dst": 127, "dt": 134, "dtype": [0, 1, 2, 5, 11, 18, 30, 37, 38, 78, 81, 125, 126, 140, 143, 159, 162, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 248, 251, 252, 294, 296, 301, 305, 317, 370, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486], "dtypecategori": [178, 317], "dual": 429, "duchi": 457, "dure": [3, 94, 336, 337, 338, 400, 484], "dx": 112, "dy": 112, "dyld": 480, "dyld_library_path": 480, "dylib": 2, "dynam": 483, "e": [2, 6, 8, 112, 134, 143, 163, 164, 179, 269, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 350, 363, 382, 419, 420, 442, 443, 448, 454, 457, 479, 483, 488], "e5": 317, "e8": 317, "each": [0, 1, 2, 68, 116, 136, 141, 145, 164, 178, 183, 184, 186, 187, 188, 191, 192, 204, 208, 232, 237, 238, 241, 256, 265, 266, 275, 292, 295, 301, 303, 304, 337, 338, 340, 343, 344, 349, 383, 386, 399, 421, 423, 476, 479, 480, 483], "eager": 483, "earli": 337, "earlier": 2, "eas": 5, "easi": [2, 324, 480], "easier": [1, 483], "edg": [93, 232, 400, 479], "edit": [8, 377], "effect": [337, 479, 483], "effici": [5, 7, 163, 337, 386, 483, 485], "eigenvalu": [186, 187], "eigenvector": 186, "einstein": [130, 131], "einsum": 131, "either": [8, 13, 65, 78, 79, 87, 88, 89, 93, 128, 129, 133, 161, 166, 167, 180, 181, 182, 189, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 299, 327, 356, 388, 400, 404, 405, 484], "elem": [1, 143], "elem_to_loc": [1, 2], "element": [0, 1, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 28, 70, 83, 87, 88, 89, 92, 106, 107, 108, 109, 110, 111, 116, 128, 129, 133, 134, 135, 137, 139, 140, 141, 160, 161, 164, 166, 167, 172, 173, 174, 175, 176, 177, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 208, 222, 224, 226, 228, 233, 234, 237, 238, 254, 255, 256, 258, 259, 261, 269, 270, 271, 272, 276, 277, 283, 287, 289, 290, 293, 299, 301, 304, 329, 336, 337, 338, 343, 347, 349, 357, 379, 383, 386, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479, 481], "elementwis": [1, 96, 97], "elif": 5, "ellipsi": 482, "elman": 383, "els": [0, 2, 5, 324, 363, 480, 483], "elsewher": [296, 482], "elu": [324, 439], "emb": [5, 340, 380, 391], "embed": [5, 307, 324, 380, 386, 391, 422], "empti": [127, 245], "enabl": [3, 5, 8, 94, 132, 467], "encod": [2, 145, 386, 391, 399, 423], "encount": [2, 481], "end": [118, 184, 214, 237, 259, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446, 468, 471], "end_axi": [0, 49, 159], "end_encod": 2, "endif": 2, "endswith": 363, "enhanc": [5, 386, 483], "enjoi": 2, "enough": [2, 483], "ensur": [0, 1, 2, 8, 143, 309, 429, 480], "ensure_row_contigu": [1, 143], "enter": 5, "entir": [15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 337, 338], "entri": [0, 247, 337, 338], "entropi": [6, 421, 423], "enumer": 324, "environ": [8, 119, 132, 480], "ep": [4, 142, 144, 328, 344, 348, 350, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 466], "epoch": 6, "epsilon": [328, 344, 348, 350, 382, 422, 424, 455, 457, 458, 459, 460, 466], "epsilon_1": 456, "epsilon_2": 456, "equal": [0, 16, 28, 82, 140, 167, 172, 182, 228, 233, 248, 275, 348, 352], "equal_nan": [0, 16, 82, 172], "equat": [130, 131], "equival": [0, 2, 31, 65, 79, 126, 129, 161, 164, 168, 287, 329, 339, 341, 345, 346, 347, 353, 354, 377, 379, 381, 384, 385, 387, 389, 392, 393, 394, 395, 396, 398], "erf": [0, 135, 479], "erfinv": 0, "error": [0, 2, 8, 123, 134, 135, 217, 218, 275, 341, 411, 412, 413, 429, 431, 481, 484], "error_norm": 4, "estim": [458, 460], "eta": 461, "etc": [2, 237, 324, 400, 480], "eval": [2, 3, 4, 5, 6, 324, 452, 454, 479, 480, 481, 483, 485], "eval_cpu": 2, "eval_fn": 6, "eval_gpu": 2, "evalu": [2, 5, 6, 7, 127, 136, 179, 302, 324, 361, 372, 452, 454, 479, 485], "even": [1, 2, 5, 94, 479, 483, 484], "evenli": [0, 193], "everi": [237, 311, 454, 472, 481], "everyth": [5, 480], "everywher": 0, "exact": [412, 413], "exactli": [2, 5, 145, 365, 481], "exampl": [0, 3, 4, 5, 6, 8, 18, 38, 112, 143, 159, 178, 186, 187, 189, 190, 282, 287, 309, 312, 313, 324, 326, 327, 328, 348, 355, 356, 363, 365, 372, 375, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 454, 463, 468, 469, 470, 471, 472, 476, 481, 482, 483, 484, 485, 486], "exce": 309, "exceed": 217, "except": [7, 140, 153, 154, 156, 157, 158, 344, 365, 482, 484], "exclud": [236, 288], "exclus": [0, 83, 89], "execut": [2, 8, 84, 85, 86, 213, 484, 487], "exist": [2, 3, 5, 363, 375], "exp": [0, 1, 139, 143, 199, 203, 242, 273, 329, 339, 390, 409, 410, 427, 439, 440, 444, 479, 487], "exp_elementwis": [1, 143], "expand_dim": 0, "expect": [2, 5, 330, 331, 332, 333, 334, 335, 336, 337, 338, 391, 399, 424, 479, 482], "expens": 399, "expensive_fun": 483, "experiment": 484, "explain": 2, "explicit": [2, 463, 476, 484], "explicitli": [163, 324, 476], "explor": 8, "expm1": 0, "exponenti": [0, 137, 139, 329, 339, 387, 409, 410, 439, 469], "exponential_decai": 454, "export": 8, "ext_modul": 2, "extend": [2, 232], "extens": [7, 194, 219, 369, 486], "extern": 484, "extra": [1, 311, 312], "extract": [0, 5, 45, 117, 118, 324, 362, 452, 480], "extras_requir": 2, "extrem": [482, 483], "ey": [0, 5, 188, 192], "f": [0, 2, 4, 6, 112, 189, 324, 349, 459, 479, 484], "f_jvp": 112, "f_t": 349, "f_vjp": 112, "f_vmap": 112, "face": 5, "factor": [2, 14, 168, 183, 184, 190, 400, 423, 469, 472], "fall": [2, 112], "fallback": 2, "fals": [0, 1, 2, 5, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 80, 82, 94, 101, 108, 109, 110, 111, 123, 143, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 217, 221, 235, 280, 284, 300, 304, 307, 310, 311, 312, 313, 317, 344, 348, 350, 352, 363, 365, 375, 378, 381, 386, 391, 399, 400, 421, 424, 456, 467, 484], "famili": 5, "fan": [402, 403, 404, 405], "fan_in": [402, 403, 404, 405], "fan_out": [402, 403, 404, 405], "far": 454, "fast": [1, 7, 341, 413, 480, 487], "faster": [1, 2, 8, 129, 411, 421, 479, 481], "featur": [1, 7, 98, 99, 100, 101, 102, 103, 104, 145, 328, 343, 344, 348, 349, 350, 352, 381, 382, 383, 386, 399, 400, 479, 483], "feed": 5, "feed_forward": 5, "feedforward": [402, 403], "feel": 5, "fetch": 1, "few": [1, 2, 5, 6, 7, 8, 480, 483, 485], "ffn": 5, "ffn_norm": 5, "fft": 7, "figur": 480, "file": [5, 8, 194, 262, 263, 264, 265, 266, 365, 369, 480, 481, 486], "file_or_weight": 365, "fill": [0, 2, 162, 230, 296, 306, 401, 402, 403, 404, 405, 407, 408], "filter": [0, 105, 330, 331, 332, 333, 334, 335, 358, 362], "filter_and_map": 324, "filter_fn": [358, 362], "final": [2, 4, 5, 6, 168, 468, 471], "find": [2, 4, 8, 480], "find_packag": 2, "finder": 8, "fine": [476, 483], "finetun": 324, "finish": 2, "finit": [0, 173, 225], "first": [0, 1, 2, 3, 4, 5, 6, 8, 118, 121, 159, 165, 178, 180, 191, 200, 202, 204, 233, 250, 258, 285, 291, 294, 299, 310, 312, 313, 324, 327, 344, 356, 400, 422, 430, 456, 458, 459, 460, 463, 479, 481, 484, 487], "first_lay": 483, "fit": [2, 237, 487], "five": 479, "fix": [2, 5, 8, 483], "flag": [2, 8, 479, 484], "flat": [163, 164, 310, 314], "flat_param": 265, "flatten": [0, 28, 29, 108, 109, 110, 111, 189, 231, 233, 236, 256, 259, 274, 287, 288, 293, 310], "flexibl": 7, "flexibli": 377, "flip": [0, 101, 105], "float": [0, 1, 2, 11, 14, 16, 18, 78, 142, 143, 144, 145, 146, 161, 162, 168, 172, 178, 189, 225, 238, 240, 244, 246, 309, 317, 328, 336, 337, 338, 344, 348, 350, 358, 370, 382, 386, 391, 397, 399, 400, 401, 402, 403, 404, 405, 407, 408, 422, 423, 424, 426, 430, 433, 434, 445, 446, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 471, 472], "float16": [1, 2, 11, 143, 168, 194, 317, 358, 483, 484], "float16_t": [1, 2], "float32": [0, 1, 2, 11, 18, 140, 143, 146, 168, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 251, 252, 296, 305, 317, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 483, 484, 485, 486], "float64": 178, "floor": [0, 1, 161], "floor_divid": 0, "flow": [0, 281, 483], "flush": 2, "fn": [308, 311, 312, 313, 485], "follow": [1, 2, 5, 6, 7, 8, 18, 105, 116, 141, 163, 189, 232, 237, 312, 324, 412, 413, 427, 455, 456, 457, 458, 459, 460, 461, 467, 476, 479, 480, 481, 487], "foo": 480, "food": 5, "forc": [5, 6, 324, 480, 485], "forg": 8, "formal": [116, 141, 237], "format": [5, 194, 262, 263, 264, 265, 266, 484], "formul": [329, 339], "formula": 433, "forth": 400, "forward": [1, 2, 299, 479, 483], "found": 362, "four": 328, "fourier": [147, 148, 149, 150, 151, 152, 156, 157, 158], "frac": [134, 237, 269, 326, 327, 328, 336, 337, 338, 344, 348, 350, 352, 355, 356, 382, 390, 402, 403, 404, 405, 422, 424, 426, 429, 440, 442, 443, 455, 457, 458, 459, 460, 466], "fraction": 18, "framework": [2, 7], "free": 216, "freez": [324, 375, 452], "freq": 145, "frequenc": [145, 386, 391], "frequent": [479, 483], "friend": 5, "fro": 189, "frobeniu": 189, "from": [0, 1, 2, 5, 6, 7, 83, 115, 116, 118, 121, 122, 125, 126, 127, 141, 143, 154, 155, 157, 158, 162, 163, 168, 189, 194, 204, 208, 213, 216, 230, 237, 239, 240, 241, 242, 243, 244, 248, 251, 265, 278, 281, 283, 287, 288, 293, 294, 304, 306, 310, 311, 312, 313, 314, 324, 352, 363, 365, 378, 402, 403, 404, 405, 407, 408, 424, 433, 449, 454, 478, 479, 480, 481, 483, 484, 485, 486, 487], "from_embed": 380, "from_linear": 381, "front": 2, "frozen": [324, 363, 373, 375, 381, 452], "fuction": 129, "full": [0, 1, 2, 6, 65, 79, 105, 273, 376, 377, 424, 479, 480, 483], "full_turn": 391, "fulli": [2, 7, 480, 484, 487], "fun": [94, 165, 179, 299, 302, 303, 479, 482, 483, 487], "fun1": 483, "func": 383, "function": [0, 1, 2, 3, 4, 5, 6, 7, 16, 18, 83, 94, 112, 129, 134, 135, 143, 165, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 218, 269, 299, 302, 303, 308, 309, 311, 312, 313, 324, 329, 339, 341, 342, 345, 346, 347, 353, 354, 357, 359, 363, 370, 375, 379, 383, 384, 385, 387, 388, 389, 390, 392, 393, 394, 395, 396, 397, 398, 399, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 435, 440, 442, 443, 444, 445, 446, 447, 449, 454, 463, 476, 478, 480, 482, 483, 484, 486], "functool": 479, "further": [2, 8, 481], "fuse": [1, 479], "fusibl": 479, "futur": [5, 381, 482, 483], "g": [3, 8, 112, 143, 189, 237, 349, 448, 466, 467, 483, 488], "g_t": [349, 455, 457, 458, 459, 460, 461, 466, 467], "gain": [402, 403, 404, 405], "gamma": [328, 344, 348, 350, 382, 402, 403, 404, 405], "gap": 1, "gate": [342, 343, 414], "gather": [0, 121, 163, 164], "gather_mm": [0, 164], "gather_qmm": 0, "gaurante": 301, "gaussian": [4, 341, 411, 412, 413, 424], "gaussian_nll_loss": 324, "gelu": [324, 412, 413, 479], "gelu_approx": [324, 341, 411], "gelu_fast_approx": [324, 341, 411], "geluapprox": 341, "gelufast": 341, "gener": [0, 1, 2, 3, 4, 11, 18, 101, 140, 143, 154, 155, 193, 208, 240, 245, 246, 247, 248, 251, 252, 399, 476, 479, 482, 483, 488], "general_": 2, "generate_stub": 8, "geq": [397, 446], "get": [2, 4, 6, 8, 99, 100, 101, 103, 104, 113, 114, 210, 211, 212, 213, 243, 324, 479, 481, 483, 487], "get_cache_memori": 209, "get_command_encod": 2, "get_kernel": 2, "gguf": [8, 194, 263, 486], "gh": 1, "gii": 1, "git": 8, "github": [4, 6, 8, 479], "give": [2, 5, 6, 28, 479], "given": [0, 2, 8, 15, 17, 28, 38, 83, 91, 93, 95, 108, 109, 110, 111, 116, 118, 131, 136, 138, 141, 147, 148, 149, 150, 151, 152, 156, 157, 158, 162, 163, 189, 203, 205, 207, 216, 221, 225, 227, 235, 245, 247, 248, 259, 260, 268, 273, 275, 280, 284, 286, 292, 293, 294, 296, 297, 298, 300, 315, 326, 327, 336, 355, 356, 362, 378, 422, 424, 430], "gix": 1, "gix_mult": 1, "giy_mult": 1, "global": [119, 121, 122, 123, 125, 126, 127, 132, 249, 309, 476, 479], "glorot": [402, 403], "glorot_norm": 324, "glorot_uniform": 324, "glu": [5, 324], "gm": 1, "gn": 1, "go": [2, 5, 481], "golub": 189, "good": [2, 8, 454, 479, 480, 487], "goroshin": 337, "gower": 5, "gpu": [1, 3, 7, 8, 210, 482, 487], "gputrac": [3, 219], "grad": [2, 4, 6, 299, 309, 454, 462, 479, 480, 481, 482, 483, 485], "grad_fn": [4, 479, 481], "gradient": [0, 4, 6, 112, 165, 281, 299, 308, 309, 324, 363, 376, 381, 399, 429, 452, 454, 455, 456, 458, 459, 460, 461, 462, 465, 467, 479, 480, 481, 482, 483, 484, 485], "grain": 476, "graph": [2, 5, 6, 7, 481], "great": 3, "greater": [0, 5, 28, 139, 167, 233, 309, 397, 446], "greater_equ": 0, "grep": 8, "grid": [2, 143, 208], "grid_dim": 2, "grid_grad": 1, "grid_idx": 1, "grid_sampl": 1, "grid_sample_grad": 1, "grid_sample_ref": 1, "grid_sample_vjp": 1, "grid_shap": 1, "grid_siz": 1, "ground": [4, 5, 423, 433], "group": [0, 1, 98, 99, 100, 101, 102, 103, 104, 116, 121, 122, 123, 125, 126, 127, 141, 146, 164, 237, 238, 301, 307, 330, 344, 380, 381, 480], "group_dim": 2, "group_siz": [0, 116, 141, 164, 237, 238, 307, 380, 381], "groupnorm": 324, "grow": 483, "gru": 324, "guid": [2, 7], "gw": 1, "h": [1, 2, 98, 99, 100, 102, 103, 104, 189, 327, 328, 331, 332, 334, 335, 337, 338, 343, 349, 356, 383, 481, 483], "h_": [327, 343, 349, 356, 383], "h_in": 1, "h_stride": 1, "h_t": [343, 349, 383], "ha": [2, 3, 5, 6, 7, 8, 78, 94, 118, 127, 153, 154, 156, 157, 158, 165, 183, 184, 186, 187, 188, 191, 192, 208, 212, 241, 328, 343, 349, 352, 383, 452, 454, 479, 482, 483, 485, 487], "had": 5, "hadamard": [0, 168], "hadamard_transform": 0, "half": [2, 18, 248, 252, 386, 483], "halv": [342, 414], "hand": [5, 481, 483], "handi": 481, "handl": [2, 324, 479], "happen": [2, 5, 142, 399, 454, 479, 483], "happi": 5, "hard": 5, "hard_shrink": [324, 345], "hard_tanh": [324, 346], "hardshrink": [324, 415], "hardswish": 324, "hardtanh": [324, 416], "hat": [116, 141, 237], "have": [0, 1, 2, 5, 8, 16, 82, 84, 85, 86, 90, 121, 154, 155, 157, 158, 164, 172, 204, 219, 241, 301, 310, 349, 378, 388, 461, 463, 478, 479, 480, 482, 483, 487], "haven": 5, "hazan": 457, "he": [5, 404, 405], "he_norm": 324, "he_uniform": 324, "head": [146, 378, 399], "header": [2, 143], "heart": 5, "heavi": 5, "height": [327, 328, 331, 332, 334, 335, 337, 338, 356], "hello": [310, 314], "help": [2, 5, 479, 487], "helper": [5, 143, 479], "henc": [0, 2, 237, 479], "hendryck": 413, "here": [2, 5, 454, 479, 481, 483, 486, 487], "hermitian": [186, 187], "hf": 349, "hg": 349, "hh": 383, "hi": [5, 349], "hidden": [343, 349, 383, 399], "hidden_dim": [6, 452, 454], "hidden_s": [343, 349, 383], "hierarchi": 317, "high": [248, 252, 324, 340, 408, 449], "high_pad_s": 0, "higher": [2, 171, 218, 430, 481], "highli": 8, "him": 5, "hing": 425, "hinge_loss": 324, "hinton": 466, "hit": 2, "hn": 343, "ho": 349, "hold": [2, 5, 10, 11, 189, 479], "homebrew": 480, "hopkin": 189, "host": 2, "host1": 480, "host2": 480, "host_nam": [1, 2], "hostfil": 480, "hostnam": 480, "hot": 423, "hour": 5, "how": [2, 5, 6, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 355, 356, 380, 400, 462, 479, 482, 487], "howev": [2, 112, 324, 341, 344, 463, 476, 479, 480, 483, 484], "hr": 343, "http": [344, 348, 350, 357, 382, 413, 435], "huber": 426, "huber_loss": 324, "human": [404, 405], "hundr": 8, "hurri": 5, "hutter": 459, "hyperbol": [0, 20, 22, 25, 107, 272, 290, 398, 447], "hz": 343, "i": [0, 1, 2, 3, 5, 6, 7, 8, 16, 18, 28, 37, 78, 83, 93, 99, 100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 117, 118, 121, 122, 124, 125, 126, 127, 129, 136, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 168, 172, 173, 178, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 199, 203, 204, 208, 214, 217, 218, 232, 233, 236, 237, 238, 245, 246, 247, 257, 259, 262, 263, 264, 269, 273, 275, 280, 281, 286, 287, 288, 291, 294, 295, 299, 300, 301, 302, 303, 304, 307, 309, 310, 311, 312, 313, 317, 319, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 363, 369, 371, 372, 374, 375, 377, 378, 379, 381, 382, 383, 386, 391, 397, 399, 400, 404, 405, 411, 413, 421, 422, 424, 429, 430, 433, 434, 436, 441, 446, 452, 454, 456, 459, 461, 462, 463, 468, 470, 471, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "i386": 8, "i_n": 1, "i_nw": 1, "i_s": 1, "i_sw": 1, "i_t": 349, "iclr": [458, 459, 460], "id": [6, 8], "idea": [481, 483], "idempot": [363, 375], "ident": [0, 112, 140, 281, 324, 372], "identifi": [2, 310, 478], "idim": 6, "idiom": [6, 479], "idx": [38, 482], "ie": [375, 480], "ieee": 317, "ignor": [5, 38, 93, 94, 136, 456], "ih": 383, "ii": 1, "ij": 208, "imag": [0, 331, 332, 334, 335, 337, 338, 400], "imagenet": [404, 405], "imaginari": 170, "immedi": [5, 358], "implement": [0, 1, 4, 6, 145, 146, 189, 340, 362, 378, 386, 388, 391, 397, 399, 400, 446, 455, 456, 457, 458, 460, 461, 462, 474, 479, 481], "impli": 301, "implicit": [476, 479, 481], "implicitli": 483, "import": [2, 3, 4, 5, 6, 8, 112, 168, 189, 265, 299, 310, 311, 312, 313, 314, 324, 326, 327, 328, 348, 355, 356, 365, 400, 421, 423, 430, 449, 452, 454, 479, 480, 481, 482, 483, 484, 485], "improv": [1, 2, 3, 5, 421, 455, 456, 457, 458, 459, 460, 466, 479, 480], "in_ax": [303, 481], "in_channel": [330, 331, 332, 333, 334, 335], "in_dim": [324, 452], "in_proj": 452, "inci": 2, "includ": [1, 2, 108, 109, 110, 111, 143, 211, 212, 217, 350, 359, 371, 381, 424, 454, 479, 481, 482, 485, 486, 488], "include_dir": 2, "inclus": [0, 41, 42, 43, 44, 108, 109, 110, 111, 159], "incom": 2, "inconveni": 479, "incorpor": 484, "incorrect": 484, "increas": 218, "increment": 18, "incur": [5, 8], "incx": 2, "independ": [120, 337, 338], "index": [0, 1, 2, 7, 9, 28, 38, 138, 140, 165, 208, 233, 287, 288, 299, 315], "indic": [0, 2, 16, 26, 27, 28, 29, 38, 163, 164, 165, 172, 173, 174, 175, 176, 177, 178, 191, 236, 275, 287, 288, 299, 372, 374, 423, 430, 470, 482], "indices_or_sect": [71, 275], "indirectli": 484, "individu": [324, 337, 338], "ineffici": [482, 483], "inexact": [11, 178], "inf": [189, 225, 378], "infer": [7, 162, 194, 294, 480], "infin": [0, 174, 176, 177, 225, 355, 356, 460], "infinit": [16, 172, 173], "info": [5, 8], "inform": [3, 5, 6, 8, 131, 210, 263, 264, 317, 324, 328, 341, 378, 481, 487], "inherit": [6, 478], "inifn": 174, "init": [324, 379, 449, 454, 468, 469, 471, 472, 480], "init_fn": [401, 402, 403, 404, 405, 406, 407, 408, 449], "init_valu": 1, "initi": [1, 3, 4, 5, 123, 313, 324, 328, 344, 348, 350, 352, 379, 382, 401, 402, 403, 404, 405, 406, 407, 408, 452, 463, 468, 469, 471, 472, 479, 480, 483], "initializer_list": 0, "inject": 0, "inlin": 0, "inner": [0, 479], "inorm": 348, "inp": [1, 143], "inp_ndim": 1, "inp_shap": 1, "inp_strid": 1, "inplac": [2, 8], "input": [0, 1, 2, 4, 5, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 117, 118, 121, 122, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 247, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 297, 298, 299, 300, 301, 303, 304, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 381, 382, 383, 386, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 424, 425, 426, 427, 429, 430, 432, 434, 446, 449, 479, 481, 482, 485, 486], "input_dil": [0, 101], "input_dim": [6, 324, 352, 381], "input_nam": [1, 143], "input_s": [343, 349, 383], "inputs1": 430, "inputs2": 430, "insert": [118, 138, 487], "insid": 479, "inspect": [3, 479, 485], "inspir": 7, "instabl": 434, "instal": 2, "instanc": [5, 38, 112, 237, 314, 324, 348, 358, 359, 360, 363, 365, 366, 367, 372, 375, 376, 377, 388, 452, 484], "instancenorm": 324, "instanti": [1, 2, 6, 483], "instantiate_axpbi": 2, "instead": [2, 8, 112, 324, 377, 391, 480, 481, 483], "int": [0, 1, 2, 5, 6, 9, 15, 17, 18, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 45, 46, 49, 56, 57, 58, 59, 60, 63, 66, 68, 71, 74, 75, 76, 77, 78, 80, 83, 90, 91, 95, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 116, 117, 118, 125, 126, 127, 131, 138, 140, 141, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 164, 165, 169, 178, 185, 189, 193, 203, 205, 207, 210, 211, 212, 213, 216, 217, 218, 221, 223, 229, 232, 233, 235, 236, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 256, 257, 259, 260, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 315, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 380, 381, 382, 383, 386, 391, 399, 414, 422, 423, 427, 432, 434, 452, 468, 470, 471, 472], "int16": 317, "int32": [0, 1, 11, 18, 38, 159, 178, 189, 248, 317, 400, 482, 485], "int64": [11, 317], "int8": [11, 317], "int_0": 134, "integ": [0, 11, 161, 163, 164, 178, 189, 210, 232, 237, 238, 240, 247, 248, 275, 287, 291, 303, 317, 340, 370, 470, 482], "integr": [18, 287, 483], "intend": [0, 479], "interact": 399, "interest": 487, "interfac": 2, "intermedi": 484, "intern": 328, "interpol": 400, "interv": [18, 193, 248, 252], "introduc": [0, 259], "intuit": 324, "invalid": [0, 83], "invers": [0, 19, 20, 21, 22, 23, 24, 25, 135, 150, 151, 152, 153, 154, 155, 184, 188, 192], "invert": 0, "involv": [454, 479], "iogpu": 218, "ip": 480, "is_avail": 123, "is_equival": 2, "is_floating_point": 2, "is_leaf": [310, 311, 312, 313], "is_leaf_fn": 362, "isclos": 0, "isfinit": 0, "ish": 5, "ishmael": 5, "isinf": 0, "isnan": 0, "isneginf": 0, "isposinf": 0, "issu": [480, 481, 484], "issubdtyp": [11, 317], "item": [0, 2, 4, 5, 6, 311, 454, 483, 484, 485], "iter": [4, 6, 191, 311, 312, 476, 479, 483], "iterm": 8, "itertool": [5, 311], "its": [0, 1, 2, 8, 184, 204, 233, 250, 296, 308, 314, 324, 381, 454, 458, 459, 460, 480, 484, 487], "itself": [2, 307, 463], "ix": 1, "ix_n": 1, "ix_nw": 1, "ix_s": 1, "ix_sw": 1, "iy_n": 1, "iy_nw": 1, "iy_s": 1, "iy_sw": 1, "j": [5, 8, 189, 337, 457, 458, 460], "j8": 2, "jacobian": [2, 179, 302, 485], "jain": 337, "jax": [7, 476], "jit": 143, "jmlr": 457, "jnp": 484, "john": 189, "join": 470, "join_schedul": 454, "jointli": 245, "just": [2, 6, 350, 479, 482], "jvp": [2, 112, 485], "k": [0, 5, 45, 90, 117, 140, 146, 163, 168, 293, 296, 297, 298, 326, 352, 355, 363], "k_h": [327, 356], "k_w": [327, 356], "kaim": 405, "keep": [2, 15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 324, 362, 481, 483], "keepdim": [0, 15, 17, 26, 27, 33, 34, 35, 36, 56, 57, 58, 59, 63, 75, 76, 80, 189, 203, 205, 207, 221, 235, 273, 280, 284, 300], "kei": [1, 3, 5, 146, 210, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 310, 311, 362, 363, 375, 378, 463, 476, 478, 481], "kept": 218, "kernel": [2, 7, 8, 98, 99, 100, 101, 102, 103, 104, 143, 326, 327, 355, 356, 479, 482], "kernel_dil": [0, 101], "kernel_s": [326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "key_cach": 5, "key_input_dim": 378, "key_proj": 5, "keyword": [165, 265, 266, 299, 311, 324, 476, 486, 488], "kind": 5, "kingma": [458, 460], "kl_div_loss": 324, "kname": 2, "know": [2, 5], "known": [389, 441], "kth": [0, 28, 233], "kullback": 427, "kw_onli": 2, "kwarg": [10, 120, 265, 266, 488], "l": [5, 6, 183, 184, 186, 187, 324, 326, 328, 330, 333, 343, 349, 355, 383, 433], "l1": [299, 426, 428, 429, 433], "l1_loss": 324, "l2": [426, 429, 467], "l2_loss": 324, "l_": [326, 355, 426], "la": 189, "label": [3, 4, 423, 430], "label_smooth": 423, "lack": 482, "lambd": [345, 395, 415, 445], "lambda": [311, 312, 313, 324, 345, 358, 363, 370, 395, 415, 439, 445, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 480, 481], "languag": [1, 2], "larg": [5, 324, 378, 429, 479, 480, 483], "larger": [1, 145, 218, 386, 461], "largest": [189, 225, 293], "lasso": 299, "last": [0, 1, 5, 29, 78, 142, 144, 149, 152, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 190, 191, 192, 204, 213, 241, 274, 291, 301, 330, 331, 332, 333, 334, 335, 337, 338, 344, 400, 484], "latenc": 480, "later": [3, 8, 454], "launch": [1, 2, 123, 480, 482], "layer": [7, 142, 307, 324, 326, 327, 337, 338, 343, 344, 349, 350, 352, 355, 356, 372, 377, 380, 381, 383, 388, 399, 448, 452], "layer_s": 6, "layernorm": 324, "layout": 1, "lazi": [7, 452, 485], "lazili": [5, 324], "lceil": 90, "ld": [343, 349, 383], "ldot": [326, 327, 355, 356], "lead": [0, 18, 83, 479], "leaf": [94, 307, 310, 311, 312, 313, 362], "leaf_modul": 324, "leaki": [351, 418], "leaky_relu": 324, "leakyrelu": 324, "learn": [4, 6, 7, 328, 344, 348, 350, 379, 382, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "learnabl": [330, 331, 332, 333, 334, 335, 388], "learning_r": [6, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479], "least": [5, 84, 85, 86, 93, 183, 184, 186, 187, 188, 190, 191, 192, 237], "leav": [2, 136, 311, 312, 313], "lectur": 466, "lecun": 337, "left": [0, 5, 145, 180, 189, 237, 259, 326, 327, 341, 355, 356, 386, 400, 412, 413, 424, 426, 434], "left_shift": 0, "leibler": 427, "len": [5, 149, 152, 155, 158, 168, 470], "length": [5, 278, 328, 330, 333, 343, 349, 383, 470], "leq": [426, 439], "less": [0, 5, 28, 182, 218, 233, 386, 433], "less_equ": 0, "let": [1, 2, 4, 5, 184, 479, 481, 483, 484], "level": [0, 163, 164, 404, 405], "lfloor": [326, 327, 355, 356], "lh": [343, 349, 383], "lhs_indic": [0, 163, 164], "lhs_mask": 90, "lib": 480, "libmlx": 8, "libmlx_ext": 2, "libmpi": 480, "librari": [2, 8, 319, 324], "like": [2, 5, 7, 126, 178, 230, 306, 338, 429, 463, 465, 479, 480, 481, 483, 484, 485, 487], "likelihood": [424, 432], "limit": [0, 2, 93, 216, 217, 218, 482], "linalg": 168, "line": [5, 480, 483, 484], "linear": [0, 2, 5, 6, 7, 307, 311, 324, 329, 339, 341, 342, 351, 365, 381, 383, 384, 385, 387, 389, 400, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441, 449, 452, 463, 471, 479], "linear1": 5, "linear2": 5, "linear3": 5, "linear_schedul": [454, 470], "linearli": 378, "link": [2, 8], "linspac": 0, "lion": 454, "list": [1, 5, 10, 15, 17, 30, 71, 78, 83, 84, 85, 86, 91, 94, 95, 101, 131, 136, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 165, 179, 189, 203, 205, 207, 208, 221, 229, 232, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 263, 273, 275, 279, 280, 284, 291, 292, 295, 299, 300, 302, 305, 310, 313, 314, 324, 363, 365, 366, 367, 368, 373, 375, 376, 377, 452, 454, 458, 459, 460, 461, 470, 478, 479, 480, 481, 483], "liter": [2, 232, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "littl": 5, "liu": 5, "live": [7, 143, 487], "ll": [1, 4, 6, 426, 479, 481], "llama": 5, "llamaattent": 5, "llamaencoderlay": 5, "llm": 7, "load": [6, 7, 319, 365, 480], "load_weight": [324, 483], "loader": 6, "loader_path": 2, "loan": 189, "loc": [1, 244, 246], "local": [324, 337, 480], "locat": [0, 2, 83, 376, 377, 480, 487], "log": [0, 197, 199, 203, 353, 354, 419, 420, 421, 424, 427, 429, 432, 444], "log10": 0, "log1p": 0, "log2": 0, "log_cosh_loss": 324, "log_sigmoid": [324, 353], "log_softmax": [324, 354], "logaddexp": 0, "logarithm": [0, 195, 196, 197, 198], "logcosh": 429, "logic": [0, 2, 200, 201, 202], "logical_and": 0, "logical_not": 0, "logical_or": 0, "logist": [0, 4, 269, 413, 441], "logit": [5, 241, 421, 423, 479], "logsigmoid": 324, "logsoftmax": 324, "logsumexp": 0, "long": 5, "longer": [5, 105, 481], "look": [2, 5, 480], "lookup": 340, "loop": [5, 6, 479, 480, 481, 483], "loshchilov": 459, "loss": [4, 6, 299, 324, 454, 479, 480, 481, 483], "loss_and_grad": 324, "loss_and_grad_fn": [6, 454, 479, 481], "loss_fn": [4, 6, 454, 479, 481], "loss_grad_fn": 480, "lot": [480, 481], "low": [248, 252, 408, 449], "low_pad_s": 0, "lower": [183, 184, 186, 187, 192, 237, 248, 251, 252, 296, 408], "lr": [4, 461], "lr_schedul": [468, 469, 470, 472], "lstm": 324, "lto": 2, "lu": 5, "luckili": 483, "lvalu": 299, "m": [0, 2, 5, 8, 90, 140, 163, 168, 189, 296, 326, 327, 355, 356, 455, 479], "m1": [1, 5, 479, 481, 487], "m10": 317, "m7": 317, "m_": [458, 459, 460, 461], "m_t": [458, 459, 460, 461], "mac": 480, "machin": [5, 7, 8, 466, 480], "maco": [8, 218], "macosx": 8, "made": [5, 319], "mai": [2, 189, 307, 337, 480, 481, 482], "main": [7, 118, 140, 143, 294, 311, 312, 324, 480], "maintain": [337, 338, 461], "major": [0, 2], "make": [1, 2, 3, 5, 6, 8, 204, 227, 268, 324, 468, 469, 471, 472, 479, 483, 485, 487], "make_shar": 2, "malloc_or_wait": 2, "man": 5, "manag": [282, 476, 480, 487], "mani": [2, 83, 275, 330, 331, 332, 333, 334, 335, 340, 380, 479, 480, 483], "manual": 324, "map": [2, 6, 38, 194, 311, 340, 358], "map_fn": [358, 362], "map_torch_to_mlx": 5, "margin": [430, 434], "margin_ranking_loss": 324, "mask": [0, 5, 90, 146, 372, 378, 482], "mask_lh": [0, 90], "mask_n": 1, "mask_nw": 1, "mask_out": [0, 90], "mask_rh": [0, 90], "mask_s": 1, "mask_sw": 1, "matadata": 194, "match": [8, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 211, 365, 400, 423, 480, 482, 484], "materi": [5, 7], "math": [5, 434, 479], "mathbf": 184, "mathcal": 352, "mathemat": 189, "mathrm": [134, 269, 348], "matmul": [0, 163, 487], "matric": [189, 190, 191], "matrix": [0, 4, 14, 45, 90, 116, 117, 140, 141, 163, 164, 168, 169, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 208, 237, 238, 245, 380, 381, 406, 449], "matter": [5, 324], "max": [0, 1, 2, 189, 206, 329, 355, 356, 379, 409, 416, 417, 422, 424, 425, 430, 434, 436, 438, 456, 460, 479, 481, 487], "max_": [355, 356], "max_buffer_s": 210, "max_freq": 391, "max_i": 237, "max_norm": 309, "max_recommended_working_set_s": [210, 218], "max_val": 416, "maximum": [0, 6, 26, 38, 93, 108, 213, 217, 309, 324, 351, 384, 391, 412, 413, 418, 437, 452, 483], "maxpool1d": 324, "maxpool2d": 324, "maxtotalthreadsperthreadgroup": 2, "mca": 480, "md": 189, "me": 5, "mean": [0, 1, 4, 5, 6, 144, 244, 245, 246, 299, 324, 328, 344, 363, 382, 407, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 481, 484], "meant": 112, "measur": 487, "mechan": 399, "medic": 338, "meet": 8, "member": [2, 324, 368, 373], "memori": [0, 1, 2, 7, 83, 209, 211, 212, 213, 215, 216, 217, 218, 399, 452, 456, 479, 483, 484], "memory_order_relax": 1, "memory_s": [210, 218], "memoryview": [483, 484], "merg": 479, "meshgrid": 0, "metadata": [4, 194, 263, 264], "metal": [2, 7, 143], "metal_captur": 3, "metal_kernel": 1, "metal_path": 8, "metallib": [2, 8], "method": [2, 5, 9, 10, 30, 112, 120, 307, 315, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 369, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 452, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 474], "millisecond": [8, 479, 487], "min": [0, 2, 189, 222, 329, 379, 409, 416, 417, 436, 438], "min_freq": 391, "min_i": 237, "min_val": 416, "mind": [2, 5], "mine": 5, "minibatch": 6, "minim": 480, "minimum": [0, 27, 38, 93, 109, 391, 421, 422], "minsizerel": 8, "minu": 139, "minut": 5, "mish": 324, "miss": [365, 486], "mix": 482, "mkdir": [3, 8], "ml": 8, "mlp": [6, 324, 399, 454], "mlp_dim": [5, 399], "mlx": [1, 3, 4, 5, 6, 8, 319, 324, 449, 452, 454, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "mlx_build_benchmark": 8, "mlx_build_cpu": 8, "mlx_build_exampl": 8, "mlx_build_gguf": 8, "mlx_build_met": [2, 8], "mlx_build_metallib": 2, "mlx_build_python_bind": 8, "mlx_build_safetensor": 8, "mlx_build_test": 8, "mlx_disable_compil": [119, 132, 479], "mlx_ext": 2, "mlx_ext_metallib": 2, "mlx_include_dir": 2, "mlx_metal_debug": [3, 8], "mlx_metal_jit": 8, "mlx_sample_extens": 2, "mlx_trace": 3, "mnist": 6, "mode": [0, 1, 2, 105, 232, 361, 372, 374, 400, 404, 405, 480], "model": [4, 6, 7, 265, 307, 308, 311, 312, 324, 358, 361, 363, 365, 369, 372, 374, 375, 376, 378, 399, 449, 452, 454, 462, 463, 465, 479, 480, 483], "modest": 2, "modif": 484, "modifi": 484, "modul": [2, 5, 6, 307, 308, 388, 399, 449, 465, 478, 479, 483], "moment": [5, 456, 458, 459, 460], "momentum": [328, 461, 463, 467, 479], "monei": 5, "monoton": 435, "more": [1, 2, 3, 6, 10, 78, 118, 163, 183, 184, 186, 187, 188, 191, 192, 204, 216, 217, 263, 264, 317, 324, 328, 337, 386, 391, 399, 400, 402, 403, 404, 405, 421, 476, 479, 480, 481, 482, 485, 487], "most": [2, 241, 324, 465, 479, 480, 481, 482, 483], "move": [0, 2, 223, 487], "moveaxi": 0, "mpi": 319, "mpiexec": 480, "mpirun": 480, "mse": 299, "mse_loss": 324, "mtl": 2, "mtl_capture_en": 3, "mtlcommandbuff": 2, "mu": 467, "much": [1, 2, 5, 326, 327, 355, 356, 479, 483], "multi": [7, 146, 330, 331, 332, 333, 334, 335, 482, 484], "multidimension": 208, "multiheadattent": [5, 324], "multipl": [0, 1, 8, 14, 90, 142, 144, 163, 164, 204, 224, 237, 238, 378, 391, 469, 470, 472, 479, 483, 486], "multipli": [0, 2, 38, 164, 237, 238, 336, 391, 400], "murtadha": 5, "must": [0, 1, 2, 3, 8, 90, 93, 145, 162, 164, 186, 187, 189, 240, 241, 245, 248, 251, 252, 304, 400, 484], "mx": [1, 2, 3, 4, 5, 6, 38, 96, 97, 112, 123, 126, 143, 159, 178, 186, 187, 189, 190, 194, 247, 265, 299, 309, 324, 326, 327, 328, 339, 348, 351, 355, 356, 358, 365, 369, 384, 400, 401, 402, 403, 404, 405, 406, 407, 408, 410, 418, 421, 422, 423, 427, 430, 437, 447, 449, 452, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "my": [5, 8], "my_devic": 488, "my_path": 265, "myexp": [1, 143], "myexp_strid": 1, "mymlp": 452, "n": [0, 1, 2, 5, 30, 90, 98, 99, 100, 101, 102, 103, 104, 140, 147, 149, 150, 152, 153, 156, 158, 168, 169, 245, 280, 296, 300, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 355, 356, 383, 400, 429, 434, 480], "n_i": [326, 327, 355, 356], "n_t": 343, "naiv": [2, 481], "naive_add": 481, "name": [1, 2, 143, 164, 194, 237, 238, 263, 264, 265, 266, 324, 344, 362, 365, 367, 480, 482, 486], "named_modul": 324, "nan": [0, 16, 82, 172, 173, 175, 225], "nan_to_num": 0, "nanobind": [2, 399], "nanobind_add_modul": 2, "nativ": 8, "natur": [0, 195, 197, 483], "nb": 2, "nb_domain": 2, "nb_func": 399, "nb_modul": 2, "nb_static": 2, "nbyte": 2, "nc": 328, "ndarrai": [30, 482, 483, 485], "ndhwc": [332, 335, 338], "ndim": [0, 1, 2, 159, 189, 191, 400], "ne": 1, "nearest": [1, 400], "necessari": 324, "necessarili": 293, "need": [1, 2, 5, 6, 7, 8, 82, 237, 324, 376, 377, 391, 399, 476, 480, 481, 483, 484, 485, 487], "neg": [0, 118, 159, 176, 225, 259, 294, 351, 355, 356, 378, 424, 432, 434, 482], "negat": [0, 226], "negative_slop": [351, 418], "neginf": [0, 225], "neighbor": 400, "neither": [165, 299], "nelem": 2, "nervou": 5, "nest": [78, 94, 313, 324, 452, 478, 481], "nesterov": 467, "network": [5, 7, 328, 337, 340, 402, 403, 449, 452, 466, 480], "neural": [5, 7, 340, 402, 403, 435, 449, 452, 466], "never": [5, 483], "new": [0, 2, 6, 91, 118, 223, 227, 257, 279, 295, 301, 311, 312, 370, 378, 452, 454, 465, 470, 479, 482, 483, 484], "new_tre": 312, "next": [2, 5, 6, 216], "nh": [343, 349, 383], "nhwc": [328, 331, 334], "nice": [481, 483], "nlc": [328, 330, 333], "nld": [343, 349, 383], "nlh": [343, 349, 383], "nll": [424, 432], "nll_loss": 324, "nn": [2, 5, 6, 265, 311, 324, 449, 452, 454, 463, 465, 479, 483], "nobodi": 5, "node": [94, 136, 303, 312, 313], "nois": 4, "noisi": 4, "nomins": 2, "non": [0, 1, 2, 8, 208, 373, 383, 435, 452], "none": [1, 2, 5, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 121, 122, 125, 126, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 215, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306, 307, 310, 311, 312, 313, 315, 326, 327, 341, 355, 356, 358, 362, 363, 370, 375, 378, 383, 391, 399, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 456, 474, 482], "nonlinear": [383, 479], "nonzero": 482, "noop": 375, "nor": [2, 165, 299], "norm": [5, 144, 309, 344, 434, 460, 461], "norm1": 5, "norm2": 5, "norm_first": 399, "normal": [1, 2, 4, 5, 142, 143, 144, 186, 245, 251, 324, 326, 327, 328, 344, 348, 350, 355, 356, 382, 399, 402, 404, 484, 487], "not_equ": 0, "notabl": [5, 7], "notat": [116, 141, 310, 367], "note": [0, 2, 5, 8, 16, 18, 83, 90, 94, 100, 103, 104, 112, 146, 154, 155, 164, 172, 189, 211, 237, 241, 301, 307, 324, 382, 400, 454, 484, 486], "noth": [5, 324, 483], "notic": [5, 481, 486], "now": [1, 2, 5, 8, 381, 479, 480, 484], "np": [1, 5, 6, 480, 484, 485], "npy": [194, 262, 486], "npz": [5, 194, 265, 266, 365, 369, 486], "nuclear": 189, "nullopt": 0, "num": [0, 5, 193, 250], "num_class": [6, 454], "num_decoder_lay": 399, "num_embed": [340, 380], "num_encoder_lay": 399, "num_epoch": [6, 454], "num_exampl": 4, "num_featur": [4, 328], "num_group": 344, "num_head": [5, 378, 399], "num_it": 4, "num_lay": [5, 6, 454], "num_param": 324, "num_paramet": 379, "num_sampl": 241, "num_split": 0, "number": [0, 2, 11, 18, 61, 70, 94, 99, 100, 101, 103, 104, 116, 140, 141, 164, 165, 169, 179, 193, 225, 232, 237, 238, 241, 244, 246, 250, 252, 256, 259, 260, 291, 292, 296, 299, 302, 303, 307, 324, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 378, 379, 399, 400, 402, 403, 404, 405, 468, 470, 471, 476, 479, 481, 488], "number_of_el": 0, "numer": [5, 142, 144, 189, 199, 203, 273, 328, 344, 348, 350, 382, 421, 422, 424, 434, 455, 456, 457, 458, 459, 460, 466, 479, 483], "numpi": [2, 5, 6, 7, 13, 16, 18, 87, 88, 89, 91, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 483, 485, 486], "nw": 1, "nwhc": 337, "o": [2, 8, 146, 349], "o_t": 349, "obj": 263, "object": [3, 10, 30, 50, 78, 94, 143, 178, 265, 303, 310, 311, 312, 313, 317, 337, 399, 478], "observ": 5, "occupi": [116, 141, 164, 237, 238], "occur": 484, "odim": 6, "odot": [343, 349], "off": [5, 8, 483], "offer": 429, "offset": [0, 1, 2, 5, 46, 83, 118, 142, 145, 294], "often": 338, "ok": [365, 481], "okai": [479, 483], "old": 5, "omit": [458, 460, 480], "onc": [2, 8, 479], "one": [0, 2, 5, 8, 38, 78, 84, 93, 99, 100, 101, 103, 104, 138, 140, 142, 144, 145, 189, 197, 204, 238, 241, 278, 283, 317, 375, 400, 423, 480, 487], "ones": [0, 2, 5, 230, 265, 296, 376, 377, 454, 480, 482], "ones_lik": 0, "onli": [1, 2, 5, 7, 8, 82, 90, 99, 100, 101, 103, 104, 186, 187, 189, 218, 237, 245, 301, 324, 362, 363, 365, 370, 372, 375, 376, 377, 452, 479, 480, 481, 486, 487], "onlin": 457, "op": [1, 2, 231, 301, 363, 483], "open": [3, 8, 18, 248, 252], "openmpi": 480, "oper": [3, 5, 7, 9, 37, 84, 85, 86, 101, 146, 163, 164, 234, 236, 273, 281, 288, 315, 324, 399, 461, 479, 480, 481, 482, 483, 484, 485, 487, 488], "operand": [130, 131, 163], "opportun": 479, "opt": [462, 480], "optim": [3, 4, 6, 7, 376, 479, 480, 481, 483], "option": [0, 3, 5, 14, 15, 17, 18, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 86, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 105, 108, 109, 110, 111, 112, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 142, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 163, 164, 165, 169, 176, 177, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 217, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 259, 273, 274, 275, 278, 279, 280, 284, 286, 287, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 310, 311, 312, 313, 326, 327, 328, 330, 331, 332, 333, 334, 335, 343, 349, 352, 355, 356, 358, 362, 363, 365, 370, 375, 378, 380, 381, 383, 386, 391, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 476, 479, 486, 488], "ord": 189, "order": [0, 1, 28, 83, 101, 131, 186, 187, 189, 233, 237, 293, 324, 344, 376, 388, 463, 479, 481], "ordinari": 171, "org": [344, 348, 350, 357, 382, 413, 435], "origin": [5, 118, 309, 328, 371, 402, 403, 404, 405, 455, 456, 457, 458, 460, 461, 484], "orthonorm": 168, "ostream": 2, "ostringstream": 2, "other": [0, 2, 5, 7, 178, 189, 324, 364, 452, 461, 479, 480, 482, 483, 485], "other_input": 324, "otherwis": [18, 101, 123, 217, 247, 307, 310, 311, 312, 313, 363, 365, 375, 397, 399, 400, 415, 421, 426, 433, 445, 446, 483, 484], "our": [1, 2, 5, 6, 388, 455, 456, 457, 458, 460, 461, 480], "out": [0, 1, 2, 8, 90, 143, 326, 327, 337, 338, 355, 356, 372, 479, 480, 481, 482], "out_ax": [303, 481], "out_channel": [330, 331, 332, 333, 334, 335], "out_dim": [324, 452], "out_dtyp": 2, "out_idx": 2, "out_mask": 90, "out_proj": [5, 452], "out_ptr": 2, "out_shap": [1, 2], "outer": [0, 479, 483], "outlier": 429, "output": [0, 1, 2, 5, 8, 15, 16, 17, 18, 28, 83, 90, 91, 94, 96, 97, 108, 109, 110, 111, 112, 130, 140, 142, 143, 144, 145, 146, 153, 156, 157, 158, 162, 163, 165, 168, 169, 172, 189, 193, 203, 205, 207, 208, 221, 225, 229, 230, 233, 235, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 265, 266, 273, 278, 280, 284, 288, 294, 296, 299, 300, 301, 302, 303, 304, 305, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 348, 352, 355, 356, 378, 381, 397, 399, 400, 402, 403, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 479, 480, 481, 482, 483, 484, 485, 486, 487], "output_dim": [6, 324, 352, 381], "output_directori": 2, "output_dtyp": [1, 143], "output_fil": 5, "output_nam": [1, 143], "output_shap": [1, 143], "outsid": [143, 159], "over": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 149, 152, 155, 158, 171, 189, 191, 193, 203, 205, 207, 221, 233, 235, 261, 273, 274, 280, 284, 291, 293, 300, 328, 330, 331, 332, 333, 334, 335, 344, 350, 382, 423, 468, 471, 480, 481], "overal": 2, "overhead": [479, 483, 487], "overlap": 1, "overload": 18, "overrid": [2, 132], "overview": 3, "overwrit": 5, "own": [8, 484], "owndata": 484, "p": [8, 240, 324, 336, 337, 338, 434, 458, 460], "pack": [164, 237, 238], "packag": [2, 4, 6, 8, 319, 449, 480], "package_data": 2, "pad": [0, 1, 98, 99, 100, 101, 102, 103, 104, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "pad_valu": 0, "pad_width": [0, 232], "padding_hi": 0, "padding_lo": 0, "page": 485, "pain": 5, "pair": [0, 2, 232, 365, 386], "pairwis": 434, "pan": 5, "paper": [328, 391, 455, 456, 457, 458, 460, 461], "parallel": [480, 487], "param": [299, 324, 449, 481], "paramet": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 348, 349, 350, 351, 352, 355, 356, 358, 359, 362, 363, 365, 370, 371, 372, 375, 376, 377, 378, 379, 380, 381, 382, 383, 386, 388, 391, 395, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 448, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 465, 466, 467, 468, 469, 470, 471, 472, 474, 479, 480, 481, 483], "parameter_scal": 456, "parametr": [379, 436], "pars": 5, "parse_arg": 5, "parser": 5, "part": [1, 2, 170, 253, 481, 482], "partial": [376, 377, 479, 483], "particip": [121, 122, 125, 126, 127], "particular": [237, 344], "particularli": 479, "partit": [0, 28], "pass": [1, 2, 5, 6, 65, 79, 231, 232, 299, 308, 310, 311, 312, 324, 363, 375, 376, 377, 388, 479, 480, 483], "password": 480, "path": [3, 8, 131, 219, 265, 266, 307, 312, 365, 480], "pattern": [324, 483], "peak": [213, 215], "penalti": 467, "pep": 484, "per": [5, 6, 116, 141, 164, 237, 238, 307, 328, 344, 348, 350, 382, 474, 479, 480, 483], "perceptron": 7, "perf_count": 479, "perfectli": 483, "perform": [0, 1, 2, 3, 5, 7, 14, 90, 101, 108, 109, 110, 111, 127, 130, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 186, 187, 204, 238, 260, 273, 287, 324, 344, 399, 404, 405, 454, 479, 480, 482, 483, 487], "perhap": [2, 5], "perm": 6, "permtuat": 247, "permut": [0, 6], "persist": 8, "pg": 189, "phi": [341, 411], "physic": 480, "pi": [134, 341, 391, 412, 481], "pick": 2, "pip": [2, 8], "pipelin": 2, "pixel": 337, "place": [2, 5, 38, 259, 260, 307, 480, 483, 484], "placehold": 479, "plai": [2, 5], "plain": 388, "plan": [2, 479], "platform": 8, "plu": [0, 197], "point": [0, 2, 4, 5, 8, 83, 161, 238, 317], "pointer": 2, "pool": [326, 327, 355, 356, 487], "popul": 2, "portion": 336, "posinf": [0, 225], "posit": [0, 5, 28, 118, 145, 159, 165, 177, 183, 184, 223, 225, 233, 245, 259, 294, 299, 311, 324, 330, 331, 332, 333, 334, 335, 378, 386, 391, 424, 434], "possibl": [275, 340, 380, 479, 480, 482, 487], "possibli": [5, 14, 90, 163, 204, 309], "postur": 5, "potenti": 217, "power": [0, 481, 484], "practic": [2, 479], "pre": [8, 146, 421], "preced": 344, "precis": [0, 2, 5, 139, 146, 324, 341, 382, 421, 462, 479], "preclud": 324, "pred": [425, 429], "predic": [307, 370], "predict": [421, 424, 425, 426, 427, 428, 429, 431, 432, 433], "prefix": [303, 310], "prelu": 324, "prepar": [2, 5], "prepend": [3, 204], "preprint": [5, 455, 461], "preprocessor": 8, "present": 1, "preserv": [257, 481], "press": [5, 189], "pressur": 2, "pretti": [479, 483], "prevent": [281, 434, 484], "previou": [216, 217, 218], "primal": [1, 2, 112, 179, 302], "primit": 481, "print": [1, 2, 4, 5, 6, 8, 309, 310, 311, 312, 314, 324, 476, 479, 480, 481, 482, 483, 484, 485], "prior": [236, 287, 288], "priorit": 481, "privat": 2, "prng": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "prob": 421, "probabl": [8, 248, 336, 337, 338, 381, 421, 423, 427, 487], "problem": [4, 6, 324], "process": [5, 101, 105, 120, 121, 122, 123, 125, 126, 127, 311, 312, 338, 340, 399, 478, 480], "processor": 8, "prod": [0, 1], "produc": [0, 2, 8, 94, 378, 449], "product": [0, 2, 14, 83, 110, 171, 179, 185, 204, 231, 235, 291, 302, 378, 485], "profil": 3, "program": [213, 480], "programmat": 377, "project": [3, 5, 378], "project_source_dir": 2, "promot": 2, "promote_typ": 2, "promoted_dtyp": 2, "prompt": 5, "propag": [481, 482], "properti": [31, 38, 47, 51, 61, 62, 68, 70, 371, 374, 464, 481], "proportion": 309, "protocol": 484, "provid": [0, 2, 5, 83, 116, 141, 165, 247, 259, 291, 299, 311, 313, 319, 324, 358, 363, 365, 375, 376, 377, 380, 381, 399, 400, 448, 452, 480, 486, 488], "pseudo": 476, "pth": 5, "public": [2, 324], "pun": 0, "pure": [1, 324, 454], "purpos": [1, 189], "purs": 5, "push": 2, "push_back": 2, "put": [0, 1, 6, 236, 479, 480], "put_along_axi": 0, "py": [2, 5, 8, 480], "pypi": 8, "python": [1, 3, 5, 50, 68, 78, 136, 310, 311, 312, 313, 314, 452, 462, 463, 465, 478, 480, 481, 484], "python_requir": 2, "pytorch": [5, 7, 341, 344, 481], "pytorch_compat": 344, "q": [146, 190], "qualifi": 480, "quantiz": [0, 116, 141, 164, 194, 238, 380, 381], "quantized_matmul": 0, "quantizedembed": 324, "quantizedlinear": 324, "quarter": 5, "queri": [5, 146, 218, 378], "query_input_dim": 378, "query_proj": 5, "question": [5, 483], "queue": 3, "quick": [2, 7], "quit": [481, 484], "quotient": [0, 128, 129, 161], "r": [2, 5, 190, 299, 337, 343], "r_t": 343, "race": 487, "radian": [0, 115], "rag": 5, "rain": 5, "rais": [0, 5, 189, 217, 234, 275, 365], "ram": 5, "random": [1, 2, 3, 4, 5, 6, 7, 143, 326, 327, 328, 348, 355, 356, 365, 372, 479, 481, 487, 488], "randomli": [4, 5, 247, 336, 337, 338], "rang": [0, 2, 3, 4, 5, 6, 8, 18, 159, 163, 193, 403, 405, 412, 413, 454, 468, 469, 470, 471, 472, 476, 479, 481, 483, 487], "rank": [0, 125, 126, 127, 430, 480], "rate": [4, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "rather": [2, 481, 487], "ratio": [0, 24], "rceil": 90, "re": [6, 8, 449], "readabl": 3, "readi": 2, "real": [0, 153, 154, 155, 156, 157, 158, 183, 184, 186, 187], "realli": 350, "reason": [1, 5, 482], "reboot": 8, "receiv": [125, 126, 307, 470, 484], "reciproc": [0, 261], "reclaim": 216, "recommend": [8, 217, 461], "recompil": [94, 479], "record": [3, 213, 483], "recreat": [314, 454], "rectifi": [351, 384, 385, 404, 405, 418, 437, 438], "recurr": [343, 349, 383], "recurs": [324, 362, 363, 368, 373, 375, 452], "recv": 126, "redirect": 2, "reduc": [0, 1, 8, 15, 17, 26, 27, 122, 203, 205, 207, 221, 235, 280, 284, 300, 313, 328, 399, 429], "reduct": [15, 17, 122, 203, 205, 221, 235, 313, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480], "redund": 481, "refer": [189, 348, 357, 371, 402, 403, 404, 405, 413, 435, 482], "reflect": [371, 479, 482, 484], "regard": 341, "regardless": [83, 146], "regist": [2, 6], "register_librari": 2, "regress": [7, 429], "regular": [38, 337, 435, 459, 479, 482], "regularli": 2, "reimplement": 2, "rel": [16, 172, 456, 479], "relative_step": 456, "relax": 217, "relev": 2, "reli": [1, 2], "relu": [324, 379, 399, 436, 449], "relu6": 324, "remain": [0, 5, 218, 299, 312, 336, 337, 338, 480], "remaind": [0, 129], "remov": [0, 118, 204, 241, 278, 423], "rep": [0, 292], "repeat": [0, 292], "repeatedli": 4, "repetit": 256, "replac": [0, 5, 225, 376, 377, 399, 433], "replai": 3, "repli": 5, "repo": [4, 6, 8, 479], "report": [211, 217], "repres": [2, 5, 120, 123, 164, 430, 434, 484], "represent": [5, 237, 301, 310, 314], "request": 2, "requir": [1, 2, 5, 324, 480, 483, 484], "requires_grad": 481, "rerun": [479, 483], "rescal": 309, "research": 7, "reset": 215, "reset_peak_memori": 213, "reshap": [0, 5, 189, 400, 482], "resid": 218, "resolv": 2, "resourc": 2, "respect": [2, 4, 6, 142, 144, 163, 164, 165, 237, 299, 311, 324, 328, 341, 344, 348, 350, 452, 481, 485], "respons": 2, "rest": [5, 145, 311, 312, 386], "restart": 8, "restor": 259, "result": [0, 5, 14, 18, 38, 78, 83, 94, 142, 144, 164, 189, 204, 238, 245, 256, 279, 311, 312, 313, 391, 421, 479, 481, 484], "resum": 5, "return": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 50, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 324, 343, 349, 358, 359, 360, 362, 363, 364, 365, 366, 367, 368, 372, 373, 375, 376, 377, 383, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 449, 452, 462, 478, 479, 480, 481, 482, 483, 484, 486, 487], "return_metadata": 194, "revers": [0, 2, 41, 42, 43, 44, 83, 108, 109, 110, 111, 295, 391], "rf": 8, "rfft": 153, "rfft2": 154, "rfftn": 155, "rfloor": [326, 327, 355, 356], "rho": 455, "rhs_indic": [0, 163, 164], "rhs_mask": 90, "right": [0, 1, 2, 8, 237, 258, 259, 326, 327, 341, 355, 356, 400, 412, 413, 424, 426, 434], "right_shift": 0, "rm": [5, 8, 144, 456], "rmsnorm": [5, 324], "rmsprop": 454, "rnn": [324, 343], "roadcast": 248, "robust": 429, "roform": [5, 386], "roll": 0, "root": [0, 5, 144, 261, 276, 382], "rope": [5, 324], "rosetta": 8, "rotari": [5, 145, 386], "rotat": [145, 386], "round": [0, 237], "routin": 2, "row": [0, 1, 2, 83, 140, 143, 169, 237, 296], "row_contigu": 2, "rpath": 2, "rsqrt": 0, "rtol": [0, 16, 172], "rule": [2, 454], "run": [1, 2, 3, 5, 6, 7, 8, 9, 143, 231, 315, 328, 358, 455, 456, 458, 459, 460, 479, 480, 483, 487, 488], "runtim": [5, 123, 319, 479, 480], "runtime_error": 2, "safetensor": [8, 194, 264, 365, 369, 454, 483, 486], "sai": [2, 5, 449, 483], "said": 5, "sake": 481, "same": [0, 2, 5, 8, 16, 38, 82, 91, 94, 99, 100, 101, 103, 104, 105, 121, 142, 144, 153, 156, 157, 158, 164, 165, 172, 179, 232, 241, 259, 260, 301, 302, 304, 312, 324, 327, 328, 336, 344, 348, 356, 380, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 452, 462, 476, 479, 480, 482, 487], "sampl": [2, 4, 5, 193, 240, 241, 242, 244, 245, 248, 251, 252, 402, 403, 404, 405, 407, 408, 424, 430, 434, 476, 479], "sat": 5, "save": [3, 5, 7, 194, 219, 237, 263, 264, 265, 266, 369, 483], "save_gguf": 486, "save_safetensor": [369, 454, 486], "save_weight": 324, "savez": [5, 369, 486], "savez_compress": 486, "saw": [5, 481], "scalar": [0, 2, 13, 14, 16, 30, 50, 78, 82, 87, 88, 89, 90, 91, 93, 128, 129, 133, 161, 162, 165, 166, 167, 168, 172, 180, 181, 182, 193, 199, 200, 201, 202, 204, 206, 222, 224, 225, 228, 232, 234, 240, 248, 251, 252, 255, 258, 263, 283, 299, 301, 304, 308, 434, 481, 483, 485], "scale": [0, 2, 5, 14, 116, 141, 142, 144, 145, 146, 164, 168, 237, 238, 244, 246, 309, 337, 338, 350, 378, 386, 387, 391, 400, 439, 456], "scale_arr": 2, "scale_factor": 400, "scale_paramet": 456, "scatter": 0, "scatter_add": 0, "scatter_max": 0, "scatter_min": 0, "scatter_prod": 0, "schedul": [2, 217, 454, 468, 469, 470, 471, 472, 474, 487], "schema": 3, "scipi": 168, "scope": 324, "score": [5, 146, 430], "sdk": 8, "se": 1, "second": [5, 8, 118, 178, 180, 200, 202, 204, 258, 285, 294, 299, 327, 356, 422, 430, 456, 458, 459, 460, 481, 487], "second_layer_a": 483, "second_layer_b": 483, "secret": 5, "section": [1, 5, 8, 275, 434, 479, 480, 481], "see": [1, 2, 5, 6, 8, 10, 11, 32, 33, 34, 35, 36, 39, 40, 41, 42, 43, 44, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 189, 216, 263, 264, 307, 317, 324, 328, 329, 337, 339, 341, 345, 346, 347, 353, 354, 361, 379, 380, 381, 384, 385, 386, 387, 389, 391, 392, 393, 394, 395, 396, 398, 400, 402, 403, 404, 405, 411, 412, 413, 439, 479, 480, 481, 482, 485, 487], "seed": 243, "seen": 484, "select": [0, 3, 8, 186, 187, 293, 304, 358, 362, 370], "self": [5, 6, 9, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 112, 315, 324, 435, 452], "selu": 324, "semant": [13, 87, 88, 89, 91, 128, 129, 133, 166, 167, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 487], "semi": [183, 184, 245], "send": 480, "sennrich": 5, "sensit": 429, "sentencepiec": 5, "separ": [5, 65, 79, 344, 430], "sequenc": [5, 15, 17, 33, 34, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 83, 91, 101, 125, 138, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 203, 205, 207, 221, 229, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 273, 275, 278, 280, 284, 291, 292, 295, 300, 305, 328, 330, 333, 343, 349, 383, 399, 476, 487], "sequenti": [324, 449], "seri": 8, "serial": 454, "set": [2, 5, 6, 8, 94, 112, 119, 121, 122, 123, 125, 126, 127, 132, 142, 145, 210, 216, 217, 218, 267, 268, 282, 341, 350, 352, 361, 363, 370, 371, 372, 375, 376, 381, 386, 397, 422, 434, 446, 452, 454, 456, 463, 476, 481, 483], "set_data": 2, "set_default_devic": 2, "set_dtyp": 324, "set_input_arrai": 2, "set_memory_limit": 216, "set_output_arrai": 2, "setbyt": 2, "setcomputepipelinest": 2, "setup": [2, 4, 6, 8, 479], "sever": [5, 8, 98, 99, 100, 101, 102, 103, 104, 265, 266, 479, 486], "sgd": [4, 6, 454, 461, 463, 468, 469, 472, 479], "shade": [1, 2], "shall": 5, "shape": [0, 2, 3, 5, 6, 65, 82, 83, 90, 91, 94, 98, 99, 100, 101, 102, 103, 104, 118, 121, 125, 126, 143, 146, 147, 150, 153, 156, 157, 158, 162, 163, 168, 179, 188, 192, 204, 229, 230, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 259, 301, 302, 304, 305, 306, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 348, 349, 352, 355, 356, 365, 383, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 454, 479, 481, 482, 485, 487], "shapeless": [0, 94], "share": [7, 116, 141, 164, 237, 238, 301, 480], "shazeer": 5, "shift": [0, 180, 258, 259, 328], "shop": 5, "should": [2, 4, 5, 6, 8, 83, 118, 121, 142, 143, 144, 146, 179, 209, 218, 219, 236, 237, 288, 294, 299, 302, 307, 310, 324, 330, 331, 332, 333, 334, 335, 337, 338, 372, 378, 388, 423, 425, 430, 452, 478, 479, 480, 481, 483, 484, 488], "show": [8, 317, 479], "shown": 2, "shuffl": 6, "side": [0, 232, 326, 327, 355, 356, 479], "sigma": [341, 342, 343, 349, 390, 402, 403, 404, 405, 413, 414, 419, 440, 441], "sigmoid": [0, 5, 324, 353, 389, 413, 419, 421, 441], "sign": [0, 16, 172, 317, 461], "signal": [105, 400], "signatur": [1, 143], "signedinteg": [11, 178], "signific": 237, "silent": [156, 157, 158], "silicon": [2, 5, 7, 8, 487], "silu": 324, "simd": 1, "simd_sum": 1, "simdgroup": 1, "simdgroup_s": 1, "similar": [5, 164, 178, 311, 376, 377, 378, 422, 484, 486], "similarli": [2, 8, 204, 481, 483], "simpl": [2, 5, 6, 324, 340, 448, 454, 479, 480, 481, 483], "simple_axpbi": 2, "simple_tim": 2, "simplest": [2, 324, 480], "simpli": [2, 5, 8, 339, 351, 384, 410, 418, 437, 447, 452, 479, 480, 481], "simplic": 0, "simultan": 1, "sin": [0, 112, 391, 481, 485], "sinc": [1, 2, 5, 6, 164, 213, 452, 461, 470, 484, 487], "sine": [0, 21, 22, 271, 272, 481], "sing": 189, "singer": 457, "singl": [2, 6, 136, 179, 194, 208, 232, 302, 327, 356, 479, 482, 486], "singleton": [0, 15, 17, 26, 27, 123, 203, 204, 205, 207, 221, 235, 280, 284, 300], "singular": [189, 191], "sinh": 0, "sinusoid": 391, "sinusoidalpositionalencod": 324, "size": [0, 1, 2, 5, 6, 51, 68, 90, 99, 100, 103, 104, 116, 138, 141, 142, 143, 144, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 162, 164, 168, 169, 178, 185, 189, 212, 217, 218, 237, 238, 241, 257, 275, 278, 301, 307, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 348, 355, 356, 380, 381, 400, 456, 480, 483, 484], "size_in_megabyt": 218, "size_t": [0, 2], "sizeof": 2, "skip": [3, 83], "slice": [0, 482], "slice_s": 0, "slice_upd": 0, "slight": [5, 483], "slightli": [386, 487], "slope": 351, "slot": 480, "slow": 479, "slowli": 5, "small": [5, 139, 142, 144, 328, 344, 350, 382, 424, 429, 434, 479, 480, 487], "smaller": [0, 8, 233, 461, 479], "smallest": 189, "smile": 5, "smooth": [423, 433, 466], "smooth_l1_loss": 324, "sned": 127, "snippet": 480, "so": [1, 2, 5, 8, 165, 168, 299, 336, 400, 454, 479, 480, 483, 487], "softmax": [0, 5, 146, 324, 354, 420, 423], "softmin": 324, "softplu": [324, 357, 435], "softshrink": 324, "softsign": 324, "solv": 324, "some": [0, 2, 4, 5, 6, 363, 375, 454, 463, 479, 481, 483], "someon": 5, "someth": [4, 5, 482], "sonoma": 8, "soon": 5, "sort": [0, 28, 29, 233, 293], "sourc": [0, 1, 2, 3, 60, 125, 126, 143, 223, 295, 480], "space": [0, 2, 193, 421, 432], "spars": [0, 208], "spatial": [99, 100, 101, 103, 104, 326, 344, 355, 400], "speak": [5, 189], "special": 2, "specif": [1, 2, 8, 480, 481], "specifi": [0, 2, 18, 37, 99, 100, 101, 103, 104, 118, 154, 155, 162, 165, 185, 189, 193, 223, 229, 236, 241, 256, 285, 287, 288, 291, 294, 295, 299, 303, 305, 328, 397, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 480, 481, 487], "speed": [1, 2], "spent": 5, "split": [0, 342, 344, 414], "splittabl": 476, "sqrt": [0, 5, 134, 146, 168, 328, 341, 344, 348, 350, 352, 382, 391, 402, 403, 404, 405, 412, 455, 457, 458, 459, 466, 479], "squar": [0, 4, 5, 144, 169, 188, 192, 261, 276, 299, 311, 324, 382, 431, 433, 455, 456, 458, 459, 460, 481, 484], "squeez": [0, 400, 479], "src": [0, 125, 126], "ssh": 480, "stabil": [142, 144, 328, 344, 348, 350, 382, 421, 422, 424, 455, 456, 457, 458, 459, 460, 466], "stabl": [199, 203, 273, 429], "stable_abi": 2, "stack": [0, 479], "standard": [0, 1, 6, 50, 78, 204, 242, 246, 280, 399, 402, 404, 407, 480, 485], "starmap": [5, 311], "start": [0, 1, 2, 4, 5, 7, 8, 18, 145, 193, 219, 275, 313, 479, 482, 487], "start_axi": [0, 49, 159], "start_captur": 3, "state": [5, 6, 324, 343, 349, 383, 454, 463, 476, 479], "static": 8, "static_cast": 2, "std": [0, 2, 407], "step": [0, 3, 5, 6, 18, 324, 343, 349, 383, 456, 463, 468, 470, 471, 472, 479, 480], "step_decai": 454, "step_siz": 472, "still": [5, 8, 189, 479, 483], "stochast": [457, 458, 460, 467, 483], "stood": 5, "stop": [0, 2, 5, 18, 193, 220, 281, 481, 482], "stop_captur": 3, "stop_gradi": [0, 481], "storag": 83, "store": 5, "str": [2, 105, 130, 131, 143, 165, 186, 187, 189, 194, 208, 210, 219, 262, 263, 264, 265, 266, 299, 310, 314, 358, 359, 362, 363, 365, 367, 369, 375, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "straight": 5, "strang": 5, "stream": [2, 7, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 480, 487], "streamcontext": 282, "streamordevic": [0, 2], "street": 5, "strength": [461, 467], "strict": [123, 166, 181, 363, 365, 375], "strictli": [189, 218], "stride": [0, 2, 83, 98, 99, 100, 101, 102, 103, 104, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 386, 482], "string": [0, 2, 131, 143, 210, 232, 484, 486], "structur": [2, 462, 481], "stub": 8, "style": [2, 13, 16, 87, 88, 89, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283], "su": 5, "sub": [0, 6, 118, 250, 294, 307], "subarrai": [118, 275], "subclass": 452, "subdtyp": 178, "subgradi": 457, "sublinear": 456, "submodul": [5, 6, 324, 359, 363, 364, 375, 377], "subscript": [130, 131], "subsect": 5, "subsequ": 454, "subset": [324, 362], "substanti": 8, "subtract": [0, 38], "subtyp": [178, 317], "sudo": [8, 218], "sum": [0, 2, 4, 13, 111, 122, 171, 189, 203, 273, 291, 294, 324, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480, 482, 484], "sum_": [189, 326, 327, 429], "sum_i": 420, "sum_j": [442, 443], "summat": [130, 131], "super": [5, 6, 324, 452], "superset": [311, 462], "support": [1, 2, 5, 7, 8, 16, 90, 100, 103, 104, 146, 159, 168, 172, 183, 184, 186, 187, 188, 190, 191, 192, 194, 204, 237, 245, 480, 481, 482, 484, 486], "suppos": [481, 487], "sure": [2, 3, 5, 8, 324, 479], "surpass": [404, 405], "surpris": 5, "sw": 1, "swap": [0, 105, 217, 285, 377], "swapax": [0, 112], "swiglu": 5, "swish": [389, 441], "switch": 8, "symbol": 461, "symmetr": [99, 100, 103, 104, 183, 184, 186, 187], "symmetri": [186, 187], "synchron": [2, 479], "syntax": [38, 482], "synthet": 4, "sysctl": 218, "system": [5, 8, 210, 211, 212, 218], "t": [0, 1, 2, 5, 8, 134, 143, 146, 164, 183, 184, 238, 299, 324, 326, 343, 349, 355, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 481, 487], "tabl": [1, 189, 317, 340], "take": [0, 2, 5, 6, 87, 88, 89, 94, 163, 165, 179, 206, 222, 230, 238, 288, 299, 302, 303, 306, 312, 313, 378, 421, 476, 480, 481, 482, 486, 487, 488], "take_along_axi": [0, 482], "taken": [118, 287, 294], "talk": 480, "tan": 0, "tangent": [0, 2, 23, 24, 25, 112, 179, 289, 290, 398, 447], "tangent_i": 2, "tangent_x": 2, "tanh": [0, 324, 341, 343, 349, 357, 383, 412, 435], "target": [2, 299, 421, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 479], "target_include_directori": 2, "target_link_librari": 2, "target_link_opt": 2, "target_sourc": 2, "task": [217, 429], "tau": 467, "tcp": 480, "tell": [5, 479, 484], "temp": 5, "templat": [0, 1, 2, 143], "ten": [481, 483], "tend": 461, "tensor": [194, 291, 326, 327, 355, 356, 434, 484], "tensordot": 0, "term": [2, 424, 455, 456, 457, 458, 459, 460, 466], "termin": 8, "test": [6, 8, 480], "test_imag": 6, "test_label": 6, "text": [5, 326, 327, 341, 343, 349, 355, 356, 357, 383, 390, 397, 402, 403, 404, 405, 412, 415, 416, 417, 424, 425, 426, 429, 430, 433, 435, 436, 439, 440, 445, 446, 456, 461], "textrm": [237, 341, 342, 411, 414], "tf": 484, "tgp_size": 2, "th": [108, 109, 110, 111, 117, 140, 186, 470], "than": [1, 2, 5, 78, 105, 118, 129, 145, 163, 166, 167, 181, 182, 183, 184, 186, 187, 188, 191, 192, 204, 216, 218, 309, 311, 386, 397, 400, 430, 433, 446, 456, 461, 479, 481, 487], "thank": 483, "thei": [1, 2, 4, 5, 8, 16, 105, 164, 172, 388, 425, 452, 461, 478, 479, 480, 483, 485, 486, 487], "them": [0, 2, 5, 121, 324, 363, 375, 480, 487], "themselv": [2, 479], "thi": [0, 1, 2, 5, 6, 8, 15, 16, 17, 18, 26, 27, 28, 29, 83, 112, 132, 143, 163, 164, 168, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 199, 203, 204, 205, 207, 209, 211, 218, 221, 233, 235, 241, 268, 273, 274, 275, 280, 284, 287, 293, 300, 309, 312, 313, 324, 336, 337, 338, 342, 343, 349, 359, 360, 362, 363, 366, 367, 368, 373, 375, 376, 377, 378, 381, 383, 397, 402, 403, 404, 405, 412, 413, 414, 421, 429, 446, 452, 463, 478, 479, 480, 481, 483, 484, 486], "thing": [2, 5, 480], "third": 185, "thompson": 337, "those": [2, 5, 324], "though": [2, 5, 479, 483, 484], "thousand": 483, "thread": [1, 2], "thread_index_in_simdgroup": 1, "thread_position_in_grid": [1, 2, 143], "threadgroup": [1, 2, 143], "threads_per_simdgroup": 1, "three": [5, 86, 400], "threefri": 476, "threshold": [397, 426, 433, 446], "through": [1, 2, 281, 399, 461, 479, 481, 484], "throw": [2, 94, 123], "thu": [5, 324], "thumb": 454, "tic": 479, "tieleman": 466, "tile": [0, 146], "time": [2, 5, 8, 217, 292, 324, 326, 327, 343, 349, 355, 356, 383, 479, 481, 483, 487], "timeit": [479, 481], "titl": 2, "tmp": [1, 143], "to_quant": 307, "to_stream": 2, "toc": 479, "togeth": [0, 1, 2, 6, 237, 311, 312, 480], "tok_embed": 5, "token": [5, 340, 380], "told": 5, "toler": [0, 16, 172], "too": [178, 479, 483], "took": 5, "tool": 8, "top": [2, 293, 352, 400], "topk": 0, "torch": [5, 484], "torch_weight": 5, "total": [218, 481], "total_norm": 309, "tpi": 479, "trace": [0, 3, 479], "trace_fil": 3, "tracer": 376, "track": [2, 324, 328], "track_running_stat": 328, "trade": 483, "tradit": [5, 145, 337, 338, 386], "train": [5, 6, 324, 328, 336, 337, 338, 361, 363, 375, 402, 403], "train_imag": [6, 454], "train_label": [6, 454], "trainabl": [6, 308, 324, 452], "trainable_paramet": [324, 362, 463], "transform": [1, 5, 7, 112, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 168, 308, 324, 328, 344, 350, 352, 362, 363, 375, 381, 386, 482], "transformerencod": 265, "transit": 470, "translat": [142, 350], "transpos": [0, 5, 31, 102, 103, 104, 164, 238, 333, 334, 335], "treat": [0, 2, 154, 155, 157, 158, 287, 400, 479], "tree": [7, 94, 136, 165, 299, 303, 310, 311, 312, 313, 314, 462, 463, 465, 474, 481], "tree_flatten": [265, 311, 314, 324, 454], "tree_map": [312, 324, 480], "tree_unflatten": [5, 454], "trembl": 5, "tri": 0, "triangl": [186, 187, 296], "triangular": [183, 184, 192], "tril": 0, "trilinear": 400, "triplet": 434, "triplet_loss": 324, "triu": 0, "true": [0, 1, 2, 4, 5, 16, 41, 42, 43, 44, 82, 94, 108, 109, 110, 111, 143, 145, 164, 172, 178, 183, 184, 189, 194, 208, 217, 238, 273, 304, 307, 310, 311, 312, 313, 317, 324, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 362, 363, 365, 372, 375, 381, 383, 386, 391, 399, 400, 421, 429, 456], "truncat": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 251], "truth": [4, 423, 433], "try": [2, 8], "tupl": [0, 30, 65, 68, 79, 95, 99, 100, 101, 103, 104, 125, 129, 131, 136, 138, 179, 186, 189, 190, 191, 232, 237, 257, 259, 278, 299, 302, 310, 311, 312, 313, 314, 326, 327, 331, 332, 334, 335, 355, 356, 365, 367, 388, 400, 456, 458, 459, 460, 461, 478, 481], "tutori": 2, "twice": 487, "two": [0, 2, 13, 14, 16, 24, 82, 85, 87, 88, 89, 90, 118, 128, 133, 148, 151, 157, 163, 164, 166, 167, 172, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 199, 204, 206, 222, 224, 228, 231, 285, 313, 327, 342, 349, 356, 414, 422, 479, 480, 481, 482, 487], "txt": 2, "type": [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 310, 313, 324, 370, 399, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 482], "type_nam": 2, "type_to_nam": 2, "typenam": [0, 1, 2], "typic": [0, 146, 340, 454, 479, 483], "u": [1, 2, 183, 186, 187, 191, 352, 377, 474, 483], "u_": 455, "u_t": 455, "uint": [1, 2, 143], "uint16": [11, 317], "uint3": 1, "uint32": [11, 26, 27, 28, 29, 241, 317], "uint64": [11, 317], "uint8": [11, 317], "ultra": 5, "unabl": 8, "unam": 8, "unari": 479, "unchang": [145, 281, 386], "uncheck": 8, "uncompress": 265, "undefin": [0, 28, 112, 183, 184, 233, 245, 482], "under": [2, 189], "underli": [2, 301], "understand": [5, 402, 403], "unexpect": [2, 18], "unfreez": [324, 363], "unfrozen": 375, "unifi": 7, "uniform": [3, 324, 352, 365, 403, 405, 449, 476, 479, 481, 487], "uniformli": 252, "unintend": 0, "union": [18, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 84, 85, 86, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 176, 177, 178, 186, 187, 210, 263, 282], "uniqu": [2, 476], "unique_ptr": 2, "unit": [329, 339, 341, 342, 343, 351, 384, 385, 387, 389, 402, 403, 404, 405, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441], "unittest": 8, "univers": 189, "unless": [5, 16, 172, 189, 452], "unlik": [5, 16, 172, 337, 338, 371], "unnecessari": [2, 5], "unnorm": [241, 421, 423], "unscal": 456, "unsign": [164, 237, 238, 317], "unsignedinteg": 11, "unspecifi": [15, 17, 18, 26, 27, 28, 29, 95, 108, 109, 110, 111, 162, 203, 205, 207, 221, 229, 233, 235, 256, 273, 274, 280, 284, 287, 293, 294, 300, 305, 488], "unsqueez": 5, "unsupport": 194, "until": [2, 483, 485], "unus": 2, "up": [1, 2, 5, 112, 479], "upcast": 2, "updat": [0, 1, 2, 4, 5, 6, 38, 94, 307, 311, 313, 328, 358, 359, 365, 370, 371, 372, 377, 454, 456, 459, 461, 462, 463, 467, 468, 469, 470, 471, 472, 479, 480, 483], "update_modul": 324, "uplo": [186, 187], "upon": [5, 311, 312], "upper": [183, 184, 186, 187, 192, 237, 248, 251, 252, 408], "upsampl": 324, "us": [0, 3, 4, 5, 6, 7, 8, 18, 38, 83, 112, 116, 119, 121, 122, 125, 126, 127, 129, 141, 143, 145, 159, 164, 180, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 204, 211, 212, 213, 216, 218, 237, 238, 256, 257, 258, 259, 286, 310, 313, 317, 319, 324, 327, 337, 340, 341, 343, 349, 352, 356, 358, 362, 369, 376, 378, 380, 381, 383, 386, 391, 399, 400, 404, 405, 412, 413, 422, 449, 452, 454, 455, 456, 458, 459, 460, 461, 462, 463, 476, 478, 479, 480, 481, 482, 485, 487], "usag": [112, 399, 479], "user": [2, 5, 324], "usual": [340, 380, 478, 483], "util": [1, 2, 5, 7, 8, 265, 324, 454, 480], "v": [5, 105, 146, 186, 324, 363, 484], "v_": [455, 457, 458, 459, 460, 466, 467], "v_t": [455, 457, 458, 459, 460, 466, 467], "val": [0, 30, 162], "valid": [6, 105, 159, 303, 310, 363, 375, 478], "valid_parameter_filt": 358, "valu": [0, 1, 4, 5, 11, 12, 16, 18, 26, 27, 50, 78, 82, 93, 140, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 172, 185, 189, 191, 193, 210, 218, 225, 232, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 259, 263, 287, 288, 299, 303, 308, 310, 311, 312, 313, 317, 327, 329, 336, 337, 338, 339, 345, 348, 352, 356, 362, 378, 379, 395, 397, 399, 401, 421, 422, 423, 424, 425, 426, 428, 429, 430, 431, 432, 433, 446, 452, 456, 459, 468, 469, 471, 472, 481], "value_and_grad": [6, 112, 324, 376, 452, 454, 465, 479, 481, 484, 485], "value_and_grad_fn": 483, "value_cach": 5, "value_dim": 378, "value_input_dim": 378, "value_output_dim": 378, "value_proj": 5, "valueerror": [189, 365, 481], "values_hat": 5, "van": 189, "var": [0, 328, 344, 348, 350, 424], "variabl": [8, 94, 119, 132, 165, 179, 299, 302, 303, 480], "varianc": [0, 280, 300, 328, 344, 424], "variant": [5, 433, 460], "variou": 189, "vector": [0, 2, 4, 7, 171, 179, 189, 287, 302, 303, 340, 423, 485], "verbos": [1, 143], "veri": [5, 378, 480, 483, 487], "verifi": [4, 8], "versa": 259, "version": [2, 8, 116, 141, 199, 203, 237, 273, 303, 476, 481, 482], "versu": 479, "via": [8, 112, 462, 465, 480, 483, 484], "vice": 259, "video": 338, "view": [0, 3, 83, 484], "virtual": 2, "vjp": [2, 112, 485], "vmap": [2, 112, 481, 483, 485], "vmap_add": 481, "vocab_s": 5, "vocabulari": [340, 380], "void": [1, 2], "vt": 191, "w": [0, 1, 4, 99, 100, 103, 104, 116, 141, 164, 186, 237, 238, 299, 312, 327, 328, 331, 332, 334, 335, 337, 338, 352, 356, 454, 467, 481], "w1": [5, 309], "w2": [5, 309], "w3": 5, "w_": [327, 343, 349, 356, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467], "w_1": 237, "w_g": 237, "w_i": [116, 141, 237], "w_in": 1, "w_q": 237, "w_star": 4, "w_stride": 1, "w_t": [455, 457, 458, 459, 460, 461, 466, 467], "wa": [5, 83, 125, 126, 480, 483], "wai": [2, 5, 8, 324, 400, 479, 480, 481, 482], "wait": [2, 5, 217], "walk": 5, "walkthrough": 2, "walsh": 168, "want": [1, 5, 480, 481, 487], "warm": [2, 479], "warmup": [470, 471], "warmup_init": 456, "watch": [5, 479], "wd": 461, "we": [0, 1, 2, 4, 5, 6, 116, 125, 126, 141, 164, 237, 238, 324, 340, 380, 388, 459, 461, 476, 478, 479, 480, 481, 483, 487], "weight": [0, 4, 98, 99, 100, 101, 102, 103, 104, 142, 144, 311, 324, 365, 369, 380, 381, 421, 423, 452, 456, 459, 461, 463, 467, 481, 483], "weight_decai": [456, 459, 461, 467], "weight_fil": 5, "weights_fp16": 483, "well": [5, 324, 363, 375, 378, 483], "wen": 5, "went": 5, "were": [5, 487], "wet": 5, "what": [2, 5, 311], "whatsoev": 5, "whc": 337, "when": [0, 1, 2, 5, 7, 8, 94, 101, 112, 127, 183, 184, 186, 187, 188, 189, 191, 192, 194, 330, 331, 332, 333, 334, 335, 400, 404, 405, 421, 427, 433, 452, 454, 470, 476, 479, 480, 487], "where": [0, 6, 140, 172, 184, 237, 299, 303, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 379, 382, 383, 397, 404, 405, 410, 411, 413, 424, 430, 436, 439, 441, 446, 463, 480, 481, 482], "wherea": 481, "whether": [143, 164, 186, 187, 192, 238, 343, 349, 362, 378, 383, 421, 424, 430], "which": [0, 1, 2, 5, 6, 7, 8, 18, 37, 83, 94, 101, 118, 121, 122, 125, 126, 127, 136, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 165, 173, 174, 175, 176, 177, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 208, 219, 237, 241, 242, 256, 257, 259, 262, 263, 264, 265, 266, 278, 279, 287, 294, 299, 302, 303, 307, 327, 337, 338, 341, 356, 358, 362, 386, 421, 423, 426, 430, 433, 449, 462, 463, 476, 479, 480, 481, 482, 483, 487, 488], "while": [2, 3, 5, 8, 257, 386, 483, 484], "whistl": 2, "who": 5, "whose": [140, 307, 308], "why": 5, "wide": 483, "width": [327, 328, 331, 332, 334, 335, 337, 338, 356, 380, 381], "window": [8, 326, 327, 355, 356], "wipe": 8, "wire": 218, "wired_limit_mb": 218, "wise": [0, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 87, 88, 89, 92, 106, 107, 128, 129, 133, 134, 135, 137, 139, 160, 161, 166, 167, 172, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 222, 224, 226, 228, 234, 254, 255, 258, 261, 269, 270, 271, 272, 276, 277, 283, 289, 290, 329, 337, 338, 347, 357, 379, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479], "wish": 8, "with_logit": 421, "within": [0, 3, 28, 172], "without": [1, 5, 7, 281, 378, 448, 478, 479, 480, 483, 484, 487], "wk": 5, "wl": 2, "wo": 5, "word": 0, "work": [2, 3, 5, 217, 479, 480, 481, 482, 483], "workhors": 324, "world": [314, 480], "worri": [1, 483], "would": [2, 5, 400, 480, 482, 483, 484, 487], "wq": 5, "wrap": [112, 324], "write": [0, 1, 2, 5, 324, 484], "written": 2, "wrt": 308, "wv": 5, "x": [0, 1, 2, 4, 5, 6, 38, 90, 112, 121, 122, 126, 127, 134, 139, 142, 143, 144, 164, 168, 169, 189, 238, 242, 247, 260, 265, 269, 297, 298, 304, 311, 313, 324, 326, 327, 328, 329, 339, 341, 342, 344, 348, 350, 351, 352, 355, 356, 357, 358, 379, 382, 384, 390, 391, 397, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 433, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 454, 461, 479, 480, 481, 482, 483, 484, 485, 487], "x1": 422, "x2": 422, "x86_64": 8, "x_1": [422, 430], "x_2": [422, 430], "x_cast": 2, "x_grad": 1, "x_i": [420, 442, 443], "x_j": [442, 443], "x_offset": 2, "x_ptr": 2, "x_shape": 1, "x_stride": 2, "x_t": [343, 349, 383], "x_view": 484, "xcode": 8, "xcodeproj": 3, "xcrun": 8, "xf": 349, "xg": 349, "xi": 349, "xn": 343, "xo": 349, "xor": 89, "xr": 343, "xy": [0, 208], "xz": 343, "x\u00b2": 484, "y": [0, 2, 4, 5, 6, 38, 112, 168, 304, 324, 328, 337, 344, 348, 350, 352, 382, 425, 430, 433, 454, 457, 479, 480, 481, 483, 484], "y_": [425, 429], "y_cast": 2, "y_hat": 324, "y_offset": 2, "y_ptr": 2, "y_stride": 2, "ye": 5, "year": 5, "yet": [5, 189, 324, 452, 463, 481, 482, 483, 485], "yield": [5, 6, 476], "you": [2, 3, 5, 6, 7, 8, 218, 324, 391, 399, 449, 476, 479, 480, 481, 482, 484, 486, 487], "your": [2, 5, 8, 452, 481, 483], "z": [2, 343, 479, 483], "z_t": 343, "zeiler": 455, "zero": [0, 140, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 185, 208, 215, 296, 297, 298, 306, 324, 326, 327, 336, 337, 338, 365, 401, 402, 403, 404, 405, 406, 407, 408, 449, 454, 456, 482], "zero_grad": 481, "zeros_lik": 0, "zhang": 5, "zip": [5, 6], "zip_saf": 2}, "titles": ["Operations", "Custom Metal Kernels", "Custom Extensions in MLX", "Metal Debugger", "Linear Regression", "LLM inference", "Multi-Layer Perceptron", "MLX", "Build and Install", "mlx.core.Device", "mlx.core.Dtype", "mlx.core.DtypeCategory", "mlx.core.abs", "mlx.core.add", "mlx.core.addmm", "mlx.core.all", "mlx.core.allclose", "mlx.core.any", "mlx.core.arange", "mlx.core.arccos", "mlx.core.arccosh", "mlx.core.arcsin", "mlx.core.arcsinh", "mlx.core.arctan", "mlx.core.arctan2", "mlx.core.arctanh", "mlx.core.argmax", "mlx.core.argmin", "mlx.core.argpartition", "mlx.core.argsort", "mlx.core.array", "mlx.core.array.T", "mlx.core.array.abs", "mlx.core.array.all", "mlx.core.array.any", "mlx.core.array.argmax", "mlx.core.array.argmin", "mlx.core.array.astype", "mlx.core.array.at", "mlx.core.array.conj", "mlx.core.array.cos", "mlx.core.array.cummax", "mlx.core.array.cummin", "mlx.core.array.cumprod", "mlx.core.array.cumsum", "mlx.core.array.diag", "mlx.core.array.diagonal", "mlx.core.array.dtype", "mlx.core.array.exp", "mlx.core.array.flatten", "mlx.core.array.item", "mlx.core.array.itemsize", "mlx.core.array.log", "mlx.core.array.log10", "mlx.core.array.log1p", "mlx.core.array.log2", "mlx.core.array.logsumexp", "mlx.core.array.max", "mlx.core.array.mean", "mlx.core.array.min", "mlx.core.array.moveaxis", "mlx.core.array.nbytes", "mlx.core.array.ndim", "mlx.core.array.prod", "mlx.core.array.reciprocal", "mlx.core.array.reshape", "mlx.core.array.round", "mlx.core.array.rsqrt", "mlx.core.array.shape", "mlx.core.array.sin", "mlx.core.array.size", "mlx.core.array.split", "mlx.core.array.sqrt", "mlx.core.array.square", "mlx.core.array.squeeze", "mlx.core.array.std", "mlx.core.array.sum", "mlx.core.array.swapaxes", "mlx.core.array.tolist", "mlx.core.array.transpose", "mlx.core.array.var", "mlx.core.array.view", "mlx.core.array_equal", "mlx.core.as_strided", "mlx.core.atleast_1d", "mlx.core.atleast_2d", "mlx.core.atleast_3d", "mlx.core.bitwise_and", "mlx.core.bitwise_or", "mlx.core.bitwise_xor", "mlx.core.block_masked_mm", "mlx.core.broadcast_to", "mlx.core.ceil", "mlx.core.clip", "mlx.core.compile", "mlx.core.concatenate", "mlx.core.conj", "mlx.core.conjugate", "mlx.core.conv1d", "mlx.core.conv2d", "mlx.core.conv3d", "mlx.core.conv_general", "mlx.core.conv_transpose1d", "mlx.core.conv_transpose2d", "mlx.core.conv_transpose3d", "mlx.core.convolve", "mlx.core.cos", "mlx.core.cosh", "mlx.core.cummax", "mlx.core.cummin", "mlx.core.cumprod", "mlx.core.cumsum", "mlx.core.custom_function", "mlx.core.default_device", "mlx.core.default_stream", "mlx.core.degrees", "mlx.core.dequantize", "mlx.core.diag", "mlx.core.diagonal", "mlx.core.disable_compile", "mlx.core.distributed.Group", "mlx.core.distributed.all_gather", "mlx.core.distributed.all_sum", "mlx.core.distributed.init", "mlx.core.distributed.is_available", "mlx.core.distributed.recv", "mlx.core.distributed.recv_like", "mlx.core.distributed.send", "mlx.core.divide", "mlx.core.divmod", "mlx.core.einsum", "mlx.core.einsum_path", "mlx.core.enable_compile", "mlx.core.equal", "mlx.core.erf", "mlx.core.erfinv", "mlx.core.eval", "mlx.core.exp", "mlx.core.expand_dims", "mlx.core.expm1", "mlx.core.eye", "mlx.core.fast.affine_quantize", "mlx.core.fast.layer_norm", "mlx.core.fast.metal_kernel", "mlx.core.fast.rms_norm", "mlx.core.fast.rope", "mlx.core.fast.scaled_dot_product_attention", "mlx.core.fft.fft", "mlx.core.fft.fft2", "mlx.core.fft.fftn", "mlx.core.fft.ifft", "mlx.core.fft.ifft2", "mlx.core.fft.ifftn", "mlx.core.fft.irfft", "mlx.core.fft.irfft2", "mlx.core.fft.irfftn", "mlx.core.fft.rfft", "mlx.core.fft.rfft2", "mlx.core.fft.rfftn", "mlx.core.flatten", "mlx.core.floor", "mlx.core.floor_divide", "mlx.core.full", "mlx.core.gather_mm", "mlx.core.gather_qmm", "mlx.core.grad", "mlx.core.greater", "mlx.core.greater_equal", "mlx.core.hadamard_transform", "mlx.core.identity", "mlx.core.imag", "mlx.core.inner", "mlx.core.isclose", "mlx.core.isfinite", "mlx.core.isinf", "mlx.core.isnan", "mlx.core.isneginf", "mlx.core.isposinf", "mlx.core.issubdtype", "mlx.core.jvp", "mlx.core.left_shift", "mlx.core.less", "mlx.core.less_equal", "mlx.core.linalg.cholesky", "mlx.core.linalg.cholesky_inv", "mlx.core.linalg.cross", "mlx.core.linalg.eigh", "mlx.core.linalg.eigvalsh", "mlx.core.linalg.inv", "mlx.core.linalg.norm", "mlx.core.linalg.qr", "mlx.core.linalg.svd", "mlx.core.linalg.tri_inv", "mlx.core.linspace", "mlx.core.load", "mlx.core.log", "mlx.core.log10", "mlx.core.log1p", "mlx.core.log2", "mlx.core.logaddexp", "mlx.core.logical_and", "mlx.core.logical_not", "mlx.core.logical_or", "mlx.core.logsumexp", "mlx.core.matmul", "mlx.core.max", "mlx.core.maximum", "mlx.core.mean", "mlx.core.meshgrid", "mlx.core.metal.clear_cache", "mlx.core.metal.device_info", "mlx.core.metal.get_active_memory", "mlx.core.metal.get_cache_memory", "mlx.core.metal.get_peak_memory", "mlx.core.metal.is_available", "mlx.core.metal.reset_peak_memory", "mlx.core.metal.set_cache_limit", "mlx.core.metal.set_memory_limit", "mlx.core.metal.set_wired_limit", "mlx.core.metal.start_capture", "mlx.core.metal.stop_capture", "mlx.core.min", "mlx.core.minimum", "mlx.core.moveaxis", "mlx.core.multiply", "mlx.core.nan_to_num", "mlx.core.negative", "mlx.core.new_stream", "mlx.core.not_equal", "mlx.core.ones", "mlx.core.ones_like", "mlx.core.outer", "mlx.core.pad", "mlx.core.partition", "mlx.core.power", "mlx.core.prod", "mlx.core.put_along_axis", "mlx.core.quantize", "mlx.core.quantized_matmul", "mlx.core.radians", "mlx.core.random.bernoulli", "mlx.core.random.categorical", "mlx.core.random.gumbel", "mlx.core.random.key", "mlx.core.random.laplace", "mlx.core.random.multivariate_normal", "mlx.core.random.normal", "mlx.core.random.permutation", "mlx.core.random.randint", "mlx.core.random.seed", "mlx.core.random.split", "mlx.core.random.truncated_normal", "mlx.core.random.uniform", "mlx.core.real", "mlx.core.reciprocal", "mlx.core.remainder", "mlx.core.repeat", "mlx.core.reshape", "mlx.core.right_shift", "mlx.core.roll", "mlx.core.round", "mlx.core.rsqrt", "mlx.core.save", "mlx.core.save_gguf", "mlx.core.save_safetensors", "mlx.core.savez", "mlx.core.savez_compressed", "mlx.core.set_default_device", "mlx.core.set_default_stream", "mlx.core.sigmoid", "mlx.core.sign", "mlx.core.sin", "mlx.core.sinh", "mlx.core.softmax", "mlx.core.sort", "mlx.core.split", "mlx.core.sqrt", "mlx.core.square", "mlx.core.squeeze", "mlx.core.stack", "mlx.core.std", "mlx.core.stop_gradient", "mlx.core.stream", "mlx.core.subtract", "mlx.core.sum", "mlx.core.swapaxes", "mlx.core.synchronize", "mlx.core.take", "mlx.core.take_along_axis", "mlx.core.tan", "mlx.core.tanh", "mlx.core.tensordot", "mlx.core.tile", "mlx.core.topk", "mlx.core.trace", "mlx.core.transpose", "mlx.core.tri", "mlx.core.tril", "mlx.core.triu", "mlx.core.value_and_grad", "mlx.core.var", "mlx.core.view", "mlx.core.vjp", "mlx.core.vmap", "mlx.core.where", "mlx.core.zeros", "mlx.core.zeros_like", "mlx.nn.quantize", "mlx.nn.value_and_grad", "mlx.optimizers.clip_grad_norm", "mlx.utils.tree_flatten", "mlx.utils.tree_map", "mlx.utils.tree_map_with_path", "mlx.utils.tree_reduce", "mlx.utils.tree_unflatten", "mlx.core.Stream", "Array", "Data Types", "Devices and Streams", "Distributed Communication", "Fast", "FFT", "Linear Algebra", "Metal", "Neural Networks", "mlx.nn.ALiBi", "mlx.nn.AvgPool1d", "mlx.nn.AvgPool2d", "mlx.nn.BatchNorm", "mlx.nn.CELU", "mlx.nn.Conv1d", "mlx.nn.Conv2d", "mlx.nn.Conv3d", "mlx.nn.ConvTranspose1d", "mlx.nn.ConvTranspose2d", "mlx.nn.ConvTranspose3d", "mlx.nn.Dropout", "mlx.nn.Dropout2d", "mlx.nn.Dropout3d", "mlx.nn.ELU", "mlx.nn.Embedding", "mlx.nn.GELU", "mlx.nn.GLU", "mlx.nn.GRU", "mlx.nn.GroupNorm", "mlx.nn.HardShrink", "mlx.nn.HardTanh", "mlx.nn.Hardswish", "mlx.nn.InstanceNorm", "mlx.nn.LSTM", "mlx.nn.LayerNorm", "mlx.nn.LeakyReLU", "mlx.nn.Linear", "mlx.nn.LogSigmoid", "mlx.nn.LogSoftmax", "mlx.nn.MaxPool1d", "mlx.nn.MaxPool2d", "mlx.nn.Mish", "mlx.nn.Module.apply", "mlx.nn.Module.apply_to_modules", "mlx.nn.Module.children", "mlx.nn.Module.eval", "mlx.nn.Module.filter_and_map", "mlx.nn.Module.freeze", "mlx.nn.Module.leaf_modules", "mlx.nn.Module.load_weights", "mlx.nn.Module.modules", "mlx.nn.Module.named_modules", "mlx.nn.Module.parameters", "mlx.nn.Module.save_weights", "mlx.nn.Module.set_dtype", "mlx.nn.Module.state", "mlx.nn.Module.train", "mlx.nn.Module.trainable_parameters", "mlx.nn.Module.training", "mlx.nn.Module.unfreeze", "mlx.nn.Module.update", "mlx.nn.Module.update_modules", "mlx.nn.MultiHeadAttention", "mlx.nn.PReLU", "mlx.nn.QuantizedEmbedding", "mlx.nn.QuantizedLinear", "mlx.nn.RMSNorm", "mlx.nn.RNN", "mlx.nn.ReLU", "mlx.nn.ReLU6", "mlx.nn.RoPE", "mlx.nn.SELU", "mlx.nn.Sequential", "mlx.nn.SiLU", "mlx.nn.Sigmoid", "mlx.nn.SinusoidalPositionalEncoding", "mlx.nn.Softmax", "mlx.nn.Softmin", "mlx.nn.Softplus", "mlx.nn.Softshrink", "mlx.nn.Softsign", "mlx.nn.Step", "mlx.nn.Tanh", "mlx.nn.Transformer", "mlx.nn.Upsample", "mlx.nn.init.constant", "mlx.nn.init.glorot_normal", "mlx.nn.init.glorot_uniform", "mlx.nn.init.he_normal", "mlx.nn.init.he_uniform", "mlx.nn.init.identity", "mlx.nn.init.normal", "mlx.nn.init.uniform", "mlx.nn.celu", "mlx.nn.elu", "mlx.nn.gelu", "mlx.nn.gelu_approx", "mlx.nn.gelu_fast_approx", "mlx.nn.glu", "mlx.nn.hard_shrink", "mlx.nn.hard_tanh", "mlx.nn.hardswish", "mlx.nn.leaky_relu", "mlx.nn.log_sigmoid", "mlx.nn.log_softmax", "mlx.nn.losses.binary_cross_entropy", "mlx.nn.losses.cosine_similarity_loss", "mlx.nn.losses.cross_entropy", "mlx.nn.losses.gaussian_nll_loss", "mlx.nn.losses.hinge_loss", "mlx.nn.losses.huber_loss", "mlx.nn.losses.kl_div_loss", "mlx.nn.losses.l1_loss", "mlx.nn.losses.log_cosh_loss", "mlx.nn.losses.margin_ranking_loss", "mlx.nn.losses.mse_loss", "mlx.nn.losses.nll_loss", "mlx.nn.losses.smooth_l1_loss", "mlx.nn.losses.triplet_loss", "mlx.nn.mish", "mlx.nn.prelu", "mlx.nn.relu", "mlx.nn.relu6", "mlx.nn.selu", "mlx.nn.sigmoid", "mlx.nn.silu", "mlx.nn.softmax", "mlx.nn.softmin", "mlx.nn.softplus", "mlx.nn.softshrink", "mlx.nn.step", "mlx.nn.tanh", "Functions", "Initializers", "Layers", "Loss Functions", "Module", "Operations", "Optimizers", "mlx.optimizers.AdaDelta", "mlx.optimizers.Adafactor", "mlx.optimizers.Adagrad", "mlx.optimizers.Adam", "mlx.optimizers.AdamW", "mlx.optimizers.Adamax", "mlx.optimizers.Lion", "mlx.optimizers.Optimizer.apply_gradients", "mlx.optimizers.Optimizer.init", "mlx.optimizers.Optimizer.state", "mlx.optimizers.Optimizer.update", "mlx.optimizers.RMSprop", "mlx.optimizers.SGD", "mlx.optimizers.cosine_decay", "mlx.optimizers.exponential_decay", "mlx.optimizers.join_schedules", "mlx.optimizers.linear_schedule", "mlx.optimizers.step_decay", "Common Optimizers", "Optimizer", "Schedulers", "Random", "Transforms", "Tree Utils", "Compilation", "Distributed Communication", "Function Transforms", "Indexing Arrays", "Lazy Evaluation", "Conversion to NumPy and Other Frameworks", "Quick Start Guide", "Saving and Loading Arrays", "Unified Memory", "Using Streams"], "titleterms": {"A": 487, "In": 482, "The": 324, "ab": [12, 32], "adadelta": 455, "adafactor": 456, "adagrad": 457, "adam": 458, "adamax": 460, "adamw": 459, "add": 13, "addmm": 14, "affine_quant": 141, "algebra": 322, "alibi": 325, "all": [5, 15, 33, 480], "all_gath": 121, "all_sum": 122, "allclos": 16, "ani": [17, 34], "api": [7, 8], "appli": 358, "apply_gradi": 462, "apply_to_modul": 359, "arang": 18, "arcco": 19, "arccosh": 20, "arcsin": 21, "arcsinh": 22, "arctan": 23, "arctan2": 24, "arctanh": 25, "argmax": [26, 35], "argmin": [27, 36], "argpartit": 28, "argsort": 29, "arrai": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 316, 482, 486], "array_equ": 82, "as_strid": 83, "astyp": 37, "atleast_1d": 84, "atleast_2d": 85, "atleast_3d": 86, "attent": 5, "automat": 481, "avgpool1d": 326, "avgpool2d": 327, "back": 2, "basic": [479, 485], "batchnorm": 328, "benchmark": 5, "bernoulli": 240, "binari": 8, "binary_cross_entropi": 421, "bind": 2, "bitwise_and": 87, "bitwise_or": 88, "bitwise_xor": 89, "block_masked_mm": 90, "broadcast_to": 91, "build": [2, 8], "c": [7, 8], "categor": 241, "ceil": 92, "celu": [329, 409], "children": 360, "choleski": 183, "cholesky_inv": 184, "class": 324, "clear_cach": 209, "clip": 93, "clip_grad_norm": 309, "cmake": 2, "co": [40, 106], "code": [2, 5], "common": 473, "commun": [319, 480], "compil": [94, 479], "complex": 1, "comput": 483, "concaten": 95, "conj": [39, 96], "conjug": 97, "constant": 401, "conv1d": [98, 330], "conv2d": [99, 331], "conv3d": [100, 332], "conv_gener": 101, "conv_transpose1d": 102, "conv_transpose2d": 103, "conv_transpose3d": 104, "convers": 484, "convert": 5, "convolv": 105, "convtranspose1d": 333, "convtranspose2d": 334, "convtranspose3d": 335, "core": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 315], "cosh": 107, "cosine_decai": 468, "cosine_similarity_loss": 422, "cpu": 2, "cross": 185, "cross_entropi": 423, "cummax": [41, 108], "cummin": [42, 109], "cumprod": [43, 110], "cumsum": [44, 111], "custom": [1, 2], "custom_funct": 112, "data": 317, "debug": 479, "debugg": 3, "default_devic": 113, "default_stream": 114, "degre": 115, "dequant": 116, "devic": [9, 318], "device_info": 210, "diag": [45, 117], "diagon": [46, 118], "differ": 482, "differenti": 481, "disable_compil": 119, "distribut": [120, 121, 122, 123, 124, 125, 126, 127, 319, 480], "divid": 128, "divmod": 129, "download": [2, 5], "dropout": 336, "dropout2d": 337, "dropout3d": 338, "dtype": [10, 47], "dtypecategori": 11, "eigh": 186, "eigvalsh": 187, "einsum": 130, "einsum_path": 131, "elu": [339, 410], "embed": 340, "enable_compil": 132, "encod": 5, "end": 2, "equal": 133, "erf": 134, "erfinv": 135, "eval": [136, 361], "evalu": 483, "exampl": [1, 2, 7, 479, 480, 487], "exp": [48, 137], "expand_dim": 138, "expm1": 139, "exponential_decai": 469, "extens": 2, "ey": 140, "fast": [141, 142, 143, 144, 145, 146, 320], "fft": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 321], "fft2": 148, "fftn": 149, "filter_and_map": 362, "flatten": [49, 159], "floor": 160, "floor_divid": 161, "format": 486, "found": 8, "framework": 484, "freez": 363, "from": [8, 482], "full": [5, 162], "function": [448, 451, 479, 481, 485], "further": 7, "gather_mm": 163, "gather_qmm": 164, "gaussian_nll_loss": 424, "gelu": [341, 411], "gelu_approx": 412, "gelu_fast_approx": 413, "gener": 5, "get": 480, "get_active_memori": 211, "get_cache_memori": 212, "get_peak_memori": 213, "glorot_norm": 402, "glorot_uniform": 403, "glu": [342, 414], "gpu": 2, "grad": [165, 324], "graph": [479, 483, 485], "greater": 166, "greater_equ": 167, "grid": 1, "group": 120, "groupnorm": 344, "gru": 343, "guid": 485, "gumbel": 242, "hadamard_transform": 168, "hard_shrink": 415, "hard_tanh": 416, "hardshrink": 345, "hardswish": [347, 417], "hardtanh": 346, "he_norm": 404, "he_uniform": 405, "hinge_loss": 425, "host": 480, "huber_loss": 426, "ident": [169, 406], "ifft": 150, "ifft2": 151, "ifftn": 152, "imag": 170, "implement": [2, 5], "index": 482, "infer": 5, "init": [123, 401, 402, 403, 404, 405, 406, 407, 408, 463], "initi": 449, "inner": 171, "inspect": 324, "instal": [7, 8, 480], "instancenorm": 348, "introduc": 2, "inv": 188, "irfft": 153, "irfft2": 154, "irfftn": 155, "is_avail": [124, 214], "isclos": 172, "isfinit": 173, "isinf": 174, "isnan": 175, "isneginf": 176, "isposinf": 177, "issubdtyp": 178, "item": 50, "items": 51, "jax": 484, "join_schedul": 470, "jvp": 179, "kei": 243, "kernel": 1, "kl_div_loss": 427, "l1_loss": 428, "laplac": 244, "layer": [5, 6, 450], "layer_norm": 142, "layernorm": 350, "lazi": 483, "leaf_modul": 364, "leaky_relu": 418, "leakyrelu": 351, "left_shift": 180, "less": 181, "less_equ": 182, "linalg": [183, 184, 185, 186, 187, 188, 189, 190, 191, 192], "linear": [4, 322, 352], "linear_schedul": 471, "linspac": 193, "lion": 461, "llm": 5, "load": [5, 194, 454, 486], "load_weight": 365, "log": [52, 195], "log10": [53, 196], "log1p": [54, 197], "log2": [55, 198], "log_cosh_loss": 429, "log_sigmoid": 419, "log_softmax": 420, "logaddexp": 199, "logical_and": 200, "logical_not": 201, "logical_or": 202, "logsigmoid": 353, "logsoftmax": 354, "logsumexp": [56, 203], "loss": [421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 451], "lstm": 349, "margin_ranking_loss": 430, "matmul": 204, "max": [57, 205], "maximum": 206, "maxpool1d": 355, "maxpool2d": 356, "mean": [58, 207], "memori": 487, "meshgrid": 208, "metal": [1, 3, 8, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 323], "metal_kernel": 143, "min": [59, 221], "minim": 8, "minimum": 222, "mish": [357, 435], "mlx": [2, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472], "model": 5, "modul": [324, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 452], "moveaxi": [60, 223], "mpi": 480, "mse_loss": 431, "multi": 6, "multiheadattent": 378, "multipli": 224, "multivariate_norm": 245, "named_modul": 367, "nan_to_num": 225, "nbyte": 61, "ndim": 62, "neg": 226, "network": 324, "neural": 324, "new_stream": 227, "nll_loss": 432, "nn": [307, 308, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447], "norm": 189, "normal": [246, 407], "not_equ": 228, "numpi": [482, 484], "ones": 229, "ones_lik": 230, "onli": 483, "oper": [0, 2, 453], "optim": [309, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474], "option": 8, "other": 484, "outer": 231, "pad": 232, "paramet": [324, 368], "partit": 233, "perceptron": 6, "permut": 247, "place": 482, "power": 234, "prelu": [379, 436], "primit": 2, "prod": [63, 235], "pure": 479, "put": 5, "put_along_axi": 236, "python": [2, 7, 8], "pytorch": 484, "qr": 190, "quantiz": [237, 307], "quantized_matmul": 238, "quantizedembed": 380, "quantizedlinear": 381, "quick": [324, 485], "radian": 239, "randint": 248, "random": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "read": 7, "real": 253, "reciproc": [64, 254], "recv": 125, "recv_lik": 126, "reduc": 480, "refer": 7, "regress": 4, "relu": [384, 437], "relu6": [385, 438], "remaind": 255, "remot": 480, "repeat": 256, "requir": 8, "reset_peak_memori": 215, "reshap": [65, 257], "result": 2, "rfft": 156, "rfft2": 157, "rfftn": 158, "right_shift": 258, "rms_norm": 144, "rmsnorm": 382, "rmsprop": 466, "rnn": 383, "roll": 259, "rope": [145, 386], "round": [66, 260], "rsqrt": [67, 261], "sampl": 1, "save": [262, 454, 486], "save_gguf": 263, "save_safetensor": 264, "save_weight": 369, "savez": 265, "savez_compress": 266, "scaled_dot_product_attent": 146, "schedul": 475, "script": [2, 5], "seed": 249, "selu": [387, 439], "send": 127, "sequenti": 388, "serial": 486, "set": 480, "set_cache_limit": 216, "set_default_devic": 267, "set_default_stream": 268, "set_dtyp": 370, "set_memory_limit": 217, "set_wired_limit": 218, "setuptool": 2, "sgd": 467, "shape": [1, 68], "shell": 8, "sigmoid": [269, 390, 440], "sign": 270, "silu": [389, 441], "simpl": [1, 487], "sin": [69, 271], "sinh": 272, "sinusoidalpositionalencod": 391, "size": [8, 70], "smooth_l1_loss": 433, "softmax": [273, 392, 442], "softmin": [393, 443], "softplu": [394, 444], "softshrink": [395, 445], "softsign": 396, "sort": 274, "sourc": 8, "specifi": 488, "speedup": 479, "split": [71, 250, 275], "sqrt": [72, 276], "squar": [73, 277], "squeez": [74, 278], "stack": 279, "start": [324, 480, 485], "start_captur": 219, "state": [371, 464], "std": [75, 280], "step": [397, 446], "step_decai": 472, "stop_captur": 220, "stop_gradi": 281, "stream": [282, 315, 318, 488], "stride": 1, "subtract": 283, "sum": [76, 284], "support": 317, "svd": 191, "swapax": [77, 285], "synchron": 286, "t": 31, "take": 287, "take_along_axi": 288, "tan": 289, "tanh": [290, 398, 447], "tensordot": 291, "tensorflow": 484, "tile": 292, "togeth": 5, "tolist": 78, "topk": 293, "trace": 294, "train": [372, 374, 479, 480], "trainable_paramet": 373, "transform": [2, 399, 477, 479, 481, 483, 485], "transpos": [79, 295], "tree": 478, "tree_flatten": 310, "tree_map": 311, "tree_map_with_path": 312, "tree_reduc": 313, "tree_unflatten": 314, "tri": 296, "tri_inv": 192, "tril": 297, "triplet_loss": 434, "triu": 298, "troubleshoot": 8, "truncated_norm": 251, "tune": 480, "type": 317, "unfreez": 375, "unifi": 487, "uniform": [252, 408], "up": 480, "updat": [324, 376, 465, 482], "update_modul": 377, "upsampl": 400, "us": [1, 2, 483, 488], "usag": [2, 7], "util": [310, 311, 312, 313, 314, 478], "valu": 324, "value_and_grad": [299, 308], "var": [80, 300], "vector": 481, "view": [81, 301], "vjp": [1, 302], "vmap": 303, "weight": 5, "what": 483, "when": 483, "where": 304, "why": 483, "workflow": 3, "x86": 8, "xcode": 3, "you": 483, "zero": 305, "zeros_lik": 306}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"A Simple Example": [[487, "a-simple-example"]], "Array": [[316, null]], "Attention layer": [[5, "attention-layer"]], "Automatic Differentiation": [[481, "automatic-differentiation"]], "Automatic Vectorization": [[481, "automatic-vectorization"]], "Basics": [[485, "basics"]], "Basics of Compile": [[479, "basics-of-compile"]], "Binary Size Minimization": [[8, "binary-size-minimization"]], "Binding to Python": [[2, "binding-to-python"]], "Build Options": [[8, "id3"]], "Build Requirements": [[8, "build-requirements"]], "Build and Install": [[8, null]], "Build from source": [[8, "build-from-source"]], "Building and Binding": [[2, "building-and-binding"]], "Building with CMake": [[2, "building-with-cmake"]], "Building with setuptools": [[2, "building-with-setuptools"]], "C++ API": [[8, "c-api"]], "C++ API Reference": [[7, null]], "Common Optimizers": [[473, null]], "Compilation": [[479, null]], "Compiling Training Graphs": [[479, "compiling-training-graphs"]], "Complex Example": [[1, "complex-example"]], "Conversion to NumPy and Other Frameworks": [[484, null]], "Converting the weights": [[5, "converting-the-weights"]], "Custom Extensions in MLX": [[2, null]], "Custom Metal Kernels": [[1, null]], "Data Types": [[317, null]], "Debugging": [[479, "debugging"]], "Devices and Streams": [[318, null]], "Differences from NumPy": [[482, "differences-from-numpy"]], "Distributed Communication": [[319, null], [480, null]], "Download the code": [[2, null], [5, null]], "Encoder layer": [[5, "encoder-layer"]], "Example Speedup": [[479, "example-speedup"]], "Examples": [[7, null]], "FFT": [[321, null]], "Fast": [[320, null]], "Full model": [[5, "full-model"]], "Function Transforms": [[481, null]], "Function and Graph Transformations": [[485, "function-and-graph-transformations"]], "Functions": [[448, null]], "Further Reading": [[7, null]], "Generation": [[5, "generation"]], "Getting Started": [[480, "getting-started"]], "Grid Sample VJP": [[1, "grid-sample-vjp"]], "Implementing the CPU Back-end": [[2, "implementing-the-cpu-back-end"]], "Implementing the GPU Back-end": [[2, "implementing-the-gpu-back-end"]], "Implementing the Primitive": [[2, "implementing-the-primitive"]], "Implementing the model": [[5, "implementing-the-model"]], "In Place Updates": [[482, "in-place-updates"]], "Indexing Arrays": [[482, null]], "Initializers": [[449, null]], "Inspecting Modules": [[324, "inspecting-modules"]], "Install": [[7, null]], "Installing MPI": [[480, "installing-mpi"]], "Introducing the Example": [[2, "introducing-the-example"]], "JAX": [[484, "jax"]], "LLM inference": [[5, null]], "Layers": [[450, null]], "Lazy Evaluation": [[483, null]], "Linear Algebra": [[322, null]], "Linear Regression": [[4, null]], "Loss Functions": [[451, null]], "MLX": [[7, null]], "Metal": [[323, null]], "Metal Debugger": [[3, null]], "Metal not found": [[8, "metal-not-found"]], "Module": [[452, null]], "Multi-Layer Perceptron": [[6, null]], "Neural Networks": [[324, null]], "Only Compute What You Use": [[483, "only-compute-what-you-use"]], "Operations": [[0, null], [2, "operations"], [453, null]], "Operations and Primitives": [[2, "operations-and-primitives"]], "Optimizer": [[474, null]], "Optimizers": [[454, null]], "Parameters": [[324, "parameters"]], "Primitive Transforms": [[2, "primitive-transforms"]], "Primitives": [[2, "primitives"]], "Pure Functions": [[479, "pure-functions"]], "Putting it all together": [[5, "putting-it-all-together"]], "PyTorch": [[484, "pytorch"]], "Python API": [[8, "python-api"]], "Python API Reference": [[7, null]], "Python Installation": [[8, "python-installation"]], "Quick Start Guide": [[485, null]], "Quick Start with Neural Networks": [[324, "quick-start-with-neural-networks"]], "Random": [[476, null]], "Results": [[2, "results"]], "Saving and Loading": [[454, "saving-and-loading"]], "Saving and Loading Arrays": [[486, null]], "Schedulers": [[475, null]], "Scripts": [[2, "scripts"], [5, "scripts"]], "Serialization Formats": [[486, "id1"]], "Setting up Remote Hosts": [[480, "setting-up-remote-hosts"]], "Simple Example": [[1, "simple-example"]], "Specifying the Stream": [[488, "specifying-the-stream"]], "Supported Data Types": [[317, "id2"]], "TensorFlow": [[484, "tensorflow"]], "The Module Class": [[324, "the-module-class"]], "Training Example": [[480, "training-example"]], "Transformations with Compile": [[479, "transformations-with-compile"]], "Transforming Compute Graphs": [[483, "transforming-compute-graphs"]], "Transforms": [[477, null]], "Tree Utils": [[478, null]], "Troubleshooting": [[8, "troubleshooting"], [8, "id2"]], "Tuning All Reduce": [[480, "tuning-all-reduce"]], "Unified Memory": [[487, null]], "Updating the Parameters": [[324, "updating-the-parameters"]], "Usage": [[2, "usage"], [7, null]], "Using Shape/Strides": [[1, "using-shape-strides"]], "Using Streams": [[488, null]], "Using the Primitive": [[2, "using-the-primitive"]], "Value and Grad": [[324, "value-and-grad"]], "Weight loading and benchmarking": [[5, "weight-loading-and-benchmarking"]], "When to Evaluate": [[483, "when-to-evaluate"]], "Why Lazy Evaluation": [[483, "why-lazy-evaluation"]], "Xcode Workflow": [[3, "xcode-workflow"]], "mlx.core.Device": [[9, null]], "mlx.core.Dtype": [[10, null]], "mlx.core.DtypeCategory": [[11, null]], "mlx.core.Stream": [[315, null]], "mlx.core.abs": [[12, null]], "mlx.core.add": [[13, null]], "mlx.core.addmm": [[14, null]], "mlx.core.all": [[15, null]], "mlx.core.allclose": [[16, null]], "mlx.core.any": [[17, null]], "mlx.core.arange": [[18, null]], "mlx.core.arccos": [[19, null]], "mlx.core.arccosh": [[20, null]], "mlx.core.arcsin": [[21, null]], "mlx.core.arcsinh": [[22, null]], "mlx.core.arctan": [[23, null]], "mlx.core.arctan2": [[24, null]], "mlx.core.arctanh": [[25, null]], "mlx.core.argmax": [[26, null]], "mlx.core.argmin": [[27, null]], "mlx.core.argpartition": [[28, null]], "mlx.core.argsort": [[29, null]], "mlx.core.array": [[30, null]], "mlx.core.array.T": [[31, null]], "mlx.core.array.abs": [[32, null]], "mlx.core.array.all": [[33, null]], "mlx.core.array.any": [[34, null]], "mlx.core.array.argmax": [[35, null]], "mlx.core.array.argmin": [[36, null]], "mlx.core.array.astype": [[37, null]], "mlx.core.array.at": [[38, null]], "mlx.core.array.conj": [[39, null]], "mlx.core.array.cos": [[40, null]], "mlx.core.array.cummax": [[41, null]], "mlx.core.array.cummin": [[42, null]], "mlx.core.array.cumprod": [[43, null]], "mlx.core.array.cumsum": [[44, null]], "mlx.core.array.diag": [[45, null]], "mlx.core.array.diagonal": [[46, null]], "mlx.core.array.dtype": [[47, null]], "mlx.core.array.exp": [[48, null]], "mlx.core.array.flatten": [[49, null]], "mlx.core.array.item": [[50, null]], "mlx.core.array.itemsize": [[51, null]], "mlx.core.array.log": [[52, null]], "mlx.core.array.log10": [[53, null]], "mlx.core.array.log1p": [[54, null]], "mlx.core.array.log2": [[55, null]], "mlx.core.array.logsumexp": [[56, null]], "mlx.core.array.max": [[57, null]], "mlx.core.array.mean": [[58, null]], "mlx.core.array.min": [[59, null]], "mlx.core.array.moveaxis": [[60, null]], "mlx.core.array.nbytes": [[61, null]], "mlx.core.array.ndim": [[62, null]], "mlx.core.array.prod": [[63, null]], "mlx.core.array.reciprocal": [[64, null]], "mlx.core.array.reshape": [[65, null]], "mlx.core.array.round": [[66, null]], "mlx.core.array.rsqrt": [[67, null]], "mlx.core.array.shape": [[68, null]], "mlx.core.array.sin": [[69, null]], "mlx.core.array.size": [[70, null]], "mlx.core.array.split": [[71, null]], "mlx.core.array.sqrt": [[72, null]], "mlx.core.array.square": [[73, null]], "mlx.core.array.squeeze": [[74, null]], "mlx.core.array.std": [[75, null]], "mlx.core.array.sum": [[76, null]], "mlx.core.array.swapaxes": [[77, null]], "mlx.core.array.tolist": [[78, null]], "mlx.core.array.transpose": [[79, null]], "mlx.core.array.var": [[80, null]], "mlx.core.array.view": [[81, null]], "mlx.core.array_equal": [[82, null]], "mlx.core.as_strided": [[83, null]], "mlx.core.atleast_1d": [[84, null]], "mlx.core.atleast_2d": [[85, null]], "mlx.core.atleast_3d": [[86, null]], "mlx.core.bitwise_and": [[87, null]], "mlx.core.bitwise_or": [[88, null]], "mlx.core.bitwise_xor": [[89, null]], "mlx.core.block_masked_mm": [[90, null]], "mlx.core.broadcast_to": [[91, null]], "mlx.core.ceil": [[92, null]], "mlx.core.clip": [[93, null]], "mlx.core.compile": [[94, null]], "mlx.core.concatenate": [[95, null]], "mlx.core.conj": [[96, null]], "mlx.core.conjugate": [[97, null]], "mlx.core.conv1d": [[98, null]], "mlx.core.conv2d": [[99, null]], "mlx.core.conv3d": [[100, null]], "mlx.core.conv_general": [[101, null]], "mlx.core.conv_transpose1d": [[102, null]], "mlx.core.conv_transpose2d": [[103, null]], "mlx.core.conv_transpose3d": [[104, null]], "mlx.core.convolve": [[105, null]], "mlx.core.cos": [[106, null]], "mlx.core.cosh": [[107, null]], "mlx.core.cummax": [[108, null]], "mlx.core.cummin": [[109, null]], "mlx.core.cumprod": [[110, null]], "mlx.core.cumsum": [[111, null]], "mlx.core.custom_function": [[112, null]], "mlx.core.default_device": [[113, null]], "mlx.core.default_stream": [[114, null]], "mlx.core.degrees": [[115, null]], "mlx.core.dequantize": [[116, null]], "mlx.core.diag": [[117, null]], "mlx.core.diagonal": [[118, null]], "mlx.core.disable_compile": [[119, null]], "mlx.core.distributed.Group": [[120, null]], "mlx.core.distributed.all_gather": [[121, null]], "mlx.core.distributed.all_sum": [[122, null]], "mlx.core.distributed.init": [[123, null]], "mlx.core.distributed.is_available": [[124, null]], "mlx.core.distributed.recv": [[125, null]], "mlx.core.distributed.recv_like": [[126, null]], "mlx.core.distributed.send": [[127, null]], "mlx.core.divide": [[128, null]], "mlx.core.divmod": [[129, null]], "mlx.core.einsum": [[130, null]], "mlx.core.einsum_path": [[131, null]], "mlx.core.enable_compile": [[132, null]], "mlx.core.equal": [[133, null]], "mlx.core.erf": [[134, null]], "mlx.core.erfinv": [[135, null]], "mlx.core.eval": [[136, null]], "mlx.core.exp": [[137, null]], "mlx.core.expand_dims": [[138, null]], "mlx.core.expm1": [[139, null]], "mlx.core.eye": [[140, null]], "mlx.core.fast.affine_quantize": [[141, null]], "mlx.core.fast.layer_norm": [[142, null]], "mlx.core.fast.metal_kernel": [[143, null]], "mlx.core.fast.rms_norm": [[144, null]], "mlx.core.fast.rope": [[145, null]], "mlx.core.fast.scaled_dot_product_attention": [[146, null]], "mlx.core.fft.fft": [[147, null]], "mlx.core.fft.fft2": [[148, null]], "mlx.core.fft.fftn": [[149, null]], "mlx.core.fft.ifft": [[150, null]], "mlx.core.fft.ifft2": [[151, null]], "mlx.core.fft.ifftn": [[152, null]], "mlx.core.fft.irfft": [[153, null]], "mlx.core.fft.irfft2": [[154, null]], "mlx.core.fft.irfftn": [[155, null]], "mlx.core.fft.rfft": [[156, null]], "mlx.core.fft.rfft2": [[157, null]], "mlx.core.fft.rfftn": [[158, null]], "mlx.core.flatten": [[159, null]], "mlx.core.floor": [[160, null]], "mlx.core.floor_divide": [[161, null]], "mlx.core.full": [[162, null]], "mlx.core.gather_mm": [[163, null]], "mlx.core.gather_qmm": [[164, null]], "mlx.core.grad": [[165, null]], "mlx.core.greater": [[166, null]], "mlx.core.greater_equal": [[167, null]], "mlx.core.hadamard_transform": [[168, null]], "mlx.core.identity": [[169, null]], "mlx.core.imag": [[170, null]], "mlx.core.inner": [[171, null]], "mlx.core.isclose": [[172, null]], "mlx.core.isfinite": [[173, null]], "mlx.core.isinf": [[174, null]], "mlx.core.isnan": [[175, null]], "mlx.core.isneginf": [[176, null]], "mlx.core.isposinf": [[177, null]], "mlx.core.issubdtype": [[178, null]], "mlx.core.jvp": [[179, null]], "mlx.core.left_shift": [[180, null]], "mlx.core.less": [[181, null]], "mlx.core.less_equal": [[182, null]], "mlx.core.linalg.cholesky": [[183, null]], "mlx.core.linalg.cholesky_inv": [[184, null]], "mlx.core.linalg.cross": [[185, null]], "mlx.core.linalg.eigh": [[186, null]], "mlx.core.linalg.eigvalsh": [[187, null]], "mlx.core.linalg.inv": [[188, null]], "mlx.core.linalg.norm": [[189, null]], "mlx.core.linalg.qr": [[190, null]], "mlx.core.linalg.svd": [[191, null]], "mlx.core.linalg.tri_inv": [[192, null]], "mlx.core.linspace": [[193, null]], "mlx.core.load": [[194, null]], "mlx.core.log": [[195, null]], "mlx.core.log10": [[196, null]], "mlx.core.log1p": [[197, null]], "mlx.core.log2": [[198, null]], "mlx.core.logaddexp": [[199, null]], "mlx.core.logical_and": [[200, null]], "mlx.core.logical_not": [[201, null]], "mlx.core.logical_or": [[202, null]], "mlx.core.logsumexp": [[203, null]], "mlx.core.matmul": [[204, null]], "mlx.core.max": [[205, null]], "mlx.core.maximum": [[206, null]], "mlx.core.mean": [[207, null]], "mlx.core.meshgrid": [[208, null]], "mlx.core.metal.clear_cache": [[209, null]], "mlx.core.metal.device_info": [[210, null]], "mlx.core.metal.get_active_memory": [[211, null]], "mlx.core.metal.get_cache_memory": [[212, null]], "mlx.core.metal.get_peak_memory": [[213, null]], "mlx.core.metal.is_available": [[214, null]], "mlx.core.metal.reset_peak_memory": [[215, null]], "mlx.core.metal.set_cache_limit": [[216, null]], "mlx.core.metal.set_memory_limit": [[217, null]], "mlx.core.metal.set_wired_limit": [[218, null]], "mlx.core.metal.start_capture": [[219, null]], "mlx.core.metal.stop_capture": [[220, null]], "mlx.core.min": [[221, null]], "mlx.core.minimum": [[222, null]], "mlx.core.moveaxis": [[223, null]], "mlx.core.multiply": [[224, null]], "mlx.core.nan_to_num": [[225, null]], "mlx.core.negative": [[226, null]], "mlx.core.new_stream": [[227, null]], "mlx.core.not_equal": [[228, null]], "mlx.core.ones": [[229, null]], "mlx.core.ones_like": [[230, null]], "mlx.core.outer": [[231, null]], "mlx.core.pad": [[232, null]], "mlx.core.partition": [[233, null]], "mlx.core.power": [[234, null]], "mlx.core.prod": [[235, null]], "mlx.core.put_along_axis": [[236, null]], "mlx.core.quantize": [[237, null]], "mlx.core.quantized_matmul": [[238, null]], "mlx.core.radians": [[239, null]], "mlx.core.random.bernoulli": [[240, null]], "mlx.core.random.categorical": [[241, null]], "mlx.core.random.gumbel": [[242, null]], "mlx.core.random.key": [[243, null]], "mlx.core.random.laplace": [[244, null]], "mlx.core.random.multivariate_normal": [[245, null]], "mlx.core.random.normal": [[246, null]], "mlx.core.random.permutation": [[247, null]], "mlx.core.random.randint": [[248, null]], "mlx.core.random.seed": [[249, null]], "mlx.core.random.split": [[250, null]], "mlx.core.random.truncated_normal": [[251, null]], "mlx.core.random.uniform": [[252, null]], "mlx.core.real": [[253, null]], "mlx.core.reciprocal": [[254, null]], "mlx.core.remainder": [[255, null]], "mlx.core.repeat": [[256, null]], "mlx.core.reshape": [[257, null]], "mlx.core.right_shift": [[258, null]], "mlx.core.roll": [[259, null]], "mlx.core.round": [[260, null]], "mlx.core.rsqrt": [[261, null]], "mlx.core.save": [[262, null]], "mlx.core.save_gguf": [[263, null]], "mlx.core.save_safetensors": [[264, null]], "mlx.core.savez": [[265, null]], "mlx.core.savez_compressed": [[266, null]], "mlx.core.set_default_device": [[267, null]], "mlx.core.set_default_stream": [[268, null]], "mlx.core.sigmoid": [[269, null]], "mlx.core.sign": [[270, null]], "mlx.core.sin": [[271, null]], "mlx.core.sinh": [[272, null]], "mlx.core.softmax": [[273, null]], "mlx.core.sort": [[274, null]], "mlx.core.split": [[275, null]], "mlx.core.sqrt": [[276, null]], "mlx.core.square": [[277, null]], "mlx.core.squeeze": [[278, null]], "mlx.core.stack": [[279, null]], "mlx.core.std": [[280, null]], "mlx.core.stop_gradient": [[281, null]], "mlx.core.stream": [[282, null]], "mlx.core.subtract": [[283, null]], "mlx.core.sum": [[284, null]], "mlx.core.swapaxes": [[285, null]], "mlx.core.synchronize": [[286, null]], "mlx.core.take": [[287, null]], "mlx.core.take_along_axis": [[288, null]], "mlx.core.tan": [[289, null]], "mlx.core.tanh": [[290, null]], "mlx.core.tensordot": [[291, null]], "mlx.core.tile": [[292, null]], "mlx.core.topk": [[293, null]], "mlx.core.trace": [[294, null]], "mlx.core.transpose": [[295, null]], "mlx.core.tri": [[296, null]], "mlx.core.tril": [[297, null]], "mlx.core.triu": [[298, null]], "mlx.core.value_and_grad": [[299, null]], "mlx.core.var": [[300, null]], "mlx.core.view": [[301, null]], "mlx.core.vjp": [[302, null]], "mlx.core.vmap": [[303, null]], "mlx.core.where": [[304, null]], "mlx.core.zeros": [[305, null]], "mlx.core.zeros_like": [[306, null]], "mlx.nn.ALiBi": [[325, null]], "mlx.nn.AvgPool1d": [[326, null]], "mlx.nn.AvgPool2d": [[327, null]], "mlx.nn.BatchNorm": [[328, null]], "mlx.nn.CELU": [[329, null]], "mlx.nn.Conv1d": [[330, null]], "mlx.nn.Conv2d": [[331, null]], "mlx.nn.Conv3d": [[332, null]], "mlx.nn.ConvTranspose1d": [[333, null]], "mlx.nn.ConvTranspose2d": [[334, null]], "mlx.nn.ConvTranspose3d": [[335, null]], "mlx.nn.Dropout": [[336, null]], "mlx.nn.Dropout2d": [[337, null]], "mlx.nn.Dropout3d": [[338, null]], "mlx.nn.ELU": [[339, null]], "mlx.nn.Embedding": [[340, null]], "mlx.nn.GELU": [[341, null]], "mlx.nn.GLU": [[342, null]], "mlx.nn.GRU": [[343, null]], "mlx.nn.GroupNorm": [[344, null]], "mlx.nn.HardShrink": [[345, null]], "mlx.nn.HardTanh": [[346, null]], "mlx.nn.Hardswish": [[347, null]], "mlx.nn.InstanceNorm": [[348, null]], "mlx.nn.LSTM": [[349, null]], "mlx.nn.LayerNorm": [[350, null]], "mlx.nn.LeakyReLU": [[351, null]], "mlx.nn.Linear": [[352, null]], "mlx.nn.LogSigmoid": [[353, null]], "mlx.nn.LogSoftmax": [[354, null]], "mlx.nn.MaxPool1d": [[355, null]], "mlx.nn.MaxPool2d": [[356, null]], "mlx.nn.Mish": [[357, null]], "mlx.nn.Module.apply": [[358, null]], "mlx.nn.Module.apply_to_modules": [[359, null]], "mlx.nn.Module.children": [[360, null]], "mlx.nn.Module.eval": [[361, null]], "mlx.nn.Module.filter_and_map": [[362, null]], "mlx.nn.Module.freeze": [[363, null]], "mlx.nn.Module.leaf_modules": [[364, null]], "mlx.nn.Module.load_weights": [[365, null]], "mlx.nn.Module.modules": [[366, null]], "mlx.nn.Module.named_modules": [[367, null]], "mlx.nn.Module.parameters": [[368, null]], "mlx.nn.Module.save_weights": [[369, null]], "mlx.nn.Module.set_dtype": [[370, null]], "mlx.nn.Module.state": [[371, null]], "mlx.nn.Module.train": [[372, null]], "mlx.nn.Module.trainable_parameters": [[373, null]], "mlx.nn.Module.training": [[374, null]], "mlx.nn.Module.unfreeze": [[375, null]], "mlx.nn.Module.update": [[376, null]], "mlx.nn.Module.update_modules": [[377, null]], "mlx.nn.MultiHeadAttention": [[378, null]], "mlx.nn.PReLU": [[379, null]], "mlx.nn.QuantizedEmbedding": [[380, null]], "mlx.nn.QuantizedLinear": [[381, null]], "mlx.nn.RMSNorm": [[382, null]], "mlx.nn.RNN": [[383, null]], "mlx.nn.ReLU": [[384, null]], "mlx.nn.ReLU6": [[385, null]], "mlx.nn.RoPE": [[386, null]], "mlx.nn.SELU": [[387, null]], "mlx.nn.Sequential": [[388, null]], "mlx.nn.SiLU": [[389, null]], "mlx.nn.Sigmoid": [[390, null]], "mlx.nn.SinusoidalPositionalEncoding": [[391, null]], "mlx.nn.Softmax": [[392, null]], "mlx.nn.Softmin": [[393, null]], "mlx.nn.Softplus": [[394, null]], "mlx.nn.Softshrink": [[395, null]], "mlx.nn.Softsign": [[396, null]], "mlx.nn.Step": [[397, null]], "mlx.nn.Tanh": [[398, null]], "mlx.nn.Transformer": [[399, null]], "mlx.nn.Upsample": [[400, null]], "mlx.nn.celu": [[409, null]], "mlx.nn.elu": [[410, null]], "mlx.nn.gelu": [[411, null]], "mlx.nn.gelu_approx": [[412, null]], "mlx.nn.gelu_fast_approx": [[413, null]], "mlx.nn.glu": [[414, null]], "mlx.nn.hard_shrink": [[415, null]], "mlx.nn.hard_tanh": [[416, null]], "mlx.nn.hardswish": [[417, null]], "mlx.nn.init.constant": [[401, null]], "mlx.nn.init.glorot_normal": [[402, null]], "mlx.nn.init.glorot_uniform": [[403, null]], "mlx.nn.init.he_normal": [[404, null]], "mlx.nn.init.he_uniform": [[405, null]], "mlx.nn.init.identity": [[406, null]], "mlx.nn.init.normal": [[407, null]], "mlx.nn.init.uniform": [[408, null]], "mlx.nn.leaky_relu": [[418, null]], "mlx.nn.log_sigmoid": [[419, null]], "mlx.nn.log_softmax": [[420, null]], "mlx.nn.losses.binary_cross_entropy": [[421, null]], "mlx.nn.losses.cosine_similarity_loss": [[422, null]], "mlx.nn.losses.cross_entropy": [[423, null]], "mlx.nn.losses.gaussian_nll_loss": [[424, null]], "mlx.nn.losses.hinge_loss": [[425, null]], "mlx.nn.losses.huber_loss": [[426, null]], "mlx.nn.losses.kl_div_loss": [[427, null]], "mlx.nn.losses.l1_loss": [[428, null]], "mlx.nn.losses.log_cosh_loss": [[429, null]], "mlx.nn.losses.margin_ranking_loss": [[430, null]], "mlx.nn.losses.mse_loss": [[431, null]], "mlx.nn.losses.nll_loss": [[432, null]], "mlx.nn.losses.smooth_l1_loss": [[433, null]], "mlx.nn.losses.triplet_loss": [[434, null]], "mlx.nn.mish": [[435, null]], "mlx.nn.prelu": [[436, null]], "mlx.nn.quantize": [[307, null]], "mlx.nn.relu": [[437, null]], "mlx.nn.relu6": [[438, null]], "mlx.nn.selu": [[439, null]], "mlx.nn.sigmoid": [[440, null]], "mlx.nn.silu": [[441, null]], "mlx.nn.softmax": [[442, null]], "mlx.nn.softmin": [[443, null]], "mlx.nn.softplus": [[444, null]], "mlx.nn.softshrink": [[445, null]], "mlx.nn.step": [[446, null]], "mlx.nn.tanh": [[447, null]], "mlx.nn.value_and_grad": [[308, null]], "mlx.optimizers.AdaDelta": [[455, null]], "mlx.optimizers.Adafactor": [[456, null]], "mlx.optimizers.Adagrad": [[457, null]], "mlx.optimizers.Adam": [[458, null]], "mlx.optimizers.AdamW": [[459, null]], "mlx.optimizers.Adamax": [[460, null]], "mlx.optimizers.Lion": [[461, null]], "mlx.optimizers.Optimizer.apply_gradients": [[462, null]], "mlx.optimizers.Optimizer.init": [[463, null]], "mlx.optimizers.Optimizer.state": [[464, null]], "mlx.optimizers.Optimizer.update": [[465, null]], "mlx.optimizers.RMSprop": [[466, null]], "mlx.optimizers.SGD": [[467, null]], "mlx.optimizers.clip_grad_norm": [[309, null]], "mlx.optimizers.cosine_decay": [[468, null]], "mlx.optimizers.exponential_decay": [[469, null]], "mlx.optimizers.join_schedules": [[470, null]], "mlx.optimizers.linear_schedule": [[471, null]], "mlx.optimizers.step_decay": [[472, null]], "mlx.utils.tree_flatten": [[310, null]], "mlx.utils.tree_map": [[311, null]], "mlx.utils.tree_map_with_path": [[312, null]], "mlx.utils.tree_reduce": [[313, null]], "mlx.utils.tree_unflatten": [[314, null]], "x86 Shell": [[8, "x86-shell"]]}, "docnames": ["cpp/ops", "dev/custom_metal_kernels", "dev/extensions", "dev/metal_debugger", "examples/linear_regression", "examples/llama-inference", "examples/mlp", "index", "install", "python/_autosummary/mlx.core.Device", "python/_autosummary/mlx.core.Dtype", "python/_autosummary/mlx.core.DtypeCategory", "python/_autosummary/mlx.core.abs", "python/_autosummary/mlx.core.add", "python/_autosummary/mlx.core.addmm", "python/_autosummary/mlx.core.all", "python/_autosummary/mlx.core.allclose", "python/_autosummary/mlx.core.any", "python/_autosummary/mlx.core.arange", "python/_autosummary/mlx.core.arccos", "python/_autosummary/mlx.core.arccosh", "python/_autosummary/mlx.core.arcsin", "python/_autosummary/mlx.core.arcsinh", "python/_autosummary/mlx.core.arctan", "python/_autosummary/mlx.core.arctan2", "python/_autosummary/mlx.core.arctanh", "python/_autosummary/mlx.core.argmax", "python/_autosummary/mlx.core.argmin", "python/_autosummary/mlx.core.argpartition", "python/_autosummary/mlx.core.argsort", "python/_autosummary/mlx.core.array", "python/_autosummary/mlx.core.array.T", "python/_autosummary/mlx.core.array.abs", "python/_autosummary/mlx.core.array.all", "python/_autosummary/mlx.core.array.any", "python/_autosummary/mlx.core.array.argmax", "python/_autosummary/mlx.core.array.argmin", "python/_autosummary/mlx.core.array.astype", "python/_autosummary/mlx.core.array.at", "python/_autosummary/mlx.core.array.conj", "python/_autosummary/mlx.core.array.cos", "python/_autosummary/mlx.core.array.cummax", "python/_autosummary/mlx.core.array.cummin", "python/_autosummary/mlx.core.array.cumprod", "python/_autosummary/mlx.core.array.cumsum", "python/_autosummary/mlx.core.array.diag", "python/_autosummary/mlx.core.array.diagonal", "python/_autosummary/mlx.core.array.dtype", "python/_autosummary/mlx.core.array.exp", "python/_autosummary/mlx.core.array.flatten", "python/_autosummary/mlx.core.array.item", "python/_autosummary/mlx.core.array.itemsize", "python/_autosummary/mlx.core.array.log", "python/_autosummary/mlx.core.array.log10", "python/_autosummary/mlx.core.array.log1p", "python/_autosummary/mlx.core.array.log2", "python/_autosummary/mlx.core.array.logsumexp", "python/_autosummary/mlx.core.array.max", "python/_autosummary/mlx.core.array.mean", "python/_autosummary/mlx.core.array.min", "python/_autosummary/mlx.core.array.moveaxis", "python/_autosummary/mlx.core.array.nbytes", "python/_autosummary/mlx.core.array.ndim", "python/_autosummary/mlx.core.array.prod", "python/_autosummary/mlx.core.array.reciprocal", "python/_autosummary/mlx.core.array.reshape", "python/_autosummary/mlx.core.array.round", "python/_autosummary/mlx.core.array.rsqrt", "python/_autosummary/mlx.core.array.shape", "python/_autosummary/mlx.core.array.sin", "python/_autosummary/mlx.core.array.size", "python/_autosummary/mlx.core.array.split", "python/_autosummary/mlx.core.array.sqrt", "python/_autosummary/mlx.core.array.square", "python/_autosummary/mlx.core.array.squeeze", "python/_autosummary/mlx.core.array.std", "python/_autosummary/mlx.core.array.sum", "python/_autosummary/mlx.core.array.swapaxes", "python/_autosummary/mlx.core.array.tolist", "python/_autosummary/mlx.core.array.transpose", "python/_autosummary/mlx.core.array.var", "python/_autosummary/mlx.core.array.view", "python/_autosummary/mlx.core.array_equal", "python/_autosummary/mlx.core.as_strided", "python/_autosummary/mlx.core.atleast_1d", "python/_autosummary/mlx.core.atleast_2d", "python/_autosummary/mlx.core.atleast_3d", "python/_autosummary/mlx.core.bitwise_and", "python/_autosummary/mlx.core.bitwise_or", "python/_autosummary/mlx.core.bitwise_xor", "python/_autosummary/mlx.core.block_masked_mm", "python/_autosummary/mlx.core.broadcast_to", "python/_autosummary/mlx.core.ceil", "python/_autosummary/mlx.core.clip", "python/_autosummary/mlx.core.compile", "python/_autosummary/mlx.core.concatenate", "python/_autosummary/mlx.core.conj", "python/_autosummary/mlx.core.conjugate", "python/_autosummary/mlx.core.conv1d", "python/_autosummary/mlx.core.conv2d", "python/_autosummary/mlx.core.conv3d", "python/_autosummary/mlx.core.conv_general", "python/_autosummary/mlx.core.conv_transpose1d", "python/_autosummary/mlx.core.conv_transpose2d", "python/_autosummary/mlx.core.conv_transpose3d", "python/_autosummary/mlx.core.convolve", "python/_autosummary/mlx.core.cos", "python/_autosummary/mlx.core.cosh", "python/_autosummary/mlx.core.cummax", "python/_autosummary/mlx.core.cummin", "python/_autosummary/mlx.core.cumprod", "python/_autosummary/mlx.core.cumsum", "python/_autosummary/mlx.core.custom_function", "python/_autosummary/mlx.core.default_device", "python/_autosummary/mlx.core.default_stream", "python/_autosummary/mlx.core.degrees", "python/_autosummary/mlx.core.dequantize", "python/_autosummary/mlx.core.diag", "python/_autosummary/mlx.core.diagonal", "python/_autosummary/mlx.core.disable_compile", "python/_autosummary/mlx.core.distributed.Group", "python/_autosummary/mlx.core.distributed.all_gather", "python/_autosummary/mlx.core.distributed.all_sum", "python/_autosummary/mlx.core.distributed.init", "python/_autosummary/mlx.core.distributed.is_available", "python/_autosummary/mlx.core.distributed.recv", "python/_autosummary/mlx.core.distributed.recv_like", "python/_autosummary/mlx.core.distributed.send", "python/_autosummary/mlx.core.divide", "python/_autosummary/mlx.core.divmod", "python/_autosummary/mlx.core.einsum", "python/_autosummary/mlx.core.einsum_path", "python/_autosummary/mlx.core.enable_compile", "python/_autosummary/mlx.core.equal", "python/_autosummary/mlx.core.erf", "python/_autosummary/mlx.core.erfinv", "python/_autosummary/mlx.core.eval", "python/_autosummary/mlx.core.exp", "python/_autosummary/mlx.core.expand_dims", "python/_autosummary/mlx.core.expm1", "python/_autosummary/mlx.core.eye", "python/_autosummary/mlx.core.fast.affine_quantize", "python/_autosummary/mlx.core.fast.layer_norm", "python/_autosummary/mlx.core.fast.metal_kernel", "python/_autosummary/mlx.core.fast.rms_norm", "python/_autosummary/mlx.core.fast.rope", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention", "python/_autosummary/mlx.core.fft.fft", "python/_autosummary/mlx.core.fft.fft2", "python/_autosummary/mlx.core.fft.fftn", "python/_autosummary/mlx.core.fft.ifft", "python/_autosummary/mlx.core.fft.ifft2", "python/_autosummary/mlx.core.fft.ifftn", "python/_autosummary/mlx.core.fft.irfft", "python/_autosummary/mlx.core.fft.irfft2", "python/_autosummary/mlx.core.fft.irfftn", "python/_autosummary/mlx.core.fft.rfft", "python/_autosummary/mlx.core.fft.rfft2", "python/_autosummary/mlx.core.fft.rfftn", "python/_autosummary/mlx.core.flatten", "python/_autosummary/mlx.core.floor", "python/_autosummary/mlx.core.floor_divide", "python/_autosummary/mlx.core.full", "python/_autosummary/mlx.core.gather_mm", "python/_autosummary/mlx.core.gather_qmm", "python/_autosummary/mlx.core.grad", "python/_autosummary/mlx.core.greater", "python/_autosummary/mlx.core.greater_equal", "python/_autosummary/mlx.core.hadamard_transform", "python/_autosummary/mlx.core.identity", "python/_autosummary/mlx.core.imag", "python/_autosummary/mlx.core.inner", "python/_autosummary/mlx.core.isclose", "python/_autosummary/mlx.core.isfinite", "python/_autosummary/mlx.core.isinf", "python/_autosummary/mlx.core.isnan", "python/_autosummary/mlx.core.isneginf", "python/_autosummary/mlx.core.isposinf", "python/_autosummary/mlx.core.issubdtype", "python/_autosummary/mlx.core.jvp", "python/_autosummary/mlx.core.left_shift", "python/_autosummary/mlx.core.less", "python/_autosummary/mlx.core.less_equal", "python/_autosummary/mlx.core.linalg.cholesky", "python/_autosummary/mlx.core.linalg.cholesky_inv", "python/_autosummary/mlx.core.linalg.cross", "python/_autosummary/mlx.core.linalg.eigh", "python/_autosummary/mlx.core.linalg.eigvalsh", "python/_autosummary/mlx.core.linalg.inv", "python/_autosummary/mlx.core.linalg.norm", "python/_autosummary/mlx.core.linalg.qr", "python/_autosummary/mlx.core.linalg.svd", "python/_autosummary/mlx.core.linalg.tri_inv", "python/_autosummary/mlx.core.linspace", "python/_autosummary/mlx.core.load", "python/_autosummary/mlx.core.log", "python/_autosummary/mlx.core.log10", "python/_autosummary/mlx.core.log1p", "python/_autosummary/mlx.core.log2", "python/_autosummary/mlx.core.logaddexp", "python/_autosummary/mlx.core.logical_and", "python/_autosummary/mlx.core.logical_not", "python/_autosummary/mlx.core.logical_or", "python/_autosummary/mlx.core.logsumexp", "python/_autosummary/mlx.core.matmul", "python/_autosummary/mlx.core.max", "python/_autosummary/mlx.core.maximum", "python/_autosummary/mlx.core.mean", "python/_autosummary/mlx.core.meshgrid", "python/_autosummary/mlx.core.metal.clear_cache", "python/_autosummary/mlx.core.metal.device_info", "python/_autosummary/mlx.core.metal.get_active_memory", "python/_autosummary/mlx.core.metal.get_cache_memory", "python/_autosummary/mlx.core.metal.get_peak_memory", "python/_autosummary/mlx.core.metal.is_available", "python/_autosummary/mlx.core.metal.reset_peak_memory", "python/_autosummary/mlx.core.metal.set_cache_limit", "python/_autosummary/mlx.core.metal.set_memory_limit", "python/_autosummary/mlx.core.metal.set_wired_limit", "python/_autosummary/mlx.core.metal.start_capture", "python/_autosummary/mlx.core.metal.stop_capture", "python/_autosummary/mlx.core.min", "python/_autosummary/mlx.core.minimum", "python/_autosummary/mlx.core.moveaxis", "python/_autosummary/mlx.core.multiply", "python/_autosummary/mlx.core.nan_to_num", "python/_autosummary/mlx.core.negative", "python/_autosummary/mlx.core.new_stream", "python/_autosummary/mlx.core.not_equal", "python/_autosummary/mlx.core.ones", "python/_autosummary/mlx.core.ones_like", "python/_autosummary/mlx.core.outer", "python/_autosummary/mlx.core.pad", "python/_autosummary/mlx.core.partition", "python/_autosummary/mlx.core.power", "python/_autosummary/mlx.core.prod", "python/_autosummary/mlx.core.put_along_axis", "python/_autosummary/mlx.core.quantize", "python/_autosummary/mlx.core.quantized_matmul", "python/_autosummary/mlx.core.radians", "python/_autosummary/mlx.core.random.bernoulli", "python/_autosummary/mlx.core.random.categorical", "python/_autosummary/mlx.core.random.gumbel", "python/_autosummary/mlx.core.random.key", "python/_autosummary/mlx.core.random.laplace", "python/_autosummary/mlx.core.random.multivariate_normal", "python/_autosummary/mlx.core.random.normal", "python/_autosummary/mlx.core.random.permutation", "python/_autosummary/mlx.core.random.randint", "python/_autosummary/mlx.core.random.seed", "python/_autosummary/mlx.core.random.split", "python/_autosummary/mlx.core.random.truncated_normal", "python/_autosummary/mlx.core.random.uniform", "python/_autosummary/mlx.core.real", "python/_autosummary/mlx.core.reciprocal", "python/_autosummary/mlx.core.remainder", "python/_autosummary/mlx.core.repeat", "python/_autosummary/mlx.core.reshape", "python/_autosummary/mlx.core.right_shift", "python/_autosummary/mlx.core.roll", "python/_autosummary/mlx.core.round", "python/_autosummary/mlx.core.rsqrt", "python/_autosummary/mlx.core.save", "python/_autosummary/mlx.core.save_gguf", "python/_autosummary/mlx.core.save_safetensors", "python/_autosummary/mlx.core.savez", "python/_autosummary/mlx.core.savez_compressed", "python/_autosummary/mlx.core.set_default_device", "python/_autosummary/mlx.core.set_default_stream", "python/_autosummary/mlx.core.sigmoid", "python/_autosummary/mlx.core.sign", "python/_autosummary/mlx.core.sin", "python/_autosummary/mlx.core.sinh", "python/_autosummary/mlx.core.softmax", "python/_autosummary/mlx.core.sort", "python/_autosummary/mlx.core.split", "python/_autosummary/mlx.core.sqrt", "python/_autosummary/mlx.core.square", "python/_autosummary/mlx.core.squeeze", "python/_autosummary/mlx.core.stack", "python/_autosummary/mlx.core.std", "python/_autosummary/mlx.core.stop_gradient", "python/_autosummary/mlx.core.stream", "python/_autosummary/mlx.core.subtract", "python/_autosummary/mlx.core.sum", "python/_autosummary/mlx.core.swapaxes", "python/_autosummary/mlx.core.synchronize", "python/_autosummary/mlx.core.take", "python/_autosummary/mlx.core.take_along_axis", "python/_autosummary/mlx.core.tan", "python/_autosummary/mlx.core.tanh", "python/_autosummary/mlx.core.tensordot", "python/_autosummary/mlx.core.tile", "python/_autosummary/mlx.core.topk", "python/_autosummary/mlx.core.trace", "python/_autosummary/mlx.core.transpose", "python/_autosummary/mlx.core.tri", "python/_autosummary/mlx.core.tril", "python/_autosummary/mlx.core.triu", "python/_autosummary/mlx.core.value_and_grad", "python/_autosummary/mlx.core.var", "python/_autosummary/mlx.core.view", "python/_autosummary/mlx.core.vjp", "python/_autosummary/mlx.core.vmap", "python/_autosummary/mlx.core.where", "python/_autosummary/mlx.core.zeros", "python/_autosummary/mlx.core.zeros_like", "python/_autosummary/mlx.nn.quantize", "python/_autosummary/mlx.nn.value_and_grad", "python/_autosummary/mlx.optimizers.clip_grad_norm", "python/_autosummary/mlx.utils.tree_flatten", "python/_autosummary/mlx.utils.tree_map", "python/_autosummary/mlx.utils.tree_map_with_path", "python/_autosummary/mlx.utils.tree_reduce", "python/_autosummary/mlx.utils.tree_unflatten", "python/_autosummary/stream_class", "python/array", "python/data_types", "python/devices_and_streams", "python/distributed", "python/fast", "python/fft", "python/linalg", "python/metal", "python/nn", "python/nn/_autosummary/mlx.nn.ALiBi", "python/nn/_autosummary/mlx.nn.AvgPool1d", "python/nn/_autosummary/mlx.nn.AvgPool2d", "python/nn/_autosummary/mlx.nn.BatchNorm", "python/nn/_autosummary/mlx.nn.CELU", "python/nn/_autosummary/mlx.nn.Conv1d", "python/nn/_autosummary/mlx.nn.Conv2d", "python/nn/_autosummary/mlx.nn.Conv3d", "python/nn/_autosummary/mlx.nn.ConvTranspose1d", "python/nn/_autosummary/mlx.nn.ConvTranspose2d", "python/nn/_autosummary/mlx.nn.ConvTranspose3d", "python/nn/_autosummary/mlx.nn.Dropout", "python/nn/_autosummary/mlx.nn.Dropout2d", "python/nn/_autosummary/mlx.nn.Dropout3d", "python/nn/_autosummary/mlx.nn.ELU", "python/nn/_autosummary/mlx.nn.Embedding", "python/nn/_autosummary/mlx.nn.GELU", "python/nn/_autosummary/mlx.nn.GLU", "python/nn/_autosummary/mlx.nn.GRU", "python/nn/_autosummary/mlx.nn.GroupNorm", "python/nn/_autosummary/mlx.nn.HardShrink", "python/nn/_autosummary/mlx.nn.HardTanh", "python/nn/_autosummary/mlx.nn.Hardswish", "python/nn/_autosummary/mlx.nn.InstanceNorm", "python/nn/_autosummary/mlx.nn.LSTM", "python/nn/_autosummary/mlx.nn.LayerNorm", "python/nn/_autosummary/mlx.nn.LeakyReLU", "python/nn/_autosummary/mlx.nn.Linear", "python/nn/_autosummary/mlx.nn.LogSigmoid", "python/nn/_autosummary/mlx.nn.LogSoftmax", "python/nn/_autosummary/mlx.nn.MaxPool1d", "python/nn/_autosummary/mlx.nn.MaxPool2d", "python/nn/_autosummary/mlx.nn.Mish", "python/nn/_autosummary/mlx.nn.Module.apply", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules", "python/nn/_autosummary/mlx.nn.Module.children", "python/nn/_autosummary/mlx.nn.Module.eval", "python/nn/_autosummary/mlx.nn.Module.filter_and_map", "python/nn/_autosummary/mlx.nn.Module.freeze", "python/nn/_autosummary/mlx.nn.Module.leaf_modules", "python/nn/_autosummary/mlx.nn.Module.load_weights", "python/nn/_autosummary/mlx.nn.Module.modules", "python/nn/_autosummary/mlx.nn.Module.named_modules", "python/nn/_autosummary/mlx.nn.Module.parameters", "python/nn/_autosummary/mlx.nn.Module.save_weights", "python/nn/_autosummary/mlx.nn.Module.set_dtype", "python/nn/_autosummary/mlx.nn.Module.state", "python/nn/_autosummary/mlx.nn.Module.train", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters", "python/nn/_autosummary/mlx.nn.Module.training", "python/nn/_autosummary/mlx.nn.Module.unfreeze", "python/nn/_autosummary/mlx.nn.Module.update", "python/nn/_autosummary/mlx.nn.Module.update_modules", "python/nn/_autosummary/mlx.nn.MultiHeadAttention", "python/nn/_autosummary/mlx.nn.PReLU", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding", "python/nn/_autosummary/mlx.nn.QuantizedLinear", "python/nn/_autosummary/mlx.nn.RMSNorm", "python/nn/_autosummary/mlx.nn.RNN", "python/nn/_autosummary/mlx.nn.ReLU", "python/nn/_autosummary/mlx.nn.ReLU6", "python/nn/_autosummary/mlx.nn.RoPE", "python/nn/_autosummary/mlx.nn.SELU", "python/nn/_autosummary/mlx.nn.Sequential", "python/nn/_autosummary/mlx.nn.SiLU", "python/nn/_autosummary/mlx.nn.Sigmoid", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding", "python/nn/_autosummary/mlx.nn.Softmax", "python/nn/_autosummary/mlx.nn.Softmin", "python/nn/_autosummary/mlx.nn.Softplus", "python/nn/_autosummary/mlx.nn.Softshrink", "python/nn/_autosummary/mlx.nn.Softsign", "python/nn/_autosummary/mlx.nn.Step", "python/nn/_autosummary/mlx.nn.Tanh", "python/nn/_autosummary/mlx.nn.Transformer", "python/nn/_autosummary/mlx.nn.Upsample", "python/nn/_autosummary/mlx.nn.init.constant", "python/nn/_autosummary/mlx.nn.init.glorot_normal", "python/nn/_autosummary/mlx.nn.init.glorot_uniform", "python/nn/_autosummary/mlx.nn.init.he_normal", "python/nn/_autosummary/mlx.nn.init.he_uniform", "python/nn/_autosummary/mlx.nn.init.identity", "python/nn/_autosummary/mlx.nn.init.normal", "python/nn/_autosummary/mlx.nn.init.uniform", "python/nn/_autosummary_functions/mlx.nn.celu", "python/nn/_autosummary_functions/mlx.nn.elu", "python/nn/_autosummary_functions/mlx.nn.gelu", "python/nn/_autosummary_functions/mlx.nn.gelu_approx", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx", "python/nn/_autosummary_functions/mlx.nn.glu", "python/nn/_autosummary_functions/mlx.nn.hard_shrink", "python/nn/_autosummary_functions/mlx.nn.hard_tanh", "python/nn/_autosummary_functions/mlx.nn.hardswish", "python/nn/_autosummary_functions/mlx.nn.leaky_relu", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid", "python/nn/_autosummary_functions/mlx.nn.log_softmax", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss", "python/nn/_autosummary_functions/mlx.nn.mish", "python/nn/_autosummary_functions/mlx.nn.prelu", "python/nn/_autosummary_functions/mlx.nn.relu", "python/nn/_autosummary_functions/mlx.nn.relu6", "python/nn/_autosummary_functions/mlx.nn.selu", "python/nn/_autosummary_functions/mlx.nn.sigmoid", "python/nn/_autosummary_functions/mlx.nn.silu", "python/nn/_autosummary_functions/mlx.nn.softmax", "python/nn/_autosummary_functions/mlx.nn.softmin", "python/nn/_autosummary_functions/mlx.nn.softplus", "python/nn/_autosummary_functions/mlx.nn.softshrink", "python/nn/_autosummary_functions/mlx.nn.step", "python/nn/_autosummary_functions/mlx.nn.tanh", "python/nn/functions", "python/nn/init", "python/nn/layers", "python/nn/losses", "python/nn/module", "python/ops", "python/optimizers", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta", "python/optimizers/_autosummary/mlx.optimizers.Adafactor", "python/optimizers/_autosummary/mlx.optimizers.Adagrad", "python/optimizers/_autosummary/mlx.optimizers.Adam", "python/optimizers/_autosummary/mlx.optimizers.AdamW", "python/optimizers/_autosummary/mlx.optimizers.Adamax", "python/optimizers/_autosummary/mlx.optimizers.Lion", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update", "python/optimizers/_autosummary/mlx.optimizers.RMSprop", "python/optimizers/_autosummary/mlx.optimizers.SGD", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay", "python/optimizers/_autosummary/mlx.optimizers.join_schedules", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule", "python/optimizers/_autosummary/mlx.optimizers.step_decay", "python/optimizers/common_optimizers", "python/optimizers/optimizer", "python/optimizers/schedulers", "python/random", "python/transforms", "python/tree_utils", "usage/compile", "usage/distributed", "usage/function_transforms", "usage/indexing", "usage/lazy_evaluation", "usage/numpy", "usage/quick_start", "usage/saving_and_loading", "usage/unified_memory", "usage/using_streams"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1}, "filenames": ["cpp/ops.rst", "dev/custom_metal_kernels.rst", "dev/extensions.rst", "dev/metal_debugger.rst", "examples/linear_regression.rst", "examples/llama-inference.rst", "examples/mlp.rst", "index.rst", "install.rst", "python/_autosummary/mlx.core.Device.rst", "python/_autosummary/mlx.core.Dtype.rst", "python/_autosummary/mlx.core.DtypeCategory.rst", "python/_autosummary/mlx.core.abs.rst", "python/_autosummary/mlx.core.add.rst", "python/_autosummary/mlx.core.addmm.rst", "python/_autosummary/mlx.core.all.rst", "python/_autosummary/mlx.core.allclose.rst", "python/_autosummary/mlx.core.any.rst", "python/_autosummary/mlx.core.arange.rst", "python/_autosummary/mlx.core.arccos.rst", "python/_autosummary/mlx.core.arccosh.rst", "python/_autosummary/mlx.core.arcsin.rst", "python/_autosummary/mlx.core.arcsinh.rst", "python/_autosummary/mlx.core.arctan.rst", "python/_autosummary/mlx.core.arctan2.rst", "python/_autosummary/mlx.core.arctanh.rst", "python/_autosummary/mlx.core.argmax.rst", "python/_autosummary/mlx.core.argmin.rst", "python/_autosummary/mlx.core.argpartition.rst", "python/_autosummary/mlx.core.argsort.rst", "python/_autosummary/mlx.core.array.rst", "python/_autosummary/mlx.core.array.T.rst", "python/_autosummary/mlx.core.array.abs.rst", "python/_autosummary/mlx.core.array.all.rst", "python/_autosummary/mlx.core.array.any.rst", "python/_autosummary/mlx.core.array.argmax.rst", "python/_autosummary/mlx.core.array.argmin.rst", "python/_autosummary/mlx.core.array.astype.rst", "python/_autosummary/mlx.core.array.at.rst", "python/_autosummary/mlx.core.array.conj.rst", "python/_autosummary/mlx.core.array.cos.rst", "python/_autosummary/mlx.core.array.cummax.rst", "python/_autosummary/mlx.core.array.cummin.rst", "python/_autosummary/mlx.core.array.cumprod.rst", "python/_autosummary/mlx.core.array.cumsum.rst", "python/_autosummary/mlx.core.array.diag.rst", "python/_autosummary/mlx.core.array.diagonal.rst", "python/_autosummary/mlx.core.array.dtype.rst", "python/_autosummary/mlx.core.array.exp.rst", "python/_autosummary/mlx.core.array.flatten.rst", "python/_autosummary/mlx.core.array.item.rst", "python/_autosummary/mlx.core.array.itemsize.rst", "python/_autosummary/mlx.core.array.log.rst", "python/_autosummary/mlx.core.array.log10.rst", "python/_autosummary/mlx.core.array.log1p.rst", "python/_autosummary/mlx.core.array.log2.rst", "python/_autosummary/mlx.core.array.logsumexp.rst", "python/_autosummary/mlx.core.array.max.rst", "python/_autosummary/mlx.core.array.mean.rst", "python/_autosummary/mlx.core.array.min.rst", "python/_autosummary/mlx.core.array.moveaxis.rst", "python/_autosummary/mlx.core.array.nbytes.rst", "python/_autosummary/mlx.core.array.ndim.rst", "python/_autosummary/mlx.core.array.prod.rst", "python/_autosummary/mlx.core.array.reciprocal.rst", "python/_autosummary/mlx.core.array.reshape.rst", "python/_autosummary/mlx.core.array.round.rst", "python/_autosummary/mlx.core.array.rsqrt.rst", "python/_autosummary/mlx.core.array.shape.rst", "python/_autosummary/mlx.core.array.sin.rst", "python/_autosummary/mlx.core.array.size.rst", "python/_autosummary/mlx.core.array.split.rst", "python/_autosummary/mlx.core.array.sqrt.rst", "python/_autosummary/mlx.core.array.square.rst", "python/_autosummary/mlx.core.array.squeeze.rst", "python/_autosummary/mlx.core.array.std.rst", "python/_autosummary/mlx.core.array.sum.rst", "python/_autosummary/mlx.core.array.swapaxes.rst", "python/_autosummary/mlx.core.array.tolist.rst", "python/_autosummary/mlx.core.array.transpose.rst", "python/_autosummary/mlx.core.array.var.rst", "python/_autosummary/mlx.core.array.view.rst", "python/_autosummary/mlx.core.array_equal.rst", "python/_autosummary/mlx.core.as_strided.rst", "python/_autosummary/mlx.core.atleast_1d.rst", "python/_autosummary/mlx.core.atleast_2d.rst", "python/_autosummary/mlx.core.atleast_3d.rst", "python/_autosummary/mlx.core.bitwise_and.rst", "python/_autosummary/mlx.core.bitwise_or.rst", "python/_autosummary/mlx.core.bitwise_xor.rst", "python/_autosummary/mlx.core.block_masked_mm.rst", "python/_autosummary/mlx.core.broadcast_to.rst", "python/_autosummary/mlx.core.ceil.rst", "python/_autosummary/mlx.core.clip.rst", "python/_autosummary/mlx.core.compile.rst", "python/_autosummary/mlx.core.concatenate.rst", "python/_autosummary/mlx.core.conj.rst", "python/_autosummary/mlx.core.conjugate.rst", "python/_autosummary/mlx.core.conv1d.rst", "python/_autosummary/mlx.core.conv2d.rst", "python/_autosummary/mlx.core.conv3d.rst", "python/_autosummary/mlx.core.conv_general.rst", "python/_autosummary/mlx.core.conv_transpose1d.rst", "python/_autosummary/mlx.core.conv_transpose2d.rst", "python/_autosummary/mlx.core.conv_transpose3d.rst", "python/_autosummary/mlx.core.convolve.rst", "python/_autosummary/mlx.core.cos.rst", "python/_autosummary/mlx.core.cosh.rst", "python/_autosummary/mlx.core.cummax.rst", "python/_autosummary/mlx.core.cummin.rst", "python/_autosummary/mlx.core.cumprod.rst", "python/_autosummary/mlx.core.cumsum.rst", "python/_autosummary/mlx.core.custom_function.rst", "python/_autosummary/mlx.core.default_device.rst", "python/_autosummary/mlx.core.default_stream.rst", "python/_autosummary/mlx.core.degrees.rst", "python/_autosummary/mlx.core.dequantize.rst", "python/_autosummary/mlx.core.diag.rst", "python/_autosummary/mlx.core.diagonal.rst", "python/_autosummary/mlx.core.disable_compile.rst", "python/_autosummary/mlx.core.distributed.Group.rst", "python/_autosummary/mlx.core.distributed.all_gather.rst", "python/_autosummary/mlx.core.distributed.all_sum.rst", "python/_autosummary/mlx.core.distributed.init.rst", "python/_autosummary/mlx.core.distributed.is_available.rst", "python/_autosummary/mlx.core.distributed.recv.rst", "python/_autosummary/mlx.core.distributed.recv_like.rst", "python/_autosummary/mlx.core.distributed.send.rst", "python/_autosummary/mlx.core.divide.rst", "python/_autosummary/mlx.core.divmod.rst", "python/_autosummary/mlx.core.einsum.rst", "python/_autosummary/mlx.core.einsum_path.rst", "python/_autosummary/mlx.core.enable_compile.rst", "python/_autosummary/mlx.core.equal.rst", "python/_autosummary/mlx.core.erf.rst", "python/_autosummary/mlx.core.erfinv.rst", "python/_autosummary/mlx.core.eval.rst", "python/_autosummary/mlx.core.exp.rst", "python/_autosummary/mlx.core.expand_dims.rst", "python/_autosummary/mlx.core.expm1.rst", "python/_autosummary/mlx.core.eye.rst", "python/_autosummary/mlx.core.fast.affine_quantize.rst", "python/_autosummary/mlx.core.fast.layer_norm.rst", "python/_autosummary/mlx.core.fast.metal_kernel.rst", "python/_autosummary/mlx.core.fast.rms_norm.rst", "python/_autosummary/mlx.core.fast.rope.rst", "python/_autosummary/mlx.core.fast.scaled_dot_product_attention.rst", "python/_autosummary/mlx.core.fft.fft.rst", "python/_autosummary/mlx.core.fft.fft2.rst", "python/_autosummary/mlx.core.fft.fftn.rst", "python/_autosummary/mlx.core.fft.ifft.rst", "python/_autosummary/mlx.core.fft.ifft2.rst", "python/_autosummary/mlx.core.fft.ifftn.rst", "python/_autosummary/mlx.core.fft.irfft.rst", "python/_autosummary/mlx.core.fft.irfft2.rst", "python/_autosummary/mlx.core.fft.irfftn.rst", "python/_autosummary/mlx.core.fft.rfft.rst", "python/_autosummary/mlx.core.fft.rfft2.rst", "python/_autosummary/mlx.core.fft.rfftn.rst", "python/_autosummary/mlx.core.flatten.rst", "python/_autosummary/mlx.core.floor.rst", "python/_autosummary/mlx.core.floor_divide.rst", "python/_autosummary/mlx.core.full.rst", "python/_autosummary/mlx.core.gather_mm.rst", "python/_autosummary/mlx.core.gather_qmm.rst", "python/_autosummary/mlx.core.grad.rst", "python/_autosummary/mlx.core.greater.rst", "python/_autosummary/mlx.core.greater_equal.rst", "python/_autosummary/mlx.core.hadamard_transform.rst", "python/_autosummary/mlx.core.identity.rst", "python/_autosummary/mlx.core.imag.rst", "python/_autosummary/mlx.core.inner.rst", "python/_autosummary/mlx.core.isclose.rst", "python/_autosummary/mlx.core.isfinite.rst", "python/_autosummary/mlx.core.isinf.rst", "python/_autosummary/mlx.core.isnan.rst", "python/_autosummary/mlx.core.isneginf.rst", "python/_autosummary/mlx.core.isposinf.rst", "python/_autosummary/mlx.core.issubdtype.rst", "python/_autosummary/mlx.core.jvp.rst", "python/_autosummary/mlx.core.left_shift.rst", "python/_autosummary/mlx.core.less.rst", "python/_autosummary/mlx.core.less_equal.rst", "python/_autosummary/mlx.core.linalg.cholesky.rst", "python/_autosummary/mlx.core.linalg.cholesky_inv.rst", "python/_autosummary/mlx.core.linalg.cross.rst", "python/_autosummary/mlx.core.linalg.eigh.rst", "python/_autosummary/mlx.core.linalg.eigvalsh.rst", "python/_autosummary/mlx.core.linalg.inv.rst", "python/_autosummary/mlx.core.linalg.norm.rst", "python/_autosummary/mlx.core.linalg.qr.rst", "python/_autosummary/mlx.core.linalg.svd.rst", "python/_autosummary/mlx.core.linalg.tri_inv.rst", "python/_autosummary/mlx.core.linspace.rst", "python/_autosummary/mlx.core.load.rst", "python/_autosummary/mlx.core.log.rst", "python/_autosummary/mlx.core.log10.rst", "python/_autosummary/mlx.core.log1p.rst", "python/_autosummary/mlx.core.log2.rst", "python/_autosummary/mlx.core.logaddexp.rst", "python/_autosummary/mlx.core.logical_and.rst", "python/_autosummary/mlx.core.logical_not.rst", "python/_autosummary/mlx.core.logical_or.rst", "python/_autosummary/mlx.core.logsumexp.rst", "python/_autosummary/mlx.core.matmul.rst", "python/_autosummary/mlx.core.max.rst", "python/_autosummary/mlx.core.maximum.rst", "python/_autosummary/mlx.core.mean.rst", "python/_autosummary/mlx.core.meshgrid.rst", "python/_autosummary/mlx.core.metal.clear_cache.rst", "python/_autosummary/mlx.core.metal.device_info.rst", "python/_autosummary/mlx.core.metal.get_active_memory.rst", "python/_autosummary/mlx.core.metal.get_cache_memory.rst", "python/_autosummary/mlx.core.metal.get_peak_memory.rst", "python/_autosummary/mlx.core.metal.is_available.rst", "python/_autosummary/mlx.core.metal.reset_peak_memory.rst", "python/_autosummary/mlx.core.metal.set_cache_limit.rst", "python/_autosummary/mlx.core.metal.set_memory_limit.rst", "python/_autosummary/mlx.core.metal.set_wired_limit.rst", "python/_autosummary/mlx.core.metal.start_capture.rst", "python/_autosummary/mlx.core.metal.stop_capture.rst", "python/_autosummary/mlx.core.min.rst", "python/_autosummary/mlx.core.minimum.rst", "python/_autosummary/mlx.core.moveaxis.rst", "python/_autosummary/mlx.core.multiply.rst", "python/_autosummary/mlx.core.nan_to_num.rst", "python/_autosummary/mlx.core.negative.rst", "python/_autosummary/mlx.core.new_stream.rst", "python/_autosummary/mlx.core.not_equal.rst", "python/_autosummary/mlx.core.ones.rst", "python/_autosummary/mlx.core.ones_like.rst", "python/_autosummary/mlx.core.outer.rst", "python/_autosummary/mlx.core.pad.rst", "python/_autosummary/mlx.core.partition.rst", "python/_autosummary/mlx.core.power.rst", "python/_autosummary/mlx.core.prod.rst", "python/_autosummary/mlx.core.put_along_axis.rst", "python/_autosummary/mlx.core.quantize.rst", "python/_autosummary/mlx.core.quantized_matmul.rst", "python/_autosummary/mlx.core.radians.rst", "python/_autosummary/mlx.core.random.bernoulli.rst", "python/_autosummary/mlx.core.random.categorical.rst", "python/_autosummary/mlx.core.random.gumbel.rst", "python/_autosummary/mlx.core.random.key.rst", "python/_autosummary/mlx.core.random.laplace.rst", "python/_autosummary/mlx.core.random.multivariate_normal.rst", "python/_autosummary/mlx.core.random.normal.rst", "python/_autosummary/mlx.core.random.permutation.rst", "python/_autosummary/mlx.core.random.randint.rst", "python/_autosummary/mlx.core.random.seed.rst", "python/_autosummary/mlx.core.random.split.rst", "python/_autosummary/mlx.core.random.truncated_normal.rst", "python/_autosummary/mlx.core.random.uniform.rst", "python/_autosummary/mlx.core.real.rst", "python/_autosummary/mlx.core.reciprocal.rst", "python/_autosummary/mlx.core.remainder.rst", "python/_autosummary/mlx.core.repeat.rst", "python/_autosummary/mlx.core.reshape.rst", "python/_autosummary/mlx.core.right_shift.rst", "python/_autosummary/mlx.core.roll.rst", "python/_autosummary/mlx.core.round.rst", "python/_autosummary/mlx.core.rsqrt.rst", "python/_autosummary/mlx.core.save.rst", "python/_autosummary/mlx.core.save_gguf.rst", "python/_autosummary/mlx.core.save_safetensors.rst", "python/_autosummary/mlx.core.savez.rst", "python/_autosummary/mlx.core.savez_compressed.rst", "python/_autosummary/mlx.core.set_default_device.rst", "python/_autosummary/mlx.core.set_default_stream.rst", "python/_autosummary/mlx.core.sigmoid.rst", "python/_autosummary/mlx.core.sign.rst", "python/_autosummary/mlx.core.sin.rst", "python/_autosummary/mlx.core.sinh.rst", "python/_autosummary/mlx.core.softmax.rst", "python/_autosummary/mlx.core.sort.rst", "python/_autosummary/mlx.core.split.rst", "python/_autosummary/mlx.core.sqrt.rst", "python/_autosummary/mlx.core.square.rst", "python/_autosummary/mlx.core.squeeze.rst", "python/_autosummary/mlx.core.stack.rst", "python/_autosummary/mlx.core.std.rst", "python/_autosummary/mlx.core.stop_gradient.rst", "python/_autosummary/mlx.core.stream.rst", "python/_autosummary/mlx.core.subtract.rst", "python/_autosummary/mlx.core.sum.rst", "python/_autosummary/mlx.core.swapaxes.rst", "python/_autosummary/mlx.core.synchronize.rst", "python/_autosummary/mlx.core.take.rst", "python/_autosummary/mlx.core.take_along_axis.rst", "python/_autosummary/mlx.core.tan.rst", "python/_autosummary/mlx.core.tanh.rst", "python/_autosummary/mlx.core.tensordot.rst", "python/_autosummary/mlx.core.tile.rst", "python/_autosummary/mlx.core.topk.rst", "python/_autosummary/mlx.core.trace.rst", "python/_autosummary/mlx.core.transpose.rst", "python/_autosummary/mlx.core.tri.rst", "python/_autosummary/mlx.core.tril.rst", "python/_autosummary/mlx.core.triu.rst", "python/_autosummary/mlx.core.value_and_grad.rst", "python/_autosummary/mlx.core.var.rst", "python/_autosummary/mlx.core.view.rst", "python/_autosummary/mlx.core.vjp.rst", "python/_autosummary/mlx.core.vmap.rst", "python/_autosummary/mlx.core.where.rst", "python/_autosummary/mlx.core.zeros.rst", "python/_autosummary/mlx.core.zeros_like.rst", "python/_autosummary/mlx.nn.quantize.rst", "python/_autosummary/mlx.nn.value_and_grad.rst", "python/_autosummary/mlx.optimizers.clip_grad_norm.rst", "python/_autosummary/mlx.utils.tree_flatten.rst", "python/_autosummary/mlx.utils.tree_map.rst", "python/_autosummary/mlx.utils.tree_map_with_path.rst", "python/_autosummary/mlx.utils.tree_reduce.rst", "python/_autosummary/mlx.utils.tree_unflatten.rst", "python/_autosummary/stream_class.rst", "python/array.rst", "python/data_types.rst", "python/devices_and_streams.rst", "python/distributed.rst", "python/fast.rst", "python/fft.rst", "python/linalg.rst", "python/metal.rst", "python/nn.rst", "python/nn/_autosummary/mlx.nn.ALiBi.rst", "python/nn/_autosummary/mlx.nn.AvgPool1d.rst", "python/nn/_autosummary/mlx.nn.AvgPool2d.rst", "python/nn/_autosummary/mlx.nn.BatchNorm.rst", "python/nn/_autosummary/mlx.nn.CELU.rst", "python/nn/_autosummary/mlx.nn.Conv1d.rst", "python/nn/_autosummary/mlx.nn.Conv2d.rst", "python/nn/_autosummary/mlx.nn.Conv3d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose1d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose2d.rst", "python/nn/_autosummary/mlx.nn.ConvTranspose3d.rst", "python/nn/_autosummary/mlx.nn.Dropout.rst", "python/nn/_autosummary/mlx.nn.Dropout2d.rst", "python/nn/_autosummary/mlx.nn.Dropout3d.rst", "python/nn/_autosummary/mlx.nn.ELU.rst", "python/nn/_autosummary/mlx.nn.Embedding.rst", "python/nn/_autosummary/mlx.nn.GELU.rst", "python/nn/_autosummary/mlx.nn.GLU.rst", "python/nn/_autosummary/mlx.nn.GRU.rst", "python/nn/_autosummary/mlx.nn.GroupNorm.rst", "python/nn/_autosummary/mlx.nn.HardShrink.rst", "python/nn/_autosummary/mlx.nn.HardTanh.rst", "python/nn/_autosummary/mlx.nn.Hardswish.rst", "python/nn/_autosummary/mlx.nn.InstanceNorm.rst", "python/nn/_autosummary/mlx.nn.LSTM.rst", "python/nn/_autosummary/mlx.nn.LayerNorm.rst", "python/nn/_autosummary/mlx.nn.LeakyReLU.rst", "python/nn/_autosummary/mlx.nn.Linear.rst", "python/nn/_autosummary/mlx.nn.LogSigmoid.rst", "python/nn/_autosummary/mlx.nn.LogSoftmax.rst", "python/nn/_autosummary/mlx.nn.MaxPool1d.rst", "python/nn/_autosummary/mlx.nn.MaxPool2d.rst", "python/nn/_autosummary/mlx.nn.Mish.rst", "python/nn/_autosummary/mlx.nn.Module.apply.rst", "python/nn/_autosummary/mlx.nn.Module.apply_to_modules.rst", "python/nn/_autosummary/mlx.nn.Module.children.rst", "python/nn/_autosummary/mlx.nn.Module.eval.rst", "python/nn/_autosummary/mlx.nn.Module.filter_and_map.rst", "python/nn/_autosummary/mlx.nn.Module.freeze.rst", "python/nn/_autosummary/mlx.nn.Module.leaf_modules.rst", "python/nn/_autosummary/mlx.nn.Module.load_weights.rst", "python/nn/_autosummary/mlx.nn.Module.modules.rst", "python/nn/_autosummary/mlx.nn.Module.named_modules.rst", "python/nn/_autosummary/mlx.nn.Module.parameters.rst", "python/nn/_autosummary/mlx.nn.Module.save_weights.rst", "python/nn/_autosummary/mlx.nn.Module.set_dtype.rst", "python/nn/_autosummary/mlx.nn.Module.state.rst", "python/nn/_autosummary/mlx.nn.Module.train.rst", "python/nn/_autosummary/mlx.nn.Module.trainable_parameters.rst", "python/nn/_autosummary/mlx.nn.Module.training.rst", "python/nn/_autosummary/mlx.nn.Module.unfreeze.rst", "python/nn/_autosummary/mlx.nn.Module.update.rst", "python/nn/_autosummary/mlx.nn.Module.update_modules.rst", "python/nn/_autosummary/mlx.nn.MultiHeadAttention.rst", "python/nn/_autosummary/mlx.nn.PReLU.rst", "python/nn/_autosummary/mlx.nn.QuantizedEmbedding.rst", "python/nn/_autosummary/mlx.nn.QuantizedLinear.rst", "python/nn/_autosummary/mlx.nn.RMSNorm.rst", "python/nn/_autosummary/mlx.nn.RNN.rst", "python/nn/_autosummary/mlx.nn.ReLU.rst", "python/nn/_autosummary/mlx.nn.ReLU6.rst", "python/nn/_autosummary/mlx.nn.RoPE.rst", "python/nn/_autosummary/mlx.nn.SELU.rst", "python/nn/_autosummary/mlx.nn.Sequential.rst", "python/nn/_autosummary/mlx.nn.SiLU.rst", "python/nn/_autosummary/mlx.nn.Sigmoid.rst", "python/nn/_autosummary/mlx.nn.SinusoidalPositionalEncoding.rst", "python/nn/_autosummary/mlx.nn.Softmax.rst", "python/nn/_autosummary/mlx.nn.Softmin.rst", "python/nn/_autosummary/mlx.nn.Softplus.rst", "python/nn/_autosummary/mlx.nn.Softshrink.rst", "python/nn/_autosummary/mlx.nn.Softsign.rst", "python/nn/_autosummary/mlx.nn.Step.rst", "python/nn/_autosummary/mlx.nn.Tanh.rst", "python/nn/_autosummary/mlx.nn.Transformer.rst", "python/nn/_autosummary/mlx.nn.Upsample.rst", "python/nn/_autosummary/mlx.nn.init.constant.rst", "python/nn/_autosummary/mlx.nn.init.glorot_normal.rst", "python/nn/_autosummary/mlx.nn.init.glorot_uniform.rst", "python/nn/_autosummary/mlx.nn.init.he_normal.rst", "python/nn/_autosummary/mlx.nn.init.he_uniform.rst", "python/nn/_autosummary/mlx.nn.init.identity.rst", "python/nn/_autosummary/mlx.nn.init.normal.rst", "python/nn/_autosummary/mlx.nn.init.uniform.rst", "python/nn/_autosummary_functions/mlx.nn.celu.rst", "python/nn/_autosummary_functions/mlx.nn.elu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_approx.rst", "python/nn/_autosummary_functions/mlx.nn.gelu_fast_approx.rst", "python/nn/_autosummary_functions/mlx.nn.glu.rst", "python/nn/_autosummary_functions/mlx.nn.hard_shrink.rst", "python/nn/_autosummary_functions/mlx.nn.hard_tanh.rst", "python/nn/_autosummary_functions/mlx.nn.hardswish.rst", "python/nn/_autosummary_functions/mlx.nn.leaky_relu.rst", "python/nn/_autosummary_functions/mlx.nn.log_sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.log_softmax.rst", "python/nn/_autosummary_functions/mlx.nn.losses.binary_cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.cross_entropy.rst", "python/nn/_autosummary_functions/mlx.nn.losses.gaussian_nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.hinge_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.huber_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.kl_div_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.log_cosh_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.margin_ranking_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.mse_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.nll_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.smooth_l1_loss.rst", "python/nn/_autosummary_functions/mlx.nn.losses.triplet_loss.rst", "python/nn/_autosummary_functions/mlx.nn.mish.rst", "python/nn/_autosummary_functions/mlx.nn.prelu.rst", "python/nn/_autosummary_functions/mlx.nn.relu.rst", "python/nn/_autosummary_functions/mlx.nn.relu6.rst", "python/nn/_autosummary_functions/mlx.nn.selu.rst", "python/nn/_autosummary_functions/mlx.nn.sigmoid.rst", "python/nn/_autosummary_functions/mlx.nn.silu.rst", "python/nn/_autosummary_functions/mlx.nn.softmax.rst", "python/nn/_autosummary_functions/mlx.nn.softmin.rst", "python/nn/_autosummary_functions/mlx.nn.softplus.rst", "python/nn/_autosummary_functions/mlx.nn.softshrink.rst", "python/nn/_autosummary_functions/mlx.nn.step.rst", "python/nn/_autosummary_functions/mlx.nn.tanh.rst", "python/nn/functions.rst", "python/nn/init.rst", "python/nn/layers.rst", "python/nn/losses.rst", "python/nn/module.rst", "python/ops.rst", "python/optimizers.rst", "python/optimizers/_autosummary/mlx.optimizers.AdaDelta.rst", "python/optimizers/_autosummary/mlx.optimizers.Adafactor.rst", "python/optimizers/_autosummary/mlx.optimizers.Adagrad.rst", "python/optimizers/_autosummary/mlx.optimizers.Adam.rst", "python/optimizers/_autosummary/mlx.optimizers.AdamW.rst", "python/optimizers/_autosummary/mlx.optimizers.Adamax.rst", "python/optimizers/_autosummary/mlx.optimizers.Lion.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.apply_gradients.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.init.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.state.rst", "python/optimizers/_autosummary/mlx.optimizers.Optimizer.update.rst", "python/optimizers/_autosummary/mlx.optimizers.RMSprop.rst", "python/optimizers/_autosummary/mlx.optimizers.SGD.rst", "python/optimizers/_autosummary/mlx.optimizers.cosine_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.exponential_decay.rst", "python/optimizers/_autosummary/mlx.optimizers.join_schedules.rst", "python/optimizers/_autosummary/mlx.optimizers.linear_schedule.rst", "python/optimizers/_autosummary/mlx.optimizers.step_decay.rst", "python/optimizers/common_optimizers.rst", "python/optimizers/optimizer.rst", "python/optimizers/schedulers.rst", "python/random.rst", "python/transforms.rst", "python/tree_utils.rst", "usage/compile.rst", "usage/distributed.rst", "usage/function_transforms.rst", "usage/indexing.rst", "usage/lazy_evaluation.rst", "usage/numpy.rst", "usage/quick_start.rst", "usage/saving_and_loading.rst", "usage/unified_memory.rst", "usage/using_streams.rst"], "indexentries": {"__init__() (array method)": [[30, "mlx.core.array.__init__", false]], "__init__() (custom_function method)": [[112, "mlx.core.custom_function.__init__", false]], "__init__() (device method)": [[9, "mlx.core.Device.__init__", false]], "__init__() (dtype method)": [[10, "mlx.core.Dtype.__init__", false]], "__init__() (dtypecategory method)": [[11, "mlx.core.DtypeCategory.__init__", false]], "__init__() (group method)": [[120, "mlx.core.distributed.Group.__init__", false]], "__init__() (stream method)": [[315, "mlx.core.Stream.__init__", false]], "abs (c++ function)": [[0, "_CPPv43absRK5array14StreamOrDevice", false]], "abs() (array method)": [[32, "mlx.core.array.abs", false]], "abs() (in module mlx.core)": [[12, "mlx.core.abs", false]], "adadelta (class in mlx.optimizers)": [[455, "mlx.optimizers.AdaDelta", false]], "adafactor (class in mlx.optimizers)": [[456, "mlx.optimizers.Adafactor", false]], "adagrad (class in mlx.optimizers)": [[457, "mlx.optimizers.Adagrad", false]], "adam (class in mlx.optimizers)": [[458, "mlx.optimizers.Adam", false]], "adamax (class in mlx.optimizers)": [[460, "mlx.optimizers.Adamax", false]], "adamw (class in mlx.optimizers)": [[459, "mlx.optimizers.AdamW", false]], "add (c++ function)": [[0, "_CPPv43addRK5arrayRK5array14StreamOrDevice", false]], "add() (in module mlx.core)": [[13, "mlx.core.add", false]], "addmm (c++ function)": [[0, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", false]], "addmm() (in module mlx.core)": [[14, "mlx.core.addmm", false]], "affine_quantize() (in module mlx.core.fast)": [[141, "mlx.core.fast.affine_quantize", false]], "alibi (class in mlx.nn)": [[325, "mlx.nn.ALiBi", false]], "all (c++ function)": [[0, "_CPPv43allRK5array14StreamOrDevice", false], [0, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayb14StreamOrDevice", false], [0, "_CPPv43allRK5arrayib14StreamOrDevice", false]], "all() (array method)": [[33, "mlx.core.array.all", false]], "all() (in module mlx.core)": [[15, "mlx.core.all", false]], "all_gather() (in module mlx.core.distributed)": [[121, "mlx.core.distributed.all_gather", false]], "all_sum() (in module mlx.core.distributed)": [[122, "mlx.core.distributed.all_sum", false]], "allclose (c++ function)": [[0, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "allclose() (in module mlx.core)": [[16, "mlx.core.allclose", false]], "any (c++ function)": [[0, "_CPPv43anyRK5array14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayb14StreamOrDevice", false], [0, "_CPPv43anyRK5arrayib14StreamOrDevice", false]], "any() (array method)": [[34, "mlx.core.array.any", false]], "any() (in module mlx.core)": [[17, "mlx.core.any", false]], "apply() (module method)": [[358, "mlx.nn.Module.apply", false]], "apply_gradients() (optimizer method)": [[462, "mlx.optimizers.Optimizer.apply_gradients", false]], "apply_to_modules() (module method)": [[359, "mlx.nn.Module.apply_to_modules", false]], "arange (c++ function)": [[0, "_CPPv46aranged14StreamOrDevice", false], [0, "_CPPv46aranged5Dtype14StreamOrDevice", false], [0, "_CPPv46arangedd14StreamOrDevice", false], [0, "_CPPv46arangedd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangeddd14StreamOrDevice", false], [0, "_CPPv46arangeddd5Dtype14StreamOrDevice", false], [0, "_CPPv46arangei14StreamOrDevice", false], [0, "_CPPv46arangeii14StreamOrDevice", false], [0, "_CPPv46arangeiii14StreamOrDevice", false]], "arange() (in module mlx.core)": [[18, "mlx.core.arange", false]], "arccos (c++ function)": [[0, "_CPPv46arccosRK5array14StreamOrDevice", false]], "arccos() (in module mlx.core)": [[19, "mlx.core.arccos", false]], "arccosh (c++ function)": [[0, "_CPPv47arccoshRK5array14StreamOrDevice", false]], "arccosh() (in module mlx.core)": [[20, "mlx.core.arccosh", false]], "arcsin (c++ function)": [[0, "_CPPv46arcsinRK5array14StreamOrDevice", false]], "arcsin() (in module mlx.core)": [[21, "mlx.core.arcsin", false]], "arcsinh (c++ function)": [[0, "_CPPv47arcsinhRK5array14StreamOrDevice", false]], "arcsinh() (in module mlx.core)": [[22, "mlx.core.arcsinh", false]], "arctan (c++ function)": [[0, "_CPPv46arctanRK5array14StreamOrDevice", false]], "arctan() (in module mlx.core)": [[23, "mlx.core.arctan", false]], "arctan2 (c++ function)": [[0, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", false]], "arctan2() (in module mlx.core)": [[24, "mlx.core.arctan2", false]], "arctanh (c++ function)": [[0, "_CPPv47arctanhRK5array14StreamOrDevice", false]], "arctanh() (in module mlx.core)": [[25, "mlx.core.arctanh", false]], "argmax (c++ function)": [[0, "_CPPv46argmaxRK5array14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argmaxRK5arrayib14StreamOrDevice", false]], "argmax() (array method)": [[35, "mlx.core.array.argmax", false]], "argmax() (in module mlx.core)": [[26, "mlx.core.argmax", false]], "argmin (c++ function)": [[0, "_CPPv46argminRK5array14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayb14StreamOrDevice", false], [0, "_CPPv46argminRK5arrayib14StreamOrDevice", false]], "argmin() (array method)": [[36, "mlx.core.array.argmin", false]], "argmin() (in module mlx.core)": [[27, "mlx.core.argmin", false]], "argpartition (c++ function)": [[0, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", false]], "argpartition() (in module mlx.core)": [[28, "mlx.core.argpartition", false]], "argsort (c++ function)": [[0, "_CPPv47argsortRK5array14StreamOrDevice", false], [0, "_CPPv47argsortRK5arrayi14StreamOrDevice", false]], "argsort() (in module mlx.core)": [[29, "mlx.core.argsort", false]], "array (class in mlx.core)": [[30, "mlx.core.array", false]], "array_equal (c++ function)": [[0, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", false]], "array_equal() (in module mlx.core)": [[82, "mlx.core.array_equal", false]], "as_strided (c++ function)": [[0, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", false]], "as_strided() (in module mlx.core)": [[83, "mlx.core.as_strided", false]], "astype (c++ function)": [[0, "_CPPv46astype5array5Dtype14StreamOrDevice", false]], "astype() (array method)": [[37, "mlx.core.array.astype", false]], "at (array property)": [[38, "mlx.core.array.at", false]], "atleast_1d (c++ function)": [[0, "_CPPv410atleast_1dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_1d() (in module mlx.core)": [[84, "mlx.core.atleast_1d", false]], "atleast_2d (c++ function)": [[0, "_CPPv410atleast_2dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_2d() (in module mlx.core)": [[85, "mlx.core.atleast_2d", false]], "atleast_3d (c++ function)": [[0, "_CPPv410atleast_3dRK5array14StreamOrDevice", false], [0, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "atleast_3d() (in module mlx.core)": [[86, "mlx.core.atleast_3d", false]], "avgpool1d (class in mlx.nn)": [[326, "mlx.nn.AvgPool1d", false]], "avgpool2d (class in mlx.nn)": [[327, "mlx.nn.AvgPool2d", false]], "batchnorm (class in mlx.nn)": [[328, "mlx.nn.BatchNorm", false]], "bernoulli() (in module mlx.core.random)": [[240, "mlx.core.random.bernoulli", false]], "binary_cross_entropy (class in mlx.nn.losses)": [[421, "mlx.nn.losses.binary_cross_entropy", false]], "bitwise_and (c++ function)": [[0, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", false]], "bitwise_and() (in module mlx.core)": [[87, "mlx.core.bitwise_and", false]], "bitwise_or (c++ function)": [[0, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", false]], "bitwise_or() (in module mlx.core)": [[88, "mlx.core.bitwise_or", false]], "bitwise_xor (c++ function)": [[0, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", false]], "bitwise_xor() (in module mlx.core)": [[89, "mlx.core.bitwise_xor", false]], "block_masked_mm (c++ function)": [[0, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "block_masked_mm() (in module mlx.core)": [[90, "mlx.core.block_masked_mm", false]], "broadcast_arrays (c++ function)": [[0, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", false]], "broadcast_to (c++ function)": [[0, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "broadcast_to() (in module mlx.core)": [[91, "mlx.core.broadcast_to", false]], "categorical() (in module mlx.core.random)": [[241, "mlx.core.random.categorical", false]], "ceil (c++ function)": [[0, "_CPPv44ceilRK5array14StreamOrDevice", false]], "ceil() (in module mlx.core)": [[92, "mlx.core.ceil", false]], "celu (class in mlx.nn)": [[329, "mlx.nn.CELU", false], [409, "mlx.nn.celu", false]], "children() (module method)": [[360, "mlx.nn.Module.children", false]], "cholesky() (in module mlx.core.linalg)": [[183, "mlx.core.linalg.cholesky", false]], "cholesky_inv() (in module mlx.core.linalg)": [[184, "mlx.core.linalg.cholesky_inv", false]], "clear_cache() (in module mlx.core.metal)": [[209, "mlx.core.metal.clear_cache", false]], "clip (c++ function)": [[0, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", false]], "clip() (in module mlx.core)": [[93, "mlx.core.clip", false]], "clip_grad_norm() (in module mlx.optimizers)": [[309, "mlx.optimizers.clip_grad_norm", false]], "compile() (in module mlx.core)": [[94, "mlx.core.compile", false]], "concatenate (c++ function)": [[0, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "concatenate() (in module mlx.core)": [[95, "mlx.core.concatenate", false]], "conj() (array method)": [[39, "mlx.core.array.conj", false]], "conj() (in module mlx.core)": [[96, "mlx.core.conj", false]], "conjugate (c++ function)": [[0, "_CPPv49conjugateRK5array14StreamOrDevice", false]], "conjugate() (in module mlx.core)": [[97, "mlx.core.conjugate", false]], "constant() (in module mlx.nn.init)": [[401, "mlx.nn.init.constant", false]], "conv1d (c++ function)": [[0, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv1d (class in mlx.nn)": [[330, "mlx.nn.Conv1d", false]], "conv1d() (in module mlx.core)": [[98, "mlx.core.conv1d", false]], "conv2d (c++ function)": [[0, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv2d (class in mlx.nn)": [[331, "mlx.nn.Conv2d", false]], "conv2d() (in module mlx.core)": [[99, "mlx.core.conv2d", false]], "conv3d (c++ function)": [[0, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv3d (class in mlx.nn)": [[332, "mlx.nn.Conv3d", false]], "conv3d() (in module mlx.core)": [[100, "mlx.core.conv3d", false]], "conv_general (c++ function)": [[0, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false], [0, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", false]], "conv_general() (in module mlx.core)": [[101, "mlx.core.conv_general", false]], "conv_transpose1d (c++ function)": [[0, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", false]], "conv_transpose1d() (in module mlx.core)": [[102, "mlx.core.conv_transpose1d", false]], "conv_transpose2d (c++ function)": [[0, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", false]], "conv_transpose2d() (in module mlx.core)": [[103, "mlx.core.conv_transpose2d", false]], "conv_transpose3d (c++ function)": [[0, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", false]], "conv_transpose3d() (in module mlx.core)": [[104, "mlx.core.conv_transpose3d", false]], "convolve() (in module mlx.core)": [[105, "mlx.core.convolve", false]], "convtranspose1d (class in mlx.nn)": [[333, "mlx.nn.ConvTranspose1d", false]], "convtranspose2d (class in mlx.nn)": [[334, "mlx.nn.ConvTranspose2d", false]], "convtranspose3d (class in mlx.nn)": [[335, "mlx.nn.ConvTranspose3d", false]], "copy (c++ function)": [[0, "_CPPv44copy5array14StreamOrDevice", false]], "cos (c++ function)": [[0, "_CPPv43cosRK5array14StreamOrDevice", false]], "cos() (array method)": [[40, "mlx.core.array.cos", false]], "cos() (in module mlx.core)": [[106, "mlx.core.cos", false]], "cosh (c++ function)": [[0, "_CPPv44coshRK5array14StreamOrDevice", false]], "cosh() (in module mlx.core)": [[107, "mlx.core.cosh", false]], "cosine_decay() (in module mlx.optimizers)": [[468, "mlx.optimizers.cosine_decay", false]], "cosine_similarity_loss (class in mlx.nn.losses)": [[422, "mlx.nn.losses.cosine_similarity_loss", false]], "cross() (in module mlx.core.linalg)": [[185, "mlx.core.linalg.cross", false]], "cross_entropy (class in mlx.nn.losses)": [[423, "mlx.nn.losses.cross_entropy", false]], "cummax (c++ function)": [[0, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", false]], "cummax() (array method)": [[41, "mlx.core.array.cummax", false]], "cummax() (in module mlx.core)": [[108, "mlx.core.cummax", false]], "cummin (c++ function)": [[0, "_CPPv46cumminRK5arrayibb14StreamOrDevice", false]], "cummin() (array method)": [[42, "mlx.core.array.cummin", false]], "cummin() (in module mlx.core)": [[109, "mlx.core.cummin", false]], "cumprod (c++ function)": [[0, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", false]], "cumprod() (array method)": [[43, "mlx.core.array.cumprod", false]], "cumprod() (in module mlx.core)": [[110, "mlx.core.cumprod", false]], "cumsum (c++ function)": [[0, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", false]], "cumsum() (array method)": [[44, "mlx.core.array.cumsum", false]], "cumsum() (in module mlx.core)": [[111, "mlx.core.cumsum", false]], "custom_function (class in mlx.core)": [[112, "mlx.core.custom_function", false]], "default_device() (in module mlx.core)": [[113, "mlx.core.default_device", false]], "default_stream() (in module mlx.core)": [[114, "mlx.core.default_stream", false]], "degrees (c++ function)": [[0, "_CPPv47degreesRK5array14StreamOrDevice", false]], "degrees() (in module mlx.core)": [[115, "mlx.core.degrees", false]], "depends (c++ function)": [[0, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", false]], "dequantize (c++ function)": [[0, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", false]], "dequantize() (in module mlx.core)": [[116, "mlx.core.dequantize", false]], "device (class in mlx.core)": [[9, "mlx.core.Device", false]], "device_info() (in module mlx.core.metal)": [[210, "mlx.core.metal.device_info", false]], "diag (c++ function)": [[0, "_CPPv44diagRK5arrayi14StreamOrDevice", false]], "diag() (array method)": [[45, "mlx.core.array.diag", false]], "diag() (in module mlx.core)": [[117, "mlx.core.diag", false]], "diagonal (c++ function)": [[0, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", false]], "diagonal() (array method)": [[46, "mlx.core.array.diagonal", false]], "diagonal() (in module mlx.core)": [[118, "mlx.core.diagonal", false]], "disable_compile() (in module mlx.core)": [[119, "mlx.core.disable_compile", false]], "divide (c++ function)": [[0, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", false]], "divide() (in module mlx.core)": [[128, "mlx.core.divide", false]], "divmod (c++ function)": [[0, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", false]], "divmod() (in module mlx.core)": [[129, "mlx.core.divmod", false]], "dropout (class in mlx.nn)": [[336, "mlx.nn.Dropout", false]], "dropout2d (class in mlx.nn)": [[337, "mlx.nn.Dropout2d", false]], "dropout3d (class in mlx.nn)": [[338, "mlx.nn.Dropout3d", false]], "dtype (array property)": [[47, "mlx.core.array.dtype", false]], "dtype (class in mlx.core)": [[10, "mlx.core.Dtype", false]], "dtypecategory (class in mlx.core)": [[11, "mlx.core.DtypeCategory", false]], "eigh() (in module mlx.core.linalg)": [[186, "mlx.core.linalg.eigh", false]], "eigvalsh() (in module mlx.core.linalg)": [[187, "mlx.core.linalg.eigvalsh", false]], "einsum() (in module mlx.core)": [[130, "mlx.core.einsum", false]], "einsum_path() (in module mlx.core)": [[131, "mlx.core.einsum_path", false]], "elu (class in mlx.nn)": [[339, "mlx.nn.ELU", false], [410, "mlx.nn.elu", false]], "embedding (class in mlx.nn)": [[340, "mlx.nn.Embedding", false]], "enable_compile() (in module mlx.core)": [[132, "mlx.core.enable_compile", false]], "equal (c++ function)": [[0, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", false]], "equal() (in module mlx.core)": [[133, "mlx.core.equal", false]], "erf (c++ function)": [[0, "_CPPv43erfRK5array14StreamOrDevice", false]], "erf() (in module mlx.core)": [[134, "mlx.core.erf", false]], "erfinv (c++ function)": [[0, "_CPPv46erfinvRK5array14StreamOrDevice", false]], "erfinv() (in module mlx.core)": [[135, "mlx.core.erfinv", false]], "eval() (in module mlx.core)": [[136, "mlx.core.eval", false]], "eval() (module method)": [[361, "mlx.nn.Module.eval", false]], "exp (c++ function)": [[0, "_CPPv43expRK5array14StreamOrDevice", false]], "exp() (array method)": [[48, "mlx.core.array.exp", false]], "exp() (in module mlx.core)": [[137, "mlx.core.exp", false]], "expand_dims (c++ function)": [[0, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", false]], "expand_dims() (in module mlx.core)": [[138, "mlx.core.expand_dims", false]], "expm1 (c++ function)": [[0, "_CPPv45expm1RK5array14StreamOrDevice", false]], "expm1() (in module mlx.core)": [[139, "mlx.core.expm1", false]], "exponential_decay() (in module mlx.optimizers)": [[469, "mlx.optimizers.exponential_decay", false]], "eye (c++ function)": [[0, "_CPPv43eyei14StreamOrDevice", false], [0, "_CPPv43eyei5Dtype14StreamOrDevice", false], [0, "_CPPv43eyeii14StreamOrDevice", false], [0, "_CPPv43eyeiii14StreamOrDevice", false], [0, "_CPPv43eyeiii5Dtype14StreamOrDevice", false]], "eye() (in module mlx.core)": [[140, "mlx.core.eye", false]], "fft() (in module mlx.core.fft)": [[147, "mlx.core.fft.fft", false]], "fft2() (in module mlx.core.fft)": [[148, "mlx.core.fft.fft2", false]], "fftn() (in module mlx.core.fft)": [[149, "mlx.core.fft.fftn", false]], "filter_and_map() (module method)": [[362, "mlx.nn.Module.filter_and_map", false]], "flatten (c++ function)": [[0, "_CPPv47flattenRK5array14StreamOrDevice", false], [0, "_CPPv47flattenRK5arrayii14StreamOrDevice", false]], "flatten() (array method)": [[49, "mlx.core.array.flatten", false]], "flatten() (in module mlx.core)": [[159, "mlx.core.flatten", false]], "floor (c++ function)": [[0, "_CPPv45floorRK5array14StreamOrDevice", false]], "floor() (in module mlx.core)": [[160, "mlx.core.floor", false]], "floor_divide (c++ function)": [[0, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", false]], "floor_divide() (in module mlx.core)": [[161, "mlx.core.floor_divide", false]], "freeze() (module method)": [[363, "mlx.nn.Module.freeze", false]], "full (c++ function)": [[0, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", false], [0, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", false], [0, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", false]], "full() (in module mlx.core)": [[162, "mlx.core.full", false]], "gather (c++ function)": [[0, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "gather_mm (c++ function)": [[0, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", false]], "gather_mm() (in module mlx.core)": [[163, "mlx.core.gather_mm", false]], "gather_qmm (c++ function)": [[0, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", false]], "gather_qmm() (in module mlx.core)": [[164, "mlx.core.gather_qmm", false]], "gaussian_nll_loss (class in mlx.nn.losses)": [[424, "mlx.nn.losses.gaussian_nll_loss", false]], "gelu (class in mlx.nn)": [[341, "mlx.nn.GELU", false], [411, "mlx.nn.gelu", false]], "gelu_approx (class in mlx.nn)": [[412, "mlx.nn.gelu_approx", false]], "gelu_fast_approx (class in mlx.nn)": [[413, "mlx.nn.gelu_fast_approx", false]], "get_active_memory() (in module mlx.core.metal)": [[211, "mlx.core.metal.get_active_memory", false]], "get_cache_memory() (in module mlx.core.metal)": [[212, "mlx.core.metal.get_cache_memory", false]], "get_peak_memory() (in module mlx.core.metal)": [[213, "mlx.core.metal.get_peak_memory", false]], "glorot_normal() (in module mlx.nn.init)": [[402, "mlx.nn.init.glorot_normal", false]], "glorot_uniform() (in module mlx.nn.init)": [[403, "mlx.nn.init.glorot_uniform", false]], "glu (class in mlx.nn)": [[342, "mlx.nn.GLU", false], [414, "mlx.nn.glu", false]], "grad() (in module mlx.core)": [[165, "mlx.core.grad", false]], "greater (c++ function)": [[0, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", false]], "greater() (in module mlx.core)": [[166, "mlx.core.greater", false]], "greater_equal (c++ function)": [[0, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", false]], "greater_equal() (in module mlx.core)": [[167, "mlx.core.greater_equal", false]], "group (class in mlx.core.distributed)": [[120, "mlx.core.distributed.Group", false]], "groupnorm (class in mlx.nn)": [[344, "mlx.nn.GroupNorm", false]], "gru (class in mlx.nn)": [[343, "mlx.nn.GRU", false]], "gumbel() (in module mlx.core.random)": [[242, "mlx.core.random.gumbel", false]], "hadamard_transform (c++ function)": [[0, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", false]], "hadamard_transform() (in module mlx.core)": [[168, "mlx.core.hadamard_transform", false]], "hard_shrink (class in mlx.nn)": [[415, "mlx.nn.hard_shrink", false]], "hard_tanh (class in mlx.nn)": [[416, "mlx.nn.hard_tanh", false]], "hardshrink (class in mlx.nn)": [[345, "mlx.nn.HardShrink", false]], "hardswish (class in mlx.nn)": [[347, "mlx.nn.Hardswish", false], [417, "mlx.nn.hardswish", false]], "hardtanh (class in mlx.nn)": [[346, "mlx.nn.HardTanh", false]], "he_normal() (in module mlx.nn.init)": [[404, "mlx.nn.init.he_normal", false]], "he_uniform() (in module mlx.nn.init)": [[405, "mlx.nn.init.he_uniform", false]], "hinge_loss (class in mlx.nn.losses)": [[425, "mlx.nn.losses.hinge_loss", false]], "huber_loss (class in mlx.nn.losses)": [[426, "mlx.nn.losses.huber_loss", false]], "identity (c++ function)": [[0, "_CPPv48identityi14StreamOrDevice", false], [0, "_CPPv48identityi5Dtype14StreamOrDevice", false]], "identity() (in module mlx.core)": [[169, "mlx.core.identity", false]], "identity() (in module mlx.nn.init)": [[406, "mlx.nn.init.identity", false]], "ifft() (in module mlx.core.fft)": [[150, "mlx.core.fft.ifft", false]], "ifft2() (in module mlx.core.fft)": [[151, "mlx.core.fft.ifft2", false]], "ifftn() (in module mlx.core.fft)": [[152, "mlx.core.fft.ifftn", false]], "imag (c++ function)": [[0, "_CPPv44imagRK5array14StreamOrDevice", false]], "imag() (in module mlx.core)": [[170, "mlx.core.imag", false]], "init() (in module mlx.core.distributed)": [[123, "mlx.core.distributed.init", false]], "init() (optimizer method)": [[463, "mlx.optimizers.Optimizer.init", false]], "inner (c++ function)": [[0, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", false]], "inner() (in module mlx.core)": [[171, "mlx.core.inner", false]], "instancenorm (class in mlx.nn)": [[348, "mlx.nn.InstanceNorm", false]], "inv() (in module mlx.core.linalg)": [[188, "mlx.core.linalg.inv", false]], "irfft() (in module mlx.core.fft)": [[153, "mlx.core.fft.irfft", false]], "irfft2() (in module mlx.core.fft)": [[154, "mlx.core.fft.irfft2", false]], "irfftn() (in module mlx.core.fft)": [[155, "mlx.core.fft.irfftn", false]], "is_available() (in module mlx.core.distributed)": [[124, "mlx.core.distributed.is_available", false]], "is_available() (in module mlx.core.metal)": [[214, "mlx.core.metal.is_available", false]], "isclose (c++ function)": [[0, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", false]], "isclose() (in module mlx.core)": [[172, "mlx.core.isclose", false]], "isfinite (c++ function)": [[0, "_CPPv48isfiniteRK5array14StreamOrDevice", false]], "isfinite() (in module mlx.core)": [[173, "mlx.core.isfinite", false]], "isinf (c++ function)": [[0, "_CPPv45isinfRK5array14StreamOrDevice", false]], "isinf() (in module mlx.core)": [[174, "mlx.core.isinf", false]], "isnan (c++ function)": [[0, "_CPPv45isnanRK5array14StreamOrDevice", false]], "isnan() (in module mlx.core)": [[175, "mlx.core.isnan", false]], "isneginf (c++ function)": [[0, "_CPPv48isneginfRK5array14StreamOrDevice", false]], "isneginf() (in module mlx.core)": [[176, "mlx.core.isneginf", false]], "isposinf (c++ function)": [[0, "_CPPv48isposinfRK5array14StreamOrDevice", false]], "isposinf() (in module mlx.core)": [[177, "mlx.core.isposinf", false]], "issubdtype() (in module mlx.core)": [[178, "mlx.core.issubdtype", false]], "item() (array method)": [[50, "mlx.core.array.item", false]], "itemsize (array property)": [[51, "mlx.core.array.itemsize", false]], "join_schedules() (in module mlx.optimizers)": [[470, "mlx.optimizers.join_schedules", false]], "jvp() (in module mlx.core)": [[179, "mlx.core.jvp", false]], "key() (in module mlx.core.random)": [[243, "mlx.core.random.key", false]], "kl_div_loss (class in mlx.nn.losses)": [[427, "mlx.nn.losses.kl_div_loss", false]], "l1_loss (class in mlx.nn.losses)": [[428, "mlx.nn.losses.l1_loss", false]], "laplace() (in module mlx.core.random)": [[244, "mlx.core.random.laplace", false]], "layer_norm() (in module mlx.core.fast)": [[142, "mlx.core.fast.layer_norm", false]], "layernorm (class in mlx.nn)": [[350, "mlx.nn.LayerNorm", false]], "leaf_modules() (module method)": [[364, "mlx.nn.Module.leaf_modules", false]], "leaky_relu (class in mlx.nn)": [[418, "mlx.nn.leaky_relu", false]], "leakyrelu (class in mlx.nn)": [[351, "mlx.nn.LeakyReLU", false]], "left_shift (c++ function)": [[0, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", false]], "left_shift() (in module mlx.core)": [[180, "mlx.core.left_shift", false]], "less (c++ function)": [[0, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", false]], "less() (in module mlx.core)": [[181, "mlx.core.less", false]], "less_equal (c++ function)": [[0, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", false]], "less_equal() (in module mlx.core)": [[182, "mlx.core.less_equal", false]], "linear (class in mlx.nn)": [[352, "mlx.nn.Linear", false]], "linear_schedule() (in module mlx.optimizers)": [[471, "mlx.optimizers.linear_schedule", false]], "linspace (c++ function)": [[0, "_CPPv48linspaceddi5Dtype14StreamOrDevice", false]], "linspace() (in module mlx.core)": [[193, "mlx.core.linspace", false]], "lion (class in mlx.optimizers)": [[461, "mlx.optimizers.Lion", false]], "load() (in module mlx.core)": [[194, "mlx.core.load", false]], "load_weights() (module method)": [[365, "mlx.nn.Module.load_weights", false]], "log (c++ function)": [[0, "_CPPv43logRK5array14StreamOrDevice", false]], "log() (array method)": [[52, "mlx.core.array.log", false]], "log() (in module mlx.core)": [[195, "mlx.core.log", false]], "log10 (c++ function)": [[0, "_CPPv45log10RK5array14StreamOrDevice", false]], "log10() (array method)": [[53, "mlx.core.array.log10", false]], "log10() (in module mlx.core)": [[196, "mlx.core.log10", false]], "log1p (c++ function)": [[0, "_CPPv45log1pRK5array14StreamOrDevice", false]], "log1p() (array method)": [[54, "mlx.core.array.log1p", false]], "log1p() (in module mlx.core)": [[197, "mlx.core.log1p", false]], "log2 (c++ function)": [[0, "_CPPv44log2RK5array14StreamOrDevice", false]], "log2() (array method)": [[55, "mlx.core.array.log2", false]], "log2() (in module mlx.core)": [[198, "mlx.core.log2", false]], "log_cosh_loss (class in mlx.nn.losses)": [[429, "mlx.nn.losses.log_cosh_loss", false]], "log_sigmoid (class in mlx.nn)": [[419, "mlx.nn.log_sigmoid", false]], "log_softmax (class in mlx.nn)": [[420, "mlx.nn.log_softmax", false]], "logaddexp (c++ function)": [[0, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", false]], "logaddexp() (in module mlx.core)": [[199, "mlx.core.logaddexp", false]], "logical_and (c++ function)": [[0, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", false]], "logical_and() (in module mlx.core)": [[200, "mlx.core.logical_and", false]], "logical_not (c++ function)": [[0, "_CPPv411logical_notRK5array14StreamOrDevice", false]], "logical_not() (in module mlx.core)": [[201, "mlx.core.logical_not", false]], "logical_or (c++ function)": [[0, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", false]], "logical_or() (in module mlx.core)": [[202, "mlx.core.logical_or", false]], "logsigmoid (class in mlx.nn)": [[353, "mlx.nn.LogSigmoid", false]], "logsoftmax (class in mlx.nn)": [[354, "mlx.nn.LogSoftmax", false]], "logsumexp (c++ function)": [[0, "_CPPv49logsumexpRK5array14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", false], [0, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", false]], "logsumexp() (array method)": [[56, "mlx.core.array.logsumexp", false]], "logsumexp() (in module mlx.core)": [[203, "mlx.core.logsumexp", false]], "lstm (class in mlx.nn)": [[349, "mlx.nn.LSTM", false]], "margin_ranking_loss (class in mlx.nn.losses)": [[430, "mlx.nn.losses.margin_ranking_loss", false]], "matmul (c++ function)": [[0, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", false]], "matmul() (in module mlx.core)": [[204, "mlx.core.matmul", false]], "max (c++ function)": [[0, "_CPPv43maxRK5array14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayb14StreamOrDevice", false], [0, "_CPPv43maxRK5arrayib14StreamOrDevice", false]], "max() (array method)": [[57, "mlx.core.array.max", false]], "max() (in module mlx.core)": [[205, "mlx.core.max", false]], "maximum (c++ function)": [[0, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", false]], "maximum() (in module mlx.core)": [[206, "mlx.core.maximum", false]], "maxpool1d (class in mlx.nn)": [[355, "mlx.nn.MaxPool1d", false]], "maxpool2d (class in mlx.nn)": [[356, "mlx.nn.MaxPool2d", false]], "mean (c++ function)": [[0, "_CPPv44meanRK5array14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayb14StreamOrDevice", false], [0, "_CPPv44meanRK5arrayib14StreamOrDevice", false]], "mean() (array method)": [[58, "mlx.core.array.mean", false]], "mean() (in module mlx.core)": [[207, "mlx.core.mean", false]], "meshgrid (c++ function)": [[0, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", false]], "meshgrid() (in module mlx.core)": [[208, "mlx.core.meshgrid", false]], "metal_kernel() (in module mlx.core.fast)": [[143, "mlx.core.fast.metal_kernel", false]], "min (c++ function)": [[0, "_CPPv43minRK5array14StreamOrDevice", false], [0, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayb14StreamOrDevice", false], [0, "_CPPv43minRK5arrayib14StreamOrDevice", false]], "min() (array method)": [[59, "mlx.core.array.min", false]], "min() (in module mlx.core)": [[221, "mlx.core.min", false]], "minimum (c++ function)": [[0, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", false]], "minimum() (in module mlx.core)": [[222, "mlx.core.minimum", false]], "mish (class in mlx.nn)": [[357, "mlx.nn.Mish", false], [435, "mlx.nn.mish", false]], "module (class in mlx.nn)": [[452, "mlx.nn.Module", false]], "modules() (module method)": [[366, "mlx.nn.Module.modules", false]], "moveaxis (c++ function)": [[0, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", false]], "moveaxis() (array method)": [[60, "mlx.core.array.moveaxis", false]], "moveaxis() (in module mlx.core)": [[223, "mlx.core.moveaxis", false]], "mse_loss (class in mlx.nn.losses)": [[431, "mlx.nn.losses.mse_loss", false]], "multiheadattention (class in mlx.nn)": [[378, "mlx.nn.MultiHeadAttention", false]], "multiply (c++ function)": [[0, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", false]], "multiply() (in module mlx.core)": [[224, "mlx.core.multiply", false]], "multivariate_normal() (in module mlx.core.random)": [[245, "mlx.core.random.multivariate_normal", false]], "named_modules() (module method)": [[367, "mlx.nn.Module.named_modules", false]], "nan_to_num (c++ function)": [[0, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", false]], "nan_to_num() (in module mlx.core)": [[225, "mlx.core.nan_to_num", false]], "nbytes (array property)": [[61, "mlx.core.array.nbytes", false]], "ndim (array property)": [[62, "mlx.core.array.ndim", false]], "negative (c++ function)": [[0, "_CPPv48negativeRK5array14StreamOrDevice", false]], "negative() (in module mlx.core)": [[226, "mlx.core.negative", false]], "new_stream() (in module mlx.core)": [[227, "mlx.core.new_stream", false]], "nll_loss (class in mlx.nn.losses)": [[432, "mlx.nn.losses.nll_loss", false]], "norm() (in module mlx.core.linalg)": [[189, "mlx.core.linalg.norm", false]], "normal() (in module mlx.core.random)": [[246, "mlx.core.random.normal", false]], "normal() (in module mlx.nn.init)": [[407, "mlx.nn.init.normal", false]], "not_equal (c++ function)": [[0, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", false]], "not_equal() (in module mlx.core)": [[228, "mlx.core.not_equal", false]], "number_of_elements (c++ function)": [[0, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", false]], "ones (c++ function)": [[0, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "ones() (in module mlx.core)": [[229, "mlx.core.ones", false]], "ones_like (c++ function)": [[0, "_CPPv49ones_likeRK5array14StreamOrDevice", false]], "ones_like() (in module mlx.core)": [[230, "mlx.core.ones_like", false]], "operator!= (c++ function)": [[0, "_CPPv4I0Ene5array1TRK5array", false], [0, "_CPPv4I0Ene5arrayRK5array1T", false], [0, "_CPPv4neRK5arrayRK5array", false]], "operator% (c++ function)": [[0, "_CPPv4I0Erm5array1TRK5array", false], [0, "_CPPv4I0Erm5arrayRK5array1T", false], [0, "_CPPv4rmRK5arrayRK5array", false]], "operator& (c++ function)": [[0, "_CPPv4anRK5arrayRK5array", false]], "operator&& (c++ function)": [[0, "_CPPv4aaRK5arrayRK5array", false]], "operator* (c++ function)": [[0, "_CPPv4I0Eml5array1TRK5array", false], [0, "_CPPv4I0Eml5arrayRK5array1T", false], [0, "_CPPv4mlRK5arrayRK5array", false]], "operator+ (c++ function)": [[0, "_CPPv4I0Epl5array1TRK5array", false], [0, "_CPPv4I0Epl5arrayRK5array1T", false], [0, "_CPPv4plRK5arrayRK5array", false]], "operator- (c++ function)": [[0, "_CPPv4I0Emi5array1TRK5array", false], [0, "_CPPv4I0Emi5arrayRK5array1T", false], [0, "_CPPv4miRK5array", false], [0, "_CPPv4miRK5arrayRK5array", false]], "operator/ (c++ function)": [[0, "_CPPv4dvRK5arrayRK5array", false], [0, "_CPPv4dvRK5arrayd", false], [0, "_CPPv4dvdRK5array", false]], "operator< (c++ function)": [[0, "_CPPv4I0Elt5array1TRK5array", false], [0, "_CPPv4I0Elt5arrayRK5array1T", false], [0, "_CPPv4ltRK5arrayRK5array", false]], "operator<< (c++ function)": [[0, "_CPPv4lsRK5arrayRK5array", false]], "operator<= (c++ function)": [[0, "_CPPv4I0Ele5array1TRK5array", false], [0, "_CPPv4I0Ele5arrayRK5array1T", false], [0, "_CPPv4leRK5arrayRK5array", false]], "operator== (c++ function)": [[0, "_CPPv4I0Eeq5array1TRK5array", false], [0, "_CPPv4I0Eeq5arrayRK5array1T", false], [0, "_CPPv4eqRK5arrayRK5array", false]], "operator> (c++ function)": [[0, "_CPPv4I0Egt5array1TRK5array", false], [0, "_CPPv4I0Egt5arrayRK5array1T", false], [0, "_CPPv4gtRK5arrayRK5array", false]], "operator>= (c++ function)": [[0, "_CPPv4I0Ege5array1TRK5array", false], [0, "_CPPv4I0Ege5arrayRK5array1T", false], [0, "_CPPv4geRK5arrayRK5array", false]], "operator>> (c++ function)": [[0, "_CPPv4rsRK5arrayRK5array", false]], "operator^ (c++ function)": [[0, "_CPPv4eoRK5arrayRK5array", false]], "operator| (c++ function)": [[0, "_CPPv4orRK5arrayRK5array", false]], "operator|| (c++ function)": [[0, "_CPPv4ooRK5arrayRK5array", false]], "optimizer (class in mlx.optimizers)": [[474, "mlx.optimizers.Optimizer", false]], "outer (c++ function)": [[0, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", false]], "outer() (in module mlx.core)": [[231, "mlx.core.outer", false]], "pad (c++ function)": [[0, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", false], [0, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", false]], "pad() (in module mlx.core)": [[232, "mlx.core.pad", false]], "parameters() (module method)": [[368, "mlx.nn.Module.parameters", false]], "partition (c++ function)": [[0, "_CPPv49partitionRK5arrayi14StreamOrDevice", false], [0, "_CPPv49partitionRK5arrayii14StreamOrDevice", false]], "partition() (in module mlx.core)": [[233, "mlx.core.partition", false]], "permutation() (in module mlx.core.random)": [[247, "mlx.core.random.permutation", false]], "power (c++ function)": [[0, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", false]], "power() (in module mlx.core)": [[234, "mlx.core.power", false]], "prelu (class in mlx.nn)": [[379, "mlx.nn.PReLU", false], [436, "mlx.nn.prelu", false]], "prod (c++ function)": [[0, "_CPPv44prodRK5array14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayb14StreamOrDevice", false], [0, "_CPPv44prodRK5arrayib14StreamOrDevice", false]], "prod() (array method)": [[63, "mlx.core.array.prod", false]], "prod() (in module mlx.core)": [[235, "mlx.core.prod", false]], "put_along_axis (c++ function)": [[0, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false]], "put_along_axis() (in module mlx.core)": [[236, "mlx.core.put_along_axis", false]], "qr() (in module mlx.core.linalg)": [[190, "mlx.core.linalg.qr", false]], "quantize (c++ function)": [[0, "_CPPv48quantizeRK5arrayii14StreamOrDevice", false]], "quantize() (in module mlx.core)": [[237, "mlx.core.quantize", false]], "quantize() (in module mlx.nn)": [[307, "mlx.nn.quantize", false]], "quantized_matmul (c++ function)": [[0, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", false]], "quantized_matmul() (in module mlx.core)": [[238, "mlx.core.quantized_matmul", false]], "quantizedembedding (class in mlx.nn)": [[380, "mlx.nn.QuantizedEmbedding", false]], "quantizedlinear (class in mlx.nn)": [[381, "mlx.nn.QuantizedLinear", false]], "radians (c++ function)": [[0, "_CPPv47radiansRK5array14StreamOrDevice", false]], "radians() (in module mlx.core)": [[239, "mlx.core.radians", false]], "randint() (in module mlx.core.random)": [[248, "mlx.core.random.randint", false]], "real (c++ function)": [[0, "_CPPv44realRK5array14StreamOrDevice", false]], "real() (in module mlx.core)": [[253, "mlx.core.real", false]], "reciprocal (c++ function)": [[0, "_CPPv410reciprocalRK5array14StreamOrDevice", false]], "reciprocal() (array method)": [[64, "mlx.core.array.reciprocal", false]], "reciprocal() (in module mlx.core)": [[254, "mlx.core.reciprocal", false]], "recv() (in module mlx.core.distributed)": [[125, "mlx.core.distributed.recv", false]], "recv_like() (in module mlx.core.distributed)": [[126, "mlx.core.distributed.recv_like", false]], "relu (class in mlx.nn)": [[384, "mlx.nn.ReLU", false], [437, "mlx.nn.relu", false]], "relu6 (class in mlx.nn)": [[385, "mlx.nn.ReLU6", false], [438, "mlx.nn.relu6", false]], "remainder (c++ function)": [[0, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", false]], "remainder() (in module mlx.core)": [[255, "mlx.core.remainder", false]], "repeat (c++ function)": [[0, "_CPPv46repeatRK5arrayi14StreamOrDevice", false], [0, "_CPPv46repeatRK5arrayii14StreamOrDevice", false]], "repeat() (in module mlx.core)": [[256, "mlx.core.repeat", false]], "reset_peak_memory() (in module mlx.core.metal)": [[215, "mlx.core.metal.reset_peak_memory", false]], "reshape (c++ function)": [[0, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "reshape() (array method)": [[65, "mlx.core.array.reshape", false]], "reshape() (in module mlx.core)": [[257, "mlx.core.reshape", false]], "rfft() (in module mlx.core.fft)": [[156, "mlx.core.fft.rfft", false]], "rfft2() (in module mlx.core.fft)": [[157, "mlx.core.fft.rfft2", false]], "rfftn() (in module mlx.core.fft)": [[158, "mlx.core.fft.rfftn", false]], "right_shift (c++ function)": [[0, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", false]], "right_shift() (in module mlx.core)": [[258, "mlx.core.right_shift", false]], "rms_norm() (in module mlx.core.fast)": [[144, "mlx.core.fast.rms_norm", false]], "rmsnorm (class in mlx.nn)": [[382, "mlx.nn.RMSNorm", false]], "rmsprop (class in mlx.optimizers)": [[466, "mlx.optimizers.RMSprop", false]], "rnn (class in mlx.nn)": [[383, "mlx.nn.RNN", false]], "roll (c++ function)": [[0, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayi14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv44rollRK5arrayii14StreamOrDevice", false]], "roll() (in module mlx.core)": [[259, "mlx.core.roll", false]], "rope (class in mlx.nn)": [[386, "mlx.nn.RoPE", false]], "rope() (in module mlx.core.fast)": [[145, "mlx.core.fast.rope", false]], "round (c++ function)": [[0, "_CPPv45roundRK5array14StreamOrDevice", false], [0, "_CPPv45roundRK5arrayi14StreamOrDevice", false]], "round() (array method)": [[66, "mlx.core.array.round", false]], "round() (in module mlx.core)": [[260, "mlx.core.round", false]], "rsqrt (c++ function)": [[0, "_CPPv45rsqrtRK5array14StreamOrDevice", false]], "rsqrt() (array method)": [[67, "mlx.core.array.rsqrt", false]], "rsqrt() (in module mlx.core)": [[261, "mlx.core.rsqrt", false]], "save() (in module mlx.core)": [[262, "mlx.core.save", false]], "save_gguf() (in module mlx.core)": [[263, "mlx.core.save_gguf", false]], "save_safetensors() (in module mlx.core)": [[264, "mlx.core.save_safetensors", false]], "save_weights() (module method)": [[369, "mlx.nn.Module.save_weights", false]], "savez() (in module mlx.core)": [[265, "mlx.core.savez", false]], "savez_compressed() (in module mlx.core)": [[266, "mlx.core.savez_compressed", false]], "scaled_dot_product_attention() (in module mlx.core.fast)": [[146, "mlx.core.fast.scaled_dot_product_attention", false]], "scatter (c++ function)": [[0, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_add (c++ function)": [[0, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_max (c++ function)": [[0, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_min (c++ function)": [[0, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "scatter_prod (c++ function)": [[0, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", false]], "seed() (in module mlx.core.random)": [[249, "mlx.core.random.seed", false]], "selu (class in mlx.nn)": [[387, "mlx.nn.SELU", false], [439, "mlx.nn.selu", false]], "send() (in module mlx.core.distributed)": [[127, "mlx.core.distributed.send", false]], "sequential (class in mlx.nn)": [[388, "mlx.nn.Sequential", false]], "set_cache_limit() (in module mlx.core.metal)": [[216, "mlx.core.metal.set_cache_limit", false]], "set_default_device() (in module mlx.core)": [[267, "mlx.core.set_default_device", false]], "set_default_stream() (in module mlx.core)": [[268, "mlx.core.set_default_stream", false]], "set_dtype() (module method)": [[370, "mlx.nn.Module.set_dtype", false]], "set_memory_limit() (in module mlx.core.metal)": [[217, "mlx.core.metal.set_memory_limit", false]], "set_wired_limit() (in module mlx.core.metal)": [[218, "mlx.core.metal.set_wired_limit", false]], "sgd (class in mlx.optimizers)": [[467, "mlx.optimizers.SGD", false]], "shape (array property)": [[68, "mlx.core.array.shape", false]], "sigmoid (c++ function)": [[0, "_CPPv47sigmoidRK5array14StreamOrDevice", false]], "sigmoid (class in mlx.nn)": [[390, "mlx.nn.Sigmoid", false], [440, "mlx.nn.sigmoid", false]], "sigmoid() (in module mlx.core)": [[269, "mlx.core.sigmoid", false]], "sign (c++ function)": [[0, "_CPPv44signRK5array14StreamOrDevice", false]], "sign() (in module mlx.core)": [[270, "mlx.core.sign", false]], "silu (class in mlx.nn)": [[389, "mlx.nn.SiLU", false], [441, "mlx.nn.silu", false]], "sin (c++ function)": [[0, "_CPPv43sinRK5array14StreamOrDevice", false]], "sin() (array method)": [[69, "mlx.core.array.sin", false]], "sin() (in module mlx.core)": [[271, "mlx.core.sin", false]], "sinh (c++ function)": [[0, "_CPPv44sinhRK5array14StreamOrDevice", false]], "sinh() (in module mlx.core)": [[272, "mlx.core.sinh", false]], "sinusoidalpositionalencoding (class in mlx.nn)": [[391, "mlx.nn.SinusoidalPositionalEncoding", false]], "size (array property)": [[70, "mlx.core.array.size", false]], "slice (c++ function)": [[0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "slice_update (c++ function)": [[0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", false]], "smooth_l1_loss (class in mlx.nn.losses)": [[433, "mlx.nn.losses.smooth_l1_loss", false]], "softmax (c++ function)": [[0, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayb14StreamOrDevice", false], [0, "_CPPv47softmaxRK5arrayib14StreamOrDevice", false]], "softmax (class in mlx.nn)": [[392, "mlx.nn.Softmax", false], [442, "mlx.nn.softmax", false]], "softmax() (in module mlx.core)": [[273, "mlx.core.softmax", false]], "softmin (class in mlx.nn)": [[393, "mlx.nn.Softmin", false], [443, "mlx.nn.softmin", false]], "softplus (class in mlx.nn)": [[394, "mlx.nn.Softplus", false], [444, "mlx.nn.softplus", false]], "softshrink (class in mlx.nn)": [[395, "mlx.nn.Softshrink", false], [445, "mlx.nn.softshrink", false]], "softsign (class in mlx.nn)": [[396, "mlx.nn.Softsign", false]], "sort (c++ function)": [[0, "_CPPv44sortRK5array14StreamOrDevice", false], [0, "_CPPv44sortRK5arrayi14StreamOrDevice", false]], "sort() (in module mlx.core)": [[274, "mlx.core.sort", false]], "split (c++ function)": [[0, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayi14StreamOrDevice", false], [0, "_CPPv45splitRK5arrayii14StreamOrDevice", false]], "split() (array method)": [[71, "mlx.core.array.split", false]], "split() (in module mlx.core)": [[275, "mlx.core.split", false]], "split() (in module mlx.core.random)": [[250, "mlx.core.random.split", false]], "sqrt (c++ function)": [[0, "_CPPv44sqrtRK5array14StreamOrDevice", false]], "sqrt() (array method)": [[72, "mlx.core.array.sqrt", false]], "sqrt() (in module mlx.core)": [[276, "mlx.core.sqrt", false]], "square (c++ function)": [[0, "_CPPv46squareRK5array14StreamOrDevice", false]], "square() (array method)": [[73, "mlx.core.array.square", false]], "square() (in module mlx.core)": [[277, "mlx.core.square", false]], "squeeze (c++ function)": [[0, "_CPPv47squeezeRK5array14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv47squeezeRK5arrayi14StreamOrDevice", false]], "squeeze() (array method)": [[74, "mlx.core.array.squeeze", false]], "squeeze() (in module mlx.core)": [[278, "mlx.core.squeeze", false]], "stack (c++ function)": [[0, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", false], [0, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", false]], "stack() (in module mlx.core)": [[279, "mlx.core.stack", false]], "start_capture() (in module mlx.core.metal)": [[219, "mlx.core.metal.start_capture", false]], "state (module property)": [[371, "mlx.nn.Module.state", false]], "state (optimizer property)": [[464, "mlx.optimizers.Optimizer.state", false]], "std (c++ function)": [[0, "_CPPv4StRK5array14StreamOrDevice", false], [0, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv4StRK5arraybi14StreamOrDevice", false], [0, "_CPPv4StRK5arrayibi14StreamOrDevice", false]], "std() (array method)": [[75, "mlx.core.array.std", false]], "std() (in module mlx.core)": [[280, "mlx.core.std", false]], "step (class in mlx.nn)": [[397, "mlx.nn.Step", false], [446, "mlx.nn.step", false]], "step_decay() (in module mlx.optimizers)": [[472, "mlx.optimizers.step_decay", false]], "stop_capture() (in module mlx.core.metal)": [[220, "mlx.core.metal.stop_capture", false]], "stop_gradient (c++ function)": [[0, "_CPPv413stop_gradientRK5array14StreamOrDevice", false]], "stop_gradient() (in module mlx.core)": [[281, "mlx.core.stop_gradient", false]], "stream (class in mlx.core)": [[315, "mlx.core.Stream", false]], "stream() (in module mlx.core)": [[282, "mlx.core.stream", false]], "subtract (c++ function)": [[0, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", false]], "subtract() (in module mlx.core)": [[283, "mlx.core.subtract", false]], "sum (c++ function)": [[0, "_CPPv43sumRK5array14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayb14StreamOrDevice", false], [0, "_CPPv43sumRK5arrayib14StreamOrDevice", false]], "sum() (array method)": [[76, "mlx.core.array.sum", false]], "sum() (in module mlx.core)": [[284, "mlx.core.sum", false]], "svd() (in module mlx.core.linalg)": [[191, "mlx.core.linalg.svd", false]], "swapaxes (c++ function)": [[0, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", false]], "swapaxes() (array method)": [[77, "mlx.core.array.swapaxes", false]], "swapaxes() (in module mlx.core)": [[285, "mlx.core.swapaxes", false]], "synchronize() (in module mlx.core)": [[286, "mlx.core.synchronize", false]], "t (array property)": [[31, "mlx.core.array.T", false]], "take (c++ function)": [[0, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayi14StreamOrDevice", false], [0, "_CPPv44takeRK5arrayii14StreamOrDevice", false]], "take() (in module mlx.core)": [[287, "mlx.core.take", false]], "take_along_axis (c++ function)": [[0, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", false]], "take_along_axis() (in module mlx.core)": [[288, "mlx.core.take_along_axis", false]], "tan (c++ function)": [[0, "_CPPv43tanRK5array14StreamOrDevice", false]], "tan() (in module mlx.core)": [[289, "mlx.core.tan", false]], "tanh (c++ function)": [[0, "_CPPv44tanhRK5array14StreamOrDevice", false]], "tanh (class in mlx.nn)": [[398, "mlx.nn.Tanh", false], [447, "mlx.nn.tanh", false]], "tanh() (in module mlx.core)": [[290, "mlx.core.tanh", false]], "tensordot (c++ function)": [[0, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", false], [0, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", false]], "tensordot() (in module mlx.core)": [[291, "mlx.core.tensordot", false]], "tile (c++ function)": [[0, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "tile() (in module mlx.core)": [[292, "mlx.core.tile", false]], "tolist() (array method)": [[78, "mlx.core.array.tolist", false]], "topk (c++ function)": [[0, "_CPPv44topkRK5arrayi14StreamOrDevice", false], [0, "_CPPv44topkRK5arrayii14StreamOrDevice", false]], "topk() (in module mlx.core)": [[293, "mlx.core.topk", false]], "trace (c++ function)": [[0, "_CPPv45traceRK5array14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii14StreamOrDevice", false], [0, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", false]], "trace() (in module mlx.core)": [[294, "mlx.core.trace", false]], "train() (module method)": [[372, "mlx.nn.Module.train", false]], "trainable_parameters() (module method)": [[373, "mlx.nn.Module.trainable_parameters", false]], "training (module property)": [[374, "mlx.nn.Module.training", false]], "transformer (class in mlx.nn)": [[399, "mlx.nn.Transformer", false]], "transpose (c++ function)": [[0, "_CPPv49transposeRK5array14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", false], [0, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", false]], "transpose() (array method)": [[79, "mlx.core.array.transpose", false]], "transpose() (in module mlx.core)": [[295, "mlx.core.transpose", false]], "tree_flatten() (in module mlx.utils)": [[310, "mlx.utils.tree_flatten", false]], "tree_map() (in module mlx.utils)": [[311, "mlx.utils.tree_map", false]], "tree_map_with_path() (in module mlx.utils)": [[312, "mlx.utils.tree_map_with_path", false]], "tree_reduce() (in module mlx.utils)": [[313, "mlx.utils.tree_reduce", false]], "tree_unflatten() (in module mlx.utils)": [[314, "mlx.utils.tree_unflatten", false]], "tri (c++ function)": [[0, "_CPPv43trii5Dtype14StreamOrDevice", false], [0, "_CPPv43triiii5Dtype14StreamOrDevice", false]], "tri() (in module mlx.core)": [[296, "mlx.core.tri", false]], "tri_inv() (in module mlx.core.linalg)": [[192, "mlx.core.linalg.tri_inv", false]], "tril (c++ function)": [[0, "_CPPv44tril5arrayi14StreamOrDevice", false]], "tril() (in module mlx.core)": [[297, "mlx.core.tril", false]], "triplet_loss (class in mlx.nn.losses)": [[434, "mlx.nn.losses.triplet_loss", false]], "triu (c++ function)": [[0, "_CPPv44triu5arrayi14StreamOrDevice", false]], "triu() (in module mlx.core)": [[298, "mlx.core.triu", false]], "truncated_normal() (in module mlx.core.random)": [[251, "mlx.core.random.truncated_normal", false]], "unfreeze() (module method)": [[375, "mlx.nn.Module.unfreeze", false]], "uniform() (in module mlx.core.random)": [[252, "mlx.core.random.uniform", false]], "uniform() (in module mlx.nn.init)": [[408, "mlx.nn.init.uniform", false]], "update() (module method)": [[376, "mlx.nn.Module.update", false]], "update() (optimizer method)": [[465, "mlx.optimizers.Optimizer.update", false]], "update_modules() (module method)": [[377, "mlx.nn.Module.update_modules", false]], "upsample (class in mlx.nn)": [[400, "mlx.nn.Upsample", false]], "value_and_grad() (in module mlx.core)": [[299, "mlx.core.value_and_grad", false]], "value_and_grad() (in module mlx.nn)": [[308, "mlx.nn.value_and_grad", false]], "var (c++ function)": [[0, "_CPPv43varRK5array14StreamOrDevice", false], [0, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", false], [0, "_CPPv43varRK5arraybi14StreamOrDevice", false], [0, "_CPPv43varRK5arrayibi14StreamOrDevice", false]], "var() (array method)": [[80, "mlx.core.array.var", false]], "var() (in module mlx.core)": [[300, "mlx.core.var", false]], "view (c++ function)": [[0, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", false]], "view() (array method)": [[81, "mlx.core.array.view", false]], "view() (in module mlx.core)": [[301, "mlx.core.view", false]], "vjp() (in module mlx.core)": [[302, "mlx.core.vjp", false]], "vmap() (in module mlx.core)": [[303, "mlx.core.vmap", false]], "where (c++ function)": [[0, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", false]], "where() (in module mlx.core)": [[304, "mlx.core.where", false]], "zeros (c++ function)": [[0, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", false], [0, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", false]], "zeros() (in module mlx.core)": [[305, "mlx.core.zeros", false]], "zeros_like (c++ function)": [[0, "_CPPv410zeros_likeRK5array14StreamOrDevice", false]], "zeros_like() (in module mlx.core)": [[306, "mlx.core.zeros_like", false]]}, "objects": {"": [[0, 0, 1, "_CPPv43absRK5array14StreamOrDevice", "abs"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::a"], [0, 1, 1, "_CPPv43absRK5array14StreamOrDevice", "abs::s"], [0, 0, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::a"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::b"], [0, 1, 1, "_CPPv43addRK5arrayRK5array14StreamOrDevice", "add::s"], [0, 0, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::a"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::alpha"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::b"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::beta"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::c"], [0, 1, 1, "_CPPv45addmm5array5array5arrayRKfRKf14StreamOrDevice", "addmm::s"], [0, 0, 1, "_CPPv43allRK5array14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all"], [0, 0, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::a"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::axes"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::axis"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::keepdims"], [0, 1, 1, "_CPPv43allRK5array14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayb14StreamOrDevice", "all::s"], [0, 1, 1, "_CPPv43allRK5arrayib14StreamOrDevice", "all::s"], [0, 0, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::a"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::atol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::b"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::equal_nan"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::rtol"], [0, 1, 1, "_CPPv48allcloseRK5arrayRK5arrayddb14StreamOrDevice", "allclose::s"], [0, 0, 1, "_CPPv43anyRK5array14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any"], [0, 0, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::a"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::axes"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::axis"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::keepdims"], [0, 1, 1, "_CPPv43anyRK5array14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayb14StreamOrDevice", "any::s"], [0, 1, 1, "_CPPv43anyRK5arrayib14StreamOrDevice", "any::s"], [0, 0, 1, "_CPPv46aranged14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangei14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeii14StreamOrDevice", "arange"], [0, 0, 1, "_CPPv46arangeiii14StreamOrDevice", "arange"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::dtype"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::s"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::start"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::step"], [0, 1, 1, "_CPPv46aranged14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46aranged5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangedd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeddd5Dtype14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangei14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeii14StreamOrDevice", "arange::stop"], [0, 1, 1, "_CPPv46arangeiii14StreamOrDevice", "arange::stop"], [0, 0, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::a"], [0, 1, 1, "_CPPv46arccosRK5array14StreamOrDevice", "arccos::s"], [0, 0, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::a"], [0, 1, 1, "_CPPv47arccoshRK5array14StreamOrDevice", "arccosh::s"], [0, 0, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::a"], [0, 1, 1, "_CPPv46arcsinRK5array14StreamOrDevice", "arcsin::s"], [0, 0, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::a"], [0, 1, 1, "_CPPv47arcsinhRK5array14StreamOrDevice", "arcsinh::s"], [0, 0, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan"], [0, 0, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::a"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::b"], [0, 1, 1, "_CPPv47arctan2RK5arrayRK5array14StreamOrDevice", "arctan2::s"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::a"], [0, 1, 1, "_CPPv46arctanRK5array14StreamOrDevice", "arctan::s"], [0, 0, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::a"], [0, 1, 1, "_CPPv47arctanhRK5array14StreamOrDevice", "arctanh::s"], [0, 0, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax"], [0, 0, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::a"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::axis"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::keepdims"], [0, 1, 1, "_CPPv46argmaxRK5array14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayb14StreamOrDevice", "argmax::s"], [0, 1, 1, "_CPPv46argmaxRK5arrayib14StreamOrDevice", "argmax::s"], [0, 0, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin"], [0, 0, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::a"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::axis"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::keepdims"], [0, 1, 1, "_CPPv46argminRK5array14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayb14StreamOrDevice", "argmin::s"], [0, 1, 1, "_CPPv46argminRK5arrayib14StreamOrDevice", "argmin::s"], [0, 0, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition"], [0, 0, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::a"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::axis"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::kth"], [0, 1, 1, "_CPPv412argpartitionRK5arrayi14StreamOrDevice", "argpartition::s"], [0, 1, 1, "_CPPv412argpartitionRK5arrayii14StreamOrDevice", "argpartition::s"], [0, 0, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort"], [0, 0, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::a"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::axis"], [0, 1, 1, "_CPPv47argsortRK5array14StreamOrDevice", "argsort::s"], [0, 1, 1, "_CPPv47argsortRK5arrayi14StreamOrDevice", "argsort::s"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal"], [0, 0, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::a"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::b"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::equal_nan"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5array14StreamOrDevice", "array_equal::s"], [0, 1, 1, "_CPPv411array_equalRK5arrayRK5arrayb14StreamOrDevice", "array_equal::s"], [0, 0, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::a"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::offset"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::s"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::shape"], [0, 1, 1, "_CPPv410as_strided5arrayNSt6vectorIiEENSt6vectorI6size_tEE6size_t14StreamOrDevice", "as_strided::strides"], [0, 0, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::a"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::dtype"], [0, 1, 1, "_CPPv46astype5array5Dtype14StreamOrDevice", "astype::s"], [0, 0, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d"], [0, 0, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::a"], [0, 1, 1, "_CPPv410atleast_1dRK5array14StreamOrDevice", "atleast_1d::s"], [0, 1, 1, "_CPPv410atleast_1dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_1d::s"], [0, 0, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d"], [0, 0, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::a"], [0, 1, 1, "_CPPv410atleast_2dRK5array14StreamOrDevice", "atleast_2d::s"], [0, 1, 1, "_CPPv410atleast_2dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_2d::s"], [0, 0, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d"], [0, 0, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::a"], [0, 1, 1, "_CPPv410atleast_3dRK5array14StreamOrDevice", "atleast_3d::s"], [0, 1, 1, "_CPPv410atleast_3dRKNSt6vectorI5arrayEE14StreamOrDevice", "atleast_3d::s"], [0, 0, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::a"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::b"], [0, 1, 1, "_CPPv411bitwise_andRK5arrayRK5array14StreamOrDevice", "bitwise_and::s"], [0, 0, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::a"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::b"], [0, 1, 1, "_CPPv410bitwise_orRK5arrayRK5array14StreamOrDevice", "bitwise_or::s"], [0, 0, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::a"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::b"], [0, 1, 1, "_CPPv411bitwise_xorRK5arrayRK5array14StreamOrDevice", "bitwise_xor::s"], [0, 0, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::a"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::b"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::block_size"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_lhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_out"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::mask_rhs"], [0, 1, 1, "_CPPv415block_masked_mm5array5arrayiNSt8optionalI5arrayEENSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "block_masked_mm::s"], [0, 0, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::inputs"], [0, 1, 1, "_CPPv416broadcast_arraysRKNSt6vectorI5arrayEE14StreamOrDevice", "broadcast_arrays::s"], [0, 0, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::a"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::s"], [0, 1, 1, "_CPPv412broadcast_toRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "broadcast_to::shape"], [0, 0, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::a"], [0, 1, 1, "_CPPv44ceilRK5array14StreamOrDevice", "ceil::s"], [0, 0, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_max"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::a_min"], [0, 1, 1, "_CPPv44clipRK5arrayRKNSt8optionalI5arrayEERKNSt8optionalI5arrayEE14StreamOrDevice", "clip::s"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate"], [0, 0, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::arrays"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::axis"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEE14StreamOrDevice", "concatenate::s"], [0, 1, 1, "_CPPv411concatenateRKNSt6vectorI5arrayEEi14StreamOrDevice", "concatenate::s"], [0, 0, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::a"], [0, 1, 1, "_CPPv49conjugateRK5array14StreamOrDevice", "conjugate::s"], [0, 0, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::dilation"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::groups"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::input"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::padding"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::s"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::stride"], [0, 1, 1, "_CPPv46conv1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv1d::weight"], [0, 0, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::dilation"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::groups"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::input"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::padding"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::s"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::stride"], [0, 1, 1, "_CPPv46conv2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv2d::weight"], [0, 0, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::dilation"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::groups"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::input"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::padding"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::s"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::stride"], [0, 1, 1, "_CPPv46conv3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv3d::weight"], [0, 0, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 0, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::flip"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::groups"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::input_dilation"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::kernel_dilation"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_hi"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::padding_lo"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::s"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::stride"], [0, 1, 1, "_CPPv412conv_general5array5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 1, 1, "_CPPv412conv_generalRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEENSt6vectorIiEEib14StreamOrDevice", "conv_general::weight"], [0, 0, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::dilation"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::groups"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::input"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::padding"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::s"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::stride"], [0, 1, 1, "_CPPv416conv_transpose1dRK5arrayRK5arrayiiii14StreamOrDevice", "conv_transpose1d::weight"], [0, 0, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::dilation"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::groups"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::input"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::padding"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::s"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::stride"], [0, 1, 1, "_CPPv416conv_transpose2dRK5arrayRK5arrayRKNSt4pairIiiEERKNSt4pairIiiEERKNSt4pairIiiEEi14StreamOrDevice", "conv_transpose2d::weight"], [0, 0, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::dilation"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::groups"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::input"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::padding"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::s"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::stride"], [0, 1, 1, "_CPPv416conv_transpose3dRK5arrayRK5arrayRKNSt5tupleIiiiEERKNSt5tupleIiiiEERKNSt5tupleIiiiEEi14StreamOrDevice", "conv_transpose3d::weight"], [0, 0, 1, "_CPPv44copy5array14StreamOrDevice", "copy"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::a"], [0, 1, 1, "_CPPv44copy5array14StreamOrDevice", "copy::s"], [0, 0, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::a"], [0, 1, 1, "_CPPv43cosRK5array14StreamOrDevice", "cos::s"], [0, 0, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::a"], [0, 1, 1, "_CPPv44coshRK5array14StreamOrDevice", "cosh::s"], [0, 0, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::a"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::axis"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::inclusive"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::reverse"], [0, 1, 1, "_CPPv46cummaxRK5arrayibb14StreamOrDevice", "cummax::s"], [0, 0, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::a"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::axis"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::inclusive"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::reverse"], [0, 1, 1, "_CPPv46cumminRK5arrayibb14StreamOrDevice", "cummin::s"], [0, 0, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::a"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::axis"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::inclusive"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::reverse"], [0, 1, 1, "_CPPv47cumprodRK5arrayibb14StreamOrDevice", "cumprod::s"], [0, 0, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::a"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::axis"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::inclusive"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::reverse"], [0, 1, 1, "_CPPv46cumsumRK5arrayibb14StreamOrDevice", "cumsum::s"], [0, 0, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::a"], [0, 1, 1, "_CPPv47degreesRK5array14StreamOrDevice", "degrees::s"], [0, 0, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::dependencies"], [0, 1, 1, "_CPPv47dependsRKNSt6vectorI5arrayEERKNSt6vectorI5arrayEE", "depends::inputs"], [0, 0, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::biases"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::bits"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::group_size"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::s"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::scales"], [0, 1, 1, "_CPPv410dequantizeRK5arrayRK5arrayRK5arrayii14StreamOrDevice", "dequantize::w"], [0, 0, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::a"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::k"], [0, 1, 1, "_CPPv44diagRK5arrayi14StreamOrDevice", "diag::s"], [0, 0, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::a"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis1"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::axis2"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::offset"], [0, 1, 1, "_CPPv48diagonalRK5arrayiii14StreamOrDevice", "diagonal::s"], [0, 0, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::a"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::b"], [0, 1, 1, "_CPPv46divideRK5arrayRK5array14StreamOrDevice", "divide::s"], [0, 0, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::a"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::b"], [0, 1, 1, "_CPPv46divmodRK5arrayRK5array14StreamOrDevice", "divmod::s"], [0, 0, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::a"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::b"], [0, 1, 1, "_CPPv45equalRK5arrayRK5array14StreamOrDevice", "equal::s"], [0, 0, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::a"], [0, 1, 1, "_CPPv43erfRK5array14StreamOrDevice", "erf::s"], [0, 0, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::a"], [0, 1, 1, "_CPPv46erfinvRK5array14StreamOrDevice", "erfinv::s"], [0, 0, 1, "_CPPv43expRK5array14StreamOrDevice", "exp"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::a"], [0, 1, 1, "_CPPv43expRK5array14StreamOrDevice", "exp::s"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims"], [0, 0, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::a"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::axes"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::axis"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "expand_dims::s"], [0, 1, 1, "_CPPv411expand_dimsRK5arrayi14StreamOrDevice", "expand_dims::s"], [0, 0, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::a"], [0, 1, 1, "_CPPv45expm1RK5array14StreamOrDevice", "expm1::s"], [0, 0, 1, "_CPPv43eyei14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii14StreamOrDevice", "eye"], [0, 0, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::dtype"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::k"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::m"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::n"], [0, 1, 1, "_CPPv43eyei14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyei5Dtype14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii14StreamOrDevice", "eye::s"], [0, 1, 1, "_CPPv43eyeiii5Dtype14StreamOrDevice", "eye::s"], [0, 0, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten"], [0, 0, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::a"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::end_axis"], [0, 1, 1, "_CPPv47flattenRK5array14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::s"], [0, 1, 1, "_CPPv47flattenRK5arrayii14StreamOrDevice", "flatten::start_axis"], [0, 0, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::a"], [0, 1, 1, "_CPPv45floorRK5array14StreamOrDevice", "floor::s"], [0, 0, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::a"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::b"], [0, 1, 1, "_CPPv412floor_divideRK5arrayRK5array14StreamOrDevice", "floor_divide::s"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full"], [0, 0, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full"], [0, 0, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::T"], [0, 2, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::T"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::dtype"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::s"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::shape"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv4I0E4full5arrayNSt6vectorIiEE1T5Dtype14StreamOrDevice", "full::val"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array14StreamOrDevice", "full::vals"], [0, 1, 1, "_CPPv44fullNSt6vectorIiEE5array5Dtype14StreamOrDevice", "full::vals"], [0, 0, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 0, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::a"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::axes"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::axis"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::indices"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::s"], [0, 1, 1, "_CPPv46gatherRK5arrayRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 1, 1, "_CPPv46gatherRK5arrayRKNSt6vectorI5arrayEERKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "gather::slice_sizes"], [0, 0, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::a"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::b"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::lhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::rhs_indices"], [0, 1, 1, "_CPPv49gather_mm5array5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEE14StreamOrDevice", "gather_mm::s"], [0, 0, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::biases"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::bits"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::group_size"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::lhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::rhs_indices"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::s"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::scales"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::transpose"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::w"], [0, 1, 1, "_CPPv410gather_qmmRK5arrayRK5arrayRK5arrayRK5arrayNSt8optionalI5arrayEENSt8optionalI5arrayEEbii14StreamOrDevice", "gather_qmm::x"], [0, 0, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::a"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::b"], [0, 1, 1, "_CPPv47greaterRK5arrayRK5array14StreamOrDevice", "greater::s"], [0, 0, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::a"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::b"], [0, 1, 1, "_CPPv413greater_equalRK5arrayRK5array14StreamOrDevice", "greater_equal::s"], [0, 0, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::a"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::s"], [0, 1, 1, "_CPPv418hadamard_transformRK5arrayNSt8optionalIfEE14StreamOrDevice", "hadamard_transform::scale"], [0, 0, 1, "_CPPv48identityi14StreamOrDevice", "identity"], [0, 0, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::dtype"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::n"], [0, 1, 1, "_CPPv48identityi14StreamOrDevice", "identity::s"], [0, 1, 1, "_CPPv48identityi5Dtype14StreamOrDevice", "identity::s"], [0, 0, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::a"], [0, 1, 1, "_CPPv44imagRK5array14StreamOrDevice", "imag::s"], [0, 0, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::a"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::b"], [0, 1, 1, "_CPPv45innerRK5arrayRK5array14StreamOrDevice", "inner::s"], [0, 0, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::a"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::atol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::b"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::equal_nan"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::rtol"], [0, 1, 1, "_CPPv47iscloseRK5arrayRK5arrayddb14StreamOrDevice", "isclose::s"], [0, 0, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::a"], [0, 1, 1, "_CPPv48isfiniteRK5array14StreamOrDevice", "isfinite::s"], [0, 0, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::a"], [0, 1, 1, "_CPPv45isinfRK5array14StreamOrDevice", "isinf::s"], [0, 0, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::a"], [0, 1, 1, "_CPPv45isnanRK5array14StreamOrDevice", "isnan::s"], [0, 0, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::a"], [0, 1, 1, "_CPPv48isneginfRK5array14StreamOrDevice", "isneginf::s"], [0, 0, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::a"], [0, 1, 1, "_CPPv48isposinfRK5array14StreamOrDevice", "isposinf::s"], [0, 0, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::a"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::b"], [0, 1, 1, "_CPPv410left_shiftRK5arrayRK5array14StreamOrDevice", "left_shift::s"], [0, 0, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::a"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::b"], [0, 1, 1, "_CPPv44lessRK5arrayRK5array14StreamOrDevice", "less::s"], [0, 0, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::a"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::b"], [0, 1, 1, "_CPPv410less_equalRK5arrayRK5array14StreamOrDevice", "less_equal::s"], [0, 0, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::dtype"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::num"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::s"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::start"], [0, 1, 1, "_CPPv48linspaceddi5Dtype14StreamOrDevice", "linspace::stop"], [0, 0, 1, "_CPPv43logRK5array14StreamOrDevice", "log"], [0, 0, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::a"], [0, 1, 1, "_CPPv45log10RK5array14StreamOrDevice", "log10::s"], [0, 0, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::a"], [0, 1, 1, "_CPPv45log1pRK5array14StreamOrDevice", "log1p::s"], [0, 0, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::a"], [0, 1, 1, "_CPPv44log2RK5array14StreamOrDevice", "log2::s"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::a"], [0, 1, 1, "_CPPv43logRK5array14StreamOrDevice", "log::s"], [0, 0, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::a"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::b"], [0, 1, 1, "_CPPv49logaddexpRK5arrayRK5array14StreamOrDevice", "logaddexp::s"], [0, 0, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::a"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::b"], [0, 1, 1, "_CPPv411logical_andRK5arrayRK5array14StreamOrDevice", "logical_and::s"], [0, 0, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::a"], [0, 1, 1, "_CPPv411logical_notRK5array14StreamOrDevice", "logical_not::s"], [0, 0, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::a"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::b"], [0, 1, 1, "_CPPv410logical_orRK5arrayRK5array14StreamOrDevice", "logical_or::s"], [0, 0, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp"], [0, 0, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::a"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::axes"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::axis"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::keepdims"], [0, 1, 1, "_CPPv49logsumexpRK5array14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayb14StreamOrDevice", "logsumexp::s"], [0, 1, 1, "_CPPv49logsumexpRK5arrayib14StreamOrDevice", "logsumexp::s"], [0, 0, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::a"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::b"], [0, 1, 1, "_CPPv46matmulRK5arrayRK5array14StreamOrDevice", "matmul::s"], [0, 0, 1, "_CPPv43maxRK5array14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max"], [0, 0, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::a"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::axes"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::axis"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::keepdims"], [0, 1, 1, "_CPPv43maxRK5array14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayb14StreamOrDevice", "max::s"], [0, 1, 1, "_CPPv43maxRK5arrayib14StreamOrDevice", "max::s"], [0, 0, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::a"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::b"], [0, 1, 1, "_CPPv47maximumRK5arrayRK5array14StreamOrDevice", "maximum::s"], [0, 0, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean"], [0, 0, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::a"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::axes"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::axis"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::keepdims"], [0, 1, 1, "_CPPv44meanRK5array14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayb14StreamOrDevice", "mean::s"], [0, 1, 1, "_CPPv44meanRK5arrayib14StreamOrDevice", "mean::s"], [0, 0, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::arrays"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::indexing"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::s"], [0, 1, 1, "_CPPv48meshgridRKNSt6vectorI5arrayEEbNSt6stringE14StreamOrDevice", "meshgrid::sparse"], [0, 0, 1, "_CPPv43minRK5array14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min"], [0, 0, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::a"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::axes"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::axis"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::keepdims"], [0, 1, 1, "_CPPv43minRK5array14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayb14StreamOrDevice", "min::s"], [0, 1, 1, "_CPPv43minRK5arrayib14StreamOrDevice", "min::s"], [0, 0, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::a"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::b"], [0, 1, 1, "_CPPv47minimumRK5arrayRK5array14StreamOrDevice", "minimum::s"], [0, 0, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::a"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::destination"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::s"], [0, 1, 1, "_CPPv48moveaxisRK5arrayii14StreamOrDevice", "moveaxis::source"], [0, 0, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::a"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::b"], [0, 1, 1, "_CPPv48multiplyRK5arrayRK5array14StreamOrDevice", "multiply::s"], [0, 0, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::a"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::nan"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::neginf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::posinf"], [0, 1, 1, "_CPPv410nan_to_numRK5arrayfKNSt8optionalIfEEKNSt8optionalIfEE14StreamOrDevice", "nan_to_num::s"], [0, 0, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::a"], [0, 1, 1, "_CPPv48negativeRK5array14StreamOrDevice", "negative::s"], [0, 0, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::a"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::b"], [0, 1, 1, "_CPPv49not_equalRK5arrayRK5array14StreamOrDevice", "not_equal::s"], [0, 0, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::a"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::axes"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::dtype"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::inverted"], [0, 1, 1, "_CPPv418number_of_elementsRK5arrayNSt6vectorIiEEb5Dtype14StreamOrDevice", "number_of_elements::s"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones"], [0, 0, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::dtype"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::s"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE14StreamOrDevice", "ones::shape"], [0, 1, 1, "_CPPv44onesRKNSt6vectorIiEE5Dtype14StreamOrDevice", "ones::shape"], [0, 0, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::a"], [0, 1, 1, "_CPPv49ones_likeRK5array14StreamOrDevice", "ones_like::s"], [0, 0, 1, "_CPPv4I0Ene5array1TRK5array", "operator!="], [0, 0, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!="], [0, 0, 1, "_CPPv4neRK5arrayRK5array", "operator!="], [0, 2, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::T"], [0, 2, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::T"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::a"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::a"], [0, 1, 1, "_CPPv4I0Ene5array1TRK5array", "operator!=::b"], [0, 1, 1, "_CPPv4I0Ene5arrayRK5array1T", "operator!=::b"], [0, 1, 1, "_CPPv4neRK5arrayRK5array", "operator!=::b"], [0, 0, 1, "_CPPv4I0Erm5array1TRK5array", "operator%"], [0, 0, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%"], [0, 0, 1, "_CPPv4rmRK5arrayRK5array", "operator%"], [0, 2, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::T"], [0, 2, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::T"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::a"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::a"], [0, 1, 1, "_CPPv4I0Erm5array1TRK5array", "operator%::b"], [0, 1, 1, "_CPPv4I0Erm5arrayRK5array1T", "operator%::b"], [0, 1, 1, "_CPPv4rmRK5arrayRK5array", "operator%::b"], [0, 0, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;"], [0, 0, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::a"], [0, 1, 1, "_CPPv4aaRK5arrayRK5array", "operator&amp;&amp;::b"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::a"], [0, 1, 1, "_CPPv4anRK5arrayRK5array", "operator&amp;::b"], [0, 0, 1, "_CPPv4I0Eml5array1TRK5array", "operator*"], [0, 0, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*"], [0, 0, 1, "_CPPv4mlRK5arrayRK5array", "operator*"], [0, 2, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::T"], [0, 2, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::T"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::a"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::a"], [0, 1, 1, "_CPPv4I0Eml5array1TRK5array", "operator*::b"], [0, 1, 1, "_CPPv4I0Eml5arrayRK5array1T", "operator*::b"], [0, 1, 1, "_CPPv4mlRK5arrayRK5array", "operator*::b"], [0, 0, 1, "_CPPv4I0Epl5array1TRK5array", "operator+"], [0, 0, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+"], [0, 0, 1, "_CPPv4plRK5arrayRK5array", "operator+"], [0, 2, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::T"], [0, 2, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::T"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::a"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::a"], [0, 1, 1, "_CPPv4I0Epl5array1TRK5array", "operator+::b"], [0, 1, 1, "_CPPv4I0Epl5arrayRK5array1T", "operator+::b"], [0, 1, 1, "_CPPv4plRK5arrayRK5array", "operator+::b"], [0, 0, 1, "_CPPv4I0Emi5array1TRK5array", "operator-"], [0, 0, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-"], [0, 0, 1, "_CPPv4miRK5array", "operator-"], [0, 0, 1, "_CPPv4miRK5arrayRK5array", "operator-"], [0, 2, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::T"], [0, 2, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::T"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::a"], [0, 1, 1, "_CPPv4miRK5array", "operator-::a"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::a"], [0, 1, 1, "_CPPv4I0Emi5array1TRK5array", "operator-::b"], [0, 1, 1, "_CPPv4I0Emi5arrayRK5array1T", "operator-::b"], [0, 1, 1, "_CPPv4miRK5arrayRK5array", "operator-::b"], [0, 0, 1, "_CPPv4dvRK5arrayRK5array", "operator/"], [0, 0, 1, "_CPPv4dvRK5arrayd", "operator/"], [0, 0, 1, "_CPPv4dvdRK5array", "operator/"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::a"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::a"], [0, 1, 1, "_CPPv4dvRK5arrayRK5array", "operator/::b"], [0, 1, 1, "_CPPv4dvRK5arrayd", "operator/::b"], [0, 1, 1, "_CPPv4dvdRK5array", "operator/::b"], [0, 0, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;"], [0, 0, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;"], [0, 0, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;"], [0, 2, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::T"], [0, 2, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::T"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::a"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::a"], [0, 1, 1, "_CPPv4I0Elt5array1TRK5array", "operator&lt;::b"], [0, 1, 1, "_CPPv4I0Elt5arrayRK5array1T", "operator&lt;::b"], [0, 1, 1, "_CPPv4ltRK5arrayRK5array", "operator&lt;::b"], [0, 0, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::a"], [0, 1, 1, "_CPPv4lsRK5arrayRK5array", "operator&lt;&lt;::b"], [0, 0, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;="], [0, 0, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;="], [0, 0, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;="], [0, 2, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::T"], [0, 2, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::T"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::a"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::a"], [0, 1, 1, "_CPPv4I0Ele5array1TRK5array", "operator&lt;=::b"], [0, 1, 1, "_CPPv4I0Ele5arrayRK5array1T", "operator&lt;=::b"], [0, 1, 1, "_CPPv4leRK5arrayRK5array", "operator&lt;=::b"], [0, 0, 1, "_CPPv4I0Eeq5array1TRK5array", "operator=="], [0, 0, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator=="], [0, 0, 1, "_CPPv4eqRK5arrayRK5array", "operator=="], [0, 2, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::T"], [0, 2, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::T"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::a"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::a"], [0, 1, 1, "_CPPv4I0Eeq5array1TRK5array", "operator==::b"], [0, 1, 1, "_CPPv4I0Eeq5arrayRK5array1T", "operator==::b"], [0, 1, 1, "_CPPv4eqRK5arrayRK5array", "operator==::b"], [0, 0, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;"], [0, 0, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;"], [0, 0, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;"], [0, 2, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::T"], [0, 2, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::T"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::a"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::a"], [0, 1, 1, "_CPPv4I0Egt5array1TRK5array", "operator&gt;::b"], [0, 1, 1, "_CPPv4I0Egt5arrayRK5array1T", "operator&gt;::b"], [0, 1, 1, "_CPPv4gtRK5arrayRK5array", "operator&gt;::b"], [0, 0, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;="], [0, 0, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;="], [0, 0, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;="], [0, 2, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::T"], [0, 2, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::T"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::a"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::a"], [0, 1, 1, "_CPPv4I0Ege5array1TRK5array", "operator&gt;=::b"], [0, 1, 1, "_CPPv4I0Ege5arrayRK5array1T", "operator&gt;=::b"], [0, 1, 1, "_CPPv4geRK5arrayRK5array", "operator&gt;=::b"], [0, 0, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::a"], [0, 1, 1, "_CPPv4rsRK5arrayRK5array", "operator&gt;&gt;::b"], [0, 0, 1, "_CPPv4eoRK5arrayRK5array", "operator^"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::a"], [0, 1, 1, "_CPPv4eoRK5arrayRK5array", "operator^::b"], [0, 0, 1, "_CPPv4orRK5arrayRK5array", "operator|"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::a"], [0, 1, 1, "_CPPv4orRK5arrayRK5array", "operator|::b"], [0, 0, 1, "_CPPv4ooRK5arrayRK5array", "operator||"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::a"], [0, 1, 1, "_CPPv4ooRK5arrayRK5array", "operator||::b"], [0, 0, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::a"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::b"], [0, 1, 1, "_CPPv45outerRK5arrayRK5array14StreamOrDevice", "outer::s"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 0, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::a"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::axes"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::high_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::low_pad_size"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::mode"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_value"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::pad_width"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt4pairIiiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorINSt4pairIiiEEEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEERKNSt6vectorIiEERK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 1, 1, "_CPPv43padRK5arrayiRK5arrayKNSt6stringE14StreamOrDevice", "pad::s"], [0, 0, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition"], [0, 0, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::a"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::axis"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::kth"], [0, 1, 1, "_CPPv49partitionRK5arrayi14StreamOrDevice", "partition::s"], [0, 1, 1, "_CPPv49partitionRK5arrayii14StreamOrDevice", "partition::s"], [0, 0, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::a"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::b"], [0, 1, 1, "_CPPv45powerRK5arrayRK5array14StreamOrDevice", "power::s"], [0, 0, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod"], [0, 0, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::a"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::axes"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::axis"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::keepdims"], [0, 1, 1, "_CPPv44prodRK5array14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayb14StreamOrDevice", "prod::s"], [0, 1, 1, "_CPPv44prodRK5arrayib14StreamOrDevice", "prod::s"], [0, 0, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::a"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::axis"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::indices"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::s"], [0, 1, 1, "_CPPv414put_along_axisRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "put_along_axis::values"], [0, 0, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::bits"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::group_size"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::s"], [0, 1, 1, "_CPPv48quantizeRK5arrayii14StreamOrDevice", "quantize::w"], [0, 0, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::biases"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::bits"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::group_size"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::s"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::scales"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::transpose"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::w"], [0, 1, 1, "_CPPv416quantized_matmul5array5array5array5arraybii14StreamOrDevice", "quantized_matmul::x"], [0, 0, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::a"], [0, 1, 1, "_CPPv47radiansRK5array14StreamOrDevice", "radians::s"], [0, 0, 1, "_CPPv44realRK5array14StreamOrDevice", "real"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::a"], [0, 1, 1, "_CPPv44realRK5array14StreamOrDevice", "real::s"], [0, 0, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::a"], [0, 1, 1, "_CPPv410reciprocalRK5array14StreamOrDevice", "reciprocal::s"], [0, 0, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::a"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::b"], [0, 1, 1, "_CPPv49remainderRK5arrayRK5array14StreamOrDevice", "remainder::s"], [0, 0, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat"], [0, 0, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::arr"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::axis"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::repeats"], [0, 1, 1, "_CPPv46repeatRK5arrayi14StreamOrDevice", "repeat::s"], [0, 1, 1, "_CPPv46repeatRK5arrayii14StreamOrDevice", "repeat::s"], [0, 0, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::a"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::s"], [0, 1, 1, "_CPPv47reshapeRK5arrayNSt6vectorIiEE14StreamOrDevice", "reshape::shape"], [0, 0, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::a"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::b"], [0, 1, 1, "_CPPv411right_shiftRK5arrayRK5array14StreamOrDevice", "right_shift::s"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll"], [0, 0, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::a"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::axes"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::axis"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::s"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayi14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayiRKNSt6vectorIiEE14StreamOrDevice", "roll::shift"], [0, 1, 1, "_CPPv44rollRK5arrayii14StreamOrDevice", "roll::shift"], [0, 0, 1, "_CPPv45roundRK5array14StreamOrDevice", "round"], [0, 0, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::a"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::decimals"], [0, 1, 1, "_CPPv45roundRK5array14StreamOrDevice", "round::s"], [0, 1, 1, "_CPPv45roundRK5arrayi14StreamOrDevice", "round::s"], [0, 0, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::a"], [0, 1, 1, "_CPPv45rsqrtRK5array14StreamOrDevice", "rsqrt::s"], [0, 0, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter"], [0, 0, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::a"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::axes"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::axis"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::indices"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::s"], [0, 1, 1, "_CPPv47scatterRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter::updates"], [0, 1, 1, "_CPPv47scatterRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter::updates"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add"], [0, 0, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::a"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::axes"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::axis"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::indices"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::s"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_add::updates"], [0, 1, 1, "_CPPv411scatter_addRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_add::updates"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max"], [0, 0, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::a"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::axes"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::axis"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::indices"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::s"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_max::updates"], [0, 1, 1, "_CPPv411scatter_maxRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_max::updates"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min"], [0, 0, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::a"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::axes"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::axis"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::indices"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::s"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_min::updates"], [0, 1, 1, "_CPPv411scatter_minRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_min::updates"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod"], [0, 0, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::a"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::axes"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::axis"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::indices"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::s"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRK5arrayRK5arrayi14StreamOrDevice", "scatter_prod::updates"], [0, 1, 1, "_CPPv412scatter_prodRK5arrayRKNSt6vectorI5arrayEERK5arrayRKNSt6vectorIiEE14StreamOrDevice", "scatter_prod::updates"], [0, 0, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::a"], [0, 1, 1, "_CPPv47sigmoidRK5array14StreamOrDevice", "sigmoid::s"], [0, 0, 1, "_CPPv44signRK5array14StreamOrDevice", "sign"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::a"], [0, 1, 1, "_CPPv44signRK5array14StreamOrDevice", "sign::s"], [0, 0, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::a"], [0, 1, 1, "_CPPv43sinRK5array14StreamOrDevice", "sin::s"], [0, 0, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::a"], [0, 1, 1, "_CPPv44sinhRK5array14StreamOrDevice", "sinh::s"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 0, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::a"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::s"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::start"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::stop"], [0, 1, 1, "_CPPv45sliceRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice::strides"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 0, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::s"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::src"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::start"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::stop"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::strides"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 1, 1, "_CPPv412slice_updateRK5arrayRK5arrayNSt6vectorIiEENSt6vectorIiEENSt6vectorIiEE14StreamOrDevice", "slice_update::update"], [0, 0, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax"], [0, 0, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::a"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::axes"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::axis"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::precise"], [0, 1, 1, "_CPPv47softmaxRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayb14StreamOrDevice", "softmax::s"], [0, 1, 1, "_CPPv47softmaxRK5arrayib14StreamOrDevice", "softmax::s"], [0, 0, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort"], [0, 0, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::a"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::axis"], [0, 1, 1, "_CPPv44sortRK5array14StreamOrDevice", "sort::s"], [0, 1, 1, "_CPPv44sortRK5arrayi14StreamOrDevice", "sort::s"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split"], [0, 0, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::a"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::axis"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::indices"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::num_splits"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayRKNSt6vectorIiEEi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayi14StreamOrDevice", "split::s"], [0, 1, 1, "_CPPv45splitRK5arrayii14StreamOrDevice", "split::s"], [0, 0, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::a"], [0, 1, 1, "_CPPv44sqrtRK5array14StreamOrDevice", "sqrt::s"], [0, 0, 1, "_CPPv46squareRK5array14StreamOrDevice", "square"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::a"], [0, 1, 1, "_CPPv46squareRK5array14StreamOrDevice", "square::s"], [0, 0, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze"], [0, 0, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::a"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::axes"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::axis"], [0, 1, 1, "_CPPv47squeezeRK5array14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayRKNSt6vectorIiEE14StreamOrDevice", "squeeze::s"], [0, 1, 1, "_CPPv47squeezeRK5arrayi14StreamOrDevice", "squeeze::s"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack"], [0, 0, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::arrays"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::axis"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEE14StreamOrDevice", "stack::s"], [0, 1, 1, "_CPPv45stackRKNSt6vectorI5arrayEEi14StreamOrDevice", "stack::s"], [0, 0, 1, "_CPPv4StRK5array14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std"], [0, 0, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::a"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::axes"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::axis"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::ddof"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::keepdims"], [0, 1, 1, "_CPPv4StRK5array14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arraybi14StreamOrDevice", "std::s"], [0, 1, 1, "_CPPv4StRK5arrayibi14StreamOrDevice", "std::s"], [0, 0, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::a"], [0, 1, 1, "_CPPv413stop_gradientRK5array14StreamOrDevice", "stop_gradient::s"], [0, 0, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::a"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::b"], [0, 1, 1, "_CPPv48subtractRK5arrayRK5array14StreamOrDevice", "subtract::s"], [0, 0, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum"], [0, 0, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::a"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::axes"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::axis"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::keepdims"], [0, 1, 1, "_CPPv43sumRK5array14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayRKNSt6vectorIiEEb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayb14StreamOrDevice", "sum::s"], [0, 1, 1, "_CPPv43sumRK5arrayib14StreamOrDevice", "sum::s"], [0, 0, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::a"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis1"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::axis2"], [0, 1, 1, "_CPPv48swapaxesRK5arrayii14StreamOrDevice", "swapaxes::s"], [0, 0, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take"], [0, 0, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::a"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::axis"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::index"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::indices"], [0, 1, 1, "_CPPv44takeRK5arrayRK5array14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayi14StreamOrDevice", "take::s"], [0, 1, 1, "_CPPv44takeRK5arrayii14StreamOrDevice", "take::s"], [0, 0, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::a"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::axis"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::indices"], [0, 1, 1, "_CPPv415take_along_axisRK5arrayRK5arrayi14StreamOrDevice", "take_along_axis::s"], [0, 0, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::a"], [0, 1, 1, "_CPPv43tanRK5array14StreamOrDevice", "tan::s"], [0, 0, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::a"], [0, 1, 1, "_CPPv44tanhRK5array14StreamOrDevice", "tanh::s"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot"], [0, 0, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_a"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::axes_b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::axis"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::b"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayKi14StreamOrDevice", "tensordot::s"], [0, 1, 1, "_CPPv49tensordotRK5arrayRK5arrayRKNSt6vectorIiEERKNSt6vectorIiEE14StreamOrDevice", "tensordot::s"], [0, 0, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::arr"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::reps"], [0, 1, 1, "_CPPv44tileRK5arrayNSt6vectorIiEE14StreamOrDevice", "tile::s"], [0, 0, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk"], [0, 0, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::a"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::axis"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::k"], [0, 1, 1, "_CPPv44topkRK5arrayi14StreamOrDevice", "topk::s"], [0, 1, 1, "_CPPv44topkRK5arrayii14StreamOrDevice", "topk::s"], [0, 0, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace"], [0, 0, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::a"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis1"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::axis2"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::dtype"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::offset"], [0, 1, 1, "_CPPv45traceRK5array14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii14StreamOrDevice", "trace::s"], [0, 1, 1, "_CPPv45traceRK5arrayiii5Dtype14StreamOrDevice", "trace::s"], [0, 0, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose"], [0, 0, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::a"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::axes"], [0, 1, 1, "_CPPv49transposeRK5array14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt16initializer_listIiEE14StreamOrDevice", "transpose::s"], [0, 1, 1, "_CPPv49transposeRK5arrayNSt6vectorIiEE14StreamOrDevice", "transpose::s"], [0, 0, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri"], [0, 0, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::k"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::m"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::n"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::s"], [0, 1, 1, "_CPPv43trii5Dtype14StreamOrDevice", "tri::type"], [0, 1, 1, "_CPPv43triiii5Dtype14StreamOrDevice", "tri::type"], [0, 0, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::k"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::s"], [0, 1, 1, "_CPPv44tril5arrayi14StreamOrDevice", "tril::x"], [0, 0, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::k"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::s"], [0, 1, 1, "_CPPv44triu5arrayi14StreamOrDevice", "triu::x"], [0, 0, 1, "_CPPv43varRK5array14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var"], [0, 0, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::a"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::axes"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::axis"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::ddof"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::keepdims"], [0, 1, 1, "_CPPv43varRK5array14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayRKNSt6vectorIiEEbi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arraybi14StreamOrDevice", "var::s"], [0, 1, 1, "_CPPv43varRK5arrayibi14StreamOrDevice", "var::s"], [0, 0, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::a"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::dtype"], [0, 1, 1, "_CPPv44viewRK5arrayRK5Dtype14StreamOrDevice", "view::s"], [0, 0, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::condition"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::s"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::x"], [0, 1, 1, "_CPPv45whereRK5arrayRK5arrayRK5array14StreamOrDevice", "where::y"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros"], [0, 0, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::dtype"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::s"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE14StreamOrDevice", "zeros::shape"], [0, 1, 1, "_CPPv45zerosRKNSt6vectorIiEE5Dtype14StreamOrDevice", "zeros::shape"], [0, 0, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::a"], [0, 1, 1, "_CPPv410zeros_likeRK5array14StreamOrDevice", "zeros_like::s"]], "mlx.core": [[9, 3, 1, "", "Device"], [10, 3, 1, "", "Dtype"], [11, 3, 1, "", "DtypeCategory"], [315, 3, 1, "", "Stream"], [12, 5, 1, "", "abs"], [13, 5, 1, "", "add"], [14, 5, 1, "", "addmm"], [15, 5, 1, "", "all"], [16, 5, 1, "", "allclose"], [17, 5, 1, "", "any"], [18, 5, 1, "", "arange"], [19, 5, 1, "", "arccos"], [20, 5, 1, "", "arccosh"], [21, 5, 1, "", "arcsin"], [22, 5, 1, "", "arcsinh"], [23, 5, 1, "", "arctan"], [24, 5, 1, "", "arctan2"], [25, 5, 1, "", "arctanh"], [26, 5, 1, "", "argmax"], [27, 5, 1, "", "argmin"], [28, 5, 1, "", "argpartition"], [29, 5, 1, "", "argsort"], [30, 3, 1, "", "array"], [82, 5, 1, "", "array_equal"], [83, 5, 1, "", "as_strided"], [84, 5, 1, "", "atleast_1d"], [85, 5, 1, "", "atleast_2d"], [86, 5, 1, "", "atleast_3d"], [87, 5, 1, "", "bitwise_and"], [88, 5, 1, "", "bitwise_or"], [89, 5, 1, "", "bitwise_xor"], [90, 5, 1, "", "block_masked_mm"], [91, 5, 1, "", "broadcast_to"], [92, 5, 1, "", "ceil"], [93, 5, 1, "", "clip"], [94, 5, 1, "", "compile"], [95, 5, 1, "", "concatenate"], [96, 5, 1, "", "conj"], [97, 5, 1, "", "conjugate"], [98, 5, 1, "", "conv1d"], [99, 5, 1, "", "conv2d"], [100, 5, 1, "", "conv3d"], [101, 5, 1, "", "conv_general"], [102, 5, 1, "", "conv_transpose1d"], [103, 5, 1, "", "conv_transpose2d"], [104, 5, 1, "", "conv_transpose3d"], [105, 5, 1, "", "convolve"], [106, 5, 1, "", "cos"], [107, 5, 1, "", "cosh"], [108, 5, 1, "", "cummax"], [109, 5, 1, "", "cummin"], [110, 5, 1, "", "cumprod"], [111, 5, 1, "", "cumsum"], [112, 3, 1, "", "custom_function"], [113, 5, 1, "", "default_device"], [114, 5, 1, "", "default_stream"], [115, 5, 1, "", "degrees"], [116, 5, 1, "", "dequantize"], [117, 5, 1, "", "diag"], [118, 5, 1, "", "diagonal"], [119, 5, 1, "", "disable_compile"], [128, 5, 1, "", "divide"], [129, 5, 1, "", "divmod"], [130, 5, 1, "", "einsum"], [131, 5, 1, "", "einsum_path"], [132, 5, 1, "", "enable_compile"], [133, 5, 1, "", "equal"], [134, 5, 1, "", "erf"], [135, 5, 1, "", "erfinv"], [136, 5, 1, "", "eval"], [137, 5, 1, "", "exp"], [138, 5, 1, "", "expand_dims"], [139, 5, 1, "", "expm1"], [140, 5, 1, "", "eye"], [159, 5, 1, "", "flatten"], [160, 5, 1, "", "floor"], [161, 5, 1, "", "floor_divide"], [162, 5, 1, "", "full"], [163, 5, 1, "", "gather_mm"], [164, 5, 1, "", "gather_qmm"], [165, 5, 1, "", "grad"], [166, 5, 1, "", "greater"], [167, 5, 1, "", "greater_equal"], [168, 5, 1, "", "hadamard_transform"], [169, 5, 1, "", "identity"], [170, 5, 1, "", "imag"], [171, 5, 1, "", "inner"], [172, 5, 1, "", "isclose"], [173, 5, 1, "", "isfinite"], [174, 5, 1, "", "isinf"], [175, 5, 1, "", "isnan"], [176, 5, 1, "", "isneginf"], [177, 5, 1, "", "isposinf"], [178, 5, 1, "", "issubdtype"], [179, 5, 1, "", "jvp"], [180, 5, 1, "", "left_shift"], [181, 5, 1, "", "less"], [182, 5, 1, "", "less_equal"], [193, 5, 1, "", "linspace"], [194, 5, 1, "", "load"], [195, 5, 1, "", "log"], [196, 5, 1, "", "log10"], [197, 5, 1, "", "log1p"], [198, 5, 1, "", "log2"], [199, 5, 1, "", "logaddexp"], [200, 5, 1, "", "logical_and"], [201, 5, 1, "", "logical_not"], [202, 5, 1, "", "logical_or"], [203, 5, 1, "", "logsumexp"], [204, 5, 1, "", "matmul"], [205, 5, 1, "", "max"], [206, 5, 1, "", "maximum"], [207, 5, 1, "", "mean"], [208, 5, 1, "", "meshgrid"], [221, 5, 1, "", "min"], [222, 5, 1, "", "minimum"], [223, 5, 1, "", "moveaxis"], [224, 5, 1, "", "multiply"], [225, 5, 1, "", "nan_to_num"], [226, 5, 1, "", "negative"], [227, 5, 1, "", "new_stream"], [228, 5, 1, "", "not_equal"], [229, 5, 1, "", "ones"], [230, 5, 1, "", "ones_like"], [231, 5, 1, "", "outer"], [232, 5, 1, "", "pad"], [233, 5, 1, "", "partition"], [234, 5, 1, "", "power"], [235, 5, 1, "", "prod"], [236, 5, 1, "", "put_along_axis"], [237, 5, 1, "", "quantize"], [238, 5, 1, "", "quantized_matmul"], [239, 5, 1, "", "radians"], [253, 5, 1, "", "real"], [254, 5, 1, "", "reciprocal"], [255, 5, 1, "", "remainder"], [256, 5, 1, "", "repeat"], [257, 5, 1, "", "reshape"], [258, 5, 1, "", "right_shift"], [259, 5, 1, "", "roll"], [260, 5, 1, "", "round"], [261, 5, 1, "", "rsqrt"], [262, 5, 1, "", "save"], [263, 5, 1, "", "save_gguf"], [264, 5, 1, "", "save_safetensors"], [265, 5, 1, "", "savez"], [266, 5, 1, "", "savez_compressed"], [267, 5, 1, "", "set_default_device"], [268, 5, 1, "", "set_default_stream"], [269, 5, 1, "", "sigmoid"], [270, 5, 1, "", "sign"], [271, 5, 1, "", "sin"], [272, 5, 1, "", "sinh"], [273, 5, 1, "", "softmax"], [274, 5, 1, "", "sort"], [275, 5, 1, "", "split"], [276, 5, 1, "", "sqrt"], [277, 5, 1, "", "square"], [278, 5, 1, "", "squeeze"], [279, 5, 1, "", "stack"], [280, 5, 1, "", "std"], [281, 5, 1, "", "stop_gradient"], [282, 5, 1, "", "stream"], [283, 5, 1, "", "subtract"], [284, 5, 1, "", "sum"], [285, 5, 1, "", "swapaxes"], [286, 5, 1, "", "synchronize"], [287, 5, 1, "", "take"], [288, 5, 1, "", "take_along_axis"], [289, 5, 1, "", "tan"], [290, 5, 1, "", "tanh"], [291, 5, 1, "", "tensordot"], [292, 5, 1, "", "tile"], [293, 5, 1, "", "topk"], [294, 5, 1, "", "trace"], [295, 5, 1, "", "transpose"], [296, 5, 1, "", "tri"], [297, 5, 1, "", "tril"], [298, 5, 1, "", "triu"], [299, 5, 1, "", "value_and_grad"], [300, 5, 1, "", "var"], [301, 5, 1, "", "view"], [302, 5, 1, "", "vjp"], [303, 5, 1, "", "vmap"], [304, 5, 1, "", "where"], [305, 5, 1, "", "zeros"], [306, 5, 1, "", "zeros_like"]], "mlx.core.Device": [[9, 4, 1, "", "__init__"]], "mlx.core.Dtype": [[10, 4, 1, "", "__init__"]], "mlx.core.DtypeCategory": [[11, 4, 1, "", "__init__"]], "mlx.core.Stream": [[315, 4, 1, "", "__init__"]], "mlx.core.array": [[31, 6, 1, "", "T"], [30, 4, 1, "", "__init__"], [32, 4, 1, "", "abs"], [33, 4, 1, "", "all"], [34, 4, 1, "", "any"], [35, 4, 1, "", "argmax"], [36, 4, 1, "", "argmin"], [37, 4, 1, "", "astype"], [38, 6, 1, "", "at"], [39, 4, 1, "", "conj"], [40, 4, 1, "", "cos"], [41, 4, 1, "", "cummax"], [42, 4, 1, "", "cummin"], [43, 4, 1, "", "cumprod"], [44, 4, 1, "", "cumsum"], [45, 4, 1, "", "diag"], [46, 4, 1, "", "diagonal"], [47, 6, 1, "", "dtype"], [48, 4, 1, "", "exp"], [49, 4, 1, "", "flatten"], [50, 4, 1, "", "item"], [51, 6, 1, "", "itemsize"], [52, 4, 1, "", "log"], [53, 4, 1, "", "log10"], [54, 4, 1, "", "log1p"], [55, 4, 1, "", "log2"], [56, 4, 1, "", "logsumexp"], [57, 4, 1, "", "max"], [58, 4, 1, "", "mean"], [59, 4, 1, "", "min"], [60, 4, 1, "", "moveaxis"], [61, 6, 1, "", "nbytes"], [62, 6, 1, "", "ndim"], [63, 4, 1, "", "prod"], [64, 4, 1, "", "reciprocal"], [65, 4, 1, "", "reshape"], [66, 4, 1, "", "round"], [67, 4, 1, "", "rsqrt"], [68, 6, 1, "", "shape"], [69, 4, 1, "", "sin"], [70, 6, 1, "", "size"], [71, 4, 1, "", "split"], [72, 4, 1, "", "sqrt"], [73, 4, 1, "", "square"], [74, 4, 1, "", "squeeze"], [75, 4, 1, "", "std"], [76, 4, 1, "", "sum"], [77, 4, 1, "", "swapaxes"], [78, 4, 1, "", "tolist"], [79, 4, 1, "", "transpose"], [80, 4, 1, "", "var"], [81, 4, 1, "", "view"]], "mlx.core.custom_function": [[112, 4, 1, "", "__init__"]], "mlx.core.distributed": [[120, 3, 1, "", "Group"], [121, 5, 1, "", "all_gather"], [122, 5, 1, "", "all_sum"], [123, 5, 1, "", "init"], [124, 5, 1, "", "is_available"], [125, 5, 1, "", "recv"], [126, 5, 1, "", "recv_like"], [127, 5, 1, "", "send"]], "mlx.core.distributed.Group": [[120, 4, 1, "", "__init__"]], "mlx.core.fast": [[141, 5, 1, "", "affine_quantize"], [142, 5, 1, "", "layer_norm"], [143, 5, 1, "", "metal_kernel"], [144, 5, 1, "", "rms_norm"], [145, 5, 1, "", "rope"], [146, 5, 1, "", "scaled_dot_product_attention"]], "mlx.core.fft": [[147, 5, 1, "", "fft"], [148, 5, 1, "", "fft2"], [149, 5, 1, "", "fftn"], [150, 5, 1, "", "ifft"], [151, 5, 1, "", "ifft2"], [152, 5, 1, "", "ifftn"], [153, 5, 1, "", "irfft"], [154, 5, 1, "", "irfft2"], [155, 5, 1, "", "irfftn"], [156, 5, 1, "", "rfft"], [157, 5, 1, "", "rfft2"], [158, 5, 1, "", "rfftn"]], "mlx.core.linalg": [[183, 5, 1, "", "cholesky"], [184, 5, 1, "", "cholesky_inv"], [185, 5, 1, "", "cross"], [186, 5, 1, "", "eigh"], [187, 5, 1, "", "eigvalsh"], [188, 5, 1, "", "inv"], [189, 5, 1, "", "norm"], [190, 5, 1, "", "qr"], [191, 5, 1, "", "svd"], [192, 5, 1, "", "tri_inv"]], "mlx.core.metal": [[209, 5, 1, "", "clear_cache"], [210, 5, 1, "", "device_info"], [211, 5, 1, "", "get_active_memory"], [212, 5, 1, "", "get_cache_memory"], [213, 5, 1, "", "get_peak_memory"], [214, 5, 1, "", "is_available"], [215, 5, 1, "", "reset_peak_memory"], [216, 5, 1, "", "set_cache_limit"], [217, 5, 1, "", "set_memory_limit"], [218, 5, 1, "", "set_wired_limit"], [219, 5, 1, "", "start_capture"], [220, 5, 1, "", "stop_capture"]], "mlx.core.random": [[240, 5, 1, "", "bernoulli"], [241, 5, 1, "", "categorical"], [242, 5, 1, "", "gumbel"], [243, 5, 1, "", "key"], [244, 5, 1, "", "laplace"], [245, 5, 1, "", "multivariate_normal"], [246, 5, 1, "", "normal"], [247, 5, 1, "", "permutation"], [248, 5, 1, "", "randint"], [249, 5, 1, "", "seed"], [250, 5, 1, "", "split"], [251, 5, 1, "", "truncated_normal"], [252, 5, 1, "", "uniform"]], "mlx.nn": [[325, 3, 1, "", "ALiBi"], [326, 3, 1, "", "AvgPool1d"], [327, 3, 1, "", "AvgPool2d"], [328, 3, 1, "", "BatchNorm"], [329, 3, 1, "", "CELU"], [330, 3, 1, "", "Conv1d"], [331, 3, 1, "", "Conv2d"], [332, 3, 1, "", "Conv3d"], [333, 3, 1, "", "ConvTranspose1d"], [334, 3, 1, "", "ConvTranspose2d"], [335, 3, 1, "", "ConvTranspose3d"], [336, 3, 1, "", "Dropout"], [337, 3, 1, "", "Dropout2d"], [338, 3, 1, "", "Dropout3d"], [339, 3, 1, "", "ELU"], [340, 3, 1, "", "Embedding"], [341, 3, 1, "", "GELU"], [342, 3, 1, "", "GLU"], [343, 3, 1, "", "GRU"], [344, 3, 1, "", "GroupNorm"], [345, 3, 1, "", "HardShrink"], [346, 3, 1, "", "HardTanh"], [347, 3, 1, "", "Hardswish"], [348, 3, 1, "", "InstanceNorm"], [349, 3, 1, "", "LSTM"], [350, 3, 1, "", "LayerNorm"], [351, 3, 1, "", "LeakyReLU"], [352, 3, 1, "", "Linear"], [353, 3, 1, "", "LogSigmoid"], [354, 3, 1, "", "LogSoftmax"], [355, 3, 1, "", "MaxPool1d"], [356, 3, 1, "", "MaxPool2d"], [357, 3, 1, "", "Mish"], [452, 3, 1, "", "Module"], [378, 3, 1, "", "MultiHeadAttention"], [379, 3, 1, "", "PReLU"], [380, 3, 1, "", "QuantizedEmbedding"], [381, 3, 1, "", "QuantizedLinear"], [382, 3, 1, "", "RMSNorm"], [383, 3, 1, "", "RNN"], [384, 3, 1, "", "ReLU"], [385, 3, 1, "", "ReLU6"], [386, 3, 1, "", "RoPE"], [387, 3, 1, "", "SELU"], [388, 3, 1, "", "Sequential"], [389, 3, 1, "", "SiLU"], [390, 3, 1, "", "Sigmoid"], [391, 3, 1, "", "SinusoidalPositionalEncoding"], [392, 3, 1, "", "Softmax"], [393, 3, 1, "", "Softmin"], [394, 3, 1, "", "Softplus"], [395, 3, 1, "", "Softshrink"], [396, 3, 1, "", "Softsign"], [397, 3, 1, "", "Step"], [398, 3, 1, "", "Tanh"], [399, 3, 1, "", "Transformer"], [400, 3, 1, "", "Upsample"], [409, 3, 1, "", "celu"], [410, 3, 1, "", "elu"], [411, 3, 1, "", "gelu"], [412, 3, 1, "", "gelu_approx"], [413, 3, 1, "", "gelu_fast_approx"], [414, 3, 1, "", "glu"], [415, 3, 1, "", "hard_shrink"], [416, 3, 1, "", "hard_tanh"], [417, 3, 1, "", "hardswish"], [418, 3, 1, "", "leaky_relu"], [419, 3, 1, "", "log_sigmoid"], [420, 3, 1, "", "log_softmax"], [435, 3, 1, "", "mish"], [436, 3, 1, "", "prelu"], [307, 5, 1, "", "quantize"], [437, 3, 1, "", "relu"], [438, 3, 1, "", "relu6"], [439, 3, 1, "", "selu"], [440, 3, 1, "", "sigmoid"], [441, 3, 1, "", "silu"], [442, 3, 1, "", "softmax"], [443, 3, 1, "", "softmin"], [444, 3, 1, "", "softplus"], [445, 3, 1, "", "softshrink"], [446, 3, 1, "", "step"], [447, 3, 1, "", "tanh"], [308, 5, 1, "", "value_and_grad"]], "mlx.nn.Module": [[358, 4, 1, "", "apply"], [359, 4, 1, "", "apply_to_modules"], [360, 4, 1, "", "children"], [361, 4, 1, "", "eval"], [362, 4, 1, "", "filter_and_map"], [363, 4, 1, "", "freeze"], [364, 4, 1, "", "leaf_modules"], [365, 4, 1, "", "load_weights"], [366, 4, 1, "", "modules"], [367, 4, 1, "", "named_modules"], [368, 4, 1, "", "parameters"], [369, 4, 1, "", "save_weights"], [370, 4, 1, "", "set_dtype"], [371, 6, 1, "", "state"], [372, 4, 1, "", "train"], [373, 4, 1, "", "trainable_parameters"], [374, 6, 1, "", "training"], [375, 4, 1, "", "unfreeze"], [376, 4, 1, "", "update"], [377, 4, 1, "", "update_modules"]], "mlx.nn.init": [[401, 5, 1, "", "constant"], [402, 5, 1, "", "glorot_normal"], [403, 5, 1, "", "glorot_uniform"], [404, 5, 1, "", "he_normal"], [405, 5, 1, "", "he_uniform"], [406, 5, 1, "", "identity"], [407, 5, 1, "", "normal"], [408, 5, 1, "", "uniform"]], "mlx.nn.losses": [[421, 3, 1, "", "binary_cross_entropy"], [422, 3, 1, "", "cosine_similarity_loss"], [423, 3, 1, "", "cross_entropy"], [424, 3, 1, "", "gaussian_nll_loss"], [425, 3, 1, "", "hinge_loss"], [426, 3, 1, "", "huber_loss"], [427, 3, 1, "", "kl_div_loss"], [428, 3, 1, "", "l1_loss"], [429, 3, 1, "", "log_cosh_loss"], [430, 3, 1, "", "margin_ranking_loss"], [431, 3, 1, "", "mse_loss"], [432, 3, 1, "", "nll_loss"], [433, 3, 1, "", "smooth_l1_loss"], [434, 3, 1, "", "triplet_loss"]], "mlx.optimizers": [[455, 3, 1, "", "AdaDelta"], [456, 3, 1, "", "Adafactor"], [457, 3, 1, "", "Adagrad"], [458, 3, 1, "", "Adam"], [459, 3, 1, "", "AdamW"], [460, 3, 1, "", "Adamax"], [461, 3, 1, "", "Lion"], [474, 3, 1, "", "Optimizer"], [466, 3, 1, "", "RMSprop"], [467, 3, 1, "", "SGD"], [309, 5, 1, "", "clip_grad_norm"], [468, 5, 1, "", "cosine_decay"], [469, 5, 1, "", "exponential_decay"], [470, 5, 1, "", "join_schedules"], [471, 5, 1, "", "linear_schedule"], [472, 5, 1, "", "step_decay"]], "mlx.optimizers.Optimizer": [[462, 4, 1, "", "apply_gradients"], [463, 4, 1, "", "init"], [464, 6, 1, "", "state"], [465, 4, 1, "", "update"]], "mlx.utils": [[310, 5, 1, "", "tree_flatten"], [311, 5, 1, "", "tree_map"], [312, 5, 1, "", "tree_map_with_path"], [313, 5, 1, "", "tree_reduce"], [314, 5, 1, "", "tree_unflatten"]]}, "objnames": {"0": ["cpp", "function", "C++ function"], "1": ["cpp", "functionParam", "C++ function parameter"], "2": ["cpp", "templateParam", "C++ template parameter"], "3": ["py", "class", "Python class"], "4": ["py", "method", "Python method"], "5": ["py", "function", "Python function"], "6": ["py", "property", "Python property"]}, "objtypes": {"0": "cpp:function", "1": "cpp:functionParam", "2": "cpp:templateParam", "3": "py:class", "4": "py:method", "5": "py:function", "6": "py:property"}, "terms": {"": [0, 1, 2, 4, 5, 6, 47, 51, 62, 94, 114, 116, 141, 148, 149, 151, 152, 154, 155, 157, 158, 165, 184, 189, 191, 194, 207, 231, 237, 241, 260, 263, 264, 280, 282, 299, 300, 301, 303, 308, 324, 327, 343, 349, 356, 362, 363, 365, 369, 370, 371, 375, 383, 454, 463, 464, 476, 479, 481, 484, 485, 486, 487], "0": [0, 1, 2, 4, 5, 6, 8, 9, 14, 18, 38, 45, 46, 49, 66, 71, 75, 80, 83, 95, 98, 99, 100, 101, 102, 103, 104, 117, 118, 140, 143, 146, 159, 163, 165, 186, 188, 189, 190, 192, 209, 216, 218, 225, 232, 240, 244, 246, 247, 252, 256, 260, 275, 279, 280, 294, 296, 297, 298, 299, 300, 303, 309, 310, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 344, 345, 348, 350, 351, 355, 356, 379, 384, 386, 391, 395, 397, 399, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 415, 416, 417, 418, 421, 423, 425, 426, 430, 433, 434, 436, 437, 438, 439, 445, 446, 449, 452, 455, 456, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 476, 479, 480, 481, 482, 483, 484, 485, 486], "00005": 4, "0001": 391, "0005": 412, "001": 456, "00364": 4, "01": [4, 351, 418, 459], "0137595": 404, "015": 413, "0184009": 405, "02264": 403, "025": 481, "02765": 404, "0300242": 405, "044715": [341, 412], "0485873": 423, "05": [16, 172, 328, 344, 348, 350, 382], "0507": 439, "05202": 5, "06": [424, 434, 455], "0638": 430, "06450": 350, "0645099": 407, "06561": 469, "06675": 461, "07467": 382, "08": [16, 172, 422, 457, 458, 459, 460, 466], "08022": 348, "081": 472, "08415": 413, "08494": 344, "08619": 405, "08681": [357, 435], "09864": 5, "0999938": 470, "0999961": 468, "0f": 0, "1": [0, 1, 2, 3, 5, 6, 14, 18, 28, 29, 38, 46, 49, 98, 99, 100, 101, 102, 103, 104, 117, 118, 139, 143, 146, 147, 148, 150, 151, 153, 154, 155, 156, 157, 158, 159, 168, 171, 178, 184, 185, 186, 187, 189, 190, 204, 208, 217, 231, 233, 237, 241, 244, 245, 246, 252, 269, 274, 287, 293, 294, 299, 309, 312, 313, 317, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 342, 343, 344, 348, 349, 350, 352, 355, 356, 379, 382, 383, 386, 390, 391, 397, 400, 402, 403, 404, 405, 406, 407, 408, 409, 410, 412, 413, 414, 416, 419, 420, 421, 422, 423, 424, 425, 426, 427, 429, 430, 432, 433, 434, 439, 440, 442, 443, 444, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486, 487], "10": [0, 3, 5, 6, 196, 260, 265, 311, 324, 365, 449, 470, 472, 479, 480, 482], "100": [2, 4, 5, 421, 471, 479, 481, 483, 487], "1000": [468, 479], "10000": 386, "101": 471, "1024": [1, 5], "105361": 421, "109": 2, "10_000": 4, "10x": 461, "11": 189, "114": 2, "12": [5, 168, 470], "1212": 455, "12451": 403, "128": [265, 324], "13": 8, "14": 8, "15": [1, 8, 189, 218, 313, 479], "150594": 402, "15268": 404, "16": [1, 143, 317, 326, 348, 355, 358, 452], "1606": 413, "1607": [348, 350], "16384": 168, "16506": 405, "17": 8, "177208": 404, "1803": 344, "1908": [357, 435], "1910": 382, "191107": 402, "1985": 189, "1_000": 4, "1d": [0, 98, 102, 105, 263, 288], "1e": [0, 4, 6, 16, 172, 328, 344, 348, 350, 351, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 463, 466, 468, 469, 470, 471, 472], "1e3": 479, "1st": 237, "2": [0, 1, 2, 4, 5, 6, 38, 99, 103, 117, 118, 134, 148, 151, 153, 154, 155, 156, 157, 158, 159, 168, 178, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 198, 204, 237, 245, 250, 291, 294, 296, 297, 298, 309, 313, 317, 324, 326, 327, 331, 334, 341, 351, 355, 356, 382, 391, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 423, 424, 426, 433, 434, 449, 452, 454, 455, 457, 458, 459, 463, 466, 479, 480, 481, 482, 483, 484, 485, 486, 487], "20": [168, 189], "200": [5, 470], "2002": 5, "2011": 457, "2012": [455, 466], "2015": [337, 458, 460], "2019": [5, 459], "2020": 5, "2021": 5, "20397": 421, "20_000": 5, "21": [5, 472], "2104": 5, "223144": 421, "223404": 403, "225": 189, "225763": 430, "2302": 461, "23607": [189, 190], "24": 8, "24264": 189, "247": 5, "25": [379, 400], "25211": 404, "256": [1, 2, 6, 143], "256995": 430, "28": 168, "2d": [0, 99, 103, 118, 237, 328, 337], "2nd": 237, "2x": 484, "3": [0, 1, 2, 5, 8, 100, 104, 159, 178, 185, 186, 187, 189, 190, 309, 313, 332, 335, 341, 400, 403, 405, 412, 417, 456, 461, 476, 479, 482, 484, 485], "30": 456, "3118": 484, "32": [1, 5, 6, 90, 237, 238, 317, 327, 356, 382, 479], "330": 5, "33333": 400, "348587": 423, "363207": 402, "36788": 479, "379159": 403, "380709": 407, "39": 5, "390": 481, "3d": [0, 2, 100, 104, 328, 338, 400], "3f": [2, 6, 479], "3x": 2, "4": [0, 1, 2, 5, 116, 141, 143, 159, 164, 189, 237, 238, 265, 307, 313, 317, 326, 327, 328, 348, 355, 356, 380, 381, 399, 400, 402, 403, 404, 421, 479, 480, 482, 485, 487], "4096": [479, 481, 487], "40x": 1, "41421": 189, "417497": 408, "42": 314, "437": 5, "44": 5, "447214": 190, "458835": 404, "475": 5, "48095": 402, "4d": [1, 400], "4m": 1, "5": [0, 1, 2, 4, 5, 8, 189, 217, 240, 313, 326, 328, 336, 337, 338, 341, 345, 348, 355, 395, 400, 401, 404, 405, 412, 415, 433, 445, 449, 454, 466, 468, 469, 479, 481, 482], "50": [0, 193], "500": [5, 487], "5000": 2, "510826": 421, "512": [2, 3, 5, 399, 487], "534422": 407, "539245": 421, "53947": 402, "55": 1, "5701": 455, "573409": 430, "57771": 190, "579": 5, "5f": 4, "6": [1, 2, 5, 189, 265, 385, 399, 403, 412, 413, 417, 424, 434, 438, 466, 479, 482, 485], "61278": 402, "617261": 408, "628": 5, "633": 5, "64": [0, 1, 90, 116, 141, 164, 237, 238, 307, 317, 380, 381], "64331": 405, "666329": 405, "66667": 400, "67326": 439, "676": 1, "690": 5, "6967": 404, "7": [2, 5, 189, 237, 482], "702": [341, 413], "707107": 186, "71828": 479, "74166": 189, "74597": 189, "75": 400, "75596": 430, "75787": 404, "765166": 430, "773433": 430, "776856": 403, "793615": 405, "79854": 405, "7b": 5, "7m": 1, "8": [0, 1, 2, 5, 8, 189, 237, 317, 327, 348, 356, 399, 422, 455, 456, 457, 458, 459, 460, 466, 479, 482, 485, 487], "8192": [5, 168], "84804": 189, "863726": 408, "883935": 408, "890597": 403, "894427": 190, "89613": 402, "8gb": 5, "8x": 1, "9": [8, 189, 423, 455, 458, 459, 460, 461, 463, 469, 472, 484], "90041": 403, "912766": 403, "916291": 421, "95": 6, "982273": 407, "99": [461, 466], "995016": 402, "999": [458, 459, 460], "A": [0, 2, 5, 7, 8, 9, 68, 82, 94, 142, 143, 144, 146, 165, 178, 179, 184, 186, 187, 189, 190, 191, 194, 203, 204, 205, 210, 221, 237, 240, 241, 242, 244, 245, 246, 247, 248, 251, 252, 275, 279, 282, 299, 302, 303, 307, 308, 309, 310, 311, 312, 313, 314, 315, 324, 328, 337, 343, 344, 348, 350, 362, 366, 367, 370, 376, 377, 382, 388, 391, 399, 402, 403, 405, 413, 434, 435, 452, 454, 458, 460, 462, 463, 465, 470, 479, 480, 481, 483, 484], "AS": 163, "And": [5, 400], "As": [6, 38, 287, 324], "At": 93, "But": 487, "By": [5, 307, 370, 421, 481, 484], "For": [0, 1, 2, 5, 8, 38, 146, 163, 178, 189, 237, 314, 324, 328, 337, 341, 358, 363, 372, 375, 381, 386, 391, 400, 402, 403, 404, 405, 421, 449, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487], "If": [0, 1, 2, 5, 8, 15, 16, 17, 18, 26, 27, 28, 29, 78, 82, 83, 93, 95, 105, 108, 109, 110, 111, 117, 118, 121, 122, 123, 125, 126, 127, 136, 142, 145, 156, 157, 158, 161, 162, 165, 172, 183, 184, 185, 189, 194, 203, 204, 205, 207, 208, 216, 217, 221, 225, 229, 232, 233, 235, 236, 241, 245, 247, 256, 259, 273, 274, 275, 280, 284, 286, 287, 288, 291, 293, 294, 299, 300, 303, 305, 307, 311, 313, 328, 330, 331, 332, 333, 334, 335, 344, 350, 352, 363, 365, 375, 381, 383, 386, 388, 391, 400, 421, 423, 434, 456, 479, 480, 481, 483, 486, 487, 488], "In": [0, 1, 2, 5, 6, 38, 204, 237, 311, 324, 337, 344, 452, 455, 457, 458, 460, 461, 462, 478, 479, 480, 481, 483, 486, 487], "It": [2, 5, 8, 126, 165, 268, 299, 309, 313, 324, 377, 381, 462, 474, 484, 486], "Its": 324, "No": [2, 5, 186, 187], "Not": [94, 228, 479], "ON": [3, 8], "Of": 481, "On": [1, 479, 481, 483], "One": [147, 150, 156, 232, 261, 479, 481], "THE": 8, "That": 5, "The": [0, 1, 2, 3, 5, 6, 7, 8, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 47, 51, 61, 62, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179, 180, 181, 182, 185, 186, 187, 189, 190, 191, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 212, 213, 216, 217, 218, 219, 221, 222, 223, 224, 226, 228, 229, 230, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 263, 264, 269, 270, 271, 272, 273, 274, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 317, 319, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 358, 359, 363, 365, 369, 370, 371, 372, 375, 376, 377, 378, 380, 381, 382, 383, 386, 388, 391, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 464, 466, 467, 468, 471, 474, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "Then": [4, 8], "There": [1, 2, 324, 400, 479], "These": [1, 2, 94, 236, 288, 423, 487], "To": [0, 2, 3, 4, 5, 6, 8, 216, 324, 449, 454, 479, 480, 481, 485], "With": 2, "_": [1, 3, 4, 5, 312, 324, 468, 469, 470, 471, 472, 476, 479, 483, 487], "__call__": [1, 5, 6, 324, 452], "__init__": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 324, 452], "__main__": [2, 5], "__name__": [2, 5], "_a": 2, "_ext": 2, "_f": 189, "_in": [402, 403], "_out": [402, 403], "_p": 434, "_size": [326, 327, 355, 356], "_val": 416, "a1": 163, "a2": 163, "a_": 189, "a_max": [0, 93], "a_min": [0, 93], "a_ndim": 1, "a_shap": 1, "a_strid": 1, "a_view": 484, "ab": [0, 16, 172, 189, 299, 344, 348, 350, 357, 382, 413, 435, 479], "abil": 480, "abl": [2, 237], "about": [1, 2, 5, 6, 131, 210, 483, 487], "abov": [1, 2, 5, 237, 297, 324, 400, 459, 480, 481, 482, 483, 487], "absolut": [0, 12, 16, 172, 412, 413, 433], "acc": 313, "acceler": [2, 328], "access": [0, 5, 50, 324, 452, 463, 480, 483, 487], "accord": [0, 242, 304, 307, 378, 402, 403, 404, 405], "accordingli": 2, "accross": 8, "accumul": [313, 382], "accuraci": 6, "accustom": 5, "achiev": [324, 480], "across": [1, 2, 344, 480], "act": [2, 429], "action": 324, "activ": [2, 8, 211, 337, 397, 399, 415, 435, 445, 446, 448, 479], "actual": [5, 18, 365, 452, 483], "ad": [0, 1, 2, 4, 8, 142, 348, 452, 455, 456, 457, 458, 459, 460, 466, 480, 483, 486], "adadelta": 454, "adafactor": 454, "adagrad": 454, "adam": [454, 460, 461, 470, 471], "adamax": 454, "adamw": [454, 461], "adapt": [455, 456, 457, 480], "add": [0, 1, 2, 3, 5, 14, 38, 138, 199, 232, 237, 330, 331, 332, 333, 334, 335, 481, 487], "add_argu": 5, "add_depend": 2, "add_librari": 2, "addit": [0, 2, 5, 8, 13, 14, 142, 144, 146, 194, 328, 344, 350, 378, 382, 452, 481], "addmm": 0, "address": 2, "adjac": 337, "advanc": [5, 479], "advantag": 487, "advis": 484, "affin": [328, 344, 348, 350, 352, 381], "after": [2, 5, 6, 28, 159, 161, 164, 209, 233, 237, 328, 344, 350, 358, 359, 363, 365, 372, 375, 376, 377, 378, 399, 433, 479, 487], "after_1": 232, "after_2": 232, "after_i": 232, "after_n": 232, "afternoon": 5, "again": [5, 8, 324, 479], "against": 0, "aggreg": 378, "ago": 5, "ai": 112, "ainv": [188, 192], "albeit": 487, "algebra": 7, "algorithm": [400, 461], "alia": [96, 97, 341], "alibi": 324, "align": [184, 237, 327, 343, 349, 356], "align_corn": 400, "all": [0, 1, 2, 3, 6, 8, 16, 28, 38, 84, 85, 86, 94, 99, 100, 101, 103, 104, 112, 121, 122, 123, 140, 149, 152, 155, 158, 163, 164, 191, 204, 232, 233, 259, 278, 307, 324, 358, 359, 363, 366, 367, 368, 373, 375, 378, 391, 399, 400, 449, 452, 474, 476, 479, 482, 483, 485, 488], "all_avg": 480, "all_reduce_grad": 480, "all_sum": 480, "allclos": [0, 1, 143], "alloc": [2, 212, 216, 217, 452], "allow": [0, 1, 2, 178, 309, 324, 377, 452, 474, 480, 482, 485], "almost": 5, "alon": [2, 484], "along": [0, 2, 26, 27, 94, 95, 108, 109, 110, 111, 121, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 185, 189, 236, 247, 256, 259, 273, 275, 279, 287, 288, 291, 292, 293, 294, 301, 324, 342, 383, 414], "alpha": [0, 2, 14, 237, 329, 339, 409, 410, 434, 436, 439, 459, 466], "alpha_": 2, "alreadi": [2, 3, 5, 480], "also": [0, 1, 2, 5, 6, 7, 8, 11, 13, 87, 88, 89, 119, 128, 129, 133, 149, 152, 155, 158, 166, 167, 180, 181, 182, 199, 206, 222, 224, 228, 234, 237, 255, 258, 283, 307, 308, 319, 324, 362, 376, 378, 380, 381, 389, 411, 439, 441, 448, 454, 479, 480, 481, 482, 483, 484, 485, 488], "altern": 476, "alwai": [1, 83, 211, 310, 481], "am": 5, "among": 2, "amount": [5, 213, 326, 355], "amus": 5, "an": [0, 1, 2, 3, 5, 6, 8, 10, 15, 17, 30, 84, 85, 86, 91, 98, 99, 100, 101, 102, 103, 104, 120, 125, 126, 127, 136, 140, 142, 146, 159, 162, 169, 173, 183, 189, 194, 217, 218, 223, 229, 230, 232, 235, 236, 237, 238, 247, 256, 257, 259, 260, 275, 278, 285, 287, 288, 291, 292, 296, 303, 305, 306, 310, 311, 312, 313, 324, 326, 327, 336, 341, 344, 349, 350, 352, 355, 356, 358, 378, 379, 381, 383, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 412, 436, 449, 454, 455, 465, 469, 474, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "anaconda": 480, "anchor": 434, "angl": [115, 239, 351], "angular": [145, 386], "ani": [0, 1, 2, 5, 7, 18, 94, 310, 311, 312, 313, 314, 324, 341, 358, 359, 362, 371, 381, 399, 400, 449, 471, 478, 479, 481, 483, 485, 486, 487], "anonym": 479, "anoth": [0, 93, 178, 204, 283, 304, 317, 324, 358, 479, 481, 482, 487], "anwywher": 8, "anyhow": 5, "anymor": 5, "anyth": [5, 299, 483], "anytim": 483, "api": [1, 2, 341, 480, 481], "app": 8, "append": [5, 204, 479, 483], "appl": [2, 5, 7, 8, 487], "appli": [0, 38, 145, 146, 163, 191, 311, 312, 313, 324, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 337, 338, 339, 341, 342, 344, 345, 346, 347, 348, 350, 351, 352, 353, 354, 355, 356, 357, 359, 372, 379, 381, 382, 383, 384, 385, 387, 389, 390, 392, 393, 394, 395, 396, 397, 398, 400, 409, 410, 411, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 449, 462, 465, 471, 474, 479, 480], "applic": [3, 8], "apply_fn": 359, "apply_gradi": 454, "apply_to_modul": [324, 363], "approach": [429, 481], "appropri": [2, 479], "approx": 341, "approxim": [16, 341, 411, 412, 413], "ar": [0, 1, 2, 4, 5, 6, 7, 8, 16, 18, 82, 90, 91, 93, 94, 101, 105, 112, 118, 125, 126, 136, 140, 143, 148, 149, 151, 152, 154, 155, 157, 158, 159, 164, 165, 172, 173, 174, 175, 176, 177, 178, 179, 186, 187, 189, 190, 194, 204, 217, 231, 232, 233, 237, 238, 240, 241, 242, 247, 248, 251, 252, 259, 265, 266, 278, 279, 287, 299, 302, 303, 307, 310, 311, 317, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 344, 348, 350, 352, 365, 378, 381, 400, 421, 423, 424, 448, 452, 454, 461, 463, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "arang": [0, 1, 189, 247, 317, 400, 482, 484], "arbitrari": [310, 452], "arbitrarili": [1, 94, 324, 478, 481, 485], "arc": 0, "arcco": 0, "arccosh": 0, "architectur": [5, 8, 210, 324, 377, 487], "archiv": 486, "arcsin": 0, "arcsinh": 0, "arctan": 0, "arctan2": 0, "arctanh": 0, "arg": [2, 5, 10, 18, 120, 136, 265, 266], "arg1": 178, "arg2": 178, "argmax": [0, 6], "argmin": 0, "argnam": [165, 299], "argnum": [2, 165, 299, 481], "argpars": 5, "argpartit": 0, "argsort": 0, "argument": [1, 31, 65, 79, 94, 136, 165, 299, 311, 312, 313, 324, 400, 476, 480, 481, 486, 487, 488], "argumentpars": 5, "ari": [84, 85, 86], "aris": 484, "arm": 8, "arm64": 8, "around": 5, "arr": [0, 262, 482], "arr_0": 486, "arrai": [0, 1, 2, 5, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 324, 328, 349, 358, 365, 368, 373, 379, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 446, 449, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472, 479, 480, 481, 483, 484, 485, 487], "array_equ": [0, 16, 172], "arrayfir": 7, "arxiv": [5, 344, 348, 350, 357, 382, 413, 435, 455, 461], "as_strid": 0, "ascend": [186, 187], "ask": [5, 480], "assert": [1, 2, 143], "assign": [0, 2, 38, 452], "associ": [2, 265, 266, 483], "assum": [0, 2, 5, 90, 185, 186, 187, 190, 311, 324, 326, 327, 344, 355, 356], "astyp": [0, 1, 2, 5, 143, 358, 484], "atleast": 0, "atleast_1d": 0, "atleast_2d": 0, "atleast_3d": 0, "atol": [0, 16, 172], "atom": [1, 143], "atomic_fetch_add_explicit": 1, "atomic_output": [1, 143], "attach": 2, "attempt": 94, "attend": 378, "attent": [146, 363, 378, 391, 399], "attention_norm": 5, "attribut": [1, 9, 10, 11, 30, 315, 371, 452, 474], "audio": 400, "auto": [0, 2, 8], "autom": 481, "automat": [1, 2, 7, 143, 194, 480, 485, 486, 487], "autoregress": 5, "avail": [2, 4, 5, 6, 8, 10, 124, 214, 319, 487], "averag": [326, 327, 455, 456, 458, 459, 460, 480], "avgpool1d": 324, "avgpool2d": 324, "avoid": [1, 2, 370, 479], "awai": [2, 5], "awar": [479, 483], "ax": [0, 2, 15, 17, 26, 27, 79, 112, 138, 148, 149, 151, 152, 154, 155, 157, 158, 159, 171, 189, 203, 205, 207, 221, 232, 235, 259, 273, 278, 280, 284, 285, 291, 295, 300, 481], "axes_a": 0, "axes_b": 0, "axi": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 95, 108, 109, 110, 111, 118, 121, 138, 142, 144, 147, 150, 153, 154, 155, 156, 157, 158, 159, 168, 185, 189, 203, 205, 207, 221, 223, 232, 233, 235, 236, 241, 247, 256, 259, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 292, 293, 294, 295, 300, 301, 303, 326, 327, 342, 355, 356, 383, 414, 420, 422, 423, 427, 432, 434, 442, 443, 482], "axis1": [0, 46, 77, 118, 285, 294], "axis2": [0, 46, 77, 118, 285, 294], "axpbi": 2, "axpby_": 2, "axpby_gener": 2, "axpby_general_": 2, "axpby_impl": 2, "axpby_impl_acceler": 2, "b": [0, 1, 2, 3, 5, 13, 14, 16, 24, 82, 87, 88, 89, 90, 128, 129, 133, 143, 161, 163, 166, 167, 171, 172, 180, 181, 182, 185, 189, 199, 200, 202, 204, 206, 222, 224, 228, 231, 234, 237, 244, 255, 258, 283, 291, 299, 312, 313, 342, 352, 383, 400, 414, 481, 482, 483, 484, 485, 486, 487], "b1": 163, "b2": 163, "b_": [343, 349], "b_stride": 1, "ba": [458, 460], "back": [5, 112, 214, 484], "backend": [1, 8, 123, 124], "backward": [1, 479, 481], "bad": 483, "balanc": 429, "baltimor": 189, "bandwidth": [479, 480], "bar": 480, "base": [0, 2, 145, 189, 196, 198, 234, 386, 399, 452, 454, 460, 474, 476, 479, 482], "base_idx": 1, "basi": 474, "basic": [4, 260, 481], "batch": [5, 14, 90, 163, 164, 204, 245, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 378, 383, 400, 483], "batch_idx": 1, "batch_iter": [6, 454], "batch_siz": [6, 454], "batchnorm": 324, "becaus": [5, 211, 324, 483], "been": [0, 2, 5, 212, 483], "befor": [1, 2, 5, 8, 28, 143, 233, 362, 399, 463, 480, 482, 483], "before_1": 232, "before_2": 232, "before_i": 232, "before_n": 232, "beforehand": 231, "beggin": 259, "begin": [83, 184, 213, 237, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446], "behav": 112, "behavior": [245, 429, 482, 483], "behaviour": [112, 183, 184], "behind": 481, "being": [281, 324], "bell": 2, "below": [2, 8, 189, 296, 298, 317, 400, 483], "bench": 2, "benchmark": [2, 479], "benefici": [337, 338, 483], "best": 480, "beta": [0, 2, 14, 116, 141, 237, 328, 344, 348, 350, 433, 454, 458, 459, 460, 461], "beta_": 2, "beta_1": [456, 458, 459, 460, 461], "beta_2": [458, 459, 460, 461], "better": [481, 487], "between": [0, 2, 7, 93, 159, 399, 422, 425, 426, 429, 470, 480, 483, 484, 487], "beyond": [259, 468, 471], "bfloat16": [2, 11, 168, 317, 484], "bfloat16_t": 2, "bia": [5, 116, 141, 142, 164, 237, 238, 311, 324, 330, 331, 332, 333, 334, 335, 343, 349, 350, 352, 363, 365, 375, 378, 381, 383, 458, 459, 460, 463, 481], "bias": [0, 116, 141, 164, 237, 238, 343, 349, 363, 375, 378], "bicub": 400, "big": [1, 479], "bigger": [5, 456], "bilinear": [1, 400], "binari": [194, 262, 263, 264, 265, 266, 301, 397, 421, 446, 479], "binary_cross_entropi": [324, 479], "bit": [0, 116, 141, 164, 180, 237, 238, 258, 307, 317, 358, 380, 381, 382], "bitwis": [0, 87, 88, 89, 180, 258], "bitwise_and": 0, "bitwise_or": 0, "bitwise_xor": 0, "block": [0, 2, 5, 90, 399], "block_masked_mm": 0, "block_siz": [0, 90], "bn": 328, "bodi": [1, 143], "bool": [0, 1, 2, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 78, 80, 82, 94, 101, 108, 109, 110, 111, 123, 124, 143, 145, 164, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 214, 217, 221, 235, 238, 280, 284, 300, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 358, 362, 363, 365, 370, 372, 375, 378, 381, 383, 386, 391, 399, 400, 421, 424, 456, 467], "bool_": [11, 317], "boolean": [0, 16, 82, 172, 173, 174, 175, 176, 177, 178, 200, 201, 202, 317, 374, 482], "both": [1, 2, 13, 87, 88, 89, 128, 129, 133, 166, 167, 178, 180, 181, 182, 189, 199, 206, 222, 224, 228, 234, 241, 255, 258, 283, 307, 326, 327, 348, 349, 355, 356, 454, 479, 480, 481, 485, 487], "bottom": 400, "bound": [0, 248, 251, 252, 341, 408, 479, 482, 487], "boundari": 470, "bracket": 5, "brain": 317, "break": 484, "bregler": 337, "broadcast": [0, 2, 13, 16, 87, 88, 89, 91, 93, 128, 129, 133, 162, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 236, 240, 241, 245, 251, 252, 255, 258, 283, 288, 304, 378], "broadcast_arrai": [0, 2], "broadcast_to": 0, "broadcasted_input": 2, "brought": 7, "btl_tcp_link": 480, "buffer": [1, 2, 211, 484], "bui": 5, "build": [3, 5, 7, 404, 452, 479], "build_ext": [2, 8], "build_shared_lib": [2, 8], "built": [1, 2, 8, 483], "bundl": 5, "byte": [51, 61, 211, 212, 213, 216, 217, 218, 317], "c": [0, 1, 2, 5, 14, 189, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 348, 349, 355, 356, 484, 485, 487], "c_": [349, 461], "c_in": [98, 99, 100, 101, 102, 103, 104], "c_j": [326, 327, 355, 356], "c_out": [98, 99, 100, 101, 102, 103, 104], "c_pad": 1, "c_t": [349, 461], "cach": [5, 8, 209, 211, 212, 216, 479], "calcul": [189, 421, 424, 430, 456], "call": [2, 3, 5, 6, 31, 126, 161, 209, 213, 324, 340, 363, 375, 380, 388, 452, 454, 463, 479, 480, 481, 483], "callabl": [94, 112, 143, 165, 179, 299, 302, 303, 307, 308, 310, 311, 312, 313, 358, 359, 362, 370, 383, 388, 399, 401, 402, 403, 404, 405, 406, 407, 408, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 470, 471, 472], "can": [1, 2, 3, 5, 7, 8, 13, 18, 65, 79, 83, 87, 88, 89, 94, 118, 119, 120, 128, 129, 133, 136, 166, 167, 180, 181, 182, 189, 199, 206, 218, 222, 224, 228, 234, 240, 241, 248, 251, 252, 255, 258, 263, 283, 294, 299, 313, 324, 327, 340, 341, 356, 362, 375, 380, 388, 400, 423, 449, 452, 454, 462, 463, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "cannot": [5, 93, 482, 484], "captur": [2, 3, 94, 219, 220, 324, 479], "care": [5, 483], "carri": 2, "cartesian": 208, "case": [2, 5, 121, 122, 123, 125, 126, 127, 149, 152, 153, 155, 156, 157, 158, 159, 183, 184, 185, 186, 187, 188, 190, 191, 192, 204, 257, 278, 327, 337, 356, 397, 415, 433, 439, 445, 446, 462, 463, 479, 481, 485, 486, 487, 488], "cast": [2, 37, 156, 157, 158, 194, 358, 370, 484], "caster": 2, "categor": 5, "categori": [11, 178, 317], "catlas_saxpbi": 2, "caus": [324, 479, 483], "causal": 5, "caution": 83, "cd": [3, 8], "cdf": [242, 341, 411], "cdot": [413, 422, 425, 441], "ceil": 0, "ceildiv": 1, "cell": 349, "celu": 324, "certain": [2, 372, 479], "chang": [83, 94, 268, 301, 376, 381, 400, 426, 433, 479, 484], "channel": [1, 98, 99, 100, 101, 102, 103, 104, 328, 330, 331, 332, 333, 334, 335, 337, 338], "channel_idx": 1, "charact": 310, "check": [0, 2, 8, 82, 124, 178, 186, 187, 214, 365, 481, 482], "checklist": 480, "checkout": [3, 479], "checkpoint": [399, 454], "chen": 461, "child": 377, "children": 324, "chip": 8, "choleski": 184, "choos": [5, 145, 386], "chosen": 131, "clamp": 159, "clang": 8, "clariti": 481, "class": [2, 5, 6, 9, 10, 11, 30, 112, 120, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 474], "class_pred": 307, "classif": [404, 405], "classifi": 6, "classmethod": [380, 381], "clear": 209, "click": 8, "clip": [0, 309, 421, 456], "clip_threshold": 456, "clipped_grad": 309, "clone": 8, "close": [4, 7, 8, 16, 172], "closer": 311, "cmake": [3, 8], "cmake_arg": 3, "cmake_build_parallel_level": 8, "cmake_build_typ": 8, "cmake_current_list_dir": 2, "cmake_host_system_processor": 8, "cmake_library_output_directori": 2, "cmakebuild": 2, "cmakeextens": 2, "cmakelist": 2, "cmdclass": 2, "co": [0, 2, 112, 391, 481], "code": [1, 143, 479, 480, 483], "coeffici": [2, 455, 456, 458, 459, 460, 461], "col": 296, "col_contigu": 2, "cold": 8, "collect": [2, 311, 312, 478], "column": [2, 140, 169, 186, 237], "com": [8, 480], "combin": [5, 191, 313], "come": [2, 5, 480, 481], "command": [2, 3, 8, 480], "command_buff": 2, "common": [2, 454, 479, 483], "commonli": [6, 376, 449, 479], "commun": [7, 120, 123, 124], "compar": [2, 82, 479], "comparison": [16, 133, 166, 167, 181, 182, 228], "compat": [5, 241, 245, 341, 486], "compil": [0, 3, 7, 8, 119, 132, 143, 480, 481, 483], "compiled_fun": 479, "compiled_grad_fn": 479, "complet": [4, 5, 8, 217, 376, 377, 481, 487], "complex": [2, 96, 97, 154, 155, 156, 157, 158, 170, 186, 187, 253, 310, 317, 324, 377, 479, 481], "complex64": [2, 11, 317], "complex64_t": 2, "complexflo": 11, "compon": [2, 5], "compos": [7, 324, 479, 481, 485], "composit": 485, "compress": 266, "compromis": 5, "comput": [0, 1, 2, 4, 5, 6, 7, 8, 108, 109, 110, 111, 112, 116, 131, 139, 141, 145, 165, 179, 183, 184, 185, 186, 187, 188, 189, 192, 199, 207, 231, 237, 255, 273, 280, 281, 291, 299, 300, 302, 308, 324, 328, 343, 344, 348, 349, 350, 363, 376, 381, 382, 386, 399, 402, 403, 404, 405, 412, 413, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 454, 455, 456, 458, 459, 460, 461, 465, 479, 480, 481, 485, 487], "computation": 483, "compute_encod": 2, "concaten": [0, 5, 121], "concept": 452, "concis": 5, "concret": [2, 343, 349, 352, 383, 483, 487], "conda": [8, 480], "condit": [0, 2, 304, 487], "config": [2, 480], "configu": 454, "configur": [116, 141, 480], "confirm": 480, "confus": 6, "conj": 97, "conjug": [0, 96], "connect": 480, "consecut": [145, 237, 386], "consequ": 5, "consid": [5, 16, 82, 172, 310, 311, 312, 344, 478], "consider": 479, "const": [0, 1, 2, 424], "constant": [0, 2, 5, 8, 142, 144, 232, 324, 328, 344, 350, 382, 424, 434, 466, 468, 479, 484], "constant_valu": 232, "constitut": 311, "construct": [0, 2, 6, 45, 117, 162, 229, 292, 305], "consum": 483, "contain": [2, 5, 8, 28, 29, 68, 94, 118, 131, 153, 154, 155, 163, 164, 186, 189, 200, 201, 202, 237, 275, 304, 309, 324, 362, 364, 365, 371, 399, 430, 449, 452, 479, 480, 481], "content": [8, 362, 479], "context": 282, "contigu": [1, 2, 83, 143], "continu": [329, 409, 481], "contract": [0, 131], "contrast": 459, "contribut": 2, "contriv": [481, 487], "control": [0, 351, 476, 483], "conv": 105, "conv1d": [0, 324], "conv2d": [0, 324], "conv3d": [0, 324], "conv_gener": 0, "conv_transpose1d": 0, "conv_transpose2d": 0, "conv_transpose3d": 0, "conveni": [1, 2, 6, 178], "convent": [18, 105, 130, 131, 400, 459], "convers": 7, "convert": [0, 1, 2, 78, 84, 85, 86, 115, 159, 239, 380, 381, 483, 484, 485], "convolut": [0, 98, 99, 100, 101, 102, 103, 104, 105, 330, 331, 332, 333, 334, 335, 337, 338], "convolv": [98, 99, 100, 101, 102, 103, 104], "convtranspose1d": 324, "convtranspose2d": 324, "convtranspose3d": 324, "coordin": [0, 208], "copi": [0, 1, 2, 5, 7, 233, 274, 484], "copy_inplac": 2, "copytyp": 2, "core": [1, 2, 3, 4, 5, 6, 307, 324, 326, 327, 328, 348, 355, 356, 365, 368, 370, 373, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 452, 454, 479, 480, 484, 485], "corner": 400, "correct": [2, 8, 458, 459, 460, 482, 483], "correctli": 38, "correl": [101, 337], "correspond": [0, 1, 2, 15, 17, 78, 93, 116, 118, 141, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 186, 203, 205, 221, 235, 284, 291, 303, 311, 481], "cos_first": 391, "cosh": [0, 429], "cosin": [0, 19, 20, 106, 107, 422, 468, 470, 481], "cosine_decai": [454, 470], "cosine_similarity_loss": 324, "cost": [8, 456, 480, 483], "costli": 483, "cot": 1, "cot_index": 1, "cotan": [2, 112], "cotang": [1, 2, 112, 302], "could": [5, 324], "count": [324, 470], "counter": 476, "cours": 481, "coursera": 466, "cov": 245, "covari": [245, 328], "cover": 2, "cpp": 2, "cpu": [7, 8, 186, 187, 190, 487], "cpython": 2, "crash": [83, 479], "creat": [0, 2, 5, 8, 83, 123, 140, 169, 282, 324, 452, 454, 470, 479, 482, 484], "create_additive_causal_mask": 5, "criteria": 2, "cross": [6, 101, 421, 423], "cross_entropi": [6, 324], "crowd": 5, "cry": 5, "cubic": 400, "cummax": 0, "cummin": 0, "cumprod": 0, "cumsum": 0, "cumul": [0, 83, 108, 109, 110, 111], "current": [5, 7, 8, 83, 90, 100, 103, 104, 127, 210, 212, 237, 313, 324, 456, 480, 483], "custom": [7, 112, 143, 399], "custom_decod": 399, "custom_encod": 399, "custom_funct": 1, "custom_kernel_myexp_float": 1, "custom_tim": 2, "cvpr": 337, "cycl": 478, "d": [0, 1, 2, 5, 100, 104, 117, 118, 171, 189, 204, 208, 231, 287, 294, 296, 297, 298, 314, 332, 335, 338, 343, 349, 383, 455, 458, 460, 487], "d1": 487, "d2": 487, "d2fdx2": 481, "d_i": 352, "dampen": 467, "darwin": 2, "data": [0, 2, 6, 7, 10, 18, 125, 140, 156, 157, 162, 169, 193, 225, 229, 242, 251, 294, 296, 301, 305, 338, 401, 402, 403, 404, 405, 406, 407, 408, 479, 480, 482, 484], "dataset": [4, 480, 483], "datatyp": 51, "dbuild_shared_lib": 8, "dcmake_build_typ": 8, "ddof": [0, 75, 80, 280, 300], "deal": 479, "debug": [1, 3, 480], "debugg": 7, "decai": [456, 459, 461, 467, 468, 469, 472], "decay_r": [456, 469, 472], "decay_step": 468, "decent": 6, "decid": [311, 362], "decim": [0, 66, 260], "declar": 2, "decltyp": 1, "decod": 399, "decomposit": [183, 184, 191], "decor": [1, 112], "decoupl": 459, "deep": [328, 402, 403, 404, 405], "def": [1, 2, 4, 5, 6, 112, 143, 299, 324, 452, 479, 480, 481, 482, 483, 484, 487], "default": [1, 2, 8, 14, 15, 16, 17, 18, 26, 27, 28, 29, 82, 83, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 112, 113, 114, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 143, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 163, 164, 165, 168, 169, 172, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 216, 217, 218, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 260, 267, 268, 274, 275, 278, 279, 280, 282, 284, 286, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 317, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 339, 342, 343, 345, 348, 349, 351, 352, 355, 356, 358, 363, 365, 370, 372, 375, 378, 379, 380, 381, 383, 386, 391, 395, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 452, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 476, 478, 479, 481, 484, 486, 488], "default_devic": 488, "default_stream": 488, "defin": [1, 2, 4, 5, 6, 8, 112, 126, 143, 164, 185, 189, 238, 307, 310, 484], "definit": [112, 183, 184, 245], "degre": [0, 239, 434], "delta": [426, 455], "delv": [404, 405], "demonstr": 484, "denomin": [348, 422, 455, 457, 458, 459, 460, 466], "dens": [208, 487], "depend": [0, 2, 3, 4, 8, 78, 189, 343, 349, 383, 480, 482, 486, 487], "depth": [310, 332, 335, 338, 481], "dequant": [0, 237], "deriv": [2, 481, 483], "descend": 360, "descent": [467, 479, 483], "describ": [2, 483], "descript": [2, 5, 317], "design": [1, 4, 7, 476, 487], "destin": [0, 2, 60, 127, 223, 236], "destroi": 479, "detach": 481, "detail": [1, 2, 10, 216, 324, 337, 386, 391, 400, 402, 403, 404, 405, 455, 457, 458, 460, 461, 482, 485], "determin": [0, 2, 118, 245, 313, 317, 369, 486], "dev": [2, 8], "develop": [2, 8], "developer_dir": 8, "deviat": [0, 246, 280, 402, 404, 407], "deviatoin": 0, "devic": [1, 2, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 217, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 315, 487, 488], "device_info": 218, "devicetyp": 9, "df": 484, "dfdx": [481, 482], "dft": [147, 148, 149, 150, 151, 152, 156, 157, 158], "dhwc": 338, "diag": [0, 191], "diagon": [0, 45, 117, 140, 294, 296, 297, 298], "dict": [94, 136, 194, 210, 263, 264, 265, 309, 368, 373, 376, 377, 452, 454, 462, 463, 465, 478, 481, 486], "dict_kei": [311, 463], "dictionari": [5, 94, 194, 210, 263, 264, 309, 310, 313, 324, 362, 371, 376, 377, 464, 478, 486], "did": 5, "diff": 2, "differ": [7, 178, 283, 301, 433, 481], "differenti": [1, 2, 7, 329, 409], "difficult": 481, "difficulti": [402, 403], "dilat": [0, 98, 99, 100, 101, 102, 103, 104, 330, 331, 333, 334], "dim": [1, 5, 145, 146, 340, 344, 348, 350, 378, 380, 382, 386, 391, 399], "dimens": [0, 1, 2, 5, 15, 17, 26, 27, 62, 68, 78, 84, 85, 86, 94, 99, 100, 101, 103, 104, 118, 138, 145, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 189, 190, 191, 192, 203, 204, 205, 207, 221, 235, 236, 237, 241, 250, 280, 284, 288, 291, 295, 300, 328, 330, 331, 332, 333, 334, 335, 337, 338, 342, 343, 344, 348, 349, 350, 378, 382, 383, 386, 399, 400, 414, 423, 479, 481], "dimension": [30, 142, 144, 147, 148, 149, 150, 151, 152, 156, 157, 158, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 352, 355, 356, 380, 381, 391, 482, 484], "direct": [2, 5, 360, 461, 487], "directli": [2, 5, 83], "directori": [2, 5, 8], "disabl": [119, 216, 479], "disable_compil": 479, "disappoint": 5, "discard": [5, 310], "discov": 8, "discoveri": 461, "discret": [105, 147, 148, 149, 150, 151, 152, 156, 157, 158, 340, 380], "discuss": 2, "disk": 5, "dispatch": 2, "dispatchthread": [1, 2], "displai": 324, "distanc": [5, 434], "distribut": [7, 8, 240, 241, 242, 244, 245, 246, 251, 252, 352, 402, 403, 404, 405, 407, 408, 424, 427, 432, 434, 449], "diverg": 427, "divid": [0, 2, 38, 161, 237, 255, 480], "divis": [0, 128, 161, 237, 255], "divisor": [280, 300], "divmod": 0, "dloss_dw": 481, "dloss_dx": 481, "dlpack": 484, "dlvalu": 299, "dmlx_build_cpu": 8, "dmlx_build_gguf": 8, "dmlx_build_safetensor": 8, "dmlx_metal_debug": 3, "dmlx_metal_jit": 8, "do": [0, 2, 5, 8, 301, 324, 364, 375, 449, 452, 459, 479, 480, 481, 483], "doc": [2, 6, 480], "document": [2, 3, 65, 79, 143, 263, 264, 317, 479, 481, 482], "doe": [0, 2, 3, 5, 8, 211, 301, 309, 324, 479, 482, 483, 484], "doesn": [2, 324], "domain": [251, 480], "don": [1, 8, 479, 487], "done": [324, 336, 382, 479, 480, 483, 484], "dot": [183, 188, 192, 291, 310, 367, 378], "doubl": [0, 5], "doubt": 5, "down": [5, 309], "dparam": 299, "draw": 241, "drop": 362, "dropout": [324, 337, 338, 372, 399, 479], "dropout2d": 324, "dropout3d": 324, "dst": 127, "dt": 134, "dtype": [0, 1, 2, 5, 11, 18, 30, 37, 38, 78, 81, 125, 126, 140, 143, 159, 162, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 248, 251, 252, 294, 296, 301, 305, 317, 370, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 484, 485, 486], "dtypecategori": [178, 317], "dual": 429, "duchi": 457, "dure": [3, 94, 336, 337, 338, 400, 484], "dx": 112, "dy": 112, "dyld": 480, "dyld_library_path": 480, "dylib": 2, "dynam": 483, "e": [2, 6, 8, 112, 134, 143, 163, 164, 179, 269, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 350, 363, 382, 419, 420, 442, 443, 448, 454, 457, 479, 483, 488], "e5": 317, "e8": 317, "each": [0, 1, 2, 68, 116, 136, 141, 145, 164, 178, 183, 184, 186, 187, 188, 191, 192, 204, 208, 232, 237, 238, 241, 256, 265, 266, 275, 292, 295, 301, 303, 304, 337, 338, 340, 343, 344, 349, 383, 386, 399, 421, 423, 476, 479, 480, 483], "eager": 483, "earli": 337, "earlier": 2, "eas": 5, "easi": [2, 324, 480], "easier": [1, 483], "edg": [93, 232, 400, 479], "edit": [8, 377], "effect": [337, 479, 483], "effici": [5, 7, 163, 337, 386, 483, 485], "eigenvalu": [186, 187], "eigenvector": 186, "einstein": [130, 131], "einsum": 131, "either": [8, 13, 65, 78, 79, 87, 88, 89, 93, 128, 129, 133, 161, 166, 167, 180, 181, 182, 189, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 299, 327, 356, 388, 400, 404, 405, 484], "elem": [1, 143], "elem_to_loc": [1, 2], "element": [0, 1, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 28, 70, 83, 87, 88, 89, 92, 106, 107, 108, 109, 110, 111, 116, 128, 129, 133, 134, 135, 137, 139, 140, 141, 160, 161, 164, 166, 167, 172, 173, 174, 175, 176, 177, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 208, 222, 224, 226, 228, 233, 234, 237, 238, 254, 255, 256, 258, 259, 261, 269, 270, 271, 272, 276, 277, 283, 287, 289, 290, 293, 299, 301, 304, 329, 336, 337, 338, 343, 347, 349, 357, 379, 383, 386, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479, 481], "elementwis": [1, 96, 97], "elif": 5, "ellipsi": 482, "elman": 383, "els": [0, 2, 5, 324, 363, 480, 483], "elsewher": [296, 482], "elu": [324, 439], "emb": [5, 340, 380, 391], "embed": [5, 307, 324, 380, 386, 391, 422], "empti": [127, 245], "enabl": [3, 5, 8, 94, 132, 467], "encod": [2, 145, 386, 391, 399, 423], "encount": [2, 481], "end": [118, 184, 214, 237, 259, 327, 343, 349, 356, 397, 415, 426, 433, 439, 445, 446, 468, 471], "end_axi": [0, 49, 159], "end_encod": 2, "endif": 2, "endswith": 363, "enhanc": [5, 386, 483], "enjoi": 2, "enough": [2, 483], "ensur": [0, 1, 2, 8, 143, 309, 429, 480], "ensure_row_contigu": [1, 143], "enter": 5, "entir": [15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 337, 338], "entri": [0, 247, 337, 338], "entropi": [6, 421, 423], "enumer": 324, "environ": [8, 119, 132, 480], "ep": [4, 142, 144, 328, 344, 348, 350, 382, 422, 424, 434, 454, 455, 456, 457, 458, 459, 460, 466], "epoch": 6, "epsilon": [328, 344, 348, 350, 382, 422, 424, 455, 457, 458, 459, 460, 466], "epsilon_1": 456, "epsilon_2": 456, "equal": [0, 1, 16, 28, 82, 140, 167, 172, 182, 228, 233, 248, 275, 348, 352], "equal_nan": [0, 16, 82, 172], "equat": [130, 131], "equival": [0, 2, 31, 65, 79, 126, 129, 161, 164, 168, 287, 329, 339, 341, 345, 346, 347, 353, 354, 377, 379, 381, 384, 385, 387, 389, 392, 393, 394, 395, 396, 398], "erf": [0, 135, 479], "erfinv": 0, "error": [0, 2, 8, 123, 134, 135, 217, 218, 275, 341, 411, 412, 413, 429, 431, 481, 484], "error_norm": 4, "estim": [458, 460], "eta": 461, "etc": [2, 237, 324, 400, 480], "eval": [2, 3, 4, 5, 6, 324, 452, 454, 479, 480, 481, 483, 485], "eval_cpu": 2, "eval_fn": 6, "eval_gpu": 2, "evalu": [2, 5, 6, 7, 127, 136, 179, 302, 324, 361, 372, 452, 454, 479, 485], "even": [1, 2, 5, 94, 479, 483, 484], "evenli": [0, 193], "everi": [237, 311, 454, 472, 481], "everyth": [5, 480], "everywher": 0, "exact": [412, 413], "exactli": [2, 5, 145, 365, 481], "exampl": [0, 3, 4, 5, 6, 8, 18, 38, 112, 143, 159, 178, 186, 187, 189, 190, 282, 287, 309, 312, 313, 324, 326, 327, 328, 348, 355, 356, 363, 365, 372, 375, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 449, 454, 463, 468, 469, 470, 471, 472, 476, 481, 482, 483, 484, 485, 486], "exce": 309, "exceed": 217, "except": [7, 140, 153, 154, 156, 157, 158, 344, 365, 482, 484], "exclud": [236, 288], "exclus": [0, 83, 89], "execut": [2, 8, 84, 85, 86, 213, 484, 487], "exist": [2, 3, 5, 363, 375], "exp": [0, 1, 139, 143, 199, 203, 242, 273, 329, 339, 390, 409, 410, 427, 439, 440, 444, 479, 487], "exp_elementwis": [1, 143], "expand_dim": 0, "expect": [2, 5, 330, 331, 332, 333, 334, 335, 336, 337, 338, 391, 399, 424, 479, 482], "expens": 399, "expensive_fun": 483, "experiment": 484, "explain": 2, "explicit": [2, 463, 476, 484], "explicitli": [163, 324, 476], "explor": 8, "expm1": 0, "exponenti": [0, 137, 139, 329, 339, 387, 409, 410, 439, 469], "exponential_decai": 454, "export": 8, "ext_modul": 2, "extend": [2, 232], "extens": [7, 194, 219, 369, 486], "extern": 484, "extra": [1, 311, 312], "extract": [0, 5, 45, 117, 118, 324, 362, 452, 480], "extras_requir": 2, "extrem": [482, 483], "ey": [0, 5, 188, 192], "f": [0, 2, 4, 6, 112, 189, 324, 349, 459, 479, 484], "f_jvp": 112, "f_t": 349, "f_vjp": 112, "f_vmap": 112, "face": 5, "factor": [2, 14, 168, 183, 184, 190, 400, 423, 469, 472], "fall": [2, 112], "fallback": 2, "fals": [0, 1, 2, 5, 15, 16, 17, 26, 27, 33, 34, 35, 36, 41, 42, 43, 44, 56, 57, 58, 59, 63, 75, 76, 80, 82, 94, 101, 108, 109, 110, 111, 123, 143, 172, 178, 183, 184, 189, 192, 194, 203, 205, 207, 208, 217, 221, 235, 280, 284, 300, 304, 307, 310, 311, 312, 313, 317, 344, 348, 350, 352, 363, 365, 375, 378, 381, 386, 391, 399, 400, 421, 424, 456, 467, 484], "famili": 5, "fan": [402, 403, 404, 405], "fan_in": [402, 403, 404, 405], "fan_out": [402, 403, 404, 405], "far": 454, "fast": [1, 7, 341, 413, 480, 487], "faster": [1, 2, 8, 129, 411, 421, 479, 481], "featur": [1, 7, 98, 99, 100, 101, 102, 103, 104, 145, 328, 343, 344, 348, 349, 350, 352, 381, 382, 383, 386, 399, 400, 479, 483], "feed": 5, "feed_forward": 5, "feedforward": [402, 403], "feel": 5, "fetch": 1, "few": [1, 2, 5, 6, 7, 8, 480, 483, 485], "ffn": 5, "ffn_norm": 5, "fft": 7, "figur": 480, "file": [5, 8, 194, 262, 263, 264, 265, 266, 365, 369, 480, 481, 486], "file_or_weight": 365, "fill": [0, 2, 162, 230, 296, 306, 401, 402, 403, 404, 405, 407, 408], "filter": [0, 105, 330, 331, 332, 333, 334, 335, 358, 362], "filter_and_map": 324, "filter_fn": [358, 362], "final": [2, 4, 5, 6, 168, 468, 471], "find": [2, 4, 8, 480], "find_packag": 2, "finder": 8, "fine": [476, 483], "finetun": 324, "finish": 2, "finit": [0, 173, 225], "first": [0, 1, 2, 3, 4, 5, 6, 8, 118, 121, 159, 165, 178, 180, 191, 200, 202, 204, 233, 250, 258, 285, 291, 294, 299, 310, 312, 313, 324, 327, 344, 356, 400, 422, 430, 456, 458, 459, 460, 463, 479, 481, 484, 487], "first_lay": 483, "fit": [2, 237, 487], "five": 479, "fix": [2, 5, 8, 483], "flag": [2, 8, 479, 484], "flat": [163, 164, 310, 314], "flat_param": 265, "flatten": [0, 28, 29, 108, 109, 110, 111, 189, 231, 233, 236, 256, 259, 274, 287, 288, 293, 310], "flexibl": 7, "flexibli": 377, "flip": [0, 101, 105], "float": [0, 1, 2, 11, 14, 16, 18, 78, 142, 143, 144, 145, 146, 161, 162, 168, 172, 178, 189, 225, 238, 240, 244, 246, 309, 317, 328, 336, 337, 338, 344, 348, 350, 358, 370, 382, 386, 391, 397, 399, 400, 401, 402, 403, 404, 405, 407, 408, 422, 423, 424, 426, 430, 433, 434, 445, 446, 455, 456, 457, 458, 459, 460, 461, 466, 467, 468, 469, 471, 472], "float16": [1, 2, 11, 143, 168, 194, 317, 358, 483, 484], "float16_t": [1, 2], "float32": [0, 1, 2, 11, 18, 140, 143, 146, 168, 169, 178, 186, 187, 189, 190, 193, 229, 242, 244, 245, 246, 251, 252, 296, 305, 317, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 423, 430, 468, 469, 470, 471, 472, 479, 480, 481, 482, 483, 484, 485, 486], "float64": 178, "floor": [0, 1, 161], "floor_divid": 0, "flow": [0, 281, 483], "flush": 2, "fn": [308, 311, 312, 313, 485], "follow": [1, 2, 5, 6, 7, 8, 18, 105, 116, 141, 163, 189, 232, 237, 312, 324, 412, 413, 427, 455, 456, 457, 458, 459, 460, 461, 467, 476, 479, 480, 481, 487], "foo": 480, "food": 5, "forc": [5, 6, 324, 480, 485], "forg": 8, "formal": [116, 141, 237], "format": [5, 194, 262, 263, 264, 265, 266, 484], "formul": [329, 339], "formula": 433, "forth": 400, "forward": [1, 2, 299, 479, 483], "found": 362, "four": 328, "fourier": [147, 148, 149, 150, 151, 152, 156, 157, 158], "frac": [134, 237, 269, 326, 327, 328, 336, 337, 338, 344, 348, 350, 352, 355, 356, 382, 390, 402, 403, 404, 405, 422, 424, 426, 429, 440, 442, 443, 455, 457, 458, 459, 460, 466], "fraction": 18, "framework": [2, 7], "free": 216, "freez": [324, 375, 452], "freq": 145, "frequenc": [145, 386, 391], "frequent": [479, 483], "friend": 5, "fro": 189, "frobeniu": 189, "from": [0, 1, 2, 5, 6, 7, 83, 115, 116, 118, 121, 122, 125, 126, 127, 141, 143, 154, 155, 157, 158, 162, 163, 168, 189, 194, 204, 208, 213, 216, 230, 237, 239, 240, 241, 242, 243, 244, 248, 251, 265, 278, 281, 283, 287, 288, 293, 294, 304, 306, 310, 311, 312, 313, 314, 324, 352, 363, 365, 378, 402, 403, 404, 405, 407, 408, 424, 433, 449, 454, 478, 479, 480, 481, 483, 484, 485, 486, 487], "from_embed": 380, "from_linear": 381, "front": 2, "frozen": [324, 363, 373, 375, 381, 452], "fuction": 129, "full": [0, 1, 2, 6, 65, 79, 105, 143, 273, 376, 377, 424, 479, 480, 483], "full_turn": 391, "fulli": [2, 7, 480, 484, 487], "fun": [94, 165, 179, 299, 302, 303, 479, 482, 483, 487], "fun1": 483, "func": 383, "function": [0, 1, 2, 3, 4, 5, 6, 7, 16, 18, 83, 94, 112, 129, 134, 135, 143, 165, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 218, 269, 299, 302, 303, 308, 309, 311, 312, 313, 324, 329, 339, 341, 342, 345, 346, 347, 353, 354, 357, 359, 363, 370, 375, 379, 383, 384, 385, 387, 388, 389, 390, 392, 393, 394, 395, 396, 397, 398, 399, 411, 412, 413, 414, 415, 416, 417, 419, 420, 421, 435, 440, 442, 443, 444, 445, 446, 447, 449, 454, 463, 476, 478, 480, 482, 483, 484, 486], "functool": 479, "further": [2, 8, 481], "fuse": [1, 479], "fusibl": 479, "futur": [5, 381, 482, 483], "g": [3, 8, 112, 143, 189, 237, 349, 448, 466, 467, 483, 488], "g_t": [349, 455, 457, 458, 459, 460, 461, 466, 467], "gain": [402, 403, 404, 405], "gamma": [328, 344, 348, 350, 382, 402, 403, 404, 405], "gap": 1, "gate": [342, 343, 414], "gather": [0, 121, 163, 164], "gather_mm": [0, 164], "gather_qmm": 0, "gaurante": 301, "gaussian": [4, 341, 411, 412, 413, 424], "gaussian_nll_loss": 324, "gelu": [324, 412, 413, 479], "gelu_approx": [324, 341, 411], "gelu_fast_approx": [324, 341, 411], "geluapprox": 341, "gelufast": 341, "gener": [0, 1, 2, 3, 4, 11, 18, 101, 140, 143, 154, 155, 193, 208, 240, 245, 246, 247, 248, 251, 252, 399, 476, 479, 482, 483, 488], "general_": 2, "generate_stub": 8, "geq": [397, 446], "get": [2, 4, 6, 8, 99, 100, 101, 103, 104, 113, 114, 210, 211, 212, 213, 243, 324, 479, 481, 483, 487], "get_cache_memori": 209, "get_command_encod": 2, "get_kernel": 2, "gguf": [8, 194, 263, 486], "gh": 1, "gii": 1, "git": 8, "github": [4, 6, 8, 479], "give": [2, 5, 6, 28, 479], "given": [0, 2, 8, 15, 17, 28, 38, 83, 91, 93, 95, 108, 109, 110, 111, 116, 118, 131, 136, 138, 141, 147, 148, 149, 150, 151, 152, 156, 157, 158, 162, 163, 189, 203, 205, 207, 216, 221, 225, 227, 235, 245, 247, 248, 259, 260, 268, 273, 275, 280, 284, 286, 292, 293, 294, 296, 297, 298, 300, 315, 326, 327, 336, 355, 356, 362, 378, 422, 424, 430], "gix": 1, "gix_mult": 1, "giy_mult": 1, "global": [119, 121, 122, 123, 125, 126, 127, 132, 249, 309, 476, 479], "glorot": [402, 403], "glorot_norm": 324, "glorot_uniform": 324, "glu": [5, 324], "gm": 1, "gn": 1, "go": [2, 5, 481], "golub": 189, "good": [2, 8, 454, 479, 480, 487], "goroshin": 337, "gower": 5, "gpu": [1, 3, 7, 8, 210, 482, 487], "gputrac": [3, 219], "grad": [2, 4, 6, 299, 309, 454, 462, 479, 480, 481, 482, 483, 485], "grad_fn": [4, 479, 481], "gradient": [0, 4, 6, 112, 165, 281, 299, 308, 309, 324, 363, 376, 381, 399, 429, 452, 454, 455, 456, 458, 459, 460, 461, 462, 465, 467, 479, 480, 481, 482, 483, 484, 485], "grain": 476, "graph": [2, 5, 6, 7, 481], "great": 3, "greater": [0, 5, 28, 139, 167, 233, 309, 397, 446], "greater_equ": 0, "grep": 8, "grid": [2, 143, 208], "grid_dim": 2, "grid_grad": 1, "grid_idx": 1, "grid_sampl": 1, "grid_sample_grad": 1, "grid_sample_ref": 1, "grid_sample_vjp": 1, "grid_shap": 1, "grid_siz": 1, "ground": [4, 5, 423, 433], "group": [0, 1, 98, 99, 100, 101, 102, 103, 104, 116, 121, 122, 123, 125, 126, 127, 141, 146, 164, 237, 238, 301, 307, 330, 344, 380, 381, 480], "group_dim": 2, "group_siz": [0, 116, 141, 164, 237, 238, 307, 380, 381], "groupnorm": 324, "grow": 483, "gru": 324, "guid": [2, 7], "gw": 1, "h": [1, 2, 98, 99, 100, 102, 103, 104, 189, 327, 328, 331, 332, 334, 335, 337, 338, 343, 349, 356, 383, 481, 483], "h_": [327, 343, 349, 356, 383], "h_in": 1, "h_stride": 1, "h_t": [343, 349, 383], "ha": [2, 3, 5, 6, 7, 8, 78, 94, 118, 127, 153, 154, 156, 157, 158, 165, 183, 184, 186, 187, 188, 191, 192, 208, 212, 241, 328, 343, 349, 352, 383, 452, 454, 479, 482, 483, 485, 487], "had": 5, "hadamard": [0, 168], "hadamard_transform": 0, "half": [2, 18, 248, 252, 386, 483], "halv": [342, 414], "hand": [5, 481, 483], "handi": 481, "handl": [2, 324, 479], "happen": [2, 5, 142, 399, 454, 479, 483], "happi": 5, "hard": 5, "hard_shrink": [324, 345], "hard_tanh": [324, 346], "hardshrink": [324, 415], "hardswish": 324, "hardtanh": [324, 416], "hat": [116, 141, 237], "have": [0, 1, 2, 5, 8, 16, 82, 84, 85, 86, 90, 121, 154, 155, 157, 158, 164, 172, 204, 219, 241, 301, 310, 349, 378, 388, 461, 463, 478, 479, 480, 482, 483, 487], "haven": 5, "hazan": 457, "he": [5, 404, 405], "he_norm": 324, "he_uniform": 324, "head": [146, 378, 399], "header": [2, 143], "heart": 5, "heavi": 5, "height": [327, 328, 331, 332, 334, 335, 337, 338, 356], "hello": [310, 314], "help": [2, 5, 479, 487], "helper": [5, 143, 479], "henc": [0, 2, 237, 479], "hendryck": 413, "here": [2, 5, 454, 479, 481, 483, 486, 487], "hermitian": [186, 187], "hf": 349, "hg": 349, "hh": 383, "hi": [5, 349], "hidden": [343, 349, 383, 399], "hidden_dim": [6, 452, 454], "hidden_s": [343, 349, 383], "hierarchi": 317, "high": [248, 252, 324, 340, 408, 449], "high_pad_s": 0, "higher": [2, 171, 218, 430, 481], "highli": 8, "him": 5, "hing": 425, "hinge_loss": 324, "hinton": 466, "hit": 2, "hn": 343, "ho": 349, "hold": [2, 5, 10, 11, 189, 479], "homebrew": 480, "hopkin": 189, "host": 2, "host1": 480, "host2": 480, "host_nam": [1, 2], "hostfil": 480, "hostnam": 480, "hot": 423, "hour": 5, "how": [2, 5, 6, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 355, 356, 380, 400, 462, 479, 482, 487], "howev": [2, 112, 324, 341, 344, 463, 476, 479, 480, 483, 484], "hr": 343, "http": [344, 348, 350, 357, 382, 413, 435], "huber": 426, "huber_loss": 324, "human": [404, 405], "hundr": 8, "hurri": 5, "hutter": 459, "hyperbol": [0, 20, 22, 25, 107, 272, 290, 398, 447], "hz": 343, "i": [0, 1, 2, 3, 5, 6, 7, 8, 16, 18, 28, 37, 78, 83, 93, 99, 100, 101, 103, 104, 105, 108, 109, 110, 111, 112, 117, 118, 121, 122, 124, 125, 126, 127, 129, 136, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 161, 162, 163, 164, 168, 172, 173, 178, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 199, 203, 204, 208, 214, 217, 218, 232, 233, 236, 237, 238, 245, 246, 247, 257, 259, 262, 263, 264, 269, 273, 275, 280, 281, 286, 287, 288, 291, 294, 295, 299, 300, 301, 302, 303, 304, 307, 309, 310, 311, 312, 313, 317, 319, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 363, 369, 371, 372, 374, 375, 377, 378, 379, 381, 382, 383, 386, 391, 397, 399, 400, 404, 405, 411, 413, 421, 422, 424, 429, 430, 433, 434, 436, 441, 446, 452, 454, 456, 459, 461, 462, 463, 468, 470, 471, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "i386": 8, "i_n": 1, "i_nw": 1, "i_s": 1, "i_sw": 1, "i_t": 349, "iclr": [458, 459, 460], "id": [6, 8], "idea": [481, 483], "idempot": [363, 375], "ident": [0, 112, 140, 281, 324, 372], "identifi": [2, 310, 478], "idim": 6, "idiom": [6, 479], "idx": [38, 482], "ie": [375, 480], "ieee": 317, "ignor": [5, 38, 93, 94, 136, 456], "ih": 383, "ii": 1, "ij": 208, "imag": [0, 331, 332, 334, 335, 337, 338, 400], "imagenet": [404, 405], "imaginari": 170, "immedi": [5, 358], "implement": [0, 1, 4, 6, 145, 146, 189, 340, 362, 378, 386, 388, 391, 397, 399, 400, 446, 455, 456, 457, 458, 460, 461, 462, 474, 479, 481], "impli": 301, "implicit": [476, 479, 481], "implicitli": 483, "import": [2, 3, 4, 5, 6, 8, 112, 168, 189, 265, 299, 310, 311, 312, 313, 314, 324, 326, 327, 328, 348, 355, 356, 365, 400, 421, 423, 430, 449, 452, 454, 479, 480, 481, 482, 483, 484, 485], "improv": [1, 2, 3, 5, 421, 455, 456, 457, 458, 459, 460, 466, 479, 480], "in_ax": [303, 481], "in_channel": [330, 331, 332, 333, 334, 335], "in_dim": [324, 452], "in_proj": 452, "inci": 2, "includ": [1, 2, 108, 109, 110, 111, 143, 211, 212, 217, 350, 359, 371, 381, 424, 454, 479, 481, 482, 485, 486, 488], "include_dir": 2, "inclus": [0, 41, 42, 43, 44, 108, 109, 110, 111, 159], "incom": 2, "inconveni": 479, "incorpor": 484, "incorrect": 484, "increas": 218, "increment": 18, "incur": [5, 8], "incx": 2, "independ": [120, 337, 338], "index": [0, 1, 2, 7, 9, 28, 38, 138, 140, 165, 208, 233, 287, 288, 299, 315], "indic": [0, 2, 16, 26, 27, 28, 29, 38, 163, 164, 165, 172, 173, 174, 175, 176, 177, 178, 191, 236, 275, 287, 288, 299, 372, 374, 423, 430, 470, 482], "indices_or_sect": [71, 275], "indirectli": 484, "individu": [324, 337, 338], "ineffici": [482, 483], "inexact": [11, 178], "inf": [189, 225, 378], "infer": [7, 162, 194, 294, 480], "infin": [0, 174, 176, 177, 225, 355, 356, 460], "infinit": [16, 172, 173], "info": [5, 8], "inform": [3, 5, 6, 8, 131, 210, 263, 264, 317, 324, 328, 341, 378, 481, 487], "inherit": [6, 478], "inifn": 174, "init": [324, 379, 449, 454, 468, 469, 471, 472, 480], "init_fn": [401, 402, 403, 404, 405, 406, 407, 408, 449], "init_valu": 1, "initi": [1, 3, 4, 5, 123, 313, 324, 328, 344, 348, 350, 352, 379, 382, 401, 402, 403, 404, 405, 406, 407, 408, 452, 463, 468, 469, 471, 472, 479, 480, 483], "initializer_list": 0, "inject": 0, "inlin": 0, "inner": [0, 479], "inorm": 348, "inp": [1, 143], "inp_ndim": 1, "inp_shap": 1, "inp_strid": 1, "inplac": [2, 8], "input": [0, 1, 2, 4, 5, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 117, 118, 121, 122, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 228, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 247, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 297, 298, 299, 300, 301, 303, 304, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 381, 382, 383, 386, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 424, 425, 426, 427, 429, 430, 432, 434, 446, 449, 479, 481, 482, 485, 486], "input_dil": [0, 101], "input_dim": [6, 324, 352, 381], "input_nam": [1, 143], "input_s": [343, 349, 383], "inputs1": 430, "inputs2": 430, "insert": [118, 138, 487], "insid": 479, "inspect": [3, 479, 485], "inspir": 7, "instabl": 434, "instal": 2, "instanc": [5, 38, 112, 237, 314, 324, 348, 358, 359, 360, 363, 365, 366, 367, 372, 375, 376, 377, 388, 452, 484], "instancenorm": 324, "instanti": [1, 2, 6, 483], "instantiate_axpbi": 2, "instead": [2, 8, 112, 324, 377, 391, 480, 481, 483], "int": [0, 1, 2, 5, 6, 9, 15, 17, 18, 26, 27, 28, 29, 33, 34, 35, 36, 41, 42, 43, 44, 45, 46, 49, 56, 57, 58, 59, 60, 63, 66, 68, 71, 74, 75, 76, 77, 78, 80, 83, 90, 91, 95, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 116, 117, 118, 125, 126, 127, 131, 138, 140, 141, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 164, 165, 169, 178, 185, 189, 193, 203, 205, 207, 210, 211, 212, 213, 216, 217, 218, 221, 223, 229, 232, 233, 235, 236, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 256, 257, 259, 260, 273, 274, 275, 278, 279, 280, 284, 285, 287, 288, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 315, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 340, 342, 343, 344, 348, 349, 350, 352, 355, 356, 378, 380, 381, 382, 383, 386, 391, 399, 414, 422, 423, 427, 432, 434, 452, 468, 470, 471, 472], "int16": 317, "int32": [0, 1, 11, 18, 38, 159, 178, 189, 248, 317, 400, 482, 485], "int64": [11, 317], "int8": [11, 317], "int_0": 134, "integ": [0, 11, 161, 163, 164, 178, 189, 210, 232, 237, 238, 240, 247, 248, 275, 287, 291, 303, 317, 340, 370, 470, 482], "integr": [18, 287, 483], "intend": [0, 479], "interact": 399, "interest": 487, "interfac": 2, "intermedi": 484, "intern": 328, "interpol": 400, "interv": [18, 193, 248, 252], "introduc": [0, 259], "intuit": 324, "invalid": [0, 83], "invers": [0, 19, 20, 21, 22, 23, 24, 25, 135, 150, 151, 152, 153, 154, 155, 184, 188, 192], "invert": 0, "involv": [454, 479], "iogpu": 218, "ip": 480, "is_avail": 123, "is_equival": 2, "is_floating_point": 2, "is_leaf": [310, 311, 312, 313], "is_leaf_fn": 362, "isclos": 0, "isfinit": 0, "ish": 5, "ishmael": 5, "isinf": 0, "isnan": 0, "isneginf": 0, "isposinf": 0, "issu": [480, 481, 484], "issubdtyp": [11, 317], "item": [0, 2, 4, 5, 6, 311, 454, 483, 484, 485], "iter": [4, 6, 191, 311, 312, 476, 479, 483], "iterm": 8, "itertool": [5, 311], "its": [0, 1, 2, 8, 184, 204, 233, 250, 296, 308, 314, 324, 381, 454, 458, 459, 460, 480, 483, 484, 487], "itself": [2, 307, 463], "ix": 1, "ix_n": 1, "ix_nw": 1, "ix_s": 1, "ix_sw": 1, "iy_n": 1, "iy_nw": 1, "iy_s": 1, "iy_sw": 1, "j": [5, 8, 189, 337, 457, 458, 460], "j8": 2, "jacobian": [2, 179, 302, 485], "jain": 337, "jax": [7, 476], "jit": 143, "jmlr": 457, "jnp": 484, "john": 189, "join": 470, "join_schedul": 454, "jointli": 245, "just": [2, 6, 350, 479, 482], "jvp": [2, 112, 485], "k": [0, 5, 45, 90, 117, 140, 146, 163, 168, 293, 296, 297, 298, 326, 352, 355, 363], "k_h": [327, 356], "k_w": [327, 356], "kaim": 405, "keep": [2, 15, 17, 26, 27, 203, 205, 207, 221, 235, 280, 284, 300, 324, 362, 481, 483], "keepdim": [0, 15, 17, 26, 27, 33, 34, 35, 36, 56, 57, 58, 59, 63, 75, 76, 80, 189, 203, 205, 207, 221, 235, 273, 280, 284, 300], "kei": [1, 3, 5, 146, 210, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 310, 311, 362, 363, 375, 378, 463, 476, 478, 481], "kept": 218, "kernel": [2, 7, 8, 98, 99, 100, 101, 102, 103, 104, 143, 326, 327, 355, 356, 479, 482], "kernel_dil": [0, 101], "kernel_s": [326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "key_cach": 5, "key_input_dim": 378, "key_proj": 5, "keyword": [165, 265, 266, 299, 311, 324, 476, 486, 488], "kind": 5, "kingma": [458, 460], "kl_div_loss": 324, "kname": 2, "know": [2, 5], "known": [389, 441], "kth": [0, 28, 233], "kullback": 427, "kw_onli": 2, "kwarg": [10, 120, 265, 266, 488], "l": [5, 6, 183, 184, 186, 187, 324, 326, 328, 330, 333, 343, 349, 355, 383, 433], "l1": [299, 426, 428, 429, 433], "l1_loss": 324, "l2": [426, 429, 467], "l2_loss": 324, "l_": [326, 355, 426], "la": 189, "label": [3, 4, 423, 430], "label_smooth": 423, "lack": 482, "lambd": [345, 395, 415, 445], "lambda": [311, 312, 313, 324, 345, 358, 363, 370, 395, 415, 439, 445, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 480, 481], "languag": [1, 2], "larg": [5, 324, 378, 429, 479, 480, 483], "larger": [1, 145, 218, 386, 461], "largest": [189, 225, 293], "lasso": 299, "last": [0, 1, 5, 29, 78, 142, 144, 149, 152, 154, 155, 157, 158, 159, 163, 164, 171, 183, 184, 186, 187, 188, 190, 191, 192, 204, 213, 241, 274, 291, 301, 330, 331, 332, 333, 334, 335, 337, 338, 344, 400, 484], "latenc": 480, "later": [3, 8, 454], "launch": [1, 2, 123, 480, 482], "layer": [7, 142, 307, 324, 326, 327, 337, 338, 343, 344, 349, 350, 352, 355, 356, 372, 377, 380, 381, 383, 388, 399, 448, 452], "layer_s": 6, "layernorm": 324, "layout": 1, "lazi": [7, 452, 485], "lazili": [5, 324], "lceil": 90, "ld": [343, 349, 383], "ldot": [326, 327, 355, 356], "lead": [0, 18, 83, 479], "leaf": [94, 307, 310, 311, 312, 313, 362], "leaf_modul": 324, "leaki": [351, 418], "leaky_relu": 324, "leakyrelu": 324, "learn": [4, 6, 7, 328, 344, 348, 350, 379, 382, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "learnabl": [330, 331, 332, 333, 334, 335, 388], "learning_r": [6, 454, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 469, 470, 471, 472, 479], "least": [5, 84, 85, 86, 93, 183, 184, 186, 187, 188, 190, 191, 192, 237], "leav": [2, 136, 311, 312, 313], "lectur": 466, "lecun": 337, "left": [0, 5, 145, 180, 189, 237, 259, 326, 327, 341, 355, 356, 386, 400, 412, 413, 424, 426, 434], "left_shift": 0, "leibler": 427, "len": [5, 149, 152, 155, 158, 168, 470], "length": [5, 278, 328, 330, 333, 343, 349, 383, 470], "leq": [426, 439], "less": [0, 1, 5, 28, 182, 218, 233, 386, 433], "less_equ": 0, "let": [1, 2, 4, 5, 184, 479, 481, 483, 484], "level": [0, 163, 164, 404, 405], "lfloor": [326, 327, 355, 356], "lh": [343, 349, 383], "lhs_indic": [0, 163, 164], "lhs_mask": 90, "lib": 480, "libmlx": 8, "libmlx_ext": 2, "libmpi": 480, "librari": [2, 8, 319, 324], "like": [2, 5, 7, 126, 178, 230, 306, 338, 429, 463, 465, 479, 480, 481, 483, 484, 485, 487], "likelihood": [424, 432], "limit": [0, 2, 93, 216, 217, 218, 482], "linalg": 168, "line": [5, 480, 483, 484], "linear": [0, 2, 5, 6, 7, 307, 311, 324, 329, 339, 341, 342, 351, 365, 381, 383, 384, 385, 387, 389, 400, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441, 449, 452, 463, 471, 479], "linear1": 5, "linear2": 5, "linear3": 5, "linear_schedul": [454, 470], "linearli": 378, "link": [2, 8], "linspac": 0, "lion": 454, "list": [1, 5, 10, 15, 17, 30, 71, 78, 83, 84, 85, 86, 91, 94, 95, 101, 131, 136, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 165, 179, 189, 203, 205, 207, 208, 221, 229, 232, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 263, 273, 275, 279, 280, 284, 291, 292, 295, 299, 300, 302, 305, 310, 313, 314, 324, 363, 365, 366, 367, 368, 373, 375, 376, 377, 452, 454, 458, 459, 460, 461, 470, 478, 479, 480, 481, 483], "liter": [2, 232, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "littl": 5, "liu": 5, "live": [7, 143, 487], "ll": [1, 4, 6, 426, 479, 481], "llama": 5, "llamaattent": 5, "llamaencoderlay": 5, "llm": 7, "load": [6, 7, 319, 365, 480], "load_weight": [324, 483], "loader": 6, "loader_path": 2, "loan": 189, "loc": [1, 244, 246], "local": [324, 337, 480], "locat": [0, 2, 83, 376, 377, 480, 487], "log": [0, 197, 199, 203, 353, 354, 419, 420, 421, 424, 427, 429, 432, 444], "log10": 0, "log1p": 0, "log2": 0, "log_cosh_loss": 324, "log_sigmoid": [324, 353], "log_softmax": [324, 354], "logaddexp": 0, "logarithm": [0, 195, 196, 197, 198], "logcosh": 429, "logic": [0, 2, 200, 201, 202], "logical_and": 0, "logical_not": 0, "logical_or": 0, "logist": [0, 4, 269, 413, 441], "logit": [5, 241, 421, 423, 479], "logsigmoid": 324, "logsoftmax": 324, "logsumexp": 0, "long": 5, "longer": [5, 105, 481], "look": [2, 5, 480], "lookup": 340, "loop": [5, 6, 479, 480, 481, 483], "loshchilov": 459, "loss": [4, 6, 299, 324, 454, 479, 480, 481, 483], "loss_and_grad": 324, "loss_and_grad_fn": [6, 454, 479, 481], "loss_fn": [4, 6, 454, 479, 481], "loss_grad_fn": 480, "lot": [480, 481], "low": [248, 252, 408, 449], "low_pad_s": 0, "lower": [183, 184, 186, 187, 192, 237, 248, 251, 252, 296, 408], "lr": [4, 461], "lr_schedul": [468, 469, 470, 472], "lstm": 324, "lto": 2, "lu": 5, "luckili": 483, "lvalu": 299, "m": [0, 2, 5, 8, 90, 140, 163, 168, 189, 296, 326, 327, 355, 356, 455, 479], "m1": [1, 5, 479, 481, 487], "m10": 317, "m7": 317, "m_": [458, 459, 460, 461], "m_t": [458, 459, 460, 461], "mac": 480, "machin": [5, 7, 8, 466, 480], "maco": [8, 218], "macosx": 8, "made": [5, 319], "mai": [2, 189, 307, 337, 480, 481, 482], "main": [7, 118, 140, 143, 294, 311, 312, 324, 480], "maintain": [337, 338, 461], "major": [0, 2], "make": [1, 2, 3, 5, 6, 8, 204, 227, 268, 324, 468, 469, 471, 472, 479, 483, 485, 487], "make_shar": 2, "malloc_or_wait": 2, "man": 5, "manag": [282, 476, 480, 487], "mani": [2, 83, 275, 330, 331, 332, 333, 334, 335, 340, 380, 479, 480, 483], "manual": 324, "map": [2, 6, 38, 194, 311, 340, 358], "map_fn": [358, 362], "map_torch_to_mlx": 5, "margin": [430, 434], "margin_ranking_loss": 324, "mask": [0, 5, 90, 146, 372, 378, 482], "mask_lh": [0, 90], "mask_n": 1, "mask_nw": 1, "mask_out": [0, 90], "mask_rh": [0, 90], "mask_s": 1, "mask_sw": 1, "matadata": 194, "match": [8, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 211, 365, 400, 423, 480, 482, 484], "materi": [5, 7], "math": [5, 434, 479], "mathbf": 184, "mathcal": 352, "mathemat": 189, "mathrm": [134, 269, 348], "matmul": [0, 163, 487], "matric": [189, 190, 191], "matrix": [0, 4, 14, 45, 90, 116, 117, 140, 141, 163, 164, 168, 169, 183, 184, 186, 187, 188, 189, 190, 191, 192, 204, 208, 237, 238, 245, 380, 381, 406, 449], "matter": [5, 324], "max": [0, 1, 2, 189, 206, 329, 355, 356, 379, 409, 416, 417, 422, 424, 425, 430, 434, 436, 438, 456, 460, 479, 481, 487], "max_": [355, 356], "max_buffer_s": 210, "max_freq": 391, "max_i": 237, "max_norm": 309, "max_recommended_working_set_s": [210, 218], "max_val": 416, "maximum": [0, 6, 26, 38, 93, 108, 213, 217, 309, 324, 351, 384, 391, 412, 413, 418, 437, 452, 483], "maxpool1d": 324, "maxpool2d": 324, "maxtotalthreadsperthreadgroup": 2, "mca": 480, "md": 189, "me": 5, "mean": [0, 1, 4, 5, 6, 144, 244, 245, 246, 299, 324, 328, 344, 363, 382, 407, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 481, 484], "meant": 112, "measur": 487, "mechan": 399, "medic": 338, "meet": 8, "member": [2, 324, 368, 373], "memori": [0, 1, 2, 7, 83, 209, 211, 212, 213, 215, 216, 217, 218, 399, 452, 456, 479, 483, 484], "memory_order_relax": 1, "memory_s": [210, 218], "memoryview": [483, 484], "merg": 479, "meshgrid": 0, "metadata": [4, 194, 263, 264], "metal": [2, 7, 143], "metal_captur": 3, "metal_kernel": 1, "metal_path": 8, "metallib": [2, 8], "method": [2, 5, 9, 10, 30, 112, 120, 307, 315, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 369, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 452, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 474], "millisecond": [8, 479, 487], "min": [0, 2, 189, 222, 329, 379, 409, 416, 417, 436, 438], "min_freq": 391, "min_i": 237, "min_val": 416, "mind": [2, 5], "mine": 5, "minibatch": 6, "minim": 480, "minimum": [0, 27, 38, 93, 109, 391, 421, 422], "minsizerel": 8, "minu": 139, "minut": 5, "mish": 324, "miss": [365, 486], "mix": 482, "mkdir": [3, 8], "ml": 8, "mlp": [6, 324, 399, 454], "mlp_dim": [5, 399], "mlx": [1, 3, 4, 5, 6, 8, 319, 324, 449, 452, 454, 476, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487], "mlx_build_benchmark": 8, "mlx_build_cpu": 8, "mlx_build_exampl": 8, "mlx_build_gguf": 8, "mlx_build_met": [2, 8], "mlx_build_metallib": 2, "mlx_build_python_bind": 8, "mlx_build_safetensor": 8, "mlx_build_test": 8, "mlx_disable_compil": [119, 132, 479], "mlx_ext": 2, "mlx_ext_metallib": 2, "mlx_include_dir": 2, "mlx_metal_debug": [3, 8], "mlx_metal_jit": 8, "mlx_sample_extens": 2, "mlx_trace": 3, "mnist": 6, "mode": [0, 1, 2, 105, 232, 361, 372, 374, 400, 404, 405, 480], "model": [4, 6, 7, 265, 307, 308, 311, 312, 324, 358, 361, 363, 365, 369, 372, 374, 375, 376, 378, 399, 449, 452, 454, 462, 463, 465, 479, 480, 483], "modest": 2, "modif": 484, "modifi": 484, "modul": [2, 5, 6, 307, 308, 388, 399, 449, 465, 478, 479, 483], "moment": [5, 456, 458, 459, 460], "momentum": [328, 461, 463, 467, 479], "monei": 5, "monoton": 435, "more": [1, 2, 3, 6, 10, 78, 118, 163, 183, 184, 186, 187, 188, 191, 192, 204, 216, 217, 263, 264, 317, 324, 328, 337, 386, 391, 399, 400, 402, 403, 404, 405, 421, 476, 479, 480, 481, 482, 485, 487], "most": [2, 241, 324, 465, 479, 480, 481, 482, 483], "move": [0, 2, 223, 487], "moveaxi": 0, "mpi": 319, "mpiexec": 480, "mpirun": 480, "mse": 299, "mse_loss": 324, "mtl": 2, "mtl_capture_en": 3, "mtlcommandbuff": 2, "mu": 467, "much": [1, 2, 5, 326, 327, 355, 356, 479, 483], "multi": [7, 146, 330, 331, 332, 333, 334, 335, 482, 484], "multidimension": 208, "multiheadattent": [5, 324], "multipl": [0, 1, 8, 14, 90, 142, 144, 163, 164, 204, 224, 237, 238, 378, 391, 469, 470, 472, 479, 483, 486], "multipli": [0, 2, 38, 164, 237, 238, 336, 391, 400], "murtadha": 5, "must": [0, 1, 2, 3, 8, 90, 93, 145, 162, 164, 186, 187, 189, 240, 241, 245, 248, 251, 252, 304, 400, 484], "mx": [1, 2, 3, 4, 5, 6, 38, 96, 97, 112, 123, 126, 143, 159, 178, 186, 187, 189, 190, 194, 247, 265, 299, 309, 324, 326, 327, 328, 339, 348, 351, 355, 356, 358, 365, 369, 384, 400, 401, 402, 403, 404, 405, 406, 407, 408, 410, 418, 421, 422, 423, 427, 430, 437, 447, 449, 452, 454, 476, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488], "my": [5, 8], "my_devic": 488, "my_path": 265, "myexp": [1, 143], "myexp_strid": 1, "mymlp": 452, "n": [0, 1, 2, 5, 30, 90, 98, 99, 100, 101, 102, 103, 104, 140, 147, 149, 150, 152, 153, 156, 158, 168, 169, 245, 280, 296, 300, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 349, 355, 356, 383, 400, 429, 434, 480], "n_i": [326, 327, 355, 356], "n_t": 343, "naiv": [2, 481], "naive_add": 481, "name": [1, 2, 143, 164, 194, 237, 238, 263, 264, 265, 266, 324, 344, 362, 365, 367, 480, 482, 486], "named_modul": 324, "nan": [0, 16, 82, 172, 173, 175, 225], "nan_to_num": 0, "nanobind": [2, 399], "nanobind_add_modul": 2, "nativ": 8, "natur": [0, 195, 197, 483], "nb": 2, "nb_domain": 2, "nb_func": 399, "nb_modul": 2, "nb_static": 2, "nbyte": 2, "nc": 328, "ndarrai": [30, 482, 483, 485], "ndhwc": [332, 335, 338], "ndim": [0, 1, 2, 159, 189, 191, 400], "ne": 1, "nearest": [1, 400], "necessari": 324, "necessarili": 293, "need": [1, 2, 5, 6, 7, 8, 82, 237, 324, 376, 377, 391, 399, 476, 480, 481, 483, 484, 485, 487], "neg": [0, 118, 159, 176, 225, 259, 294, 351, 355, 356, 378, 424, 432, 434, 482], "negat": [0, 226], "negative_slop": [351, 418], "neginf": [0, 225], "neighbor": 400, "neither": [165, 299], "nelem": 2, "nervou": 5, "nest": [78, 94, 313, 324, 452, 478, 481], "nesterov": 467, "network": [5, 7, 328, 337, 340, 402, 403, 449, 452, 466, 480], "neural": [5, 7, 340, 402, 403, 435, 449, 452, 466], "never": [5, 483], "new": [0, 2, 6, 91, 118, 223, 227, 257, 279, 295, 301, 311, 312, 370, 378, 452, 454, 465, 470, 479, 482, 483, 484], "new_tre": 312, "next": [2, 5, 6, 216], "nh": [343, 349, 383], "nhwc": [328, 331, 334], "nice": [481, 483], "nlc": [328, 330, 333], "nld": [343, 349, 383], "nlh": [343, 349, 383], "nll": [424, 432], "nll_loss": 324, "nn": [2, 5, 6, 265, 311, 324, 449, 452, 454, 463, 465, 479, 483], "nobodi": 5, "node": [94, 136, 303, 312, 313], "nois": 4, "noisi": 4, "nomins": 2, "non": [0, 1, 2, 8, 208, 373, 383, 435, 452], "none": [1, 2, 5, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 119, 121, 122, 125, 126, 127, 128, 129, 130, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 215, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 264, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306, 307, 310, 311, 312, 313, 315, 326, 327, 341, 355, 356, 358, 362, 363, 370, 375, 378, 383, 391, 399, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 456, 474, 482], "nonlinear": [383, 479], "nonzero": 482, "noop": 375, "nor": [2, 165, 299], "norm": [5, 144, 309, 344, 434, 460, 461], "norm1": 5, "norm2": 5, "norm_first": 399, "normal": [1, 2, 4, 5, 142, 143, 144, 186, 245, 251, 324, 326, 327, 328, 344, 348, 350, 355, 356, 382, 399, 402, 404, 484, 487], "not_equ": 0, "notabl": [5, 7], "notat": [116, 141, 310, 367], "note": [0, 1, 2, 5, 8, 16, 18, 83, 90, 94, 100, 103, 104, 112, 146, 154, 155, 164, 172, 189, 211, 237, 241, 301, 307, 324, 382, 400, 454, 484, 486], "noth": [5, 324, 483], "notic": [5, 481, 486], "now": [1, 2, 5, 8, 381, 479, 480, 484], "np": [1, 5, 6, 480, 484, 485], "npy": [194, 262, 486], "npz": [5, 194, 265, 266, 365, 369, 486], "nuclear": 189, "nullopt": 0, "num": [0, 5, 193, 250], "num_class": [6, 454], "num_decoder_lay": 399, "num_embed": [340, 380], "num_encoder_lay": 399, "num_epoch": [6, 454], "num_exampl": 4, "num_featur": [4, 328], "num_group": 344, "num_head": [5, 378, 399], "num_it": 4, "num_lay": [5, 6, 454], "num_param": 324, "num_paramet": 379, "num_sampl": 241, "num_split": 0, "number": [0, 2, 11, 18, 61, 70, 94, 99, 100, 101, 103, 104, 116, 140, 141, 164, 165, 169, 179, 193, 225, 232, 237, 238, 241, 244, 246, 250, 252, 256, 259, 260, 291, 292, 296, 299, 302, 303, 307, 324, 328, 330, 331, 332, 333, 334, 335, 337, 338, 344, 348, 378, 379, 399, 400, 402, 403, 404, 405, 468, 470, 471, 476, 479, 481, 488], "number_of_el": 0, "numer": [5, 142, 144, 189, 199, 203, 273, 328, 344, 348, 350, 382, 421, 422, 424, 434, 455, 456, 457, 458, 459, 460, 466, 479, 483], "numpi": [2, 5, 6, 7, 13, 16, 18, 87, 88, 89, 91, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 483, 485, 486], "nw": 1, "nwhc": 337, "o": [2, 8, 146, 349], "o_t": 349, "obj": 263, "object": [3, 10, 30, 50, 78, 94, 143, 178, 265, 303, 310, 311, 312, 313, 317, 337, 399, 478], "observ": 5, "occupi": [116, 141, 164, 237, 238], "occur": 484, "odim": 6, "odot": [343, 349], "off": [5, 8, 483], "offer": 429, "offset": [0, 1, 2, 5, 46, 83, 118, 142, 145, 294], "often": 338, "ok": [365, 481], "okai": [479, 483], "old": 5, "omit": [458, 460, 480], "onc": [2, 8, 479], "one": [0, 2, 5, 8, 38, 78, 84, 93, 99, 100, 101, 103, 104, 138, 140, 142, 144, 145, 189, 197, 204, 238, 241, 278, 283, 317, 375, 400, 423, 480, 487], "ones": [0, 2, 5, 230, 265, 296, 376, 377, 454, 480, 482], "ones_lik": 0, "onli": [1, 2, 5, 7, 8, 82, 90, 99, 100, 101, 103, 104, 186, 187, 189, 218, 237, 245, 301, 324, 362, 363, 365, 370, 372, 375, 376, 377, 452, 479, 480, 481, 486, 487], "onlin": 457, "op": [1, 2, 231, 301, 363, 483], "open": [3, 8, 18, 248, 252], "openmpi": 480, "oper": [3, 5, 7, 9, 37, 84, 85, 86, 101, 146, 163, 164, 234, 236, 273, 281, 288, 315, 324, 399, 461, 479, 480, 481, 482, 483, 484, 485, 487, 488], "operand": [130, 131, 163], "opportun": 479, "opt": [462, 480], "optim": [1, 3, 4, 6, 7, 376, 479, 480, 481, 483], "option": [0, 3, 5, 14, 15, 17, 18, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 83, 84, 85, 86, 90, 94, 95, 98, 99, 100, 101, 102, 103, 104, 105, 108, 109, 110, 111, 112, 116, 117, 118, 121, 122, 123, 125, 126, 127, 140, 141, 142, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 163, 164, 165, 169, 176, 177, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 203, 205, 207, 208, 217, 221, 225, 229, 232, 233, 235, 237, 238, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 256, 257, 259, 273, 274, 275, 278, 279, 280, 284, 286, 287, 291, 293, 294, 295, 296, 297, 298, 299, 300, 303, 305, 307, 310, 311, 312, 313, 326, 327, 328, 330, 331, 332, 333, 334, 335, 343, 349, 352, 355, 356, 358, 362, 363, 365, 370, 375, 378, 380, 381, 383, 386, 391, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 455, 456, 457, 458, 459, 460, 461, 463, 466, 467, 468, 476, 479, 486, 488], "ord": 189, "order": [0, 1, 28, 83, 101, 131, 186, 187, 189, 233, 237, 293, 324, 344, 376, 388, 463, 479, 481], "ordinari": 171, "org": [344, 348, 350, 357, 382, 413, 435], "origin": [5, 118, 309, 328, 371, 402, 403, 404, 405, 455, 456, 457, 458, 460, 461, 484], "orthonorm": 168, "ostream": 2, "ostringstream": 2, "other": [0, 2, 5, 7, 178, 189, 324, 364, 452, 461, 479, 480, 482, 483, 485], "other_input": 324, "otherwis": [18, 101, 123, 217, 247, 307, 310, 311, 312, 313, 363, 365, 375, 397, 399, 400, 415, 421, 426, 433, 445, 446, 483, 484], "our": [1, 2, 5, 6, 388, 455, 456, 457, 458, 460, 461, 480], "out": [0, 1, 2, 8, 90, 143, 326, 327, 337, 338, 355, 356, 372, 479, 480, 481, 482], "out_ax": [303, 481], "out_channel": [330, 331, 332, 333, 334, 335], "out_dim": [324, 452], "out_dtyp": 2, "out_idx": 2, "out_mask": 90, "out_proj": [5, 452], "out_ptr": 2, "out_shap": [1, 2], "outer": [0, 479, 483], "outlier": 429, "output": [0, 1, 2, 5, 8, 15, 16, 17, 18, 28, 83, 90, 91, 94, 96, 97, 108, 109, 110, 111, 112, 130, 140, 142, 143, 144, 145, 146, 153, 156, 157, 158, 162, 163, 165, 168, 169, 172, 189, 193, 203, 205, 207, 208, 221, 225, 229, 230, 233, 235, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 265, 266, 273, 278, 280, 284, 288, 294, 296, 299, 300, 301, 302, 303, 304, 305, 306, 326, 327, 328, 330, 331, 332, 333, 334, 335, 348, 352, 355, 356, 378, 381, 397, 399, 400, 402, 403, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 449, 479, 480, 481, 482, 483, 484, 485, 486, 487], "output_dim": [6, 324, 352, 381], "output_directori": 2, "output_dtyp": [1, 143], "output_fil": 5, "output_nam": [1, 143], "output_shap": [1, 143], "outsid": [143, 159], "over": [0, 2, 5, 6, 15, 17, 26, 27, 28, 29, 98, 99, 100, 101, 102, 103, 104, 108, 109, 110, 111, 149, 152, 155, 158, 171, 189, 191, 193, 203, 205, 207, 221, 233, 235, 261, 273, 274, 280, 284, 291, 293, 300, 328, 330, 331, 332, 333, 334, 335, 344, 350, 382, 423, 468, 471, 480, 481], "overal": 2, "overhead": [479, 483, 487], "overlap": 1, "overload": 18, "overrid": [2, 132], "overview": 3, "overwrit": 5, "own": [8, 484], "owndata": 484, "p": [8, 240, 324, 336, 337, 338, 434, 458, 460], "pack": [164, 237, 238], "packag": [2, 4, 6, 8, 319, 449, 480], "package_data": 2, "pad": [0, 1, 98, 99, 100, 101, 102, 103, 104, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356], "pad_valu": 0, "pad_width": [0, 232], "padding_hi": 0, "padding_lo": 0, "page": 485, "pain": 5, "pair": [0, 2, 232, 365, 386], "pairwis": 434, "pan": 5, "paper": [328, 391, 455, 456, 457, 458, 460, 461], "parallel": [480, 487], "param": [299, 324, 449, 481], "paramet": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 216, 217, 218, 219, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 348, 349, 350, 351, 352, 355, 356, 358, 359, 362, 363, 365, 370, 371, 372, 375, 376, 377, 378, 379, 380, 381, 382, 383, 386, 388, 391, 395, 397, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 414, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 448, 449, 452, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 465, 466, 467, 468, 469, 470, 471, 472, 474, 479, 480, 481, 483], "parameter_scal": 456, "parametr": [379, 436], "pars": 5, "parse_arg": 5, "parser": 5, "part": [1, 2, 170, 253, 481, 482], "partial": [376, 377, 479, 483], "particip": [121, 122, 125, 126, 127], "particular": [237, 344], "particularli": 479, "partit": [0, 28], "pass": [1, 2, 5, 6, 65, 79, 231, 232, 299, 308, 310, 311, 312, 324, 363, 375, 376, 377, 388, 479, 480, 483], "password": 480, "path": [3, 8, 131, 219, 265, 266, 307, 312, 365, 480], "pattern": [324, 483], "peak": [213, 215], "penalti": 467, "pep": 484, "per": [5, 6, 116, 141, 164, 237, 238, 307, 328, 344, 348, 350, 382, 474, 479, 480, 483], "perceptron": 7, "perf_count": 479, "perfectli": 483, "perform": [0, 1, 2, 3, 5, 7, 14, 90, 101, 108, 109, 110, 111, 127, 130, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 163, 164, 168, 186, 187, 204, 238, 260, 273, 287, 324, 344, 399, 404, 405, 454, 479, 480, 482, 483, 487], "perhap": [2, 5], "perm": 6, "permtuat": 247, "permut": [0, 6], "persist": 8, "pg": 189, "phi": [341, 411], "physic": 480, "pi": [134, 341, 391, 412, 481], "pick": 2, "pip": [2, 8], "pipelin": 2, "pixel": 337, "place": [2, 5, 38, 259, 260, 307, 480, 483, 484], "placehold": 479, "plai": [2, 5], "plain": 388, "plan": [2, 479], "platform": 8, "plu": [0, 197], "point": [0, 2, 4, 5, 8, 83, 161, 238, 317], "pointer": 2, "pool": [326, 327, 355, 356, 487], "popul": 2, "portion": 336, "posinf": [0, 225], "posit": [0, 5, 28, 118, 145, 159, 165, 177, 183, 184, 223, 225, 233, 245, 259, 294, 299, 311, 324, 330, 331, 332, 333, 334, 335, 378, 386, 391, 424, 434], "possibl": [275, 340, 380, 479, 480, 482, 487], "possibli": [5, 14, 90, 163, 204, 309], "postur": 5, "potenti": 217, "power": [0, 481, 484], "practic": [2, 479], "pre": [8, 146, 421], "preced": 344, "precis": [0, 2, 5, 139, 146, 324, 341, 382, 421, 462, 479], "preclud": 324, "pred": [425, 429], "predic": [307, 370], "predict": [421, 424, 425, 426, 427, 428, 429, 431, 432, 433], "prefix": [303, 310], "prelu": 324, "prepar": [2, 5], "prepend": [3, 204], "preprint": [5, 455, 461], "preprocessor": 8, "present": 1, "preserv": [257, 481], "press": [5, 189], "pressur": 2, "pretti": [479, 483], "prevent": [281, 434, 484], "previou": [216, 217, 218], "primal": [1, 2, 112, 179, 302], "primit": 481, "print": [1, 2, 4, 5, 6, 8, 309, 310, 311, 312, 314, 324, 476, 479, 480, 481, 482, 483, 484, 485], "prior": [236, 287, 288], "priorit": 481, "privat": 2, "prng": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "prob": 421, "probabl": [8, 248, 336, 337, 338, 381, 421, 423, 427, 487], "problem": [4, 6, 324], "process": [5, 101, 105, 120, 121, 122, 123, 125, 126, 127, 311, 312, 338, 340, 399, 478, 480], "processor": 8, "prod": [0, 1], "produc": [0, 2, 8, 94, 378, 449], "product": [0, 2, 14, 83, 110, 171, 179, 185, 204, 231, 235, 291, 302, 378, 485], "profil": 3, "program": [213, 480], "programmat": 377, "project": [3, 5, 378], "project_source_dir": 2, "promot": 2, "promote_typ": 2, "promoted_dtyp": 2, "prompt": 5, "propag": [481, 482], "properti": [31, 38, 47, 51, 61, 62, 68, 70, 371, 374, 464, 481], "proportion": 309, "protocol": 484, "provid": [0, 2, 5, 83, 116, 141, 165, 247, 259, 291, 299, 311, 313, 319, 324, 358, 363, 365, 375, 376, 377, 380, 381, 399, 400, 448, 452, 480, 486, 488], "pseudo": 476, "pth": 5, "public": [2, 324], "pun": 0, "pure": [1, 324, 454], "purpos": [1, 189], "purs": 5, "push": 2, "push_back": 2, "put": [0, 1, 6, 236, 479, 480], "put_along_axi": 0, "py": [2, 5, 8, 480], "pypi": 8, "python": [1, 3, 5, 50, 68, 78, 136, 310, 311, 312, 313, 314, 452, 462, 463, 465, 478, 480, 481, 484], "python_requir": 2, "pytorch": [5, 7, 341, 344, 481], "pytorch_compat": 344, "q": [146, 190], "qualifi": 480, "quantiz": [0, 116, 141, 164, 194, 238, 380, 381], "quantized_matmul": 0, "quantizedembed": 324, "quantizedlinear": 324, "quarter": 5, "queri": [5, 146, 218, 378], "query_input_dim": 378, "query_proj": 5, "question": [5, 483], "queue": 3, "quick": [2, 7], "quit": [481, 484], "quotient": [0, 128, 129, 161], "r": [2, 5, 190, 299, 337, 343], "r_t": 343, "race": 487, "radian": [0, 115], "rag": 5, "rain": 5, "rais": [0, 5, 189, 217, 234, 275, 365], "ram": 5, "random": [1, 2, 3, 4, 5, 6, 7, 143, 326, 327, 328, 348, 355, 356, 365, 372, 479, 481, 487, 488], "randomli": [4, 5, 247, 336, 337, 338], "rang": [0, 2, 3, 4, 5, 6, 8, 18, 159, 163, 193, 403, 405, 412, 413, 454, 468, 469, 470, 471, 472, 476, 479, 481, 483, 487], "rank": [0, 125, 126, 127, 430, 480], "rate": [4, 454, 455, 456, 457, 458, 459, 460, 461, 466, 467], "rather": [2, 481, 487], "ratio": [0, 24], "rceil": 90, "re": [6, 8, 449], "readabl": 3, "readi": 2, "real": [0, 153, 154, 155, 156, 157, 158, 183, 184, 186, 187], "realli": 350, "reason": [1, 5, 482], "reboot": 8, "receiv": [125, 126, 307, 470, 484], "reciproc": [0, 261], "reclaim": 216, "recommend": [8, 217, 461], "recompil": [94, 479], "record": [3, 213, 483], "recreat": [314, 454], "rectifi": [351, 384, 385, 404, 405, 418, 437, 438], "recurr": [343, 349, 383], "recurs": [324, 362, 363, 368, 373, 375, 452], "recv": 126, "redirect": 2, "reduc": [0, 1, 8, 15, 17, 26, 27, 122, 203, 205, 207, 221, 235, 280, 284, 300, 313, 328, 399, 429], "reduct": [15, 17, 122, 203, 205, 221, 235, 313, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480], "redund": 481, "refer": [189, 348, 357, 371, 402, 403, 404, 405, 413, 435, 482], "reflect": [371, 479, 482, 484], "regard": 341, "regardless": [83, 146], "regist": [2, 6], "register_librari": 2, "regress": [7, 429], "regular": [38, 337, 435, 459, 479, 482], "regularli": 2, "reimplement": 2, "rel": [16, 172, 456, 479], "relative_step": 456, "relax": 217, "relev": 2, "reli": [1, 2], "relu": [324, 379, 399, 436, 449], "relu6": 324, "remain": [0, 5, 218, 299, 312, 336, 337, 338, 480], "remaind": [0, 129], "remov": [0, 118, 204, 241, 278, 423], "rep": [0, 292], "repeat": [0, 292], "repeatedli": 4, "repetit": 256, "replac": [0, 5, 225, 376, 377, 399, 433], "replai": 3, "repli": 5, "repo": [4, 6, 8, 479], "report": [211, 217], "repres": [2, 5, 120, 123, 164, 430, 434, 484], "represent": [5, 237, 301, 310, 314], "request": 2, "requir": [1, 2, 5, 324, 480, 483, 484], "requires_grad": 481, "rerun": [479, 483], "rescal": 309, "research": 7, "reset": 215, "reset_peak_memori": 213, "reshap": [0, 5, 189, 400, 482], "resid": 218, "resolv": 2, "resourc": 2, "respect": [2, 4, 6, 142, 144, 163, 164, 165, 237, 299, 311, 324, 328, 341, 344, 348, 350, 452, 481, 485], "respons": 2, "rest": [5, 145, 311, 312, 386], "restart": 8, "restor": 259, "result": [0, 5, 14, 18, 38, 78, 83, 94, 142, 144, 164, 189, 204, 238, 245, 256, 279, 311, 312, 313, 391, 421, 479, 481, 484], "resum": 5, "return": [0, 1, 2, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 50, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 212, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 324, 343, 349, 358, 359, 360, 362, 363, 364, 365, 366, 367, 368, 372, 373, 375, 376, 377, 383, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 449, 452, 462, 478, 479, 480, 481, 482, 483, 484, 486, 487], "return_metadata": 194, "revers": [0, 2, 41, 42, 43, 44, 83, 108, 109, 110, 111, 295, 391], "rf": 8, "rfft": 153, "rfft2": 154, "rfftn": 155, "rfloor": [326, 327, 355, 356], "rho": 455, "rhs_indic": [0, 163, 164], "rhs_mask": 90, "right": [0, 1, 2, 8, 237, 258, 259, 326, 327, 341, 355, 356, 400, 412, 413, 424, 426, 434], "right_shift": 0, "rm": [5, 8, 144, 456], "rmsnorm": [5, 324], "rmsprop": 454, "rnn": [324, 343], "roadcast": 248, "robust": 429, "roform": [5, 386], "roll": 0, "root": [0, 5, 144, 261, 276, 382], "rope": [5, 324], "rosetta": 8, "rotari": [5, 145, 386], "rotat": [145, 386], "round": [0, 237], "routin": 2, "row": [0, 1, 2, 83, 140, 143, 169, 237, 296], "row_contigu": 2, "rpath": 2, "rsqrt": 0, "rtol": [0, 16, 172], "rule": [2, 454], "run": [1, 2, 3, 5, 6, 7, 8, 9, 143, 231, 315, 328, 358, 455, 456, 458, 459, 460, 479, 480, 483, 487, 488], "runtim": [5, 123, 319, 479, 480], "runtime_error": 2, "safetensor": [8, 194, 264, 365, 369, 454, 483, 486], "sai": [2, 5, 449, 483], "said": 5, "sake": 481, "same": [0, 2, 5, 8, 16, 38, 82, 91, 94, 99, 100, 101, 103, 104, 105, 121, 142, 144, 153, 156, 157, 158, 164, 165, 172, 179, 232, 241, 259, 260, 301, 302, 304, 312, 324, 327, 328, 336, 344, 348, 356, 380, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 452, 462, 476, 479, 480, 482, 487], "sampl": [2, 4, 5, 193, 240, 241, 242, 244, 245, 248, 251, 252, 402, 403, 404, 405, 407, 408, 424, 430, 434, 476, 479], "sat": 5, "save": [3, 5, 7, 194, 219, 237, 263, 264, 265, 266, 369, 483], "save_gguf": 486, "save_safetensor": [369, 454, 486], "save_weight": 324, "savez": [5, 369, 486], "savez_compress": 486, "saw": [5, 481], "scalar": [0, 2, 13, 14, 16, 30, 50, 78, 82, 87, 88, 89, 90, 91, 93, 128, 129, 133, 161, 162, 165, 166, 167, 168, 172, 180, 181, 182, 193, 199, 200, 201, 202, 204, 206, 222, 224, 225, 228, 232, 234, 240, 248, 251, 252, 255, 258, 263, 283, 299, 301, 304, 308, 434, 481, 483, 485], "scale": [0, 2, 5, 14, 116, 141, 142, 144, 145, 146, 164, 168, 237, 238, 244, 246, 309, 337, 338, 350, 378, 386, 387, 391, 400, 439, 456], "scale_arr": 2, "scale_factor": 400, "scale_paramet": 456, "scatter": 0, "scatter_add": 0, "scatter_max": 0, "scatter_min": 0, "scatter_prod": 0, "schedul": [2, 217, 454, 468, 469, 470, 471, 472, 474, 487], "schema": 3, "scipi": 168, "scope": 324, "score": [5, 146, 430], "sdk": 8, "se": 1, "second": [5, 8, 118, 178, 180, 200, 202, 204, 258, 285, 294, 299, 327, 356, 422, 430, 456, 458, 459, 460, 481, 487], "second_layer_a": 483, "second_layer_b": 483, "secret": 5, "section": [1, 5, 8, 275, 434, 479, 480, 481], "see": [1, 2, 5, 6, 8, 10, 11, 32, 33, 34, 35, 36, 39, 40, 41, 42, 43, 44, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 189, 216, 263, 264, 307, 317, 324, 328, 329, 337, 339, 341, 345, 346, 347, 353, 354, 361, 379, 380, 381, 384, 385, 386, 387, 389, 391, 392, 393, 394, 395, 396, 398, 400, 402, 403, 404, 405, 411, 412, 413, 439, 479, 480, 481, 482, 485, 487], "seed": 243, "seen": 484, "select": [0, 3, 8, 186, 187, 293, 304, 358, 362, 370], "self": [5, 6, 9, 30, 31, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 112, 315, 324, 435, 452], "selu": 324, "semant": [13, 87, 88, 89, 91, 128, 129, 133, 166, 167, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283, 487], "semi": [183, 184, 245], "send": 480, "sennrich": 5, "sensit": 429, "sentencepiec": 5, "separ": [5, 65, 79, 344, 430], "sequenc": [5, 15, 17, 33, 34, 56, 57, 58, 59, 63, 71, 74, 75, 76, 80, 83, 91, 101, 125, 138, 143, 148, 149, 151, 152, 154, 155, 157, 158, 162, 203, 205, 207, 221, 229, 235, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 273, 275, 278, 280, 284, 291, 292, 295, 300, 305, 328, 330, 333, 343, 349, 383, 399, 476, 487], "sequenti": [324, 449], "seri": 8, "serial": 454, "set": [2, 5, 6, 8, 94, 112, 119, 121, 122, 123, 125, 126, 127, 132, 142, 145, 210, 216, 217, 218, 267, 268, 282, 341, 350, 352, 361, 363, 370, 371, 372, 375, 376, 381, 386, 397, 422, 434, 446, 452, 454, 456, 463, 476, 481, 483], "set_data": 2, "set_default_devic": 2, "set_dtyp": 324, "set_input_arrai": 2, "set_memory_limit": 216, "set_output_arrai": 2, "setbyt": 2, "setcomputepipelinest": 2, "setup": [2, 4, 6, 8, 479], "sever": [5, 8, 98, 99, 100, 101, 102, 103, 104, 265, 266, 479, 486], "sgd": [4, 6, 454, 461, 463, 468, 469, 472, 479], "shade": [1, 2], "shall": 5, "shape": [0, 2, 3, 5, 6, 65, 82, 83, 90, 91, 94, 98, 99, 100, 101, 102, 103, 104, 118, 121, 125, 126, 143, 146, 147, 150, 153, 156, 157, 158, 162, 163, 168, 179, 188, 192, 204, 229, 230, 240, 241, 242, 244, 245, 246, 248, 251, 252, 257, 259, 301, 302, 304, 305, 306, 324, 326, 327, 328, 330, 331, 332, 333, 334, 335, 337, 338, 343, 348, 349, 352, 355, 356, 365, 383, 401, 402, 403, 404, 405, 406, 407, 408, 423, 434, 454, 479, 481, 482, 485, 487], "shapeless": [0, 94], "share": [7, 116, 141, 164, 237, 238, 301, 480], "shazeer": 5, "shift": [0, 180, 258, 259, 328], "shop": 5, "should": [1, 2, 4, 5, 6, 8, 83, 118, 121, 142, 143, 144, 146, 179, 209, 218, 219, 236, 237, 288, 294, 299, 302, 307, 310, 324, 330, 331, 332, 333, 334, 335, 337, 338, 372, 378, 388, 423, 425, 430, 452, 478, 479, 480, 481, 483, 484, 488], "show": [8, 317, 479], "shown": 2, "shuffl": 6, "side": [0, 232, 326, 327, 355, 356, 479], "sigma": [341, 342, 343, 349, 390, 402, 403, 404, 405, 413, 414, 419, 440, 441], "sigmoid": [0, 5, 324, 353, 389, 413, 419, 421, 441], "sign": [0, 16, 172, 317, 461], "signal": [105, 400], "signatur": [1, 143], "signedinteg": [11, 178], "signific": 237, "silent": [156, 157, 158], "silicon": [2, 5, 7, 8, 487], "silu": 324, "simd": 1, "simd_sum": 1, "simdgroup": 1, "simdgroup_s": 1, "similar": [5, 164, 178, 311, 376, 377, 378, 422, 484, 486], "similarli": [2, 8, 204, 481, 483], "simpl": [2, 5, 6, 324, 340, 448, 454, 479, 480, 481, 483], "simple_axpbi": 2, "simple_tim": 2, "simplest": [2, 324, 480], "simpli": [2, 5, 8, 339, 351, 384, 410, 418, 437, 447, 452, 479, 480, 481], "simplic": 0, "simultan": 1, "sin": [0, 112, 391, 481, 485], "sinc": [1, 2, 5, 6, 164, 213, 452, 461, 470, 484, 487], "sine": [0, 21, 22, 271, 272, 481], "sing": 189, "singer": 457, "singl": [2, 6, 136, 179, 194, 208, 232, 302, 327, 356, 479, 482, 486], "singleton": [0, 15, 17, 26, 27, 123, 203, 204, 205, 207, 221, 235, 280, 284, 300], "singular": [189, 191], "sinh": 0, "sinusoid": 391, "sinusoidalpositionalencod": 324, "size": [0, 1, 2, 5, 6, 51, 68, 90, 99, 100, 103, 104, 116, 138, 141, 142, 143, 144, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 162, 164, 168, 169, 178, 185, 189, 212, 217, 218, 237, 238, 241, 257, 275, 278, 301, 307, 324, 326, 327, 330, 331, 332, 333, 334, 335, 340, 348, 355, 356, 380, 381, 400, 456, 480, 483, 484], "size_in_megabyt": 218, "size_t": [0, 2], "sizeof": 2, "skip": [3, 83], "slice": [0, 482], "slice_s": 0, "slice_upd": 0, "slight": [5, 483], "slightli": [386, 487], "slope": 351, "slot": 480, "slow": 479, "slowli": 5, "small": [5, 139, 142, 144, 328, 344, 350, 382, 424, 429, 434, 479, 480, 487], "smaller": [0, 8, 233, 461, 479], "smallest": 189, "smile": 5, "smooth": [423, 433, 466], "smooth_l1_loss": 324, "sned": 127, "snippet": 480, "so": [1, 2, 5, 8, 165, 168, 299, 336, 400, 454, 479, 480, 483, 487], "softmax": [0, 5, 146, 324, 354, 420, 423], "softmin": 324, "softplu": [324, 357, 435], "softshrink": 324, "softsign": 324, "solv": 324, "some": [0, 2, 4, 5, 6, 363, 375, 454, 463, 479, 481, 483], "someon": 5, "someth": [4, 5, 482], "sonoma": 8, "soon": 5, "sort": [0, 28, 29, 233, 293], "sourc": [0, 1, 2, 3, 60, 125, 126, 143, 223, 295, 480], "space": [0, 2, 193, 421, 432], "spars": [0, 208], "spatial": [99, 100, 101, 103, 104, 326, 344, 355, 400], "speak": [5, 189], "special": 2, "specif": [1, 2, 8, 480, 481], "specifi": [0, 2, 18, 37, 99, 100, 101, 103, 104, 118, 154, 155, 162, 165, 185, 189, 193, 223, 229, 236, 241, 256, 285, 287, 288, 291, 294, 295, 299, 303, 305, 328, 397, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 446, 480, 481, 487], "speed": [1, 2], "spent": 5, "split": [0, 342, 344, 414], "splittabl": 476, "sqrt": [0, 5, 134, 146, 168, 328, 341, 344, 348, 350, 352, 382, 391, 402, 403, 404, 405, 412, 455, 457, 458, 459, 466, 479], "squar": [0, 4, 5, 144, 169, 188, 192, 261, 276, 299, 311, 324, 382, 431, 433, 455, 456, 458, 459, 460, 481, 484], "squeez": [0, 400, 479], "src": [0, 125, 126], "ssh": 480, "stabil": [142, 144, 328, 344, 348, 350, 382, 421, 422, 424, 455, 456, 457, 458, 459, 460, 466], "stabl": [199, 203, 273, 429], "stable_abi": 2, "stack": [0, 479], "standard": [0, 1, 6, 50, 78, 204, 242, 246, 280, 399, 402, 404, 407, 480, 485], "starmap": [5, 311], "start": [0, 1, 2, 4, 5, 7, 8, 18, 145, 193, 219, 275, 313, 479, 482, 487], "start_axi": [0, 49, 159], "start_captur": 3, "state": [5, 6, 324, 343, 349, 383, 454, 463, 476, 479], "static": 8, "static_cast": 2, "std": [0, 2, 407], "step": [0, 3, 5, 6, 18, 324, 343, 349, 383, 456, 463, 468, 470, 471, 472, 479, 480], "step_decai": 454, "step_siz": 472, "still": [5, 8, 189, 479, 483], "stochast": [457, 458, 460, 467, 483], "stood": 5, "stop": [0, 2, 5, 18, 193, 220, 281, 481, 482], "stop_captur": 3, "stop_gradi": [0, 481], "storag": 83, "store": 5, "str": [2, 105, 130, 131, 143, 165, 186, 187, 189, 194, 208, 210, 219, 262, 263, 264, 265, 266, 299, 310, 314, 358, 359, 362, 363, 365, 367, 369, 375, 400, 404, 405, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434], "straight": 5, "strang": 5, "stream": [2, 7, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 114, 115, 116, 117, 118, 121, 122, 125, 126, 127, 128, 129, 130, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 300, 301, 304, 305, 306, 480, 487], "streamcontext": 282, "streamordevic": [0, 2], "street": 5, "strength": [461, 467], "strict": [123, 166, 181, 363, 365, 375], "strictli": [189, 218], "stride": [0, 2, 83, 98, 99, 100, 101, 102, 103, 104, 326, 327, 330, 331, 332, 333, 334, 335, 355, 356, 386, 482], "string": [0, 2, 131, 143, 210, 232, 484, 486], "structur": [2, 462, 481], "stub": 8, "style": [2, 13, 16, 87, 88, 89, 128, 129, 133, 166, 167, 172, 180, 181, 182, 199, 204, 206, 222, 224, 228, 234, 255, 258, 283], "su": 5, "sub": [0, 6, 118, 250, 294, 307], "subarrai": [118, 275], "subclass": 452, "subdivid": 1, "subdtyp": 178, "subgradi": 457, "sublinear": 456, "submodul": [5, 6, 324, 359, 363, 364, 375, 377], "subscript": [130, 131], "subsect": 5, "subsequ": 454, "subset": [324, 362], "substanti": 8, "subtract": [0, 38], "subtyp": [178, 317], "sudo": [8, 218], "sum": [0, 2, 4, 13, 111, 122, 171, 189, 203, 273, 291, 294, 324, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 480, 482, 484], "sum_": [189, 326, 327, 429], "sum_i": 420, "sum_j": [442, 443], "summat": [130, 131], "super": [5, 6, 324, 452], "superset": [311, 462], "support": [1, 2, 5, 7, 8, 16, 90, 100, 103, 104, 146, 159, 168, 172, 183, 184, 186, 187, 188, 190, 191, 192, 194, 204, 237, 245, 480, 481, 482, 484, 486], "suppos": [481, 487], "sure": [2, 3, 5, 8, 324, 479], "surpass": [404, 405], "surpris": 5, "sw": 1, "swap": [0, 105, 217, 285, 377], "swapax": [0, 112], "swiglu": 5, "swish": [389, 441], "switch": 8, "symbol": 461, "symmetr": [99, 100, 103, 104, 183, 184, 186, 187], "symmetri": [186, 187], "synchron": [2, 479], "syntax": [38, 482], "synthet": 4, "sysctl": 218, "system": [5, 8, 210, 211, 212, 218], "t": [0, 1, 2, 5, 8, 134, 143, 146, 164, 183, 184, 238, 299, 324, 326, 343, 349, 355, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467, 479, 481, 487], "tabl": [1, 189, 317, 340], "take": [0, 2, 5, 6, 87, 88, 89, 94, 163, 165, 179, 206, 222, 230, 238, 288, 299, 302, 303, 306, 312, 313, 378, 421, 476, 480, 481, 482, 486, 487, 488], "take_along_axi": [0, 482], "taken": [118, 287, 294], "talk": 480, "tan": 0, "tangent": [0, 2, 23, 24, 25, 112, 179, 289, 290, 398, 447], "tangent_i": 2, "tangent_x": 2, "tanh": [0, 324, 341, 343, 349, 357, 383, 412, 435], "target": [2, 299, 421, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 479], "target_include_directori": 2, "target_link_librari": 2, "target_link_opt": 2, "target_sourc": 2, "task": [217, 429], "tau": 467, "tcp": 480, "tell": [5, 479, 484], "temp": 5, "templat": [0, 1, 2, 143], "ten": [481, 483], "tend": 461, "tensor": [194, 291, 326, 327, 355, 356, 434, 484], "tensordot": 0, "term": [2, 424, 455, 456, 457, 458, 459, 460, 466], "termin": 8, "test": [6, 8, 480], "test_imag": 6, "test_label": 6, "text": [5, 326, 327, 341, 343, 349, 355, 356, 357, 383, 390, 397, 402, 403, 404, 405, 412, 415, 416, 417, 424, 425, 426, 429, 430, 433, 435, 436, 439, 440, 445, 446, 456, 461], "textrm": [237, 341, 342, 411, 414], "tf": 484, "tgp_size": 2, "th": [108, 109, 110, 111, 117, 140, 186, 470], "than": [1, 2, 5, 78, 105, 118, 129, 145, 163, 166, 167, 181, 182, 183, 184, 186, 187, 188, 191, 192, 204, 216, 218, 309, 311, 386, 397, 400, 430, 433, 446, 456, 461, 479, 481, 487], "thank": 483, "thei": [1, 2, 4, 5, 8, 16, 105, 164, 172, 388, 425, 452, 461, 478, 479, 480, 483, 485, 486, 487], "them": [0, 2, 5, 121, 324, 363, 375, 480, 487], "themselv": [2, 479], "thi": [0, 1, 2, 5, 6, 8, 15, 16, 17, 18, 26, 27, 28, 29, 83, 112, 132, 143, 163, 164, 168, 172, 179, 183, 184, 186, 187, 188, 189, 190, 191, 192, 199, 203, 204, 205, 207, 209, 211, 218, 221, 233, 235, 241, 268, 273, 274, 275, 280, 284, 287, 293, 300, 309, 312, 313, 324, 336, 337, 338, 342, 343, 349, 359, 360, 362, 363, 366, 367, 368, 373, 375, 376, 377, 378, 381, 383, 397, 402, 403, 404, 405, 412, 413, 414, 421, 429, 446, 452, 463, 478, 479, 480, 481, 483, 484, 486], "thing": [2, 5, 480], "third": 185, "thompson": 337, "those": [2, 5, 324], "though": [2, 5, 479, 483, 484], "thousand": 483, "thread": [1, 2], "thread_index_in_simdgroup": 1, "thread_position_in_grid": [1, 2, 143], "threadgroup": [1, 2, 143], "threads_per_simdgroup": 1, "three": [5, 86, 400], "threefri": 476, "threshold": [397, 426, 433, 446], "through": [1, 2, 281, 399, 461, 479, 481, 484], "throw": [2, 94, 123], "thu": [5, 324], "thumb": 454, "tic": 479, "tieleman": 466, "tile": [0, 146], "time": [2, 5, 8, 217, 292, 324, 326, 327, 343, 349, 355, 356, 383, 479, 481, 483, 487], "timeit": [479, 481], "titl": 2, "tmp": [1, 143], "to_quant": 307, "to_stream": 2, "toc": 479, "togeth": [0, 1, 2, 6, 237, 311, 312, 480], "tok_embed": 5, "token": [5, 340, 380], "told": 5, "toler": [0, 16, 172], "too": [178, 479, 483], "took": 5, "tool": 8, "top": [2, 293, 352, 400], "topk": 0, "torch": [5, 484], "torch_weight": 5, "total": [218, 481], "total_norm": 309, "tpi": 479, "trace": [0, 3, 479], "trace_fil": 3, "tracer": 376, "track": [2, 324, 328], "track_running_stat": 328, "trade": 483, "tradit": [5, 145, 337, 338, 386], "train": [5, 6, 324, 328, 336, 337, 338, 361, 363, 375, 402, 403], "train_imag": [6, 454], "train_label": [6, 454], "trainabl": [6, 308, 324, 452], "trainable_paramet": [324, 362, 463], "transform": [1, 5, 7, 112, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 168, 308, 324, 328, 344, 350, 352, 362, 363, 375, 381, 386, 482], "transformerencod": 265, "transit": 470, "translat": [142, 350], "transpos": [0, 5, 31, 102, 103, 104, 164, 238, 333, 334, 335], "treat": [0, 2, 154, 155, 157, 158, 287, 400, 479], "tree": [7, 94, 136, 165, 299, 303, 310, 311, 312, 313, 314, 462, 463, 465, 474, 481], "tree_flatten": [265, 311, 314, 324, 454], "tree_map": [312, 324, 480], "tree_unflatten": [5, 454], "trembl": 5, "tri": 0, "triangl": [186, 187, 296], "triangular": [183, 184, 192], "tril": 0, "trilinear": 400, "triplet": 434, "triplet_loss": 324, "triu": 0, "true": [0, 1, 2, 4, 5, 16, 41, 42, 43, 44, 82, 94, 108, 109, 110, 111, 143, 145, 164, 172, 178, 183, 184, 189, 194, 208, 217, 238, 273, 304, 307, 310, 311, 312, 313, 317, 324, 328, 330, 331, 332, 333, 334, 335, 343, 344, 348, 349, 350, 352, 362, 363, 365, 372, 375, 381, 383, 386, 391, 399, 400, 421, 429, 456], "truncat": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 251], "truth": [4, 423, 433], "try": [2, 8], "tupl": [0, 30, 65, 68, 79, 95, 99, 100, 101, 103, 104, 125, 129, 131, 136, 138, 179, 186, 189, 190, 191, 232, 237, 257, 259, 278, 299, 302, 310, 311, 312, 313, 314, 326, 327, 331, 332, 334, 335, 355, 356, 365, 367, 388, 400, 456, 458, 459, 460, 461, 478, 481], "tutori": 2, "twice": 487, "two": [0, 2, 13, 14, 16, 24, 82, 85, 87, 88, 89, 90, 118, 128, 133, 148, 151, 157, 163, 164, 166, 167, 172, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 199, 204, 206, 222, 224, 228, 231, 285, 313, 327, 342, 349, 356, 414, 422, 479, 480, 481, 482, 487], "txt": 2, "type": [0, 1, 2, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 37, 68, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 115, 116, 117, 118, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 133, 134, 135, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 216, 217, 218, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 260, 261, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 309, 310, 313, 324, 370, 399, 401, 402, 403, 404, 405, 406, 407, 408, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 479, 482], "type_nam": 2, "type_to_nam": 2, "typenam": [0, 1, 2], "typic": [0, 146, 340, 454, 479, 483], "u": [1, 2, 183, 186, 187, 191, 352, 377, 474, 483], "u_": 455, "u_t": 455, "uint": [1, 2, 143], "uint16": [11, 317], "uint3": 1, "uint32": [11, 26, 27, 28, 29, 241, 317], "uint64": [11, 317], "uint8": [11, 317], "ultra": 5, "unabl": 8, "unam": 8, "unari": 479, "unchang": [145, 281, 386], "uncheck": 8, "uncompress": 265, "undefin": [0, 28, 112, 183, 184, 233, 245, 482], "under": [2, 189], "underli": [2, 301], "understand": [5, 402, 403], "unexpect": [2, 18], "unfreez": [324, 363], "unfrozen": 375, "unifi": 7, "uniform": [3, 324, 352, 365, 403, 405, 449, 476, 479, 481, 487], "uniformli": 252, "unintend": 0, "union": [18, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 52, 53, 54, 55, 56, 57, 58, 59, 60, 63, 64, 65, 66, 67, 69, 71, 72, 73, 74, 75, 76, 77, 79, 80, 81, 84, 85, 86, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 176, 177, 178, 186, 187, 210, 263, 282], "uniqu": [2, 476], "unique_ptr": 2, "unit": [329, 339, 341, 342, 343, 351, 384, 385, 387, 389, 402, 403, 404, 405, 409, 410, 411, 412, 413, 414, 418, 437, 438, 439, 441], "unittest": 8, "univers": 189, "unless": [5, 16, 172, 189, 452], "unlik": [5, 16, 172, 337, 338, 371], "unnecessari": [2, 5], "unnorm": [241, 421, 423], "unscal": 456, "unsign": [164, 237, 238, 317], "unsignedinteg": 11, "unspecifi": [15, 17, 18, 26, 27, 28, 29, 95, 108, 109, 110, 111, 162, 203, 205, 207, 221, 229, 233, 235, 256, 273, 274, 280, 284, 287, 293, 294, 300, 305, 488], "unsqueez": 5, "unsupport": 194, "until": [2, 483, 485], "unus": 2, "up": [1, 2, 5, 112, 479], "upcast": 2, "updat": [0, 1, 2, 4, 5, 6, 38, 94, 307, 311, 313, 328, 358, 359, 365, 370, 371, 372, 377, 454, 456, 459, 461, 462, 463, 467, 468, 469, 470, 471, 472, 479, 480, 483], "update_modul": 324, "uplo": [186, 187], "upon": [5, 311, 312], "upper": [183, 184, 186, 187, 192, 237, 248, 251, 252, 408], "upsampl": 324, "us": [0, 3, 4, 5, 6, 7, 8, 18, 38, 83, 112, 116, 119, 121, 122, 125, 126, 127, 129, 141, 143, 145, 159, 164, 180, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 204, 211, 212, 213, 216, 218, 237, 238, 256, 257, 258, 259, 286, 310, 313, 317, 319, 324, 327, 337, 340, 341, 343, 349, 352, 356, 358, 362, 369, 376, 378, 380, 381, 383, 386, 391, 399, 400, 404, 405, 412, 413, 422, 449, 452, 454, 455, 456, 458, 459, 460, 461, 462, 463, 476, 478, 479, 480, 481, 482, 485, 487], "usag": [112, 399, 479], "user": [2, 5, 324], "usual": [340, 380, 478, 483], "util": [1, 2, 5, 7, 8, 265, 324, 454, 480], "v": [5, 105, 146, 186, 324, 363, 484], "v_": [455, 457, 458, 459, 460, 466, 467], "v_t": [455, 457, 458, 459, 460, 466, 467], "val": [0, 30, 162], "valid": [6, 105, 159, 303, 310, 363, 375, 478], "valid_parameter_filt": 358, "valu": [0, 1, 4, 5, 11, 12, 16, 18, 26, 27, 50, 78, 82, 93, 140, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 162, 172, 185, 189, 191, 193, 210, 218, 225, 232, 236, 240, 241, 242, 244, 245, 246, 248, 251, 252, 259, 263, 287, 288, 299, 303, 308, 310, 311, 312, 313, 317, 327, 329, 336, 337, 338, 339, 345, 348, 352, 356, 362, 378, 379, 395, 397, 399, 401, 421, 422, 423, 424, 425, 426, 428, 429, 430, 431, 432, 433, 446, 452, 456, 459, 468, 469, 471, 472, 481], "value_and_grad": [6, 112, 324, 376, 452, 454, 465, 479, 481, 484, 485], "value_and_grad_fn": 483, "value_cach": 5, "value_dim": 378, "value_input_dim": 378, "value_output_dim": 378, "value_proj": 5, "valueerror": [189, 365, 481], "values_hat": 5, "van": 189, "var": [0, 328, 344, 348, 350, 424], "variabl": [8, 94, 119, 132, 165, 179, 299, 302, 303, 480], "varianc": [0, 280, 300, 328, 344, 424], "variant": [5, 433, 460], "variou": 189, "vector": [0, 2, 4, 7, 171, 179, 189, 287, 302, 303, 340, 423, 485], "verbos": [1, 143], "veri": [5, 378, 480, 483, 487], "verifi": [4, 8], "versa": 259, "version": [2, 8, 116, 141, 199, 203, 237, 273, 303, 476, 481, 482], "versu": 479, "via": [8, 112, 462, 465, 480, 483, 484], "vice": 259, "video": 338, "view": [0, 3, 83, 484], "virtual": 2, "vjp": [2, 112, 485], "vmap": [2, 112, 481, 483, 485], "vmap_add": 481, "vocab_s": 5, "vocabulari": [340, 380], "void": [1, 2], "vt": 191, "w": [0, 1, 4, 99, 100, 103, 104, 116, 141, 164, 186, 237, 238, 299, 312, 327, 328, 331, 332, 334, 335, 337, 338, 352, 356, 454, 467, 481], "w1": [5, 309], "w2": [5, 309], "w3": 5, "w_": [327, 343, 349, 356, 383, 455, 456, 457, 458, 459, 460, 461, 466, 467], "w_1": 237, "w_g": 237, "w_i": [116, 141, 237], "w_in": 1, "w_q": 237, "w_star": 4, "w_stride": 1, "w_t": [455, 457, 458, 459, 460, 461, 466, 467], "wa": [5, 83, 125, 126, 480, 483], "wai": [2, 5, 8, 324, 400, 479, 480, 481, 482], "wait": [2, 5, 217], "walk": 5, "walkthrough": 2, "walsh": 168, "want": [1, 5, 480, 481, 487], "warm": [2, 479], "warmup": [470, 471], "warmup_init": 456, "watch": [5, 479], "wd": 461, "we": [0, 1, 2, 4, 5, 6, 116, 125, 126, 141, 164, 237, 238, 324, 340, 380, 388, 459, 461, 476, 478, 479, 480, 481, 483, 487], "weight": [0, 4, 98, 99, 100, 101, 102, 103, 104, 142, 144, 311, 324, 365, 369, 380, 381, 421, 423, 452, 456, 459, 461, 463, 467, 481, 483], "weight_decai": [456, 459, 461, 467], "weight_fil": 5, "weights_fp16": 483, "well": [5, 324, 363, 375, 378, 483], "wen": 5, "went": 5, "were": [5, 487], "wet": 5, "what": [2, 5, 311], "whatsoev": 5, "whc": 337, "when": [0, 1, 2, 5, 7, 8, 94, 101, 112, 127, 183, 184, 186, 187, 188, 189, 191, 192, 194, 330, 331, 332, 333, 334, 335, 400, 404, 405, 421, 427, 433, 452, 454, 470, 476, 479, 480, 487], "where": [0, 6, 140, 172, 184, 237, 299, 303, 326, 327, 328, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 341, 343, 344, 348, 349, 350, 352, 355, 356, 362, 379, 382, 383, 397, 404, 405, 410, 411, 413, 424, 430, 436, 439, 441, 446, 463, 480, 481, 482], "wherea": 481, "whether": [143, 164, 186, 187, 192, 238, 343, 349, 362, 378, 383, 421, 424, 430], "which": [0, 1, 2, 5, 6, 7, 8, 18, 37, 83, 94, 101, 118, 121, 122, 125, 126, 127, 136, 145, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 165, 173, 174, 175, 176, 177, 179, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 194, 208, 219, 237, 241, 242, 256, 257, 259, 262, 263, 264, 265, 266, 278, 279, 287, 294, 299, 302, 303, 307, 327, 337, 338, 341, 356, 358, 362, 386, 421, 423, 426, 430, 433, 449, 462, 463, 476, 479, 480, 481, 482, 483, 487, 488], "while": [2, 3, 5, 8, 257, 386, 483, 484], "whistl": 2, "who": 5, "whose": [140, 307, 308], "why": 5, "wide": 483, "width": [327, 328, 331, 332, 334, 335, 337, 338, 356, 380, 381], "window": [8, 326, 327, 355, 356], "wipe": 8, "wire": 218, "wired_limit_mb": 218, "wise": [0, 2, 12, 13, 19, 20, 21, 22, 23, 24, 25, 87, 88, 89, 92, 106, 107, 128, 129, 133, 134, 135, 137, 139, 160, 161, 166, 167, 172, 180, 181, 182, 195, 196, 197, 198, 199, 200, 201, 202, 206, 222, 224, 226, 228, 234, 254, 255, 258, 261, 269, 270, 271, 272, 276, 277, 283, 289, 290, 329, 337, 338, 347, 357, 379, 390, 409, 416, 417, 419, 420, 435, 436, 438, 441, 442, 443, 444, 479], "wish": 8, "with_logit": 421, "within": [0, 3, 28, 172], "without": [1, 5, 7, 281, 378, 448, 478, 479, 480, 483, 484, 487], "wk": 5, "wl": 2, "wo": 5, "word": 0, "work": [2, 3, 5, 217, 479, 480, 481, 482, 483], "workhors": 324, "world": [314, 480], "worri": [1, 483], "would": [2, 5, 400, 480, 482, 483, 484, 487], "wq": 5, "wrap": [112, 324], "write": [0, 1, 2, 5, 324, 484], "written": 2, "wrt": 308, "wv": 5, "x": [0, 1, 2, 4, 5, 6, 38, 90, 112, 121, 122, 126, 127, 134, 139, 142, 143, 144, 164, 168, 169, 189, 238, 242, 247, 260, 265, 269, 297, 298, 304, 311, 313, 324, 326, 327, 328, 329, 339, 341, 342, 344, 348, 350, 351, 352, 355, 356, 357, 358, 379, 382, 384, 390, 391, 397, 400, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 433, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 452, 454, 461, 479, 480, 481, 482, 483, 484, 485, 487], "x1": 422, "x2": 422, "x86_64": 8, "x_1": [422, 430], "x_2": [422, 430], "x_cast": 2, "x_grad": 1, "x_i": [420, 442, 443], "x_j": [442, 443], "x_offset": 2, "x_ptr": 2, "x_shape": 1, "x_stride": 2, "x_t": [343, 349, 383], "x_view": 484, "xcode": 8, "xcodeproj": 3, "xcrun": 8, "xf": 349, "xg": 349, "xi": 349, "xn": 343, "xo": 349, "xor": 89, "xr": 343, "xy": [0, 208], "xz": 343, "x\u00b2": 484, "y": [0, 2, 4, 5, 6, 38, 112, 168, 304, 324, 328, 337, 344, 348, 350, 352, 382, 425, 430, 433, 454, 457, 479, 480, 481, 483, 484], "y_": [425, 429], "y_cast": 2, "y_hat": 324, "y_offset": 2, "y_ptr": 2, "y_stride": 2, "ye": 5, "year": 5, "yet": [5, 189, 324, 452, 463, 481, 482, 483, 485], "yield": [5, 6, 476], "you": [2, 3, 5, 6, 7, 8, 218, 324, 391, 399, 449, 476, 479, 480, 481, 482, 484, 486, 487], "your": [2, 5, 8, 452, 481, 483], "z": [2, 343, 479, 483], "z_t": 343, "zeiler": 455, "zero": [0, 140, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 185, 208, 215, 296, 297, 298, 306, 324, 326, 327, 336, 337, 338, 365, 401, 402, 403, 404, 405, 406, 407, 408, 449, 454, 456, 482], "zero_grad": 481, "zeros_lik": 0, "zhang": 5, "zip": [5, 6], "zip_saf": 2}, "titles": ["Operations", "Custom Metal Kernels", "Custom Extensions in MLX", "Metal Debugger", "Linear Regression", "LLM inference", "Multi-Layer Perceptron", "MLX", "Build and Install", "mlx.core.Device", "mlx.core.Dtype", "mlx.core.DtypeCategory", "mlx.core.abs", "mlx.core.add", "mlx.core.addmm", "mlx.core.all", "mlx.core.allclose", "mlx.core.any", "mlx.core.arange", "mlx.core.arccos", "mlx.core.arccosh", "mlx.core.arcsin", "mlx.core.arcsinh", "mlx.core.arctan", "mlx.core.arctan2", "mlx.core.arctanh", "mlx.core.argmax", "mlx.core.argmin", "mlx.core.argpartition", "mlx.core.argsort", "mlx.core.array", "mlx.core.array.T", "mlx.core.array.abs", "mlx.core.array.all", "mlx.core.array.any", "mlx.core.array.argmax", "mlx.core.array.argmin", "mlx.core.array.astype", "mlx.core.array.at", "mlx.core.array.conj", "mlx.core.array.cos", "mlx.core.array.cummax", "mlx.core.array.cummin", "mlx.core.array.cumprod", "mlx.core.array.cumsum", "mlx.core.array.diag", "mlx.core.array.diagonal", "mlx.core.array.dtype", "mlx.core.array.exp", "mlx.core.array.flatten", "mlx.core.array.item", "mlx.core.array.itemsize", "mlx.core.array.log", "mlx.core.array.log10", "mlx.core.array.log1p", "mlx.core.array.log2", "mlx.core.array.logsumexp", "mlx.core.array.max", "mlx.core.array.mean", "mlx.core.array.min", "mlx.core.array.moveaxis", "mlx.core.array.nbytes", "mlx.core.array.ndim", "mlx.core.array.prod", "mlx.core.array.reciprocal", "mlx.core.array.reshape", "mlx.core.array.round", "mlx.core.array.rsqrt", "mlx.core.array.shape", "mlx.core.array.sin", "mlx.core.array.size", "mlx.core.array.split", "mlx.core.array.sqrt", "mlx.core.array.square", "mlx.core.array.squeeze", "mlx.core.array.std", "mlx.core.array.sum", "mlx.core.array.swapaxes", "mlx.core.array.tolist", "mlx.core.array.transpose", "mlx.core.array.var", "mlx.core.array.view", "mlx.core.array_equal", "mlx.core.as_strided", "mlx.core.atleast_1d", "mlx.core.atleast_2d", "mlx.core.atleast_3d", "mlx.core.bitwise_and", "mlx.core.bitwise_or", "mlx.core.bitwise_xor", "mlx.core.block_masked_mm", "mlx.core.broadcast_to", "mlx.core.ceil", "mlx.core.clip", "mlx.core.compile", "mlx.core.concatenate", "mlx.core.conj", "mlx.core.conjugate", "mlx.core.conv1d", "mlx.core.conv2d", "mlx.core.conv3d", "mlx.core.conv_general", "mlx.core.conv_transpose1d", "mlx.core.conv_transpose2d", "mlx.core.conv_transpose3d", "mlx.core.convolve", "mlx.core.cos", "mlx.core.cosh", "mlx.core.cummax", "mlx.core.cummin", "mlx.core.cumprod", "mlx.core.cumsum", "mlx.core.custom_function", "mlx.core.default_device", "mlx.core.default_stream", "mlx.core.degrees", "mlx.core.dequantize", "mlx.core.diag", "mlx.core.diagonal", "mlx.core.disable_compile", "mlx.core.distributed.Group", "mlx.core.distributed.all_gather", "mlx.core.distributed.all_sum", "mlx.core.distributed.init", "mlx.core.distributed.is_available", "mlx.core.distributed.recv", "mlx.core.distributed.recv_like", "mlx.core.distributed.send", "mlx.core.divide", "mlx.core.divmod", "mlx.core.einsum", "mlx.core.einsum_path", "mlx.core.enable_compile", "mlx.core.equal", "mlx.core.erf", "mlx.core.erfinv", "mlx.core.eval", "mlx.core.exp", "mlx.core.expand_dims", "mlx.core.expm1", "mlx.core.eye", "mlx.core.fast.affine_quantize", "mlx.core.fast.layer_norm", "mlx.core.fast.metal_kernel", "mlx.core.fast.rms_norm", "mlx.core.fast.rope", "mlx.core.fast.scaled_dot_product_attention", "mlx.core.fft.fft", "mlx.core.fft.fft2", "mlx.core.fft.fftn", "mlx.core.fft.ifft", "mlx.core.fft.ifft2", "mlx.core.fft.ifftn", "mlx.core.fft.irfft", "mlx.core.fft.irfft2", "mlx.core.fft.irfftn", "mlx.core.fft.rfft", "mlx.core.fft.rfft2", "mlx.core.fft.rfftn", "mlx.core.flatten", "mlx.core.floor", "mlx.core.floor_divide", "mlx.core.full", "mlx.core.gather_mm", "mlx.core.gather_qmm", "mlx.core.grad", "mlx.core.greater", "mlx.core.greater_equal", "mlx.core.hadamard_transform", "mlx.core.identity", "mlx.core.imag", "mlx.core.inner", "mlx.core.isclose", "mlx.core.isfinite", "mlx.core.isinf", "mlx.core.isnan", "mlx.core.isneginf", "mlx.core.isposinf", "mlx.core.issubdtype", "mlx.core.jvp", "mlx.core.left_shift", "mlx.core.less", "mlx.core.less_equal", "mlx.core.linalg.cholesky", "mlx.core.linalg.cholesky_inv", "mlx.core.linalg.cross", "mlx.core.linalg.eigh", "mlx.core.linalg.eigvalsh", "mlx.core.linalg.inv", "mlx.core.linalg.norm", "mlx.core.linalg.qr", "mlx.core.linalg.svd", "mlx.core.linalg.tri_inv", "mlx.core.linspace", "mlx.core.load", "mlx.core.log", "mlx.core.log10", "mlx.core.log1p", "mlx.core.log2", "mlx.core.logaddexp", "mlx.core.logical_and", "mlx.core.logical_not", "mlx.core.logical_or", "mlx.core.logsumexp", "mlx.core.matmul", "mlx.core.max", "mlx.core.maximum", "mlx.core.mean", "mlx.core.meshgrid", "mlx.core.metal.clear_cache", "mlx.core.metal.device_info", "mlx.core.metal.get_active_memory", "mlx.core.metal.get_cache_memory", "mlx.core.metal.get_peak_memory", "mlx.core.metal.is_available", "mlx.core.metal.reset_peak_memory", "mlx.core.metal.set_cache_limit", "mlx.core.metal.set_memory_limit", "mlx.core.metal.set_wired_limit", "mlx.core.metal.start_capture", "mlx.core.metal.stop_capture", "mlx.core.min", "mlx.core.minimum", "mlx.core.moveaxis", "mlx.core.multiply", "mlx.core.nan_to_num", "mlx.core.negative", "mlx.core.new_stream", "mlx.core.not_equal", "mlx.core.ones", "mlx.core.ones_like", "mlx.core.outer", "mlx.core.pad", "mlx.core.partition", "mlx.core.power", "mlx.core.prod", "mlx.core.put_along_axis", "mlx.core.quantize", "mlx.core.quantized_matmul", "mlx.core.radians", "mlx.core.random.bernoulli", "mlx.core.random.categorical", "mlx.core.random.gumbel", "mlx.core.random.key", "mlx.core.random.laplace", "mlx.core.random.multivariate_normal", "mlx.core.random.normal", "mlx.core.random.permutation", "mlx.core.random.randint", "mlx.core.random.seed", "mlx.core.random.split", "mlx.core.random.truncated_normal", "mlx.core.random.uniform", "mlx.core.real", "mlx.core.reciprocal", "mlx.core.remainder", "mlx.core.repeat", "mlx.core.reshape", "mlx.core.right_shift", "mlx.core.roll", "mlx.core.round", "mlx.core.rsqrt", "mlx.core.save", "mlx.core.save_gguf", "mlx.core.save_safetensors", "mlx.core.savez", "mlx.core.savez_compressed", "mlx.core.set_default_device", "mlx.core.set_default_stream", "mlx.core.sigmoid", "mlx.core.sign", "mlx.core.sin", "mlx.core.sinh", "mlx.core.softmax", "mlx.core.sort", "mlx.core.split", "mlx.core.sqrt", "mlx.core.square", "mlx.core.squeeze", "mlx.core.stack", "mlx.core.std", "mlx.core.stop_gradient", "mlx.core.stream", "mlx.core.subtract", "mlx.core.sum", "mlx.core.swapaxes", "mlx.core.synchronize", "mlx.core.take", "mlx.core.take_along_axis", "mlx.core.tan", "mlx.core.tanh", "mlx.core.tensordot", "mlx.core.tile", "mlx.core.topk", "mlx.core.trace", "mlx.core.transpose", "mlx.core.tri", "mlx.core.tril", "mlx.core.triu", "mlx.core.value_and_grad", "mlx.core.var", "mlx.core.view", "mlx.core.vjp", "mlx.core.vmap", "mlx.core.where", "mlx.core.zeros", "mlx.core.zeros_like", "mlx.nn.quantize", "mlx.nn.value_and_grad", "mlx.optimizers.clip_grad_norm", "mlx.utils.tree_flatten", "mlx.utils.tree_map", "mlx.utils.tree_map_with_path", "mlx.utils.tree_reduce", "mlx.utils.tree_unflatten", "mlx.core.Stream", "Array", "Data Types", "Devices and Streams", "Distributed Communication", "Fast", "FFT", "Linear Algebra", "Metal", "Neural Networks", "mlx.nn.ALiBi", "mlx.nn.AvgPool1d", "mlx.nn.AvgPool2d", "mlx.nn.BatchNorm", "mlx.nn.CELU", "mlx.nn.Conv1d", "mlx.nn.Conv2d", "mlx.nn.Conv3d", "mlx.nn.ConvTranspose1d", "mlx.nn.ConvTranspose2d", "mlx.nn.ConvTranspose3d", "mlx.nn.Dropout", "mlx.nn.Dropout2d", "mlx.nn.Dropout3d", "mlx.nn.ELU", "mlx.nn.Embedding", "mlx.nn.GELU", "mlx.nn.GLU", "mlx.nn.GRU", "mlx.nn.GroupNorm", "mlx.nn.HardShrink", "mlx.nn.HardTanh", "mlx.nn.Hardswish", "mlx.nn.InstanceNorm", "mlx.nn.LSTM", "mlx.nn.LayerNorm", "mlx.nn.LeakyReLU", "mlx.nn.Linear", "mlx.nn.LogSigmoid", "mlx.nn.LogSoftmax", "mlx.nn.MaxPool1d", "mlx.nn.MaxPool2d", "mlx.nn.Mish", "mlx.nn.Module.apply", "mlx.nn.Module.apply_to_modules", "mlx.nn.Module.children", "mlx.nn.Module.eval", "mlx.nn.Module.filter_and_map", "mlx.nn.Module.freeze", "mlx.nn.Module.leaf_modules", "mlx.nn.Module.load_weights", "mlx.nn.Module.modules", "mlx.nn.Module.named_modules", "mlx.nn.Module.parameters", "mlx.nn.Module.save_weights", "mlx.nn.Module.set_dtype", "mlx.nn.Module.state", "mlx.nn.Module.train", "mlx.nn.Module.trainable_parameters", "mlx.nn.Module.training", "mlx.nn.Module.unfreeze", "mlx.nn.Module.update", "mlx.nn.Module.update_modules", "mlx.nn.MultiHeadAttention", "mlx.nn.PReLU", "mlx.nn.QuantizedEmbedding", "mlx.nn.QuantizedLinear", "mlx.nn.RMSNorm", "mlx.nn.RNN", "mlx.nn.ReLU", "mlx.nn.ReLU6", "mlx.nn.RoPE", "mlx.nn.SELU", "mlx.nn.Sequential", "mlx.nn.SiLU", "mlx.nn.Sigmoid", "mlx.nn.SinusoidalPositionalEncoding", "mlx.nn.Softmax", "mlx.nn.Softmin", "mlx.nn.Softplus", "mlx.nn.Softshrink", "mlx.nn.Softsign", "mlx.nn.Step", "mlx.nn.Tanh", "mlx.nn.Transformer", "mlx.nn.Upsample", "mlx.nn.init.constant", "mlx.nn.init.glorot_normal", "mlx.nn.init.glorot_uniform", "mlx.nn.init.he_normal", "mlx.nn.init.he_uniform", "mlx.nn.init.identity", "mlx.nn.init.normal", "mlx.nn.init.uniform", "mlx.nn.celu", "mlx.nn.elu", "mlx.nn.gelu", "mlx.nn.gelu_approx", "mlx.nn.gelu_fast_approx", "mlx.nn.glu", "mlx.nn.hard_shrink", "mlx.nn.hard_tanh", "mlx.nn.hardswish", "mlx.nn.leaky_relu", "mlx.nn.log_sigmoid", "mlx.nn.log_softmax", "mlx.nn.losses.binary_cross_entropy", "mlx.nn.losses.cosine_similarity_loss", "mlx.nn.losses.cross_entropy", "mlx.nn.losses.gaussian_nll_loss", "mlx.nn.losses.hinge_loss", "mlx.nn.losses.huber_loss", "mlx.nn.losses.kl_div_loss", "mlx.nn.losses.l1_loss", "mlx.nn.losses.log_cosh_loss", "mlx.nn.losses.margin_ranking_loss", "mlx.nn.losses.mse_loss", "mlx.nn.losses.nll_loss", "mlx.nn.losses.smooth_l1_loss", "mlx.nn.losses.triplet_loss", "mlx.nn.mish", "mlx.nn.prelu", "mlx.nn.relu", "mlx.nn.relu6", "mlx.nn.selu", "mlx.nn.sigmoid", "mlx.nn.silu", "mlx.nn.softmax", "mlx.nn.softmin", "mlx.nn.softplus", "mlx.nn.softshrink", "mlx.nn.step", "mlx.nn.tanh", "Functions", "Initializers", "Layers", "Loss Functions", "Module", "Operations", "Optimizers", "mlx.optimizers.AdaDelta", "mlx.optimizers.Adafactor", "mlx.optimizers.Adagrad", "mlx.optimizers.Adam", "mlx.optimizers.AdamW", "mlx.optimizers.Adamax", "mlx.optimizers.Lion", "mlx.optimizers.Optimizer.apply_gradients", "mlx.optimizers.Optimizer.init", "mlx.optimizers.Optimizer.state", "mlx.optimizers.Optimizer.update", "mlx.optimizers.RMSprop", "mlx.optimizers.SGD", "mlx.optimizers.cosine_decay", "mlx.optimizers.exponential_decay", "mlx.optimizers.join_schedules", "mlx.optimizers.linear_schedule", "mlx.optimizers.step_decay", "Common Optimizers", "Optimizer", "Schedulers", "Random", "Transforms", "Tree Utils", "Compilation", "Distributed Communication", "Function Transforms", "Indexing Arrays", "Lazy Evaluation", "Conversion to NumPy and Other Frameworks", "Quick Start Guide", "Saving and Loading Arrays", "Unified Memory", "Using Streams"], "titleterms": {"A": 487, "In": 482, "The": 324, "ab": [12, 32], "adadelta": 455, "adafactor": 456, "adagrad": 457, "adam": 458, "adamax": 460, "adamw": 459, "add": 13, "addmm": 14, "affine_quant": 141, "algebra": 322, "alibi": 325, "all": [5, 15, 33, 480], "all_gath": 121, "all_sum": 122, "allclos": 16, "ani": [17, 34], "api": [7, 8], "appli": 358, "apply_gradi": 462, "apply_to_modul": 359, "arang": 18, "arcco": 19, "arccosh": 20, "arcsin": 21, "arcsinh": 22, "arctan": 23, "arctan2": 24, "arctanh": 25, "argmax": [26, 35], "argmin": [27, 36], "argpartit": 28, "argsort": 29, "arrai": [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 316, 482, 486], "array_equ": 82, "as_strid": 83, "astyp": 37, "atleast_1d": 84, "atleast_2d": 85, "atleast_3d": 86, "attent": 5, "automat": 481, "avgpool1d": 326, "avgpool2d": 327, "back": 2, "basic": [479, 485], "batchnorm": 328, "benchmark": 5, "bernoulli": 240, "binari": 8, "binary_cross_entropi": 421, "bind": 2, "bitwise_and": 87, "bitwise_or": 88, "bitwise_xor": 89, "block_masked_mm": 90, "broadcast_to": 91, "build": [2, 8], "c": [7, 8], "categor": 241, "ceil": 92, "celu": [329, 409], "children": 360, "choleski": 183, "cholesky_inv": 184, "class": 324, "clear_cach": 209, "clip": 93, "clip_grad_norm": 309, "cmake": 2, "co": [40, 106], "code": [2, 5], "common": 473, "commun": [319, 480], "compil": [94, 479], "complex": 1, "comput": 483, "concaten": 95, "conj": [39, 96], "conjug": 97, "constant": 401, "conv1d": [98, 330], "conv2d": [99, 331], "conv3d": [100, 332], "conv_gener": 101, "conv_transpose1d": 102, "conv_transpose2d": 103, "conv_transpose3d": 104, "convers": 484, "convert": 5, "convolv": 105, "convtranspose1d": 333, "convtranspose2d": 334, "convtranspose3d": 335, "core": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 315], "cosh": 107, "cosine_decai": 468, "cosine_similarity_loss": 422, "cpu": 2, "cross": 185, "cross_entropi": 423, "cummax": [41, 108], "cummin": [42, 109], "cumprod": [43, 110], "cumsum": [44, 111], "custom": [1, 2], "custom_funct": 112, "data": 317, "debug": 479, "debugg": 3, "default_devic": 113, "default_stream": 114, "degre": 115, "dequant": 116, "devic": [9, 318], "device_info": 210, "diag": [45, 117], "diagon": [46, 118], "differ": 482, "differenti": 481, "disable_compil": 119, "distribut": [120, 121, 122, 123, 124, 125, 126, 127, 319, 480], "divid": 128, "divmod": 129, "download": [2, 5], "dropout": 336, "dropout2d": 337, "dropout3d": 338, "dtype": [10, 47], "dtypecategori": 11, "eigh": 186, "eigvalsh": 187, "einsum": 130, "einsum_path": 131, "elu": [339, 410], "embed": 340, "enable_compil": 132, "encod": 5, "end": 2, "equal": 133, "erf": 134, "erfinv": 135, "eval": [136, 361], "evalu": 483, "exampl": [1, 2, 7, 479, 480, 487], "exp": [48, 137], "expand_dim": 138, "expm1": 139, "exponential_decai": 469, "extens": 2, "ey": 140, "fast": [141, 142, 143, 144, 145, 146, 320], "fft": [147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 321], "fft2": 148, "fftn": 149, "filter_and_map": 362, "flatten": [49, 159], "floor": 160, "floor_divid": 161, "format": 486, "found": 8, "framework": 484, "freez": 363, "from": [8, 482], "full": [5, 162], "function": [448, 451, 479, 481, 485], "further": 7, "gather_mm": 163, "gather_qmm": 164, "gaussian_nll_loss": 424, "gelu": [341, 411], "gelu_approx": 412, "gelu_fast_approx": 413, "gener": 5, "get": 480, "get_active_memori": 211, "get_cache_memori": 212, "get_peak_memori": 213, "glorot_norm": 402, "glorot_uniform": 403, "glu": [342, 414], "gpu": 2, "grad": [165, 324], "graph": [479, 483, 485], "greater": 166, "greater_equ": 167, "grid": 1, "group": 120, "groupnorm": 344, "gru": 343, "guid": 485, "gumbel": 242, "hadamard_transform": 168, "hard_shrink": 415, "hard_tanh": 416, "hardshrink": 345, "hardswish": [347, 417], "hardtanh": 346, "he_norm": 404, "he_uniform": 405, "hinge_loss": 425, "host": 480, "huber_loss": 426, "ident": [169, 406], "ifft": 150, "ifft2": 151, "ifftn": 152, "imag": 170, "implement": [2, 5], "index": 482, "infer": 5, "init": [123, 401, 402, 403, 404, 405, 406, 407, 408, 463], "initi": 449, "inner": 171, "inspect": 324, "instal": [7, 8, 480], "instancenorm": 348, "introduc": 2, "inv": 188, "irfft": 153, "irfft2": 154, "irfftn": 155, "is_avail": [124, 214], "isclos": 172, "isfinit": 173, "isinf": 174, "isnan": 175, "isneginf": 176, "isposinf": 177, "issubdtyp": 178, "item": 50, "items": 51, "jax": 484, "join_schedul": 470, "jvp": 179, "kei": 243, "kernel": 1, "kl_div_loss": 427, "l1_loss": 428, "laplac": 244, "layer": [5, 6, 450], "layer_norm": 142, "layernorm": 350, "lazi": 483, "leaf_modul": 364, "leaky_relu": 418, "leakyrelu": 351, "left_shift": 180, "less": 181, "less_equ": 182, "linalg": [183, 184, 185, 186, 187, 188, 189, 190, 191, 192], "linear": [4, 322, 352], "linear_schedul": 471, "linspac": 193, "lion": 461, "llm": 5, "load": [5, 194, 454, 486], "load_weight": 365, "log": [52, 195], "log10": [53, 196], "log1p": [54, 197], "log2": [55, 198], "log_cosh_loss": 429, "log_sigmoid": 419, "log_softmax": 420, "logaddexp": 199, "logical_and": 200, "logical_not": 201, "logical_or": 202, "logsigmoid": 353, "logsoftmax": 354, "logsumexp": [56, 203], "loss": [421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 451], "lstm": 349, "margin_ranking_loss": 430, "matmul": 204, "max": [57, 205], "maximum": 206, "maxpool1d": 355, "maxpool2d": 356, "mean": [58, 207], "memori": 487, "meshgrid": 208, "metal": [1, 3, 8, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 323], "metal_kernel": 143, "min": [59, 221], "minim": 8, "minimum": 222, "mish": [357, 435], "mlx": [2, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472], "model": 5, "modul": [324, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 452], "moveaxi": [60, 223], "mpi": 480, "mse_loss": 431, "multi": 6, "multiheadattent": 378, "multipli": 224, "multivariate_norm": 245, "named_modul": 367, "nan_to_num": 225, "nbyte": 61, "ndim": 62, "neg": 226, "network": 324, "neural": 324, "new_stream": 227, "nll_loss": 432, "nn": [307, 308, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447], "norm": 189, "normal": [246, 407], "not_equ": 228, "numpi": [482, 484], "ones": 229, "ones_lik": 230, "onli": 483, "oper": [0, 2, 453], "optim": [309, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474], "option": 8, "other": 484, "outer": 231, "pad": 232, "paramet": [324, 368], "partit": 233, "perceptron": 6, "permut": 247, "place": 482, "power": 234, "prelu": [379, 436], "primit": 2, "prod": [63, 235], "pure": 479, "put": 5, "put_along_axi": 236, "python": [2, 7, 8], "pytorch": 484, "qr": 190, "quantiz": [237, 307], "quantized_matmul": 238, "quantizedembed": 380, "quantizedlinear": 381, "quick": [324, 485], "radian": 239, "randint": 248, "random": [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 476], "read": 7, "real": 253, "reciproc": [64, 254], "recv": 125, "recv_lik": 126, "reduc": 480, "refer": 7, "regress": 4, "relu": [384, 437], "relu6": [385, 438], "remaind": 255, "remot": 480, "repeat": 256, "requir": 8, "reset_peak_memori": 215, "reshap": [65, 257], "result": 2, "rfft": 156, "rfft2": 157, "rfftn": 158, "right_shift": 258, "rms_norm": 144, "rmsnorm": 382, "rmsprop": 466, "rnn": 383, "roll": 259, "rope": [145, 386], "round": [66, 260], "rsqrt": [67, 261], "sampl": 1, "save": [262, 454, 486], "save_gguf": 263, "save_safetensor": 264, "save_weight": 369, "savez": 265, "savez_compress": 266, "scaled_dot_product_attent": 146, "schedul": 475, "script": [2, 5], "seed": 249, "selu": [387, 439], "send": 127, "sequenti": 388, "serial": 486, "set": 480, "set_cache_limit": 216, "set_default_devic": 267, "set_default_stream": 268, "set_dtyp": 370, "set_memory_limit": 217, "set_wired_limit": 218, "setuptool": 2, "sgd": 467, "shape": [1, 68], "shell": 8, "sigmoid": [269, 390, 440], "sign": 270, "silu": [389, 441], "simpl": [1, 487], "sin": [69, 271], "sinh": 272, "sinusoidalpositionalencod": 391, "size": [8, 70], "smooth_l1_loss": 433, "softmax": [273, 392, 442], "softmin": [393, 443], "softplu": [394, 444], "softshrink": [395, 445], "softsign": 396, "sort": 274, "sourc": 8, "specifi": 488, "speedup": 479, "split": [71, 250, 275], "sqrt": [72, 276], "squar": [73, 277], "squeez": [74, 278], "stack": 279, "start": [324, 480, 485], "start_captur": 219, "state": [371, 464], "std": [75, 280], "step": [397, 446], "step_decai": 472, "stop_captur": 220, "stop_gradi": 281, "stream": [282, 315, 318, 488], "stride": 1, "subtract": 283, "sum": [76, 284], "support": 317, "svd": 191, "swapax": [77, 285], "synchron": 286, "t": 31, "take": 287, "take_along_axi": 288, "tan": 289, "tanh": [290, 398, 447], "tensordot": 291, "tensorflow": 484, "tile": 292, "togeth": 5, "tolist": 78, "topk": 293, "trace": 294, "train": [372, 374, 479, 480], "trainable_paramet": 373, "transform": [2, 399, 477, 479, 481, 483, 485], "transpos": [79, 295], "tree": 478, "tree_flatten": 310, "tree_map": 311, "tree_map_with_path": 312, "tree_reduc": 313, "tree_unflatten": 314, "tri": 296, "tri_inv": 192, "tril": 297, "triplet_loss": 434, "triu": 298, "troubleshoot": 8, "truncated_norm": 251, "tune": 480, "type": 317, "unfreez": 375, "unifi": 487, "uniform": [252, 408], "up": 480, "updat": [324, 376, 465, 482], "update_modul": 377, "upsampl": 400, "us": [1, 2, 483, 488], "usag": [2, 7], "util": [310, 311, 312, 313, 314, 478], "valu": 324, "value_and_grad": [299, 308], "var": [80, 300], "vector": 481, "view": [81, 301], "vjp": [1, 302], "vmap": 303, "weight": 5, "what": 483, "when": 483, "where": 304, "why": 483, "workflow": 3, "x86": 8, "xcode": 3, "you": 483, "zero": 305, "zeros_lik": 306}})
\ No newline at end of file
diff --git a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
index 53b080582..981601231 100644
--- a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
+++ b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder-members.html
@@ -99,13 +99,14 @@ $(function(){ initResizable(false); });
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a74bcd8e35f80f5a62db48c4a2bb0173e">dispatchThreadgroups</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
   <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a>(MTL::Size grid_dims, MTL::Size group_dims)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
   <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a27ded7e54bc1712063c874646b445509">inputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">operator-&gt;</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">operator=</a>(const CommandEncoder &amp;)=delete</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a>(const array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
-  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
-  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aac45ab0630ea32cf7d15c7ba3e229966">operator-&gt;</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a3f42a1362b4a513fa89e7b3dcc570a8e">operator=</a>(const CommandEncoder &amp;)=delete</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#aefa48740fdee884f02e2d379bca4e78f">outputs</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#ab69ff0d7f14b9b59db4df0608193dce4">set_input_array</a>(const array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a6a2e28e542eaa2886041bddd51ff6522">set_output_array</a>(array &amp;a, int idx, int64_t offset=0)</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
+  <tr class="odd"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
+  <tr class="even"><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a>()</td><td class="entry"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder.html">mlx::core::metal::CommandEncoder</a></td><td class="entry"></td></tr>
 </table></div><!-- contents -->
 <!-- start footer part -->
 <hr class="footer"/><address class="footer"><small>
diff --git a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
index 186308cbb..ee359fde9 100644
--- a/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
+++ b/docs/build/html/structmlx_1_1core_1_1metal_1_1_command_encoder.html
@@ -121,6 +121,8 @@ Public Member Functions</h2></td></tr>
 <tr class="separator:a74bcd8e35f80f5a62db48c4a2bb0173e"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a1e41477f2f489e38499f7830a91c9810" id="r_a1e41477f2f489e38499f7830a91c9810"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1e41477f2f489e38499f7830a91c9810">dispatchThreads</a> (MTL::Size grid_dims, MTL::Size group_dims)</td></tr>
 <tr class="separator:a1e41477f2f489e38499f7830a91c9810"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ad538ae88f90560063f9ba502e2795991" id="r_ad538ae88f90560063f9ba502e2795991"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad538ae88f90560063f9ba502e2795991">maybeInsertBarrier</a> ()</td></tr>
+<tr class="separator:ad538ae88f90560063f9ba502e2795991"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a48b548a0b15f9d1279c938a1c6167034" id="r_a48b548a0b15f9d1279c938a1c6167034"><td class="memItemLeft" align="right" valign="top"><a class="el" href="structmlx_1_1core_1_1metal_1_1_command_encoder_1_1_concurrent_context.html">ConcurrentContext</a>&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a48b548a0b15f9d1279c938a1c6167034">start_concurrent</a> ()</td></tr>
 <tr class="separator:a48b548a0b15f9d1279c938a1c6167034"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:a9b6dd221ccd2d939d544004cb6279198" id="r_a9b6dd221ccd2d939d544004cb6279198"><td class="memItemLeft" align="right" valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9b6dd221ccd2d939d544004cb6279198">~CommandEncoder</a> ()</td></tr>
@@ -256,6 +258,23 @@ Public Member Functions</h2></td></tr>
 </table>
 </div><div class="memdoc">
 
+</div>
+</div>
+<a id="ad538ae88f90560063f9ba502e2795991" name="ad538ae88f90560063f9ba502e2795991"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ad538ae88f90560063f9ba502e2795991">&#9670;&#160;</a></span>maybeInsertBarrier()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mlx::core::metal::CommandEncoder::maybeInsertBarrier </td>
+          <td>(</td>
+          <td class="paramname"><span class="paramname"><em></em></span></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="aac45ab0630ea32cf7d15c7ba3e229966" name="aac45ab0630ea32cf7d15c7ba3e229966"></a>
diff --git a/docs/build/html/usage/function_transforms.html b/docs/build/html/usage/function_transforms.html
index 1b08af1eb..e903d77a8 100644
--- a/docs/build/html/usage/function_transforms.html
+++ b/docs/build/html/usage/function_transforms.html
@@ -986,13 +986,13 @@ We will prioritize including it.</p>
 <span class="n">ys</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">uniform</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="mi">100</span><span class="p">,</span> <span class="mi">4096</span><span class="p">))</span>
 
 <span class="k">def</span> <span class="nf">naive_add</span><span class="p">(</span><span class="n">xs</span><span class="p">,</span> <span class="n">ys</span><span class="p">):</span>
-    <span class="k">return</span> <span class="p">[</span><span class="n">xs</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">ys</span><span class="p">[:,</span> <span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">xs</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">])]</span>
+    <span class="k">return</span> <span class="p">[</span><span class="n">xs</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">+</span> <span class="n">ys</span><span class="p">[:,</span> <span class="n">i</span><span class="p">]</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">xs</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">])]</span>
 </pre></div>
 </div>
 <p>Instead you can use <a class="reference internal" href="../python/_autosummary/mlx.core.vmap.html#mlx.core.vmap" title="mlx.core.vmap"><code class="xref py py-func docutils literal notranslate"><span class="pre">vmap()</span></code></a> to automatically vectorize the addition:</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># Vectorize over the second dimension of x and the</span>
 <span class="c1"># first dimension of y</span>
-<span class="n">vmap_add</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">vmap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">,</span> <span class="n">in_axes</span><span class="o">=</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">))</span>
+<span class="n">vmap_add</span> <span class="o">=</span> <span class="n">mx</span><span class="o">.</span><span class="n">vmap</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">,</span> <span class="n">in_axes</span><span class="o">=</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">))</span>
 </pre></div>
 </div>
 <p>The <code class="docutils literal notranslate"><span class="pre">in_axes</span></code> parameter can be used to specify which dimensions of the
diff --git a/docs/build/html/usage/indexing.html b/docs/build/html/usage/indexing.html
index 37d46eff8..55e3de841 100644
--- a/docs/build/html/usage/indexing.html
+++ b/docs/build/html/usage/indexing.html
@@ -922,7 +922,7 @@ undefined behavior.</p></li>
 from the GPU. Performing bounds checking for array indices before launching the
 kernel would be extremely inefficient.</p>
 <p>Indexing with boolean masks is something that MLX may support in the future. In
-general, MLX has limited support for operations for which outputs
+general, MLX has limited support for operations for which output
 <em>shapes</em> are dependent on input <em>data</em>. Other examples of these types of
 operations which MLX does not yet support include <a class="reference external" href="https://numpy.org/doc/stable/reference/generated/numpy.nonzero.html#numpy.nonzero" title="(in NumPy v2.1)"><code class="xref py py-func docutils literal notranslate"><span class="pre">numpy.nonzero()</span></code></a> and the
 single input version of <a class="reference external" href="https://numpy.org/doc/stable/reference/generated/numpy.where.html#numpy.where" title="(in NumPy v2.1)"><code class="xref py py-func docutils literal notranslate"><span class="pre">numpy.where()</span></code></a>.</p>
diff --git a/docs/build/html/usage/lazy_evaluation.html b/docs/build/html/usage/lazy_evaluation.html
index cac88c1c1..59143b41a 100644
--- a/docs/build/html/usage/lazy_evaluation.html
+++ b/docs/build/html/usage/lazy_evaluation.html
@@ -952,7 +952,7 @@ stochastic gradient descent). A natural and usually efficient place to use
 </div>
 <p>An important behavior to be aware of is when the graph will be implicitly
 evaluated. Anytime you <code class="docutils literal notranslate"><span class="pre">print</span></code> an array, convert it to an
-<a class="reference external" href="https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray" title="(in NumPy v2.1)"><code class="xref py py-obj docutils literal notranslate"><span class="pre">numpy.ndarray</span></code></a>, or otherwise access it’s memory via <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#memoryview" title="(in Python v3.13)"><code class="xref py py-obj docutils literal notranslate"><span class="pre">memoryview</span></code></a>,
+<a class="reference external" href="https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html#numpy.ndarray" title="(in NumPy v2.1)"><code class="xref py py-obj docutils literal notranslate"><span class="pre">numpy.ndarray</span></code></a>, or otherwise access its memory via <a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#memoryview" title="(in Python v3.13)"><code class="xref py py-obj docutils literal notranslate"><span class="pre">memoryview</span></code></a>,
 the graph will be evaluated. Saving arrays via <a class="reference internal" href="../python/_autosummary/mlx.core.save.html#mlx.core.save" title="mlx.core.save"><code class="xref py py-func docutils literal notranslate"><span class="pre">save()</span></code></a> (or any other MLX
 saving functions) will also evaluate the array.</p>
 <p>Calling <a class="reference internal" href="../python/_autosummary/mlx.core.array.item.html#mlx.core.array.item" title="mlx.core.array.item"><code class="xref py py-func docutils literal notranslate"><span class="pre">array.item()</span></code></a> on a scalar array will also evaluate it. In the

Functions
void	mlx::core::all_reduce_dispatch (const array &in, array &out, const std::string &op_name, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s, std::vector< array > &copies)

void	mlx::core::all_reduce_dispatch (const array &in, array &out, const std::string &op_name, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)

void	mlx::core::row_reduce_general_dispatch (const array &in, array &out, const std::string &op_name, const ReductionPlan &plan, const std::vector< int > &axes, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)

void	mlx::core::strided_reduce_general_dispatch (const array &in, array &out, const std::string &op_name, const ReductionPlan &plan, const std::vector< int > &axes, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)

MTL::ComputePipelineState *	get_mb_sort_kernel (metal::Device &d, const std::string &kernel_name, const array &in, const array &idx, int bn, int tn)

MTL::ComputePipelineState *	get_reduce_init_kernel (metal::Device &d, const std::string &kernel_name, const array &out)

MTL::ComputePipelineState *	get_reduce_init_kernel (metal::Device &d, const std::string &kernel_name, const std::string &func_name, const std::string &op_name, const array &out)

MTL::ComputePipelineState *	get_reduce_kernel (metal::Device &d, const std::string &kernel_name, const std::string &func_name, const std::string &op_name, const array &in, const array &out, int ndim=-1, int bm=-1, int bn=-1)

MTL::ComputePipelineState *	get_steel_gemm_fused_kernel (metal::Device &d, const std::string &kernel_name, const std::string &hash_name, const metal::MTLFCList &func_consts, const array &out, bool transpose_a, bool transpose_b, int bm, int bn, int bk, int wm, int wn)

void	steel_matmul (const Stream &s, metal::Device &d, const array &a, const array &b, array &out, int M, int N, int K, int batch_size_out, int lda, int ldb, bool transpose_a, bool transpose_b, std::vector< array > &copies, std::vector< int > batch_shape={}, std::vector< size_t > A_batch_stride={}, std::vector< size_t > B_batch_stride={})

void	all_reduce_dispatch (const array &in, array &out, const std::string &op_name, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s, std::vector< array > &copies)

void	all_reduce_dispatch (const array &in, array &out, const std::string &op_name, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)

void	row_reduce_general_dispatch (const array &in, array &out, const std::string &op_name, const ReductionPlan &plan, const std::vector< int > &axes, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)

void	strided_reduce_general_dispatch (const array &in, array &out, const std::string &op_name, const ReductionPlan &plan, const std::vector< int > &axes, CommandEncoder &compute_encoder, metal::Device &d, const Stream &s)
		const Stream &	s,
		std::vector< array > &	copies )	const Stream &	s )
void qvm_split_k	(	const device uint32_t *	w,
		const device T *	scales,
		const device T *	biases,
		const device T *	x,
		device T *	y,
			const constant int &	out_vec_size,
		const constant int &	x_batch_ndims,
		const constant int *	x_shape,
		const constant size_t *	x_strides,
		const constant int &	w_batch_ndims,
		const constant int *	w_shape,
		const constant size_t *	w_strides,
		const constant size_t *	s_strides,
		const constant size_t *	b_strides,
		const constant int &	final_block_size,
Functions
template<typename T , typename U , typename Op , int NDIMS, int N_READS = REDUCE_N_READS>
void	col_reduce_small (const device T in, device U out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int shape, const constant size_t strides, const constant int &ndim, const constant int reduce_shape, const constant size_t reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 tsize)

template<typename T , typename U , typename Op , int NDIMS>
void	col_reduce_small (const device T in, device U out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int shape, const constant size_t strides, const constant int &ndim, const constant int reduce_shape, const constant size_t reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)

template<typename T , typename U , typename Op , int NDIMS>
void	col_reduce_longcolumn (const device T in, device U out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int shape, const constant size_t strides, const constant int &ndim, const constant int reduce_shape, const constant size_t reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, const constant size_t &out_size, uint3 gid, uint3 gsize, uint3 lid, uint3 lsize)

template<typename T , typename U , typename Op , int NDIMS, int BM, int BN>
void	col_reduce_looped (const device T in, device U out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int shape, const constant size_t strides, const constant int &ndim, const constant int reduce_shape, const constant size_t reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)
	Our approach is the following simple looped approach:

template<typename T , typename U , typename Op , int NDIMS, int BM, int BN>
void	col_reduce_2pass (const device T in, device U out, const constant size_t &reduction_size, const constant size_t &reduction_stride, const constant int shape, const constant size_t strides, const constant int &ndim, const constant int reduce_shape, const constant size_t reduce_strides, const constant int &reduce_ndim, const constant size_t &non_col_reductions, const constant size_t &out_size, uint3 gid, uint3 gsize, uint simd_lane_id, uint simd_group_id)
void col_reduce_2pass	(	const device T *	in,
		device U *	out,
		const constant size_t &	reduction_size,
		const constant size_t &	reduction_stride,
		const constant int *	shape,
		const constant size_t *	strides,
		const constant int &	ndim,
		const constant int *	reduce_shape,
		const constant size_t *	reduce_strides,
		const constant int &	reduce_ndim,
		const constant size_t &	non_col_reductions,
		const constant size_t &	out_size,
		uint3	gid,
		uint3	gsize,
		uint	simd_lane_id,
		uint	simd_group_id )
void col_reduce_longcolumn	(	const device T *	in,
		device U *	out,
		const constant size_t &	reduction_size,
		const constant size_t &	reduction_stride,
		const constant int *	shape,
		const constant size_t *	strides,
		const constant int &	ndim,
		const constant int *	reduce_shape,
		const constant size_t *	reduce_strides,
		const constant int &	reduce_ndim,
		const constant size_t &	non_col_reductions,
		const constant size_t &	out_size,
		uint3	gid,
		uint3	gsize,
		uint3	lid,
		uint3	lsize )
void col_reduce_small
	uint	simd_lane_id,	uint3	lid,
	uint	simd_group_id,
	uint3	tid,
	uint3	tsize )	uint3	lsize )
Functions
template<typename T , int D>
void	sdpa_vector (const device T queries, const device T keys, const device T values, device T out, const constant int &gqa_factor, const constant int &N, const constant size_t &k_stride, const constant float &scale, uint3 tid, uint simd_gid, uint simd_lid)

template<typename T , int D>
void	sdpa_vector (const device T queries, const device T keys, const device T values, device T out, const constant int &gqa_factor, const constant int &N, const constant size_t &k_stride, const constant size_t &v_stride, const constant float &scale, uint3 tid, uint simd_gid, uint simd_lid)