post nanobind docs fixes and some updates (#889)

* post nanobind docs fixes and some updates * one more doc nit * fix for stubs and latex
2025-12-16 01:49:05 +08:00 · 2024-03-24 15:03:27 -07:00
parent be98f4ab6b
commit 1e16331d9c
16 changed files with 185 additions and 118 deletions
--- a/python/mlx/nn/init.py
+++ b/python/mlx/nn/init.py
@@ -156,7 +156,7 @@ def glorot_normal(
    (``fan_out``) units according to:

    .. math::
-        \sigma = \gamma \sqrt{\frac{2.0}{\text{fan_in} + \text{fan_out}}}
+        \sigma = \gamma \sqrt{\frac{2.0}{\text{fan\_in} + \text{fan\_out}}}

    For more details see the original reference: `Understanding the difficulty
    of training deep feedforward neural networks
@@ -199,7 +199,7 @@ def glorot_uniform(
    units according to:

    .. math::
-        \sigma = \gamma \sqrt{\frac{6.0}{\text{fan_in} + \text{fan_out}}}
+        \sigma = \gamma \sqrt{\frac{6.0}{\text{fan\_in} + \text{fan\_out}}}

    For more details see the original reference: `Understanding the difficulty
    of training deep feedforward neural networks
--- a/python/mlx/nn/layers/pooling.py
+++ b/python/mlx/nn/layers/pooling.py
@@ -166,7 +166,7 @@ class MaxPool1d(_Pool1d):
                    \text{input}(N_i, \text{stride} \times t + m, C_j),

    where :math:`L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
-    \text{kernel_size}}{\text{stride}}\right\rfloor + 1`.
+    \text{kernel\_size}}{\text{stride}}\right\rfloor + 1`.

    Args:
        kernel_size (int or tuple(int)): The size of the pooling window kernel.
@@ -205,7 +205,7 @@ class AvgPool1d(_Pool1d):
                    \text{input}(N_i, \text{stride} \times t + m, C_j),

    where :math:`L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
-    \text{kernel_size}}{\text{stride}}\right\rfloor + 1`.
+    \text{kernel\_size}}{\text{stride}}\right\rfloor + 1`.

    Args:
        kernel_size (int or tuple(int)): The size of the pooling window kernel.
@@ -246,8 +246,8 @@ class MaxPool2d(_Pool2d):
                                                \text{stride[1]} \times w + n, C_j),
        \end{aligned}

-    where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
-    :math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
+    where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
+    :math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.

    The parameters ``kernel_size``, ``stride``, ``padding``, can either be:

@@ -295,8 +295,8 @@ class AvgPool2d(_Pool2d):
                                                \text{stride[1]} \times w + n, C_j),
        \end{aligned}

-    where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
-    :math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
+    where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
+    :math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.

    The parameters ``kernel_size``, ``stride``, ``padding``, can either be:

--- a/python/mlx/nn/layers/recurrent.py
+++ b/python/mlx/nn/layers/recurrent.py
@@ -103,12 +103,12 @@ class GRU(Module):

    .. math::

-        \begin{align*}
+        \begin{aligned}
        r_t &= \sigma (W_{xr}x_t + W_{hr}h_t + b_{r}) \\
        z_t &= \sigma (W_{xz}x_t + W_{hz}h_t + b_{z}) \\
        n_t &= \text{tanh}(W_{xn}x_t + b_{n} + r_t \odot (W_{hn}h_t + b_{hn})) \\
        h_{t + 1} &= (1 - z_t) \odot n_t + z_t \odot h_t
-        \end{align*}
+        \end{aligned}

    The hidden state :math:`h` has shape ``NH`` or ``H`` depending on
    whether the input is batched or not. Returns the hidden state at each
@@ -206,14 +206,14 @@ class LSTM(Module):
    Concretely, for each element of the sequence, this layer computes:

    .. math::
-        \begin{align*}
+        \begin{aligned}
        i_t &= \sigma (W_{xi}x_t + W_{hi}h_t + b_{i}) \\
        f_t &= \sigma (W_{xf}x_t + W_{hf}h_t + b_{f}) \\
        g_t &= \text{tanh} (W_{xg}x_t + W_{hg}h_t + b_{g}) \\
        o_t &= \sigma (W_{xo}x_t + W_{ho}h_t + b_{o}) \\
        c_{t + 1} &= f_t \odot c_t + i_t \odot g_t \\
        h_{t + 1} &= o_t \text{tanh}(c_{t + 1})
-        \end{align*}
+        \end{aligned}

    The hidden state :math:`h` and cell state :math:`c` have shape ``NH``
    or ``H``, depending on whether the input is batched or not.
--- a/python/mlx/nn/losses.py
+++ b/python/mlx/nn/losses.py
@@ -343,10 +343,9 @@ def smooth_l1_loss(

    .. math::

-      l =
-          \begin{cases}
-            0.5 (x - y)^2, & \text{ if } & (x - y) < \beta \\
-            |x - y| - 0.5 \beta, &  & \text{otherwise}
+      l = \begin{cases}
+            0.5 (x - y)^2, & \text{if } (x - y) < \beta \\
+            |x - y| - 0.5 \beta, & \text{otherwise}
          \end{cases}

    Args: