mirror of
https://github.com/ml-explore/mlx.git
synced 2025-09-19 02:38:09 +08:00
post nanobind docs fixes and some updates (#889)
* post nanobind docs fixes and some updates * one more doc nit * fix for stubs and latex
This commit is contained in:
@@ -156,7 +156,7 @@ def glorot_normal(
|
||||
(``fan_out``) units according to:
|
||||
|
||||
.. math::
|
||||
\sigma = \gamma \sqrt{\frac{2.0}{\text{fan_in} + \text{fan_out}}}
|
||||
\sigma = \gamma \sqrt{\frac{2.0}{\text{fan\_in} + \text{fan\_out}}}
|
||||
|
||||
For more details see the original reference: `Understanding the difficulty
|
||||
of training deep feedforward neural networks
|
||||
@@ -199,7 +199,7 @@ def glorot_uniform(
|
||||
units according to:
|
||||
|
||||
.. math::
|
||||
\sigma = \gamma \sqrt{\frac{6.0}{\text{fan_in} + \text{fan_out}}}
|
||||
\sigma = \gamma \sqrt{\frac{6.0}{\text{fan\_in} + \text{fan\_out}}}
|
||||
|
||||
For more details see the original reference: `Understanding the difficulty
|
||||
of training deep feedforward neural networks
|
||||
|
@@ -166,7 +166,7 @@ class MaxPool1d(_Pool1d):
|
||||
\text{input}(N_i, \text{stride} \times t + m, C_j),
|
||||
|
||||
where :math:`L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
|
||||
\text{kernel_size}}{\text{stride}}\right\rfloor + 1`.
|
||||
\text{kernel\_size}}{\text{stride}}\right\rfloor + 1`.
|
||||
|
||||
Args:
|
||||
kernel_size (int or tuple(int)): The size of the pooling window kernel.
|
||||
@@ -205,7 +205,7 @@ class AvgPool1d(_Pool1d):
|
||||
\text{input}(N_i, \text{stride} \times t + m, C_j),
|
||||
|
||||
where :math:`L_{out} = \left\lfloor \frac{L + 2 \times \text{padding} -
|
||||
\text{kernel_size}}{\text{stride}}\right\rfloor + 1`.
|
||||
\text{kernel\_size}}{\text{stride}}\right\rfloor + 1`.
|
||||
|
||||
Args:
|
||||
kernel_size (int or tuple(int)): The size of the pooling window kernel.
|
||||
@@ -246,8 +246,8 @@ class MaxPool2d(_Pool2d):
|
||||
\text{stride[1]} \times w + n, C_j),
|
||||
\end{aligned}
|
||||
|
||||
where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
|
||||
:math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
|
||||
where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
|
||||
:math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
|
||||
|
||||
The parameters ``kernel_size``, ``stride``, ``padding``, can either be:
|
||||
|
||||
@@ -295,8 +295,8 @@ class AvgPool2d(_Pool2d):
|
||||
\text{stride[1]} \times w + n, C_j),
|
||||
\end{aligned}
|
||||
|
||||
where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
|
||||
:math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
|
||||
where :math:`H_{out} = \left\lfloor\frac{H + 2 * \text{padding[0]} - \text{kernel\_size[0]}}{\text{stride[0]}}\right\rfloor + 1`,
|
||||
:math:`W_{out} = \left\lfloor\frac{W + 2 * \text{padding[1]} - \text{kernel\_size[1]}}{\text{stride[1]}}\right\rfloor + 1`.
|
||||
|
||||
The parameters ``kernel_size``, ``stride``, ``padding``, can either be:
|
||||
|
||||
|
@@ -103,12 +103,12 @@ class GRU(Module):
|
||||
|
||||
.. math::
|
||||
|
||||
\begin{align*}
|
||||
\begin{aligned}
|
||||
r_t &= \sigma (W_{xr}x_t + W_{hr}h_t + b_{r}) \\
|
||||
z_t &= \sigma (W_{xz}x_t + W_{hz}h_t + b_{z}) \\
|
||||
n_t &= \text{tanh}(W_{xn}x_t + b_{n} + r_t \odot (W_{hn}h_t + b_{hn})) \\
|
||||
h_{t + 1} &= (1 - z_t) \odot n_t + z_t \odot h_t
|
||||
\end{align*}
|
||||
\end{aligned}
|
||||
|
||||
The hidden state :math:`h` has shape ``NH`` or ``H`` depending on
|
||||
whether the input is batched or not. Returns the hidden state at each
|
||||
@@ -206,14 +206,14 @@ class LSTM(Module):
|
||||
Concretely, for each element of the sequence, this layer computes:
|
||||
|
||||
.. math::
|
||||
\begin{align*}
|
||||
\begin{aligned}
|
||||
i_t &= \sigma (W_{xi}x_t + W_{hi}h_t + b_{i}) \\
|
||||
f_t &= \sigma (W_{xf}x_t + W_{hf}h_t + b_{f}) \\
|
||||
g_t &= \text{tanh} (W_{xg}x_t + W_{hg}h_t + b_{g}) \\
|
||||
o_t &= \sigma (W_{xo}x_t + W_{ho}h_t + b_{o}) \\
|
||||
c_{t + 1} &= f_t \odot c_t + i_t \odot g_t \\
|
||||
h_{t + 1} &= o_t \text{tanh}(c_{t + 1})
|
||||
\end{align*}
|
||||
\end{aligned}
|
||||
|
||||
The hidden state :math:`h` and cell state :math:`c` have shape ``NH``
|
||||
or ``H``, depending on whether the input is batched or not.
|
||||
|
@@ -343,10 +343,9 @@ def smooth_l1_loss(
|
||||
|
||||
.. math::
|
||||
|
||||
l =
|
||||
\begin{cases}
|
||||
0.5 (x - y)^2, & \text{ if } & (x - y) < \beta \\
|
||||
|x - y| - 0.5 \beta, & & \text{otherwise}
|
||||
l = \begin{cases}
|
||||
0.5 (x - y)^2, & \text{if } (x - y) < \beta \\
|
||||
|x - y| - 0.5 \beta, & \text{otherwise}
|
||||
\end{cases}
|
||||
|
||||
Args:
|
||||
|
Reference in New Issue
Block a user