diff --git a/docs/.gitignore b/docs/.gitignore index 27834a90d..5c2693cb6 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1 +1,2 @@ src/python/_autosummary*/ +src/python/nn/_autosummary*/ diff --git a/docs/src/examples/mlp.rst b/docs/src/examples/mlp.rst index c003618ce..36890e95c 100644 --- a/docs/src/examples/mlp.rst +++ b/docs/src/examples/mlp.rst @@ -61,7 +61,10 @@ set: def eval_fn(model, X, y): return mx.mean(mx.argmax(model(X), axis=1) == y) -Next, setup the problem parameters and load the data: +Next, setup the problem parameters and load the data. To load the data, you need our +`mnist data loader +`_, which +we will import as `mnist`. .. code-block:: python diff --git a/docs/src/install.rst b/docs/src/install.rst index 5244fac2f..92669ab6e 100644 --- a/docs/src/install.rst +++ b/docs/src/install.rst @@ -35,8 +35,7 @@ Probably you are using a non-native Python. The output of should be ``arm``. If it is ``i386`` (and you have M series machine) then you are using a non-native Python. Switch your Python to a native Python. A good -way to do this is with -`Conda `_. +way to do this is with `Conda `_. Build from source @@ -166,3 +165,27 @@ should point to the path to the built metal library. .. code-block:: shell xcrun -sdk macosx --show-sdk-version + +Troubleshooting +^^^^^^^^^^^^^^^ + +Metal not found +~~~~~~~~~~~~~~~ + +You see the following error when you try to build: + +.. code-block:: shell + + error: unable to find utility "metal", not a developer tool or in PATH + +To fix this, first make sure you have Xcode installed: + +.. code-block:: shell + + xcode-select --install + +Then set the active developer directory: + +.. code-block:: shell + + sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer diff --git a/docs/src/python/nn.rst b/docs/src/python/nn.rst index 93cfd8c78..bc19a8162 100644 --- a/docs/src/python/nn.rst +++ b/docs/src/python/nn.rst @@ -64,7 +64,6 @@ Quick Start with Neural Networks # gradient with respect to `mlp.trainable_parameters()` loss_and_grad = nn.value_and_grad(mlp, l2_loss) - .. _module_class: The Module Class @@ -86,20 +85,58 @@ name should not start with ``_``). It can be arbitrarily nested in other :meth:`Module.parameters` can be used to extract a nested dictionary with all the parameters of a module and its submodules. -A :class:`Module` can also keep track of "frozen" parameters. -:meth:`Module.trainable_parameters` returns only the subset of -:meth:`Module.parameters` that is not frozen. When using -:meth:`mlx.nn.value_and_grad` the gradients returned will be with respect to these -trainable parameters. +A :class:`Module` can also keep track of "frozen" parameters. See the +:meth:`Module.freeze` method for more details. :meth:`mlx.nn.value_and_grad` +the gradients returned will be with respect to these trainable parameters. -Updating the parameters + +Updating the Parameters ^^^^^^^^^^^^^^^^^^^^^^^ MLX modules allow accessing and updating individual parameters. However, most times we need to update large subsets of a module's parameters. This action is performed by :meth:`Module.update`. -Value and grad + +Inspecting Modules +^^^^^^^^^^^^^^^^^^ + +The simplest way to see the model architecture is to print it. Following along with +the above example, you can print the ``MLP`` with: + +.. code-block:: python + + print(mlp) + +This will display: + +.. code-block:: shell + + MLP( + (layers.0): Linear(input_dims=2, output_dims=128, bias=True) + (layers.1): Linear(input_dims=128, output_dims=128, bias=True) + (layers.2): Linear(input_dims=128, output_dims=10, bias=True) + ) + +To get more detailed information on the arrays in a :class:`Module` you can use +:func:`mlx.utils.tree_map` on the parameters. For example, to see the shapes of +all the parameters in a :class:`Module` do: + +.. code-block:: python + + from mlx.utils import tree_map + shapes = tree_map(lambda p: p.shape, mlp.parameters()) + +As another example, you can count the number of parameters in a :class:`Module` +with: + +.. code-block:: python + + from mlx.utils import tree_flatten + num_params = sum(v.size for _, v in tree_flatten(mlp.parameters())) + + +Value and Grad -------------- Using a :class:`Module` does not preclude using MLX's high order function @@ -133,62 +170,14 @@ In detail: :meth:`mlx.core.value_and_grad` .. autosummary:: + :recursive: :toctree: _autosummary value_and_grad + Module -Neural Network Layers ---------------------- +.. toctree:: -.. autosummary:: - :toctree: _autosummary - :template: nn-module-template.rst - - Embedding - ReLU - PReLU - GELU - SiLU - Step - SELU - Mish - Linear - Conv1d - Conv2d - LayerNorm - RMSNorm - GroupNorm - RoPE - MultiHeadAttention - Sequential - -Layers without parameters (e.g. activation functions) are also provided as -simple functions. - -.. autosummary:: - :toctree: _autosummary_functions - :template: nn-module-template.rst - - gelu - gelu_approx - gelu_fast_approx - relu - prelu - silu - step - selu - mish - -Loss Functions --------------- - -.. autosummary:: - :toctree: _autosummary_functions - :template: nn-module-template.rst - - losses.cross_entropy - losses.binary_cross_entropy - losses.l1_loss - losses.mse_loss - losses.nll_loss - losses.kl_div_loss + nn/layers + nn/functions + nn/losses diff --git a/docs/src/python/nn/functions.rst b/docs/src/python/nn/functions.rst new file mode 100644 index 000000000..f13cbe7b4 --- /dev/null +++ b/docs/src/python/nn/functions.rst @@ -0,0 +1,23 @@ +.. _nn_functions: + +.. currentmodule:: mlx.nn + +Functions +--------- + +Layers without parameters (e.g. activation functions) are also provided as +simple functions. + +.. autosummary:: + :toctree: _autosummary_functions + :template: nn-module-template.rst + + gelu + gelu_approx + gelu_fast_approx + relu + prelu + silu + step + selu + mish diff --git a/docs/src/python/nn/layers.rst b/docs/src/python/nn/layers.rst new file mode 100644 index 000000000..5628134d6 --- /dev/null +++ b/docs/src/python/nn/layers.rst @@ -0,0 +1,28 @@ +.. _layers: + +.. currentmodule:: mlx.nn + +Layers +------ + +.. autosummary:: + :toctree: _autosummary + :template: nn-module-template.rst + + Embedding + ReLU + PReLU + GELU + SiLU + Step + SELU + Mish + Linear + Conv1d + Conv2d + LayerNorm + RMSNorm + GroupNorm + RoPE + MultiHeadAttention + Sequential diff --git a/docs/src/python/nn/losses.rst b/docs/src/python/nn/losses.rst new file mode 100644 index 000000000..4808ce5ab --- /dev/null +++ b/docs/src/python/nn/losses.rst @@ -0,0 +1,17 @@ +.. _losses: + +.. currentmodule:: mlx.nn.losses + +Loss Functions +-------------- + +.. autosummary:: + :toctree: _autosummary_functions + :template: nn-module-template.rst + + cross_entropy + binary_cross_entropy + l1_loss + mse_loss + nll_loss + kl_div_loss diff --git a/docs/src/python/nn/module.rst b/docs/src/python/nn/module.rst deleted file mode 100644 index e14ba96f4..000000000 --- a/docs/src/python/nn/module.rst +++ /dev/null @@ -1,7 +0,0 @@ -mlx.nn.Module -============= - -.. currentmodule:: mlx.nn - -.. autoclass:: Module - :members: diff --git a/python/mlx/nn/layers/linear.py b/python/mlx/nn/layers/linear.py index d4a503384..0c7a1b907 100644 --- a/python/mlx/nn/layers/linear.py +++ b/python/mlx/nn/layers/linear.py @@ -7,12 +7,21 @@ from mlx.nn.layers.base import Module class Linear(Module): - """Applies an affine transformation to the input. + r"""Applies an affine transformation to the input. + + Concretely: + + .. math:: + + y = W^\top x + b + + where :math:`W` has shape ``[output_dims, input_dims]``. Args: input_dims (int): The dimensionality of the input features output_dims (int): The dimensionality of the output features - bias (bool): If set to False then the layer will not use a bias + bias (bool, optional): If set to ``False`` then the layer will + not use a bias. Default ``True``. """ def __init__(self, input_dims: int, output_dims: int, bias: bool = True):