angelos's commit files

2025-12-13 16:21:00 +08:00 · 2023-11-29 10:42:59 -08:00
parent 8ca7f9e8e9
commit d1f86272a2
56 changed files with 12350 additions and 0 deletions
--- a/python/mlx/nn/init.py
+++ b/python/mlx/nn/init.py
@@ -0,0 +1,3 @@
+from mlx.nn.layers import *
+from mlx.nn import losses
+from mlx.nn.utils import value_and_grad
--- a/python/mlx/nn/layers/init.py
+++ b/python/mlx/nn/layers/init.py
@@ -0,0 +1,23 @@
+from mlx.nn.layers.base import Module
+from mlx.nn.layers.activations import (
+    GELU,
+    ReLU,
+    SiLU,
+    gelu,
+    gelu_approx,
+    gelu_fast_approx,
+    relu,
+    silu,
+)
+from mlx.nn.layers.containers import Sequential
+from mlx.nn.layers.convolution import Conv1d, Conv2d
+from mlx.nn.layers.dropout import Dropout
+from mlx.nn.layers.embedding import Embedding
+from mlx.nn.layers.linear import Linear
+from mlx.nn.layers.normalization import GroupNorm, LayerNorm, RMSNorm
+from mlx.nn.layers.positional_encoding import RoPE, SinusoidalPositionalEncoding
+from mlx.nn.layers.transformer import (
+    MultiHeadAttention,
+    TransformerEncoder,
+    TransformerEncoderLayer,
+)
--- a/python/mlx/nn/layers/activations.py
+++ b/python/mlx/nn/layers/activations.py
@@ -0,0 +1,129 @@
+import math
+
+import mlx.core as mx
+from mlx.nn.layers.base import Module
+
+
+def _make_activation_module(f):
+    def decorator(klass):
+        klass.__doc__ = f.__doc__
+        klass.__call__ = lambda self, x: f(x)
+        return klass
+
+    return decorator
+
+
+def relu(x):
+    """Applies the Rectified Linear Unit.
+
+    Simply ``mx.maximum(x, 0)``.
+    """
+    return mx.maximum(x, 0)
+
+
+def silu(x):
+    r"""Applies the Sigmoid Linear Unit.
+
+    Applies :math:`x \sigma(x)` element wise, where :math:`\sigma(\cdot)` is
+    the logistic sigmoid.
+    """
+    return x * mx.sigmoid(x)
+
+
+def gelu(x):
+    """Applies the Gaussian Error Linear Units function.
+
+    .. math::
+        \\textrm{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Gaussian CDF.
+
+    See also :func:`gelu_approx` and :func:`gelu_fast_approx` for faster
+    approximations.
+    """
+    return x * (1 + mx.erf(x / math.sqrt(2))) / 2
+
+
+def gelu_approx(x):
+    r"""An approximation to Gaussian Error Linear Unit.
+
+    See :func:`gelu` for the exact computation.
+
+    This function approximates ``gelu`` with a maximum absolute error :math:`<
+    0.0003` in the range :math:`[-6, 6]` using the following
+
+    .. math::
+
+        x = x \sigma\left(1.60033 x \left(1 + 0.0433603 x^2\right)\right)
+
+    where :math:`\sigma(\cdot)` is the logistic sigmoid.
+    """
+    return x * mx.sigmoid(1.60033 * x * (1 + 0.0433603 * x.square()))
+
+
+def gelu_fast_approx(x):
+    r"""A fast approximation to Gaussian Error Linear Unit.
+
+    See :func:`gelu` for the exact computation.
+
+    This function approximates ``gelu`` with a maximum absolute error :math:`<
+    0.015` in the range :math:`[-6, 6]` using the following
+
+    .. math::
+
+        x = x \sigma\left(1.773 x\right)
+
+    where :math:`\sigma(\cdot)` is the logistic sigmoid.
+    """
+    return x * mx.sigmoid(1.773 * x)
+
+
+@_make_activation_module(relu)
+class ReLU(Module):
+    pass
+
+
+@_make_activation_module(silu)
+class SiLU(Module):
+    pass
+
+
+class GELU(Module):
+    r"""Applies the Gaussian Error Linear Units.
+
+    .. math::
+        \textrm{GELU}(x) = x * \Phi(x)
+
+    where :math:`\Phi(x)` is the Gaussian CDF.
+
+    However, if ``approx`` is set to 'precise' or 'fast' it applies
+
+    .. math::
+        \textrm{GELUApprox}(x) &= x * \sigma\left(1.60033 * x \left(1 + 0.0433603 * x^2\right)\right) \\
+        \textrm{GELUFast}(x) &= x * \sigma\left(1.773 * x\right)
+
+    respectively.
+
+    See :func:`gelu`, :func:`gelu_approx` and :func:`gelu_fast_approx` for the
+    functional equivalents and information regarding error bounds.
+
+    Args:
+        approx ('none' | 'precise' | 'fast'): Which approximation to gelu to use if any.
+    """
+
+    def __init__(self, approx="none"):
+        super().__init__()
+
+        if approx == "none":
+            self._act = gelu
+        elif approx == "precise":
+            self._act = gelu_approx
+        elif approx == "fast":
+            self._act = gelu_fast_approx
+        else:
+            raise ValueError(
+                f"The approximation should be in ['none', 'precise', 'fast'] but '{approx}' was given"
+            )
+
+    def __call__(self, x):
+        return self._act(x)
--- a/python/mlx/nn/losses.py
+++ b/python/mlx/nn/losses.py
@@ -0,0 +1,6 @@
+import mlx.core as mx
+
+
+def cross_entropy(logits: mx.array, targets: mx.array, axis: int = -1):
+    score = mx.take_along_axis(logits, targets[..., None], axis).squeeze(-1)
+    return mx.logsumexp(logits, axis=axis) - score
--- a/python/mlx/utils.py
+++ b/python/mlx/utils.py
@@ -0,0 +1,136 @@
+def tree_map(fn, tree, *rest):
+    """Applies ``fn`` to the leaves of the python tree ``tree`` and
+    returns a new collection with the results.
+
+    If ``rest`` is provided, every item is assumed to be a superset of ``tree``
+    and the corresponding leaves are provided as extra positional arguments to
+    ``fn``. In that respect, :meth:`tree_map` is closer to :func:`itertools.starmap`
+    than to :func:`map`.
+
+    .. code-block:: python
+
+        import mlx.nn as nn
+        from mlx.utils import tree_map
+
+        model = nn.Linear(10, 10)
+        print(model.parameters().keys())
+        # dict_keys(['weight', 'bias'])
+
+        # square the parameters
+        model.update(tree_map(lambda x: x*x, model.parameters()))
+
+    Args:
+        fn (Callable): The function that processes the leaves of the tree
+        tree (Any): The main python tree that will be iterated upon
+        rest (Tuple[Any]): Extra trees to be iterated together with tree
+
+    Returns:
+        A python tree with the new values returned by ``fn``.
+    """
+    if isinstance(tree, list):
+        return [
+            tree_map(fn, child, *(r[i] for r in rest)) for i, child in enumerate(tree)
+        ]
+    elif isinstance(tree, tuple):
+        return tuple(
+            tree_map(fn, child, *(r[i] for r in rest)) for i, child in enumerate(tree)
+        )
+    elif isinstance(tree, dict):
+        return {
+            k: tree_map(fn, child, *(r[k] for r in rest)) for k, child in tree.items()
+        }
+    else:
+        return fn(tree, *rest)
+
+
+def tree_flatten(tree, prefix="", is_leaf=None):
+    """Flattens a python tree to a list of key, value tuples.
+
+    The keys are using the dot notation to define trees of arbitrary depth and
+    complexity.
+
+    .. code-block:: python
+
+        from mlx.utils import tree_flatten
+
+        print(tree_flatten([[[0]]]))
+        # [("0.0.0", 0)]
+
+        print(tree_flatten([[[0]]], ".hello"))
+        # [("hello.0.0.0", 0)]
+
+    .. note::
+       Dictionaries should have keys that are valid python identifiers.
+
+    Args:
+        tree (Any): The python tree to be flattened.
+        prefix (str): A prefix to use for the keys. The first character is
+            always discarded.
+        is_leaf (Callable): An optional callable that returns True if the
+            passed object is considered a leaf or False otherwise.
+
+    Returns:
+        List[Tuple[str, Any]]: The flat representation of the python tree.
+    """
+    flat_tree = []
+
+    if is_leaf is None or not is_leaf(tree):
+        if isinstance(tree, (list, tuple)):
+            for i, t in enumerate(tree):
+                flat_tree.extend(tree_flatten(t, f"{prefix}.{i}", is_leaf))
+            return flat_tree
+        if isinstance(tree, dict):
+            for k, t in tree.items():
+                flat_tree.extend(tree_flatten(t, f"{prefix}.{k}", is_leaf))
+            return flat_tree
+
+    return [(prefix[1:], tree)]
+
+
+def tree_unflatten(tree):
+    """Recreate a python tree from its flat representation.
+
+    .. code-block:: python
+
+        from mlx.utils import tree_unflatten
+
+        d = tree_unflatten([("hello.world", 42)])
+        print(d)
+        # {"hello": {"world": 42}}
+
+    Args:
+        tree (List[Tuple[str, Any]]): The flat representation of a python tree.
+                                      For instance as returned by :meth:`tree_flatten`.
+
+    Returns:
+        A python tree.
+    """
+    if len(tree) == 1 and tree[0][0] == "":
+        return tree[0][1]
+
+    try:
+        int(tree[0][0].split(".", maxsplit=1)[0])
+        is_list = True
+    except ValueError:
+        is_list = False
+
+    # collect children
+    children = {}
+    for key, value in tree:
+        current_idx, *next_idx = key.split(".", maxsplit=1)
+        next_idx = "" if not next_idx else next_idx[0]
+        if current_idx not in children:
+            children[current_idx] = []
+        children[current_idx].append((next_idx, value))
+
+    # recursively map them to the original container
+    if is_list:
+        keys = sorted((int(idx), idx) for idx in children.keys())
+        l = []
+        for i, k in keys:
+            while i > len(l):
+                l.append({})
+            l.append(tree_unflatten(children[k]))
+        return l
+    else:
+        return {k: tree_unflatten(v) for k, v in children.items()}
--- a/python/src/array.cpp
+++ b/python/src/array.cpp
--- a/python/src/device.cpp
+++ b/python/src/device.cpp
@@ -0,0 +1,42 @@
+#include <sstream>
+
+#include <pybind11/pybind11.h>
+
+#include "mlx/device.h"
+#include "mlx/utils.h"
+
+namespace py = pybind11;
+using namespace py::literals;
+using namespace mlx::core;
+
+void init_device(py::module_& m) {
+  py::enum_<Device::DeviceType>(m, "DeviceType")
+      .value("cpu", Device::DeviceType::cpu)
+      .value("gpu", Device::DeviceType::gpu)
+      .export_values()
+      .def(
+          "__eq__",
+          [](const Device::DeviceType& d1, const Device& d2) {
+            return d1 == d2;
+          },
+          py::prepend());
+
+  py::class_<Device>(m, "Device")
+      .def(py::init<Device::DeviceType, int>(), "type"_a, "index"_a = 0)
+      .def_readonly("type", &Device::type)
+      .def(
+          "__repr__",
+          [](const Device& d) {
+            std::ostringstream os;
+            os << d;
+            return os.str();
+          })
+      .def("__eq__", [](const Device& d1, const Device& d2) {
+        return d1 == d2;
+      });
+
+  py::implicitly_convertible<Device::DeviceType, Device>();
+
+  m.def("default_device", &default_device);
+  m.def("set_default_device", &set_default_device, "device"_a);
+}
--- a/python/src/metal.cpp
+++ b/python/src/metal.cpp
@@ -0,0 +1,12 @@
+#include <pybind11/pybind11.h>
+
+#include "mlx/backend/metal/metal.h"
+
+namespace py = pybind11;
+
+using namespace mlx::core;
+
+void init_metal(py::module_& m) {
+  py::module_ metal = m.def_submodule("metal", "mlx.metal");
+  metal.def("is_available", &metal::is_available);
+}
--- a/python/src/stream.cpp
+++ b/python/src/stream.cpp
@@ -0,0 +1,32 @@
+#include <sstream>
+
+#include <pybind11/pybind11.h>
+
+#include "mlx/stream.h"
+#include "mlx/utils.h"
+
+namespace py = pybind11;
+using namespace py::literals;
+using namespace mlx::core;
+
+void init_stream(py::module_& m) {
+  py::class_<Stream>(m, "Stream")
+      .def(py::init<int, Device>(), "index"_a, "device"_a)
+      .def_readonly("device", &Stream::device)
+      .def(
+          "__repr__",
+          [](const Stream& s) {
+            std::ostringstream os;
+            os << s;
+            return os.str();
+          })
+      .def("__eq__", [](const Stream& s1, const Stream& s2) {
+        return s1 == s2;
+      });
+
+  py::implicitly_convertible<Device::DeviceType, Device>();
+
+  m.def("default_stream", &default_stream, "device"_a);
+  m.def("set_default_stream", &set_default_stream, "stream"_a);
+  m.def("new_stream", &new_stream, "device"_a);
+}
--- a/python/tests/test_bf16.py
+++ b/python/tests/test_bf16.py
@@ -0,0 +1,188 @@
+import unittest
+from itertools import permutations
+
+import math
+import mlx.core as mx
+import numpy as np
+
+import mlx_tests
+
+try:
+    import torch
+
+    has_torch = True
+except ImportError as e:
+    has_torch = False
+
+
+class TestBF16(mlx_tests.MLXTestCase):
+    def __test_ops(
+        self,
+        ref_op,  # Function that outputs array_like
+        mlx_op,  # Function that outputs array_like
+        np_args,  # Numpy arguments
+        ref_transform=lambda x: x,
+        mlx_transform=lambda x: mx.array(x),
+        atol=1e-5,
+    ):
+        ref_args = map(ref_transform, np_args)
+        mlx_args = map(mlx_transform, np_args)
+
+        r_ref = ref_op(*ref_args)
+        r_mlx = mlx_op(*mlx_args)
+
+        self.assertTrue(np.allclose(r_mlx, r_ref, atol=atol))
+
+    def __default_test(
+        self,
+        op,
+        np_args,
+        simple_transform=lambda x: x,
+        atol_np=1e-3,
+        atol_torch=1e-5,
+        np_kwargs=dict(),
+        mlx_kwargs=dict(),
+        torch_kwargs=dict(),
+        torch_op=None,
+    ):
+        with self.subTest(reference="numpy"):
+
+            def np_transform(x):
+                x_mx_bf16 = mx.array(x).astype(mx.bfloat16)
+                x_mx_fp32 = x_mx_bf16.astype(mx.float32)
+                return np.asarray(x_mx_fp32)
+
+            def mlx_fn(*args):
+                out_bf16 = getattr(mx, op)(*args, **mlx_kwargs)
+                return np.asarray(out_bf16.astype(mx.float32))
+
+            def np_fn(*args):
+                out_fp32 = getattr(np, op)(*args, **np_kwargs)
+                return np_transform(out_fp32)
+
+            ref_op = np_fn
+            mlx_op = mlx_fn
+
+            ref_transform = lambda x: simple_transform(np_transform(x))
+            mlx_transform = lambda x: simple_transform(mx.array(x).astype(mx.bfloat16))
+
+            self.__test_ops(
+                ref_op,
+                mlx_op,
+                np_args,
+                ref_transform=ref_transform,
+                mlx_transform=mlx_transform,
+                atol=atol_np,
+            )
+
+        if has_torch:
+            with self.subTest(reference="torch"):
+                torch_op = op if torch_op is None else torch_op
+
+                def torch_fn(*args):
+                    out_bf16 = getattr(torch, torch_op)(*args, **torch_kwargs)
+                    return out_bf16.to(torch.float32).numpy()
+
+                ref_op = torch_fn
+                ref_transform = lambda x: simple_transform(
+                    torch.from_numpy(x).to(torch.bfloat16)
+                )
+                self.__test_ops(
+                    ref_op,
+                    mlx_op,
+                    np_args,
+                    ref_transform=ref_transform,
+                    mlx_transform=mlx_transform,
+                    atol=atol_torch,
+                )
+
+    def test_unary_ops(self):
+        x = np.random.rand(18, 28, 38)
+        for op in ["abs", "exp", "log", "square", "sqrt"]:
+            with self.subTest(op=op):
+                np_args = (x.astype(np.float32),)
+                self.__default_test(op, np_args)
+
+    def test_binary_ops(self):
+        x = np.random.rand(18, 28, 38)
+        y = np.random.rand(18, 28, 38)
+        for op in ["add", "subtract", "multiply", "divide", "maximum", "minimum"]:
+            with self.subTest(op=op):
+                np_args = (
+                    x.astype(np.float32),
+                    y.astype(np.float32),
+                )
+                self.__default_test(op, np_args, simple_transform=lambda x: x)
+                self.__default_test(op, np_args, simple_transform=lambda x: x[:1])
+                self.__default_test(op, np_args, simple_transform=lambda x: x[:, :1])
+
+    def test_reduction_ops(self):
+        x = np.random.rand(18, 28, 38).astype(np.float32)
+
+        for op in ("min", "max"):
+            with self.subTest(op=op):
+
+                for axes in (0, 1, 2, (0, 1), (0, 2), (1, 2), (0, 1, 2)):
+                    with self.subTest(axes=axes):
+                        np_args = (x.astype(np.float32),)
+                        self.__default_test(
+                            op,
+                            np_args,
+                            np_kwargs={"axis": axes},
+                            mlx_kwargs={"axis": axes},
+                            torch_kwargs={"dim": axes},
+                            torch_op="a" + op,
+                        )
+
+    def test_arg_reduction_ops(self):
+        data = np.random.rand(10, 12, 13).astype(np.float32)
+        x = mx.array(data).astype(mx.bfloat16)
+        data = np.asarray(x.astype(mx.float32))
+
+        for op in ["argmin", "argmax"]:
+            for axis in range(3):
+                for kd in [True, False]:
+                    a = getattr(mx, op)(x, axis, kd)
+                    b = getattr(np, op)(data, axis, keepdims=kd)
+                    a = a.astype(mx.float32)
+                    self.assertEqual(a.tolist(), b.tolist())
+
+        for op in ["argmin", "argmax"]:
+            a = getattr(mx, op)(x, keepdims=True)
+            b = getattr(np, op)(data, keepdims=True)
+            a = a.astype(mx.float32)
+            self.assertEqual(a.tolist(), b.tolist())
+            a = getattr(mx, op)(x)
+            b = getattr(np, op)(data)
+            a = a.astype(mx.float32)
+            self.assertEqual(a.item(), b)
+
+    def test_blas_ops(self):
+        if mx.default_device() != mx.gpu:
+            return
+
+        def test_blas(shape_x, shape_y):
+            np.random.seed(42)
+            with self.subTest(shape_x=shape_x, shape_y=shape_y):
+                x = np.random.normal(0.0, 1.0 / shape_x[-1], size=shape_x)
+                y = np.random.normal(0.0, 1.0 / shape_x[-1], size=shape_y)
+
+                np_args = (
+                    x.astype(np.float32),
+                    y.astype(np.float32),
+                )
+                op = "matmul"
+
+                self.__default_test(op, np_args, atol_np=1e-3, atol_torch=1e-3)
+
+        for shape_x, shape_y in [
+            [(32, 32), (32, 32)],
+            [(23, 57), (57, 1)],
+            [(1, 3), (3, 128)],
+            [(8, 128, 768), (768, 16)],
+        ]:
+            test_blas(shape_x, shape_y)
+
+
+if __name__ == "__main__":
+    unittest.main()