Add Lion optimizer (#209)

* Add Lion optimizer * Update acknowledgements also with past contributions
2025-12-12 07:18:52 +08:00 · 2023-12-20 22:54:58 +01:00
parent f40d17047d
commit 4912ff3ec2
3 changed files with 59 additions and 1 deletions
--- a/ACKNOWLEDGMENTS.md
+++ b/ACKNOWLEDGMENTS.md
@@ -8,6 +8,7 @@ with a short description of your contribution(s) below. For example:
 MLX was developed with contributions from the following individuals:
 - Juarez Bochi: Fixed bug in cross attention.
 - Justin Deschenaux: Sine, Cosine, arange, randint, truncated normal, bernoulli, lion optimizer, linear and logistic regression python example.
 # Third-Party Software
--- a/docs/src/python/optimizers.rst
+++ b/docs/src/python/optimizers.rst
@@ -44,3 +44,4 @@ model's parameters and the **optimizer state**.
   Adam
   AdamW
   Adamax
   Lion
--- a/python/mlx/optimizers.py
+++ b/python/mlx/optimizers.py
@@ -442,3 +442,59 @@ class Adamax(Adam):
        state["v"] = v
        return parameter - lr * m / (v + eps)
 class Lion(Optimizer):
    r"""Implementation of the Lion optimizer [1]. 
    Since updates are computed through the sign operation, they tend to 
    have larger norm than for other optimizers such as SGD and Adam. 
    We recommend a learning rate that is 3-10x smaller than AdamW and a 
    weight decay 3-10x larger than AdamW to maintain the strength 
    (lr * wd). Our Lion implementation follows the original paper. In 
    detail,
    [1]: Chen, X. Symbolic Discovery of Optimization Algorithms. arXiv
    preprint arXiv:2302.06675.
    .. math::
        c_{t + 1} &= \beta_1 m_t + (1 - \beta_1) g_t
        m_{t + 1} &= \beta_2 m_t + (1 - \beta_2) g_t
        w_{t + 1} &= w_t - \eta (\text{sign}(c_t) + \lambda w_t)
    Args:
        learning_rate (float): The learning rate :math:`\eta`.
        betas (Tuple[float, float], optional): The coefficients
          :math:`(\beta_1, \beta_2)` used for computing the gradient
          momentum and update direction. Default: ``(0.9, 0.99)``
        weight_decay (float, optional): The weight decay :math:`\lambda`. Default: ``0.0``
    """
    def __init__(
        self,
        learning_rate: float,
        betas: List[float] = [0.9, 0.99],
        weight_decay: float = 0.0,
    ):
        super().__init__()
        self.learning_rate = learning_rate
        self.betas = betas
        self.weight_decay = weight_decay
    def apply_single(
        self, gradient: mx.array, parameter: mx.array, state: OptimizerState
    ):
        """Performs the Lion parameter update and stores :math:`m` 
        in the optimizer state."""
        lr = self.learning_rate
        b1, b2 = self.betas
        weight_decay = self.weight_decay
        m = state.get("m", gradient)
        c = b1 * m + (1 - b1) * gradient
        state["m"] = b2 * m + (1 - b2) * gradient
        if weight_decay > 0:
            parameter = (1 - lr * weight_decay) * parameter
        return parameter - lr * mx.sign(c)