Use fast rope (#945)

* use fast rope * fix llama * use fast rope for llama3.1 * requires unreleased mlx * fix su * fix deepseek v2 * only one of base or freqs * nit * fix * hard code freqs
2025-12-16 02:08:55 +08:00 · 2024-08-23 13:18:51 -07:00
parent 58591a1b41
commit 6731254e76
7 changed files with 65 additions and 137 deletions
--- a/llms/mlx_lm/models/su_rope.py
+++ b/llms/mlx_lm/models/su_rope.py
@@ -4,15 +4,14 @@ import math
 from typing import List, Union

 import mlx.core as mx
+import mlx.nn as nn


-class SuScaledRotaryEmbedding:
+class SuScaledRotaryEmbedding(nn.Module):
    def __init__(
        self,
        dims: int,
-        traditional: bool = False,
        base: float = 10000.0,
-        scale: float = 1.0,
        max_position_embeddings: int = 131072,
        original_max_position_embeddings: int = 4096,
        short_factor: Union[List[float], float] = 1.0,
@@ -23,10 +22,7 @@ class SuScaledRotaryEmbedding:

        Args:
            dims (int): The feature dimensions to be rotated.
-            traditional (bool, optional): Unused. Default: ``False``.
            base (int, optional): Base for the exponential scaling.
-            scale (float, optional): The scale used to scale the positions.
-              Default: ``1.0``.
            max_position_embeddings (int, optional): The maximum sequence
              length that this model was trained with. This is used to determine
              the size of the original RoPE embeddings when using long scaling.
@@ -42,40 +38,23 @@ class SuScaledRotaryEmbedding:
              factors for sequences of length greater than
              ``original_max_position_embeddings``.  Default: ``1.0``.
        """
-        self.inv_freq_short = 1.0 / (
-            mx.array(short_factor, dtype=mx.float32)
-            * base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
-        )
-        self.inv_freq_long = 1.0 / (
-            scale
-            * mx.array(long_factor, dtype=mx.float32)
-            * base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
-        )
+        super().__init__()
+        freqs = base ** (mx.arange(0, dims, 2, dtype=mx.float32) / dims)
+        self._freqs = mx.array(long_factor, dtype=mx.float32) * freqs
        self.original_max_position_embeddings = original_max_position_embeddings
-        self.scaling_factor = math.sqrt(
+        self.scale = math.sqrt(
            1
            + math.log(max_position_embeddings / original_max_position_embeddings)
            / math.log(original_max_position_embeddings)
        )

-    def _get_cos_sin(self, offset, L):
-        position_ids = mx.arange(offset, offset + L, dtype=mx.float32)
-        inv_freq = (
-            self.inv_freq_long
-            if (offset + L) > self.original_max_position_embeddings
-            else self.inv_freq_short
-        )
-        freqs = position_ids[:, None] * inv_freq[None, :]
-        emb = mx.concatenate([freqs, freqs], axis=-1)
-        cos = mx.cos(emb) * self.scaling_factor
-        sin = mx.sin(emb) * self.scaling_factor
-        return cos, sin
-
    def __call__(self, x, offset: int = 0):
-        def _rotate_half(_x):
-            midpoint = _x.shape[-1] // 2
-            x1, x2 = _x[..., :midpoint], _x[..., midpoint:]
-            return mx.concatenate([-x2, x1], axis=-1)
-
-        cos, sin = self._get_cos_sin(offset, x.shape[2])
-        return (x * cos) + (_rotate_half(x) * sin)
+        return mx.fast.rope(
+            self.scale * x,
+            x.shape[-1],
+            traditional=False,
+            base=None,
+            scale=1.0,
+            offset=offset,
+            freqs=self._freqs,
+        )