mlx-examples/llms/mlx_lm/models/olmo.py

# Copyright © 2023-2024 Apple Inc.

import sys
from dataclasses import dataclass
from typing import Any, Optional, Tuple

import mlx.core as mx
import mlx.nn as nn

from .base import BaseModelArgs, create_attention_mask

try:
    import hf_olmo
except ImportError:
    print("To run olmo install ai2-olmo: pip install ai2-olmo")
    sys.exit(1)


@dataclass
class ModelArgs(BaseModelArgs):
    model_type: str
    d_model: int
    n_layers: int
    mlp_hidden_size: int
    n_heads: int
    vocab_size: int
    embedding_size: int
    rope_theta: float = 10000
    rope_traditional: bool = False
    mlp_ratio: int = 4
    weight_tying: bool = False

    def __post_init__(self):
        self.mlp_hidden_size = (
            self.mlp_hidden_size
            if self.mlp_hidden_size is not None
            else self.mlp_ratio * self.d_model
        )


class TransformerBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        dim = args.d_model

        self.ff_proj = nn.Linear(dim, args.mlp_hidden_size, bias=False)
        self.ff_out = nn.Linear(args.mlp_hidden_size // 2, dim, bias=False)

        self.att_norm = nn.LayerNorm(dim, affine=False)
        self.ff_norm = nn.LayerNorm(dim, affine=False)

        head_dim = dim // self.n_heads
        self.scale = head_dim**-0.5

        self.att_proj = nn.Linear(dim, 3 * dim, bias=False)
        self.attn_out = nn.Linear(dim, dim, bias=False)

        self.rope = nn.RoPE(
            head_dim,
            traditional=args.rope_traditional,
            base=args.rope_theta,
        )

        self.args = args

    def attend(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Any] = None,
    ) -> mx.array:
        B, L, D = x.shape

        queries, keys, values = mx.split(self.att_proj(x), 3, axis=-1)

        # Prepare the queries, keys and values for the attention computation
        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
        keys = keys.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)

        if cache is not None:
            queries = self.rope(queries, offset=cache.offset)
            keys = self.rope(keys, offset=cache.offset)
            keys, values = cache.update_and_fetch(keys, values)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)

        scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)
        if mask is not None:
            scores += mask
        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
        output = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.attn_out(output)

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Any] = None,
    ) -> mx.array:
        r = self.attend(self.att_norm(x), mask, cache)
        h = x + r

        x1, x2 = mx.split(self.ff_proj(self.ff_norm(h)), 2, axis=-1)

        out = h + self.ff_out(nn.silu(x2) * x1)
        return out


class Transformer(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.n_layers = args.n_layers
        self.weight_tying = args.weight_tying

        self.wte = nn.Embedding(args.embedding_size, args.d_model)
        self.blocks = [TransformerBlock(args=args) for _ in range(args.n_layers)]
        if not self.weight_tying:
            self.ff_out = nn.Linear(args.d_model, args.embedding_size, bias=False)
        self.norm = nn.LayerNorm(args.d_model, affine=False)

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        h = self.wte(inputs)

        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.blocks)

        for block, c in zip(self.blocks, cache):
            h = block(h, mask, c)

        h = self.norm(h)

        if self.weight_tying:
            return self.wte.as_linear(h), cache

        return self.ff_out(h)


class OlmoModel(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.transformer = Transformer(args)

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        return self.transformer(inputs, cache)


class Model(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.model_type = args.model_type
        self.model = OlmoModel(args)
        self.args = args

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        return self.model(inputs, cache)

    @property
    def layers(self):
        return self.model.transformer.blocks
Handle longer prompt/generation (#931) * rebase * nits * nit * fix rotating cache with step prefill * update version 2024-08-17 06:28:39 +08:00			`# Copyright © 2023-2024 Apple Inc.`

More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`import sys`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`from dataclasses import dataclass`
More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`from typing import Any, Optional, Tuple`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`import mlx.core as mx`
			`import mlx.nn as nn`

Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`from .base import BaseModelArgs, create_attention_mask`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`try:`
			`import hf_olmo`
			`except ImportError:`
			`print("To run olmo install ai2-olmo: pip install ai2-olmo")`
More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`sys.exit(1)`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`@dataclass`
			`class ModelArgs(BaseModelArgs):`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`model_type: str`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`d_model: int`
			`n_layers: int`
			`mlp_hidden_size: int`
			`n_heads: int`
			`vocab_size: int`
			`embedding_size: int`
			`rope_theta: float = 10000`
			`rope_traditional: bool = False`
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00			`mlp_ratio: int = 4`
			`weight_tying: bool = False`

			`def __post_init__(self):`
			`self.mlp_hidden_size = (`
			`self.mlp_hidden_size`
			`if self.mlp_hidden_size is not None`
			`else self.mlp_ratio * self.d_model`
			`)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00

			`class TransformerBlock(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.n_heads = args.n_heads`
			`dim = args.d_model`

			`self.ff_proj = nn.Linear(dim, args.mlp_hidden_size, bias=False)`
			`self.ff_out = nn.Linear(args.mlp_hidden_size // 2, dim, bias=False)`

Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.att_norm = nn.LayerNorm(dim, affine=False)`
			`self.ff_norm = nn.LayerNorm(dim, affine=False)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`head_dim = dim // self.n_heads`
			`self.scale = head_dim**-0.5`

			`self.att_proj = nn.Linear(dim, 3 * dim, bias=False)`
			`self.attn_out = nn.Linear(dim, dim, bias=False)`

			`self.rope = nn.RoPE(`
			`head_dim,`
			`traditional=args.rope_traditional,`
			`base=args.rope_theta,`
			`)`

			`self.args = args`

			`def attend(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`cache: Optional[Any] = None,`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`) -> mx.array:`
			`B, L, D = x.shape`

			`queries, keys, values = mx.split(self.att_proj(x), 3, axis=-1)`

			`# Prepare the queries, keys and values for the attention computation`
			`queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)`
			`keys = keys.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)`
			`values = values.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)`

			`if cache is not None:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`queries = self.rope(queries, offset=cache.offset)`
			`keys = self.rope(keys, offset=cache.offset)`
			`keys, values = cache.update_and_fetch(keys, values)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`else:`
			`queries = self.rope(queries)`
			`keys = self.rope(keys)`

			`scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)`
			`if mask is not None:`
			`scores += mask`
			`scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)`
			`output = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.attn_out(output)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`cache: Optional[Any] = None,`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`) -> mx.array:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`r = self.attend(self.att_norm(x), mask, cache)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`h = x + r`

			`x1, x2 = mx.split(self.ff_proj(self.ff_norm(h)), 2, axis=-1)`
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`out = h + self.ff_out(nn.silu(x2) * x1)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return out`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00

			`class Transformer(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.n_layers = args.n_layers`
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00			`self.weight_tying = args.weight_tying`

Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`self.wte = nn.Embedding(args.embedding_size, args.d_model)`
			`self.blocks = [TransformerBlock(args=args) for _ in range(args.n_layers)]`
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00			`if not self.weight_tying:`
			`self.ff_out = nn.Linear(args.d_model, args.embedding_size, bias=False)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.norm = nn.LayerNorm(args.d_model, affine=False)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`h = self.wte(inputs)`

Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`mask = create_attention_mask(h, cache)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`if cache is None:`
			`cache = [None] * len(self.blocks)`

Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`for block, c in zip(self.blocks, cache):`
			`h = block(h, mask, c)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00			`h = self.norm(h)`

			`if self.weight_tying:`
Quantize embedding / Update quantize API (#680) * more async eval * quantize embedding / update quantize api * more updates for quantize * update for quantize embeddings * update sd quant API * update sdxl quants * error for datasets < batch_size * async * fix config loading * fix quant * fix tests * fix req * remove lm head if tie weights is true * fix test 2024-04-19 09:16:10 +08:00			`return self.wte.as_linear(h), cache`
fix(mlx-lm): olmo 1b model (#417) 2024-02-06 21:27:05 +08:00
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.ff_out(h)`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00

			`class OlmoModel(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.transformer = Transformer(args)`

			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`return self.transformer(inputs, cache)`


			`class Model(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`self.model_type = args.model_type`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00			`self.model = OlmoModel(args)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`self.args = args`
Olmo in MLX LM (#415) * run olmo * format 2024-02-06 13:13:49 +08:00
			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`return self.model(inputs, cache)`
Support for slerp merging models (#455) * support for slerp merging models * docs * update docs * format' 2024-02-20 12:37:15 +08:00
			`@property`
			`def layers(self):`
			`return self.model.transformer.blocks`