mlx-examples/llms/mlx_lm/models/stablelm.py

# Copyright © 2023-2024 Apple Inc.

import math
from dataclasses import dataclass

import mlx.core as mx
import mlx.nn as nn

from .base import BaseModelArgs, create_attention_mask


@dataclass
class ModelArgs(BaseModelArgs):
    model_type: str
    vocab_size: int
    hidden_size: int
    num_attention_heads: int
    num_hidden_layers: int
    num_key_value_heads: int
    intermediate_size: int
    rope_theta: float
    use_qkv_bias: bool
    partial_rotary_factor: float
    layer_norm_eps: float
    use_parallel_residual: bool = False
    qk_layernorm: bool = False


class LayerNormPerHead(nn.Module):

    def __init__(self, head_dim, num_heads, eps):
        super().__init__()
        self.norms = [
            nn.LayerNorm(head_dim, eps=eps, bias=False) for _ in range(num_heads)
        ]
        self.eps = eps

    def __call__(self, x):
        w = mx.stack([n.weight for n in self.norms])
        return w * mx.fast.layer_norm(x, None, None, self.eps)


class Attention(nn.Module):
    def __init__(self, config: ModelArgs):
        super().__init__()

        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.rope_theta = config.rope_theta
        self.partial_rotary_factor = config.partial_rotary_factor

        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        self.q_proj = nn.Linear(
            self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias
        )
        self.k_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.use_qkv_bias,
        )
        self.v_proj = nn.Linear(
            self.hidden_size,
            self.num_key_value_heads * self.head_dim,
            bias=config.use_qkv_bias,
        )
        self.o_proj = nn.Linear(
            self.num_heads * self.head_dim, self.hidden_size, bias=False
        )

        self.rope = nn.RoPE(
            int(self.partial_rotary_factor * self.head_dim),
            traditional=False,
            base=self.rope_theta,
        )

        self.qk_layernorm = config.qk_layernorm
        if self.qk_layernorm:
            self.q_layernorm = LayerNormPerHead(
                self.head_dim, self.num_heads, eps=config.layer_norm_eps
            )
            self.k_layernorm = LayerNormPerHead(
                self.head_dim, self.num_key_value_heads, eps=config.layer_norm_eps
            )

    def __call__(self, x, mask=None, cache=None):
        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)

        # Extract some shapes
        B, L, D = queries.shape

        queries = queries.reshape(B, L, self.num_heads, -1)
        keys = keys.reshape(B, L, self.num_key_value_heads, -1)
        if self.qk_layernorm:
            queries = self.q_layernorm(queries)
            keys = self.k_layernorm(keys)
        queries = queries.transpose(0, 2, 1, 3)
        keys = keys.transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.num_key_value_heads, -1).transpose(
            0, 2, 1, 3
        )

        # Add RoPE to the queries and keys and combine them with the cache
        if cache is not None:
            queries = self.rope(queries, offset=cache.offset)
            keys = self.rope(keys, offset=cache.offset)
            keys, values = cache.update_and_fetch(keys, values)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)

        queries = queries.astype(mx.float32)
        keys = keys.astype(mx.float32)

        # Finally perform the attention computation
        scale = math.sqrt(1 / queries.shape[-1])
        output = mx.fast.scaled_dot_product_attention(
            queries, keys, values, scale=scale, mask=mask
        ).astype(values.dtype)
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)


class MLP(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)
        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)

    def __call__(self, x) -> mx.array:
        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))


class DecoderLayer(nn.Module):
    def __init__(self, config: ModelArgs):
        super().__init__()
        self.self_attn = Attention(config=config)
        self.mlp = MLP(config.hidden_size, config.intermediate_size)
        self.input_layernorm = nn.LayerNorm(
            config.hidden_size,
            eps=config.layer_norm_eps,
        )
        self.use_parallel_residual = config.use_parallel_residual
        if not self.use_parallel_residual:
            self.post_attention_layernorm = nn.LayerNorm(
                config.hidden_size,
                eps=config.layer_norm_eps,
            )

    def __call__(self, x, mask, cache):
        h = self.input_layernorm(x)
        r = self.self_attn(h, mask, cache)

        if self.use_parallel_residual:
            out = x + r + self.mlp(h)
        else:
            h = x + r
            r = self.mlp(self.post_attention_layernorm(h))
            out = h + r
        return out


class StableLM(nn.Module):
    def __init__(self, config: ModelArgs):
        super().__init__()
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
        self.layers = [DecoderLayer(config) for i in range(config.num_hidden_layers)]
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def __call__(self, x, mask, cache):
        x = self.embed_tokens(x)
        if cache is None:
            cache = [None] * len(self.layers)

        for layer, c in zip(self.layers, cache):
            x = layer(x, mask, cache=c)

        return self.norm(x)


class Model(nn.Module):
    def __init__(self, config: ModelArgs):
        super().__init__()
        self.model_type = config.model_type
        self.model = StableLM(config)
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.args = config

    def __call__(
        self,
        x: mx.array,
        mask: mx.array = None,
        cache=None,
    ) -> mx.array:
        mask = create_attention_mask(x, cache)
        y = self.model(x, mask, cache)
        return self.lm_head(y)

    @property
    def layers(self):
        return self.model.layers
Handle longer prompt/generation (#931) * rebase * nits * nit * fix rotating cache with step prefill * update version 2024-08-17 06:28:39 +08:00			`# Copyright © 2023-2024 Apple Inc.`

Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`import math`
			`from dataclasses import dataclass`

			`import mlx.core as mx`
			`import mlx.nn as nn`

Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`from .base import BaseModelArgs, create_attention_mask`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00

			`@dataclass`
			`class ModelArgs(BaseModelArgs):`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`model_type: str`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`vocab_size: int`
			`hidden_size: int`
			`num_attention_heads: int`
			`num_hidden_layers: int`
			`num_key_value_heads: int`
			`intermediate_size: int`
			`rope_theta: float`
			`use_qkv_bias: bool`
Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`partial_rotary_factor: float`
			`layer_norm_eps: float`
			`use_parallel_residual: bool = False`
			`qk_layernorm: bool = False`


			`class LayerNormPerHead(nn.Module):`

			`def __init__(self, head_dim, num_heads, eps):`
			`super().__init__()`
			`self.norms = [`
			`nn.LayerNorm(head_dim, eps=eps, bias=False) for _ in range(num_heads)`
			`]`
			`self.eps = eps`

			`def __call__(self, x):`
			`w = mx.stack([n.weight for n in self.norms])`
			`return w * mx.fast.layer_norm(x, None, None, self.eps)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00

			`class Attention(nn.Module):`
			`def __init__(self, config: ModelArgs):`
			`super().__init__()`

			`self.hidden_size = config.hidden_size`
			`self.num_heads = config.num_attention_heads`
			`self.head_dim = self.hidden_size // self.num_heads`
			`self.num_key_value_heads = config.num_key_value_heads`
			`self.rope_theta = config.rope_theta`
Update to StableLM code (#514) * StableLM now part of Transformers as stablelm rather than stablelm_epoch; changed config to match new changes * removing old file * reference new stablelm 2024-03-02 01:53:38 +08:00			`self.partial_rotary_factor = config.partial_rotary_factor`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00
			`if (self.head_dim * self.num_heads) != self.hidden_size:`
			`raise ValueError(`
			f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
			f" and `num_heads`: {self.num_heads})."
			`)`

			`self.q_proj = nn.Linear(`
			`self.hidden_size, self.num_heads * self.head_dim, bias=config.use_qkv_bias`
			`)`
			`self.k_proj = nn.Linear(`
			`self.hidden_size,`
			`self.num_key_value_heads * self.head_dim,`
			`bias=config.use_qkv_bias,`
			`)`
			`self.v_proj = nn.Linear(`
			`self.hidden_size,`
			`self.num_key_value_heads * self.head_dim,`
			`bias=config.use_qkv_bias,`
			`)`
			`self.o_proj = nn.Linear(`
			`self.num_heads * self.head_dim, self.hidden_size, bias=False`
			`)`

			`self.rope = nn.RoPE(`
Update to StableLM code (#514) * StableLM now part of Transformers as stablelm rather than stablelm_epoch; changed config to match new changes * removing old file * reference new stablelm 2024-03-02 01:53:38 +08:00			`int(self.partial_rotary_factor * self.head_dim),`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`traditional=False,`
			`base=self.rope_theta,`
			`)`

Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`self.qk_layernorm = config.qk_layernorm`
			`if self.qk_layernorm:`
			`self.q_layernorm = LayerNormPerHead(`
			`self.head_dim, self.num_heads, eps=config.layer_norm_eps`
			`)`
			`self.k_layernorm = LayerNormPerHead(`
			`self.head_dim, self.num_key_value_heads, eps=config.layer_norm_eps`
			`)`

Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`def __call__(self, x, mask=None, cache=None):`
			`queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)`

			`# Extract some shapes`
			`B, L, D = queries.shape`

Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`queries = queries.reshape(B, L, self.num_heads, -1)`
			`keys = keys.reshape(B, L, self.num_key_value_heads, -1)`
			`if self.qk_layernorm:`
			`queries = self.q_layernorm(queries)`
			`keys = self.k_layernorm(keys)`
			`queries = queries.transpose(0, 2, 1, 3)`
			`keys = keys.transpose(0, 2, 1, 3)`
			`values = values.reshape(B, L, self.num_key_value_heads, -1).transpose(`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`0, 2, 1, 3`
			`)`

			`# Add RoPE to the queries and keys and combine them with the cache`
			`if cache is not None:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`queries = self.rope(queries, offset=cache.offset)`
			`keys = self.rope(keys, offset=cache.offset)`
			`keys, values = cache.update_and_fetch(keys, values)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`else:`
			`queries = self.rope(queries)`
			`keys = self.rope(keys)`

			`queries = queries.astype(mx.float32)`
			`keys = keys.astype(mx.float32)`

			`# Finally perform the attention computation`
			`scale = math.sqrt(1 / queries.shape[-1])`
[mlx-lm] Use sdpa in llama / mistral model (#515) * use sdpa * update a few more models * version * fix stablelm type 2024-03-08 09:41:23 +08:00			`output = mx.fast.scaled_dot_product_attention(`
			`queries, keys, values, scale=scale, mask=mask`
			`).astype(values.dtype)`
			`output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.o_proj(output)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00

			`class MLP(nn.Module):`
			`def __init__(self, dim, hidden_dim):`
			`super().__init__()`
			`self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)`
			`self.down_proj = nn.Linear(hidden_dim, dim, bias=False)`
			`self.up_proj = nn.Linear(dim, hidden_dim, bias=False)`

			`def __call__(self, x) -> mx.array:`
			`return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))`


			`class DecoderLayer(nn.Module):`
			`def __init__(self, config: ModelArgs):`
			`super().__init__()`
			`self.self_attn = Attention(config=config)`
			`self.mlp = MLP(config.hidden_size, config.intermediate_size)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.input_layernorm = nn.LayerNorm(`
Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`config.hidden_size,`
			`eps=config.layer_norm_eps,`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`)`
Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`self.use_parallel_residual = config.use_parallel_residual`
			`if not self.use_parallel_residual:`
			`self.post_attention_layernorm = nn.LayerNorm(`
			`config.hidden_size,`
			`eps=config.layer_norm_eps,`
			`)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00
			`def __call__(self, x, mask, cache):`
Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00			`h = self.input_layernorm(x)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`r = self.self_attn(h, mask, cache)`
Stable lm 2 (#666) * stable lm 2 * test and lora * version bump * merge stable models 2024-04-09 05:18:55 +08:00
			`if self.use_parallel_residual:`
			`out = x + r + self.mlp(h)`
			`else:`
			`h = x + r`
			`r = self.mlp(self.post_attention_layernorm(h))`
			`out = h + r`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return out`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00

			`class StableLM(nn.Module):`
			`def __init__(self, config: ModelArgs):`
			`super().__init__()`
			`self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)`
			`self.layers = [DecoderLayer(config) for i in range(config.num_hidden_layers)]`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00
			`def __call__(self, x, mask, cache):`
			`x = self.embed_tokens(x)`
			`if cache is None:`
			`cache = [None] * len(self.layers)`

Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`for layer, c in zip(self.layers, cache):`
			`x = layer(x, mask, cache=c)`

			`return self.norm(x)`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00

			`class Model(nn.Module):`
			`def __init__(self, config: ModelArgs):`
			`super().__init__()`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`self.model_type = config.model_type`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00			`self.model = StableLM(config)`
			`self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`self.args = config`
Add StableLM-2 1.6B (#378) * init * stablelm * add to readme * bump version --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-27 02:28:00 +08:00
			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: mx.array = None,`
More cache improvements (#1015) * fix rotating kv cache for chat use case * reorg + fixes to caching, unify prompt caching across types and use cases for e.g. caching during a chat * nit in chat * fix tests * fix tests * fix tests * docs * chat command * comments + docs * Define meta_state on all Cache implementations * fixes + trim_prompt_cache api * fix default model --------- Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com> 2024-10-08 11:45:51 +08:00			`cache=None,`
			`) -> mx.array:`
Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`mask = create_attention_mask(x, cache)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`y = self.model(x, mask, cache)`
			`return self.lm_head(y)`
Support for slerp merging models (#455) * support for slerp merging models * docs * update docs * format' 2024-02-20 12:37:15 +08:00
			`@property`
			`def layers(self):`
			`return self.model.layers`