mlx-examples/llms/mlx_lm/models/cohere.py

from dataclasses import dataclass
from typing import Optional, Tuple

import mlx.core as mx
import mlx.nn as nn

from .base import BaseModelArgs, create_attention_mask


@dataclass
class ModelArgs(BaseModelArgs):
    model_type: str
    hidden_size: int = 8192
    num_hidden_layers: int = 40
    intermediate_size: int = 22528
    num_attention_heads: int = 64
    num_key_value_heads: int = 64
    rope_theta: float = 8000000.0
    vocab_size: int = 256000
    layer_norm_eps: float = 1e-05
    logit_scale: float = 0.0625
    attention_bias: bool = False
    layer_norm_bias: bool = False
    use_qk_norm: bool = False


class LayerNorm2D(nn.Module):

    def __init__(self, d1, d2, eps):
        super().__init__()
        self.weight = mx.zeros((d1, d2))
        self.eps = eps

    def __call__(self, x):
        return self.weight * mx.fast.layer_norm(x, None, None, self.eps)


class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args

        dim = args.hidden_size
        self.n_heads = n_heads = args.num_attention_heads
        self.n_kv_heads = n_kv_heads = args.num_key_value_heads

        head_dim = args.hidden_size // args.num_attention_heads
        self.scale = head_dim**-0.5

        attetion_bias = args.attention_bias

        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attetion_bias)
        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)
        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)
        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attetion_bias)

        self.use_qk_norm = args.use_qk_norm
        if self.use_qk_norm:
            self.q_norm = LayerNorm2D(self.n_heads, head_dim, eps=args.layer_norm_eps)
            self.k_norm = LayerNorm2D(
                self.n_kv_heads, head_dim, eps=args.layer_norm_eps
            )

        self.rope = nn.RoPE(head_dim, traditional=True, base=args.rope_theta)

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        B, L, D = x.shape

        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)

        queries = queries.reshape(B, L, self.n_heads, -1)
        keys = keys.reshape(B, L, self.n_kv_heads, -1)
        if self.use_qk_norm:
            queries = self.q_norm(queries)
            keys = self.k_norm(keys)

        queries = queries.transpose(0, 2, 1, 3)
        keys = keys.transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)

        if cache is not None:
            queries = self.rope(queries, offset=cache.offset)
            keys = self.rope(keys, offset=cache.offset)
            keys, values = cache.update_and_fetch(keys, values)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)

        output = mx.fast.scaled_dot_product_attention(
            queries, keys, values, scale=self.scale, mask=mask
        )

        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)


class MLP(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)
        self.up_proj = nn.Linear(dim, hidden_dim, bias=False)
        self.down_proj = nn.Linear(hidden_dim, dim, bias=False)

    def __call__(self, x):
        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))


class TransformerBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.hidden_size = args.hidden_size
        self.n_heads = args.num_attention_heads

        self.self_attn = Attention(args)
        self.mlp = MLP(args.hidden_size, args.intermediate_size)
        self.input_layernorm = nn.LayerNorm(
            args.hidden_size, eps=args.layer_norm_eps, bias=args.layer_norm_bias
        )
        self.args = args

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        h = self.input_layernorm(x)
        attn_h = self.self_attn(h, mask, cache)
        ff_h = self.mlp(h)
        return attn_h + ff_h + x


class CohereModel(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args
        self.vocab_size = args.vocab_size
        self.num_hidden_layers = args.num_hidden_layers
        assert self.vocab_size > 0
        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
        self.layers = [
            TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
        ]
        self.norm = nn.LayerNorm(
            args.hidden_size, eps=args.layer_norm_eps, bias=args.layer_norm_bias
        )

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        h = self.embed_tokens(inputs)

        mask = create_attention_mask(h, cache)

        if cache is None:
            cache = [None] * len(self.layers)

        for layer, c in zip(self.layers, cache):
            h = layer(h, mask, c)

        return self.norm(h)


class Model(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.model_type = args.model_type
        self.model = CohereModel(args)
        self.args = args

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        out = self.model(inputs, cache)
        out = self.model.embed_tokens.as_linear(out)
        out = out * self.model.args.logit_scale
        return out

    @property
    def layers(self):
        return self.model.layers

    @property
    def head_dim(self):
        return self.args.hidden_size // self.args.num_attention_heads

    @property
    def n_kv_heads(self):
        return self.args.num_key_value_heads
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`from dataclasses import dataclass`
			`from typing import Optional, Tuple`

			`import mlx.core as mx`
			`import mlx.nn as nn`

Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`from .base import BaseModelArgs, create_attention_mask`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00

			`@dataclass`
			`class ModelArgs(BaseModelArgs):`
			`model_type: str`
			`hidden_size: int = 8192`
			`num_hidden_layers: int = 40`
			`intermediate_size: int = 22528`
			`num_attention_heads: int = 64`
			`num_key_value_heads: int = 64`
			`rope_theta: float = 8000000.0`
			`vocab_size: int = 256000`
			`layer_norm_eps: float = 1e-05`
			`logit_scale: float = 0.0625`
			`attention_bias: bool = False`
			`layer_norm_bias: bool = False`
Fix for cohere plus (#650) * fix for cohere plus * version bump 2024-04-06 05:11:24 +08:00			`use_qk_norm: bool = False`


			`class LayerNorm2D(nn.Module):`

			`def __init__(self, d1, d2, eps):`
			`super().__init__()`
			`self.weight = mx.zeros((d1, d2))`
			`self.eps = eps`

			`def __call__(self, x):`
			`return self.weight * mx.fast.layer_norm(x, None, None, self.eps)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00

			`class Attention(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.args = args`

			`dim = args.hidden_size`
			`self.n_heads = n_heads = args.num_attention_heads`
			`self.n_kv_heads = n_kv_heads = args.num_key_value_heads`

			`head_dim = args.hidden_size // args.num_attention_heads`
			`self.scale = head_dim**-0.5`

			`attetion_bias = args.attention_bias`

			`self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attetion_bias)`
			`self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)`
			`self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attetion_bias)`
			`self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attetion_bias)`

Fix for cohere plus (#650) * fix for cohere plus * version bump 2024-04-06 05:11:24 +08:00			`self.use_qk_norm = args.use_qk_norm`
			`if self.use_qk_norm:`
			`self.q_norm = LayerNorm2D(self.n_heads, head_dim, eps=args.layer_norm_eps)`
			`self.k_norm = LayerNorm2D(`
			`self.n_kv_heads, head_dim, eps=args.layer_norm_eps`
			`)`

Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`self.rope = nn.RoPE(head_dim, traditional=True, base=args.rope_theta)`

			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
			`B, L, D = x.shape`

			`queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)`

Fix for cohere plus (#650) * fix for cohere plus * version bump 2024-04-06 05:11:24 +08:00			`queries = queries.reshape(B, L, self.n_heads, -1)`
			`keys = keys.reshape(B, L, self.n_kv_heads, -1)`
			`if self.use_qk_norm:`
			`queries = self.q_norm(queries)`
			`keys = self.k_norm(keys)`

			`queries = queries.transpose(0, 2, 1, 3)`
			`keys = keys.transpose(0, 2, 1, 3)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)`

			`if cache is not None:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`queries = self.rope(queries, offset=cache.offset)`
			`keys = self.rope(keys, offset=cache.offset)`
			`keys, values = cache.update_and_fetch(keys, values)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`else:`
			`queries = self.rope(queries)`
			`keys = self.rope(keys)`

			`output = mx.fast.scaled_dot_product_attention(`
			`queries, keys, values, scale=self.scale, mask=mask`
			`)`

			`output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.o_proj(output)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00

			`class MLP(nn.Module):`
			`def __init__(self, dim, hidden_dim):`
			`super().__init__()`
			`self.gate_proj = nn.Linear(dim, hidden_dim, bias=False)`
			`self.up_proj = nn.Linear(dim, hidden_dim, bias=False)`
			`self.down_proj = nn.Linear(hidden_dim, dim, bias=False)`

			`def __call__(self, x):`
			`return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))`


			`class TransformerBlock(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.hidden_size = args.hidden_size`
			`self.n_heads = args.num_attention_heads`

			`self.self_attn = Attention(args)`
			`self.mlp = MLP(args.hidden_size, args.intermediate_size)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.input_layernorm = nn.LayerNorm(`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`args.hidden_size, eps=args.layer_norm_eps, bias=args.layer_norm_bias`
			`)`
			`self.args = args`

			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
			`h = self.input_layernorm(x)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`attn_h = self.self_attn(h, mask, cache)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`ff_h = self.mlp(h)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return attn_h + ff_h + x`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00

			`class CohereModel(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.args = args`
			`self.vocab_size = args.vocab_size`
			`self.num_hidden_layers = args.num_hidden_layers`
			`assert self.vocab_size > 0`
			`self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)`
			`self.layers = [`
			`TransformerBlock(args=args) for _ in range(args.num_hidden_layers)`
			`]`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.norm = nn.LayerNorm(`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`args.hidden_size, eps=args.layer_norm_eps, bias=args.layer_norm_bias`
			`)`

			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`h = self.embed_tokens(inputs)`

Unify attention mask in LLMs (#911) * Unify attention mask creation in LLMs. Currently, each model implementation in `mlx-examples/llms/models` has ad-hoc code to create a mask for the attention mechanism. This usually takes the form: ``` mask = None if h.shape[1] > 1: mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1]) mask = mask.astype(h.dtype) ``` This correctly creates a mask only if the input consists of more than one token. But this code assumes the multi-token input is at the beginning of inference. If, for example, we are evaluating multiple tokens because of speculative decoding or prompt cache reuse, this mask will not have the correct shape and and will cause the raising of an exception in the attention computation. Some of the models correctly implement the mask creation with code like this: ``` mask = None if h.shape[1] > 1: mask = create_additive_causal_mask( h.shape[1], cache[0].offset if cache is not None else 0 ) mask = mask.astype(h.dtype) ``` This commit unifies the attention mask creation for all models with a new function `create_attention_mask`, reducing code duplication and helping all models support inference performance enhancements like those mentioned above. * Allow batches in LLM key-value cache The current implementation of the LLM key-value cache assumes that the input batch is of size 1. Input batching (evaluating multiple alterative inputs at the same time) can be a valuable tool for speculative sampling and other techniques. This change removes the hard-coded batch size from the code that resizes the key-value cache. * Simplify causal mask creation Use the same codepath regardless of whether there's an offset or not. Addresses [this comment](https://github.com/ml-explore/mlx-examples/pull/911#discussion_r1691459717). * Use old-style type annotation to avoid linter error 2024-07-26 07:45:22 +08:00			`mask = create_attention_mask(h, cache)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00
			`if cache is None:`
			`cache = [None] * len(self.layers)`

Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`for layer, c in zip(self.layers, cache):`
			`h = layer(h, mask, c)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.norm(h)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00

			`class Model(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.model_type = args.model_type`
			`self.model = CohereModel(args)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`self.args = args`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00
			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`out = self.model(inputs, cache)`
Quantize embedding / Update quantize API (#680) * more async eval * quantize embedding / update quantize api * more updates for quantize * update for quantize embeddings * update sd quant API * update sdxl quants * error for datasets < batch_size * async * fix config loading * fix quant * fix tests * fix req * remove lm head if tie weights is true * fix test 2024-04-19 09:16:10 +08:00			`out = self.model.embed_tokens.as_linear(out)`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00			`out = out * self.model.args.logit_scale`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return out`
Add support for Cohere's Command-R (#565) * initial commit for command-R * update mlp, layernorm, lm_head and model args * add custom layernorm * add default to tie_word_embeddings * add layernorm weight type and refactor * update layernorm (bias conditional) in model/layers * fix layer norm use traditional rope * add test --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-03-13 22:03:36 +08:00
			`@property`
			`def layers(self):`
			`return self.model.layers`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00
			`@property`
			`def head_dim(self):`
			`return self.args.hidden_size // self.args.num_attention_heads`

			`@property`
			`def n_kv_heads(self):`
			`return self.args.num_key_value_heads`