mlx-examples/llms/mlx_lm/models/mixtral.py

from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union

import mlx.core as mx
import mlx.nn as nn
import numpy as np

from .base import BaseModelArgs


@dataclass
class ModelArgs(BaseModelArgs):
    model_type: str
    vocab_size: int = 32000
    hidden_size: int = 4096
    intermediate_size: int = 14336
    num_hidden_layers: int = 32
    num_attention_heads: int = 32
    num_experts_per_tok: int = 2
    num_key_value_heads: int = 8
    num_local_experts: int = 8
    rms_norm_eps: float = 1e-5
    rope_theta: float = 1e6
    rope_traditional: bool = False
    rope_scaling: Optional[Dict[str, Union[float, str]]] = None

    def __post_init__(self):
        if self.num_key_value_heads is None:
            self.num_key_value_heads = self.num_attention_heads


class MixtralAttention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.hidden_size = args.hidden_size
        self.num_heads = args.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = args.num_key_value_heads
        self.rope_theta = args.rope_theta

        self.scale = self.head_dim**-0.5

        self.q_proj = nn.Linear(
            self.hidden_size, self.num_heads * self.head_dim, bias=False
        )
        self.k_proj = nn.Linear(
            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
        )
        self.v_proj = nn.Linear(
            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False
        )
        self.o_proj = nn.Linear(
            self.num_heads * self.head_dim, self.hidden_size, bias=False
        )

        self.rope = nn.RoPE(
            self.head_dim,
            traditional=args.rope_traditional,
            base=args.rope_theta,
        )

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        B, L, D = x.shape

        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)

        # Prepare the queries, keys and values for the attention computation
        queries = queries.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)
        keys = keys.reshape(B, L, self.num_key_value_heads, -1).transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.num_key_value_heads, -1).transpose(
            0, 2, 1, 3
        )

        if cache is not None:
            queries = self.rope(queries, offset=cache.offset)
            keys = self.rope(keys, offset=cache.offset)
            keys, values = cache.update_and_fetch(keys, values)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)

        output = mx.fast.scaled_dot_product_attention(
            queries, keys, values, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.o_proj(output)


class MixtralBLockSparseTop2MLP(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.ffn_dim = args.intermediate_size
        self.hidden_dim = args.hidden_size

        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)

        self.act_fn = nn.silu

    def __call__(self, x: mx.array) -> mx.array:
        current_hidden_states = self.act_fn(self.w1(x)) * self.w3(x)
        current_hidden_states = self.w2(current_hidden_states)
        return current_hidden_states


class MixtralSparseMoeBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.hidden_dim = args.hidden_size
        self.ffn_dim = args.intermediate_size
        self.num_experts = args.num_local_experts
        self.num_experts_per_tok = args.num_experts_per_tok

        # gating
        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)

        self.experts = [
            MixtralBLockSparseTop2MLP(args=args) for _ in range(self.num_experts)
        ]

    def __call__(self, x: mx.array) -> mx.array:
        ne = self.num_experts_per_tok
        orig_shape = x.shape
        x = x.reshape(-1, x.shape[-1])

        gates = self.gate(x)

        inds = mx.stop_gradient(mx.argpartition(-gates, kth=ne - 1, axis=-1)[:, :ne])

        scores = mx.softmax(
            mx.take_along_axis(gates, inds, axis=-1).astype(mx.float32),
            axis=-1,
        ).astype(gates.dtype)

        if self.training:
            inds = np.array(inds)
            y = mx.zeros((x.shape[0], ne, x.shape[-1]), x.dtype)
            for e, expert in enumerate(self.experts):
                idx1, idx2 = map(mx.array, np.where(inds == e))
                if idx1.size == 0:
                    continue
                y[idx1, idx2] = expert(x[idx1])

            y = (y * scores[:, :, None]).sum(axis=1)
        else:
            y = []
            for xt, st, it in zip(x, scores, inds.tolist()):
                yt = mx.stack([self.experts[e](xt) for e in it], axis=-1)
                yt = (yt * st).sum(axis=-1)
                y.append(yt[None, :])
            y = mx.concatenate(y)

        return y.reshape(orig_shape)


class MixtralDecoderLayer(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.hidden_size = args.hidden_size

        self.self_attn = MixtralAttention(args)

        self.block_sparse_moe = MixtralSparseMoeBlock(args)
        self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
        self.post_attention_layernorm = nn.RMSNorm(
            args.hidden_size, eps=args.rms_norm_eps
        )

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), mask, cache)
        h = x + r
        r = self.block_sparse_moe(self.post_attention_layernorm(h))
        out = h + r
        return out


class MixtralModel(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.vocab_size = args.vocab_size
        self.num_hidden_layers = args.num_hidden_layers

        self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
        self.layers = [
            MixtralDecoderLayer(args=args) for _ in range(args.num_hidden_layers)
        ]
        self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        h = self.embed_tokens(inputs)

        mask = None
        T = h.shape[1]
        if T > 1:
            mask = nn.MultiHeadAttention.create_additive_causal_mask(T)
            mask = mask.astype(h.dtype)

        if cache is None:
            cache = [None] * len(self.layers)

        for layer, c in zip(self.layers, cache):
            h = layer(h, mask, c)

        return self.norm(h)


class Model(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.model_type = args.model_type
        self.model = MixtralModel(args)
        self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
        self.args = args

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        out = self.model(inputs, cache)
        return self.lm_head(out)

    @property
    def layers(self):
        return self.model.layers

    @property
    def head_dim(self):
        return self.args.hidden_size // self.args.num_attention_heads

    @property
    def n_kv_heads(self):
        return self.args.num_key_value_heads
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`from dataclasses import dataclass`
			`from typing import Dict, Optional, Tuple, Union`

			`import mlx.core as mx`
			`import mlx.nn as nn`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`import numpy as np`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
			`from .base import BaseModelArgs`


			`@dataclass`
			`class ModelArgs(BaseModelArgs):`
Mixtral: Fix non-default arg follows default exception (#450) Mixtral models throw the following exception ``` Traceback (most recent call last): File "<frozen runpy>", line 198, in _run_module_as_main File "<frozen runpy>", line 88, in _run_code File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/generate.py", line 119, in <module> main(args) File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/generate.py", line 96, in main model, tokenizer = load(args.model, tokenizer_config=tokenizer_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/utils.py", line 278, in load model = load_model(model_path) ^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/utils.py", line 221, in load_model model_class, model_args_class = _get_classes(config=config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/utils.py", line 46, in _get_classes arch = importlib.import_module(f"mlx_lm.models.{model_type}") ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/importlib/__init__.py", line 126, in import_module return _bootstrap._gcd_import(name[level:], package, level) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "<frozen importlib._bootstrap>", line 1204, in _gcd_import File "<frozen importlib._bootstrap>", line 1176, in _find_and_load File "<frozen importlib._bootstrap>", line 1147, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 690, in _load_unlocked File "<frozen importlib._bootstrap_external>", line 940, in exec_module File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/mlx_lm/models/mixtral.py", line 11, in <module> @dataclass ^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/dataclasses.py", line 1230, in dataclass return wrap(cls) ^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/dataclasses.py", line 1220, in wrap return _process_class(cls, init, repr, eq, order, unsafe_hash, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/homebrew/anaconda3/lib/python3.11/dataclasses.py", line 1027, in _process_class _init_fn(all_init_fields, File "/opt/homebrew/anaconda3/lib/python3.11/dataclasses.py", line 545, in _init_fn raise TypeError(f'non-default argument {f.name!r} ' TypeError: non-default argument 'model_type' follows default argument ``` 2024-02-19 05:30:26 +08:00			`model_type: str`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`vocab_size: int = 32000`
			`hidden_size: int = 4096`
			`intermediate_size: int = 14336`
			`num_hidden_layers: int = 32`
			`num_attention_heads: int = 32`
			`num_experts_per_tok: int = 2`
			`num_key_value_heads: int = 8`
			`num_local_experts: int = 8`
			`rms_norm_eps: float = 1e-5`
			`rope_theta: float = 1e6`
			`rope_traditional: bool = False`
			`rope_scaling: Optional[Dict[str, Union[float, str]]] = None`

			`def __post_init__(self):`
			`if self.num_key_value_heads is None:`
			`self.num_key_value_heads = self.num_attention_heads`


			`class MixtralAttention(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.hidden_size = args.hidden_size`
			`self.num_heads = args.num_attention_heads`
			`self.head_dim = self.hidden_size // self.num_heads`
			`self.num_key_value_heads = args.num_key_value_heads`
			`self.rope_theta = args.rope_theta`

			`self.scale = self.head_dim**-0.5`

			`self.q_proj = nn.Linear(`
			`self.hidden_size, self.num_heads * self.head_dim, bias=False`
			`)`
			`self.k_proj = nn.Linear(`
			`self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False`
			`)`
			`self.v_proj = nn.Linear(`
			`self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False`
			`)`
			`self.o_proj = nn.Linear(`
			`self.num_heads * self.head_dim, self.hidden_size, bias=False`
			`)`

			`self.rope = nn.RoPE(`
			`self.head_dim,`
			`traditional=args.rope_traditional,`
			`base=args.rope_theta,`
			`)`

			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
			`B, L, D = x.shape`

			`queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)`

			`# Prepare the queries, keys and values for the attention computation`
			`queries = queries.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)`
			`keys = keys.reshape(B, L, self.num_key_value_heads, -1).transpose(0, 2, 1, 3)`
			`values = values.reshape(B, L, self.num_key_value_heads, -1).transpose(`
			`0, 2, 1, 3`
			`)`

			`if cache is not None:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`queries = self.rope(queries, offset=cache.offset)`
			`keys = self.rope(keys, offset=cache.offset)`
			`keys, values = cache.update_and_fetch(keys, values)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`else:`
			`queries = self.rope(queries)`
			`keys = self.rope(keys)`

Make attention faster for a some models (#574) * make attention faster for a couple models * remove unused generation flags * add comment on lora * include text files as well 2024-03-15 12:35:54 +08:00			`output = mx.fast.scaled_dot_product_attention(`
			`queries, keys, values, scale=self.scale, mask=mask`
			`)`
			`output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.o_proj(output)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00

			`class MixtralBLockSparseTop2MLP(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.ffn_dim = args.intermediate_size`
			`self.hidden_dim = args.hidden_size`

			`self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)`
			`self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)`
			`self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)`

			`self.act_fn = nn.silu`

			`def __call__(self, x: mx.array) -> mx.array:`
			`current_hidden_states = self.act_fn(self.w1(x)) * self.w3(x)`
			`current_hidden_states = self.w2(current_hidden_states)`
			`return current_hidden_states`


			`class MixtralSparseMoeBlock(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.hidden_dim = args.hidden_size`
			`self.ffn_dim = args.intermediate_size`
			`self.num_experts = args.num_local_experts`
			`self.num_experts_per_tok = args.num_experts_per_tok`

			`# gating`
			`self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)`

			`self.experts = [`
			`MixtralBLockSparseTop2MLP(args=args) for _ in range(self.num_experts)`
			`]`

			`def __call__(self, x: mx.array) -> mx.array:`
			`ne = self.num_experts_per_tok`
			`orig_shape = x.shape`
			`x = x.reshape(-1, x.shape[-1])`

			`gates = self.gate(x)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
Fix argpartition call in Mixtral and other MOES (#676) * Update mixtral.py * fix all moes --------- Co-authored-by: yuhai-china <yuhai.china@gmail.com> 2024-04-13 02:00:56 +08:00			`inds = mx.stop_gradient(mx.argpartition(-gates, kth=ne - 1, axis=-1)[:, :ne])`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`scores = mx.softmax(`
			`mx.take_along_axis(gates, inds, axis=-1).astype(mx.float32),`
			`axis=-1,`
			`).astype(gates.dtype)`

feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`if self.training:`
			`inds = np.array(inds)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`y = mx.zeros((x.shape[0], ne, x.shape[-1]), x.dtype)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`for e, expert in enumerate(self.experts):`
			`idx1, idx2 = map(mx.array, np.where(inds == e))`
			`if idx1.size == 0:`
			`continue`
			`y[idx1, idx2] = expert(x[idx1])`

			`y = (y * scores[:, :, None]).sum(axis=1)`
			`else:`
			`y = []`
			`for xt, st, it in zip(x, scores, inds.tolist()):`
DBRX (#628) * dbrx * format * format * comments * change scores slightly * remove inadvertant import 2024-03-29 12:03:53 +08:00			`yt = mx.stack([self.experts[e](xt) for e in it], axis=-1)`
feat: move lora into mlx-lm (#337) * feat: Add lora and qlora training to mlx-lm --------- Co-authored-by: Awni Hannun <awni@apple.com> 2024-01-24 00:44:37 +08:00			`yt = (yt * st).sum(axis=-1)`
			`y.append(yt[None, :])`
			`y = mx.concatenate(y)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
			`return y.reshape(orig_shape)`


			`class MixtralDecoderLayer(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.hidden_size = args.hidden_size`

			`self.self_attn = MixtralAttention(args)`

			`self.block_sparse_moe = MixtralSparseMoeBlock(args)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)`
			`self.post_attention_layernorm = nn.RMSNorm(`
			`args.hidden_size, eps=args.rms_norm_eps`
			`)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`r = self.self_attn(self.input_layernorm(x), mask, cache)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`h = x + r`
			`r = self.block_sparse_moe(self.post_attention_layernorm(h))`
			`out = h + r`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return out`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00

			`class MixtralModel(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.vocab_size = args.vocab_size`
			`self.num_hidden_layers = args.num_hidden_layers`

			`self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)`
			`self.layers = [`
			`MixtralDecoderLayer(args=args) for _ in range(args.num_hidden_layers)`
			`]`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`h = self.embed_tokens(inputs)`

			`mask = None`
			`T = h.shape[1]`
			`if T > 1:`
			`mask = nn.MultiHeadAttention.create_additive_causal_mask(T)`
			`mask = mask.astype(h.dtype)`

			`if cache is None:`
			`cache = [None] * len(self.layers)`

Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`for layer, c in zip(self.layers, cache):`
			`h = layer(h, mask, c)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`return self.norm(h)`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00

			`class Model(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
Lazy import + refactor Lora layer addition (#426) * lazy model import in mlx_lm * change lora loading * fix olmo lora * remove a bunch of unused stuff from plamo * move phixtral to mlx-lm and out of llms/ 2024-02-13 02:51:02 +08:00			`self.model_type = args.model_type`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00			`self.model = MixtralModel(args)`
			`self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`self.args = args`
feat(mlx_lm): add mixtral support in mlx_lm (#318) * feat: add mixtral support in mlx_lm * chore: update doc 2024-01-15 23:18:14 +08:00
			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00			`out = self.model(inputs, cache)`
			`return self.lm_head(out)`
Support for slerp merging models (#455) * support for slerp merging models * docs * update docs * format' 2024-02-20 12:37:15 +08:00
			`@property`
			`def layers(self):`
			`return self.model.layers`
Kv cache (#643) * in place kv_cache * fix * fix kv cache size * partially fix kv cache dtype * step kv cache * multiple of step size * more teests + kv cache * more kv cache * udpate all models to use kv cache 2024-05-08 23:18:13 +08:00
			`@property`
			`def head_dim(self):`
			`return self.args.hidden_size // self.args.num_attention_heads`

			`@property`
			`def n_kv_heads(self):`
			`return self.args.num_key_value_heads`