mlx-examples/llms/mistral/mistral.py

# Copyright © 2023 Apple Inc.

import argparse
import json
import time
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

import mlx.core as mx
import mlx.nn as nn
from mlx.utils import tree_unflatten
from sentencepiece import SentencePieceProcessor


@dataclass
class ModelArgs:
    dim: int
    n_layers: int
    head_dim: int
    hidden_dim: int
    n_heads: int
    n_kv_heads: int
    norm_eps: float
    vocab_size: int
    rope_theta: float = 10000


class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args

        self.n_heads: int = args.n_heads
        self.n_kv_heads: int = args.n_kv_heads

        self.repeats = self.n_heads // self.n_kv_heads

        self.scale = self.args.head_dim**-0.5

        self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=False)
        self.wk = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
        self.wv = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)
        self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=False)
        self.rope = nn.RoPE(args.head_dim, traditional=True, base=args.rope_theta)

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        B, L, D = x.shape

        queries, keys, values = self.wq(x), self.wk(x), self.wv(x)

        # Prepare the queries, keys and values for the attention computation
        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)

        if cache is not None:
            key_cache, value_cache = cache
            queries = self.rope(queries, offset=key_cache.shape[2])
            keys = self.rope(keys, offset=key_cache.shape[2])
            keys = mx.concatenate([key_cache, keys], axis=2)
            values = mx.concatenate([value_cache, values], axis=2)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)

        output = mx.fast.scaled_dot_product_attention(
            queries, keys, values, scale=self.scale, mask=mask
        )
        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.wo(output), (keys, values)


class FeedForward(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()

        self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=False)
        self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=False)
        self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=False)

    def __call__(self, x) -> mx.array:
        return self.w2(nn.silu(self.w1(x)) * self.w3(x))


class TransformerBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.attention = Attention(args)
        self.feed_forward = FeedForward(args=args)
        self.attention_norm = nn.RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = nn.RMSNorm(args.dim, eps=args.norm_eps)
        self.args = args

    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        r, cache = self.attention(self.attention_norm(x), mask, cache)
        h = x + r
        r = self.feed_forward(self.ffn_norm(h))
        out = h + r
        return out, cache


class Mistral(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args
        self.vocab_size = args.vocab_size
        self.n_layers = args.n_layers
        assert self.vocab_size > 0
        self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)
        self.layers = [TransformerBlock(args=args) for _ in range(args.n_layers)]
        self.norm = nn.RMSNorm(args.dim, eps=args.norm_eps)
        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)

    def __call__(
        self,
        inputs: mx.array,
        cache=None,
    ):
        h = self.tok_embeddings(inputs)

        mask = None
        if h.shape[1] > 1:
            mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])
            mask = mask.astype(h.dtype)

        if cache is None:
            cache = [None] * len(self.layers)

        for e, layer in enumerate(self.layers):
            h, cache[e] = layer(h, mask, cache[e])

        return self.output(self.norm(h)), cache


class Tokenizer:
    def __init__(self, model_path: str):
        assert Path(model_path).exists(), model_path
        self._model = SentencePieceProcessor(model_file=model_path)
        self._sep = "▁"
        assert self._model.vocab_size() == self._model.get_piece_size()

    @property
    def eos_id(self) -> int:
        return self._model.eos_id()

    @property
    def pad_id(self) -> int:
        return self._model.pad_id()

    def encode(self, s: str) -> List[int]:
        return [self._model.bos_id(), *self._model.encode(s)]

    def decode(self, t: List[int]) -> str:
        out = self._model.decode(t)
        if t and self._model.id_to_piece(t[0])[0] == self._sep:
            return " " + out
        return out


def load_model(folder: str):
    model_path = Path(folder)
    tokenizer = Tokenizer(str(model_path / "tokenizer.model"))
    with open(model_path / "config.json", "r") as f:
        config = json.loads(f.read())
        config.pop("sliding_window", None)
        config.pop("model_type", None)
        quantization = config.pop("quantization", None)
        model_args = ModelArgs(**config)
    weights = mx.load(str(model_path / "weights.npz"))
    weights = tree_unflatten(list(weights.items()))
    model = Mistral(model_args)
    if quantization is not None:
        nn.quantize(model, **quantization)
    model.update(weights)
    mx.eval(model.parameters())
    return model, tokenizer


def generate(prompt: mx.array, model: Mistral, temp: Optional[float] = 0.0):
    def sample(logits):
        if temp == 0:
            return mx.argmax(logits, axis=-1)
        else:
            return mx.random.categorical(logits * (1 / temp))

    logits, cache = model(prompt[None])
    y = sample(logits[:, -1, :])
    yield y

    while True:
        logits, cache = model(y[:, None], cache)
        y = sample(logits.squeeze(1))
        yield y


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Mistral inference script")
    parser.add_argument(
        "--model-path",
        type=str,
        default="mlx_model",
        help="The path to the model weights and tokenizer",
    )
    parser.add_argument(
        "--prompt",
        help="The message to be processed by the model",
        default="In the beginning the Universe was created.",
    )
    parser.add_argument(
        "--max-tokens",
        "-m",
        type=int,
        default=100,
        help="Maximum number of tokens to generate",
    )
    parser.add_argument(
        "--temp",
        help="The sampling temperature.",
        type=float,
        default=0.0,
    )
    parser.add_argument(
        "--tokens-per-eval",
        help="The batch size of tokens to generate.",
        type=int,
        default=10,
    )
    parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")

    args = parser.parse_args()

    mx.random.seed(args.seed)
    print("[INFO] Loading model from disk.")
    model, tokenizer = load_model(args.model_path)

    print("[INFO] Starting generation...")
    tic = time.time()
    print(args.prompt, end="", flush=True)
    prompt = mx.array(tokenizer.encode(args.prompt))
    tokens = []
    for token, ntoks in zip(generate(prompt, model, args.temp), range(args.max_tokens)):
        tokens.append(token)
        if ntoks == 0:
            mx.eval(tokens)
            toc = time.time()
            prompt_tps = prompt.size / (toc - tic)
            tic = time.time()

        if (len(tokens) % args.tokens_per_eval) == 0:
            mx.eval(tokens)
            s = tokenizer.decode([t.item() for t in tokens])
            print(s, end="", flush=True)
            tokens = []

    mx.eval(tokens)
    s = tokenizer.decode([t.item() for t in tokens])
    print(s, flush=True)
    print("------")
    generation_tps = ntoks / (time.time() - tic)
    print(
        f"Tokens per second: prompt {prompt_tps:.3f}, "
        f"generation {generation_tps:.3f}"
    )
mistral 2023-12-06 03:02:52 +08:00			`# Copyright © 2023 Apple Inc.`

			`import argparse`
			`import json`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`import time`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00			`from dataclasses import dataclass`
mistral 2023-12-06 03:02:52 +08:00			`from pathlib import Path`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00			`from typing import List, Optional, Tuple`
mistral 2023-12-06 03:02:52 +08:00
			`import mlx.core as mx`
			`import mlx.nn as nn`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`from mlx.utils import tree_unflatten`
Add llms subdir + update README (#145) * add llms subdir + update README * nits * use same pre-commit as mlx * update readmes a bit * format 2023-12-21 02:22:25 +08:00			`from sentencepiece import SentencePieceProcessor`
mistral 2023-12-06 03:02:52 +08:00

			`@dataclass`
			`class ModelArgs:`
			`dim: int`
			`n_layers: int`
			`head_dim: int`
			`hidden_dim: int`
			`n_heads: int`
			`n_kv_heads: int`
			`norm_eps: float`
			`vocab_size: int`
Fix conversion + inference errors. - Mistral (#176) * Fix conversion + inference errors. * wire rope_theta throuugh to nn.RoPE --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-23 06:10:25 +08:00			`rope_theta: float = 10000`
mistral 2023-12-06 03:02:52 +08:00

			`class Attention(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.args = args`

			`self.n_heads: int = args.n_heads`
			`self.n_kv_heads: int = args.n_kv_heads`

			`self.repeats = self.n_heads // self.n_kv_heads`

			`self.scale = self.args.head_dim**-0.5`

			`self.wq = nn.Linear(args.dim, args.n_heads * args.head_dim, bias=False)`
			`self.wk = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)`
			`self.wv = nn.Linear(args.dim, args.n_kv_heads * args.head_dim, bias=False)`
			`self.wo = nn.Linear(args.n_heads * args.head_dim, args.dim, bias=False)`
Fix conversion + inference errors. - Mistral (#176) * Fix conversion + inference errors. * wire rope_theta throuugh to nn.RoPE --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-23 06:10:25 +08:00			`self.rope = nn.RoPE(args.head_dim, traditional=True, base=args.rope_theta)`
mistral 2023-12-06 03:02:52 +08:00
			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
			`B, L, D = x.shape`

			`queries, keys, values = self.wq(x), self.wk(x), self.wv(x)`

			`# Prepare the queries, keys and values for the attention computation`
			`queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)`
			`keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)`
			`values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)`

			`if cache is not None:`
			`key_cache, value_cache = cache`
			`queries = self.rope(queries, offset=key_cache.shape[2])`
			`keys = self.rope(keys, offset=key_cache.shape[2])`
			`keys = mx.concatenate([key_cache, keys], axis=2)`
			`values = mx.concatenate([value_cache, values], axis=2)`
			`else:`
			`queries = self.rope(queries)`
			`keys = self.rope(keys)`

Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`output = mx.fast.scaled_dot_product_attention(`
			`queries, keys, values, scale=self.scale, mask=mask`
			`)`
			`output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)`
mistral 2023-12-06 03:02:52 +08:00			`return self.wo(output), (keys, values)`


			`class FeedForward(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`

			`self.w1 = nn.Linear(args.dim, args.hidden_dim, bias=False)`
			`self.w2 = nn.Linear(args.hidden_dim, args.dim, bias=False)`
			`self.w3 = nn.Linear(args.dim, args.hidden_dim, bias=False)`

			`def __call__(self, x) -> mx.array:`
			`return self.w2(nn.silu(self.w1(x)) * self.w3(x))`


			`class TransformerBlock(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.n_heads = args.n_heads`
			`self.dim = args.dim`
			`self.attention = Attention(args)`
			`self.feed_forward = FeedForward(args=args)`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.attention_norm = nn.RMSNorm(args.dim, eps=args.norm_eps)`
			`self.ffn_norm = nn.RMSNorm(args.dim, eps=args.norm_eps)`
mistral 2023-12-06 03:02:52 +08:00			`self.args = args`

			`def __call__(`
			`self,`
			`x: mx.array,`
			`mask: Optional[mx.array] = None,`
			`cache: Optional[Tuple[mx.array, mx.array]] = None,`
			`) -> mx.array:`
			`r, cache = self.attention(self.attention_norm(x), mask, cache)`
			`h = x + r`
			`r = self.feed_forward(self.ffn_norm(h))`
			`out = h + r`
			`return out, cache`


			`class Mistral(nn.Module):`
			`def __init__(self, args: ModelArgs):`
			`super().__init__()`
			`self.args = args`
			`self.vocab_size = args.vocab_size`
			`self.n_layers = args.n_layers`
			`assert self.vocab_size > 0`
			`self.tok_embeddings = nn.Embedding(args.vocab_size, args.dim)`
			`self.layers = [TransformerBlock(args=args) for _ in range(args.n_layers)]`
Switch to fast RMS/LN Norm (#603) * use nn.RMSNorm, use sdpa, cleanup * bump mlx versions * minor update * use fast layer norm * version bump * update requirement for whisper * update requirement for gguf 2024-03-23 22:13:51 +08:00			`self.norm = nn.RMSNorm(args.dim, eps=args.norm_eps)`
mistral 2023-12-06 03:02:52 +08:00			`self.output = nn.Linear(args.dim, args.vocab_size, bias=False)`

			`def __call__(`
			`self,`
			`inputs: mx.array,`
			`cache=None,`
			`):`
			`h = self.tok_embeddings(inputs)`

			`mask = None`
			`if h.shape[1] > 1:`
			`mask = nn.MultiHeadAttention.create_additive_causal_mask(h.shape[1])`
			`mask = mask.astype(h.dtype)`

			`if cache is None:`
			`cache = [None] * len(self.layers)`

			`for e, layer in enumerate(self.layers):`
			`h, cache[e] = layer(h, mask, cache[e])`

			`return self.output(self.norm(h)), cache`


			`class Tokenizer:`
			`def __init__(self, model_path: str):`
			`assert Path(model_path).exists(), model_path`
			`self._model = SentencePieceProcessor(model_file=model_path)`
			`self._sep = "▁"`
			`assert self._model.vocab_size() == self._model.get_piece_size()`

			`@property`
			`def eos_id(self) -> int:`
			`return self._model.eos_id()`

			`@property`
			`def pad_id(self) -> int:`
			`return self._model.pad_id()`

			`def encode(self, s: str) -> List[int]:`
			`return [self._model.bos_id(), *self._model.encode(s)]`

			`def decode(self, t: List[int]) -> str:`
			`out = self._model.decode(t)`
			`if t and self._model.id_to_piece(t[0])[0] == self._sep:`
			`return " " + out`
			`return out`


Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`def load_model(folder: str):`
mistral 2023-12-06 03:02:52 +08:00			`model_path = Path(folder)`
			`tokenizer = Tokenizer(str(model_path / "tokenizer.model"))`
Use config.json, add model_type (#157) * Use config.json, add model_type * Update convert to generate config.json 2023-12-21 00:39:37 +08:00			`with open(model_path / "config.json", "r") as f:`
mistral 2023-12-06 03:02:52 +08:00			`config = json.loads(f.read())`
Use config.json, add model_type (#157) * Use config.json, add model_type * Update convert to generate config.json 2023-12-21 00:39:37 +08:00			`config.pop("sliding_window", None)`
			`config.pop("model_type", None)`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`quantization = config.pop("quantization", None)`
mistral 2023-12-06 03:02:52 +08:00			`model_args = ModelArgs(**config)`
mixtral runs a bit faster 2023-12-13 00:36:40 +08:00			`weights = mx.load(str(model_path / "weights.npz"))`
mistral 2023-12-06 03:02:52 +08:00			`weights = tree_unflatten(list(weights.items()))`
			`model = Mistral(model_args)`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`if quantization is not None:`
Quantize embedding / Update quantize API (#680) * more async eval * quantize embedding / update quantize api * more updates for quantize * update for quantize embeddings * update sd quant API * update sdxl quants * error for datasets < batch_size * async * fix config loading * fix quant * fix tests * fix req * remove lm head if tie weights is true * fix test 2024-04-19 09:16:10 +08:00			`nn.quantize(model, **quantization)`
mistral 2023-12-06 03:02:52 +08:00			`model.update(weights)`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`mx.eval(model.parameters())`
mistral 2023-12-06 03:02:52 +08:00			`return model, tokenizer`


			`def generate(prompt: mx.array, model: Mistral, temp: Optional[float] = 0.0):`
			`def sample(logits):`
			`if temp == 0:`
			`return mx.argmax(logits, axis=-1)`
			`else:`
			`return mx.random.categorical(logits * (1 / temp))`

			`logits, cache = model(prompt[None])`
			`y = sample(logits[:, -1, :])`
			`yield y`

			`while True:`
			`logits, cache = model(y[:, None], cache)`
			`y = sample(logits.squeeze(1))`
			`yield y`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser(description="Mistral inference script")`
			`parser.add_argument(`
rename --model_path to --model-path (#151) use same argument convention for mistral/mixtral as for llama convert. 2023-12-21 22:28:57 +08:00			`"--model-path",`
mistral 2023-12-06 03:02:52 +08:00			`type=str,`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`default="mlx_model",`
mistral 2023-12-06 03:02:52 +08:00			`help="The path to the model weights and tokenizer",`
			`)`
			`parser.add_argument(`
			`"--prompt",`
			`help="The message to be processed by the model",`
			`default="In the beginning the Universe was created.",`
			`)`
			`parser.add_argument(`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`"--max-tokens",`
mistral 2023-12-06 03:02:52 +08:00			`"-m",`
			`type=int,`
			`default=100,`
			`help="Maximum number of tokens to generate",`
			`)`
nits 2023-12-06 03:24:30 +08:00			`parser.add_argument(`
			`"--temp",`
			`help="The sampling temperature.",`
			`type=float,`
Quantize example (#162) * testing quantization * conversion + quantization working * one config processor * quantization in mistral / nits in llama * args for quantization * llama / mistral conversion in good shape * phi2 quantized * mixtral * qwen conversion 2023-12-22 04:59:37 +08:00			`default=0.0,`
nits 2023-12-06 03:24:30 +08:00			`)`
Add arg tokens_per_eval for token generation 2023-12-10 02:43:44 +08:00			`parser.add_argument(`
make parameter naming consistent with other examples. (#214) 2024-01-03 00:18:12 +08:00			`"--tokens-per-eval",`
Add arg tokens_per_eval for token generation 2023-12-10 02:43:44 +08:00			`help="The batch size of tokens to generate.",`
			`type=int,`
			`default=10,`
			`)`
mistral 2023-12-06 03:02:52 +08:00			`parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")`

			`args = parser.parse_args()`

			`mx.random.seed(args.seed)`
			`print("[INFO] Loading model from disk.")`
			`model, tokenizer = load_model(args.model_path)`

			`print("[INFO] Starting generation...")`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`tic = time.time()`
mistral 2023-12-06 03:02:52 +08:00			`print(args.prompt, end="", flush=True)`
			`prompt = mx.array(tokenizer.encode(args.prompt))`
			`tokens = []`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`for token, ntoks in zip(generate(prompt, model, args.temp), range(args.max_tokens)):`
mistral 2023-12-06 03:02:52 +08:00			`tokens.append(token)`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`if ntoks == 0:`
			`mx.eval(tokens)`
Fix conversion + inference errors. - Mistral (#176) * Fix conversion + inference errors. * wire rope_theta throuugh to nn.RoPE --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-23 06:10:25 +08:00			`toc = time.time()`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`prompt_tps = prompt.size / (toc - tic)`
			`tic = time.time()`
mistral 2023-12-06 03:02:52 +08:00
Add arg tokens_per_eval for token generation 2023-12-10 02:43:44 +08:00			`if (len(tokens) % args.tokens_per_eval) == 0:`
mistral 2023-12-06 03:02:52 +08:00			`mx.eval(tokens)`
			`s = tokenizer.decode([t.item() for t in tokens])`
			`print(s, end="", flush=True)`
			`tokens = []`

			`mx.eval(tokens)`
			`s = tokenizer.decode([t.item() for t in tokens])`
			`print(s, flush=True)`
			`print("------")`
feat: add mistral tps (#173) * feat: add mistral tps * eval params before timing + format --------- Co-authored-by: Awni Hannun <awni@apple.com> 2023-12-22 23:55:57 +08:00			`generation_tps = ntoks / (time.time() - tic)`
			`print(`
			`f"Tokens per second: prompt {prompt_tps:.3f}, "`
			`f"generation {generation_tps:.3f}"`
			`)`