refactor: merge deepseek coder example into hf_llm example (#234)

* refactor: merge deepseek coder example into hf_llm example * remove deepseek example * chore: fix format in readme * chore: remove default rope_scaling dict and use get to access type and factor to avoid key error * Update llms/hf_llm/models.py Co-authored-by: Awni Hannun <awni.hannun@gmail.com> * chore: fix lint --------- Co-authored-by: Awni Hannun <awni.hannun@gmail.com>
2025-12-16 02:08:55 +08:00 · 2024-01-06 07:53:46 -08:00
parent cf0ad26a89
commit 758f05c09a
7 changed files with 22 additions and 527 deletions
--- a/llms/deepseek-coder/README.md
+++ b/llms/deepseek-coder/README.md
@@ -1,49 +0,0 @@
 # Deepseek Coder
 Deepseek Coder is a family of code generating language models based on the
 Llama architecture.[^1] The models were trained from scratch on a corpus of 2T
 tokens, with a composition of 87% code and 13% natural language containing both
 English and Chinese.
 ### Setup
 Install the dependencies:
 ```
 pip install -r requirements.txt
 ```
 Next, download and convert the model. 
 ```sh
 python convert.py --hf-path <path_to_huggingface_model>
 ```
 To generate a 4-bit quantized model, use `-q`. For a full list of options run:
 ```
 python convert.py --help
 ```
 The converter downloads the model from Hugging Face. The default model is
 `deepseek-ai/deepseek-coder-6.7b-instruct`. Check out the [Hugging Face
 page](https://huggingface.co/deepseek-ai) to see a list of available models.
 By default, the conversion script will save the converted `weights.npz`,
 tokenizer, and `config.json` in the `mlx_model` directory.
 > [!TIP] Alternatively, you can also download a few converted checkpoints from
 > the [MLX Community](https://huggingface.co/mlx-community) organization on
 > Hugging Face and skip the conversion step.
 ### Run
 Once you've converted the weights, you can interact with the Deepseek coder
 model:
 ```
 python deepseek_coder.py --prompt "write a quick sort algorithm in python."
 ```
 [^1]: For more information [blog post](https://deepseekcoder.github.io/) by
  DeepSeek AI 
--- a/llms/deepseek-coder/convert.py
+++ b/llms/deepseek-coder/convert.py
@@ -1,159 +0,0 @@
 import argparse
 import copy
 import json
 from pathlib import Path
 import mlx.core as mx
 import mlx.nn as nn
 import numpy as np
 import torch
 from deepseek_coder import DeepseekCoder, ModelArgs
 from mlx.utils import tree_flatten, tree_map, tree_unflatten
 from transformers import AutoModelForCausalLM, AutoTokenizer
 def quantize(weights, config, args):
    quantized_config = copy.deepcopy(config)
    # Load the model:
    model_args = ModelArgs(**config)
    model = DeepseekCoder(model_args)
    weights = tree_map(mx.array, weights)
    model.update(tree_unflatten(list(weights.items())))
    # Quantize the model:
    nn.QuantizedLinear.quantize_module(model, args.q_group_size, args.q_bits)
    # Update the config:
    quantized_config["quantization"] = {
        "group_size": args.q_group_size,
        "bits": args.q_bits,
    }
    quantized_weights = dict(tree_flatten(model.parameters()))
    return quantized_weights, quantized_config
 def convert(args):
    hf_path = Path(args.hf_path)
    model = AutoModelForCausalLM.from_pretrained(
        str(hf_path), trust_remote_code=True, torch_dtype=torch.float16
    )
    config = model.config.to_dict()
    state_dict = model.state_dict()
    tokenizer = AutoTokenizer.from_pretrained(
        str(hf_path), trust_remote_code=True, use_fast=False
    )
    # things to change
    # 1. there's no "model." in the weight names
    state_dict = {k.replace("model.", ""): v for k, v in state_dict.items()}
    # 2. mlp is called feed_forward
    state_dict = {k.replace("mlp", "feed_forward"): v for k, v in state_dict.items()}
    # 3. up_proj, down_proj, gate_proj
    state_dict = {k.replace("down_proj", "w2"): v for k, v in state_dict.items()}
    state_dict = {k.replace("up_proj", "w3"): v for k, v in state_dict.items()}
    state_dict = {k.replace("gate_proj", "w1"): v for k, v in state_dict.items()}
    # 4. layernorms
    state_dict = {
        k.replace("input_layernorm", "attention_norm"): v for k, v in state_dict.items()
    }
    state_dict = {
        k.replace("post_attention_layernorm", "ffn_norm"): v
        for k, v in state_dict.items()
    }
    # 5. lm head
    state_dict = {k.replace("lm_head", "output"): v for k, v in state_dict.items()}
    # 6. token emb
    state_dict = {
        k.replace("embed_tokens", "tok_embeddings"): v for k, v in state_dict.items()
    }
    # 7. attention
    state_dict = {k.replace("self_attn", "attention"): v for k, v in state_dict.items()}
    state_dict = {k.replace("q_proj", "wq"): v for k, v in state_dict.items()}
    state_dict = {k.replace("k_proj", "wk"): v for k, v in state_dict.items()}
    state_dict = {k.replace("v_proj", "wv"): v for k, v in state_dict.items()}
    state_dict = {k.replace("o_proj", "wo"): v for k, v in state_dict.items()}
    weights = {k: v.numpy() for k, v in state_dict.items()}
    config["rope_scaling_factor"] = (
        config["rope_scaling"]["factor"] if config["rope_scaling"] is not None else 1.0
    )
    keep_keys = set(
        [
            "vocab_size",
            "hidden_size",
            "num_attention_heads",
            "num_key_value_heads",
            "num_hidden_layers",
            "max_position_embeddings",
            "rms_norm_eps",
            "intermediate_size",
            "rope_scaling_factor",
            "rope_theta",
        ]
    )
    for k in list(config.keys()):
        if k not in keep_keys:
            config.pop(k)
    return weights, config, tokenizer
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert Deepseek coder model to npz")
    parser.add_argument(
        "--hf-path",
        help="The huggingface model to be converted",
        default="deepseek-ai/deepseek-coder-6.7b-instruct",
    )
    parser.add_argument(
        "--mlx-path",
        type=str,
        default="mlx_model",
        help="The path to save the MLX model.",
    )
    parser.add_argument(
        "-q",
        "--quantize",
        help="Generate a quantized model.",
        action="store_true",
    )
    parser.add_argument(
        "--q-group-size",
        help="Group size for quantization.",
        type=int,
        default=64,
    )
    parser.add_argument(
        "--q-bits",
        help="Bits per weight for quantization.",
        type=int,
        default=4,
    )
    args = parser.parse_args()
    mlx_path = Path(args.mlx_path)
    mlx_path.mkdir(parents=True, exist_ok=True)
    weights, config, tokenizer = convert(args)
    if args.quantize:
        print("[INFO] Quantizing")
        weights, config = quantize(weights, config, args)
    np.savez(str(mlx_path / "weights.npz"), **weights)
    tokenizer.save_pretrained(mlx_path)
    with open(mlx_path / "config.json", "w") as f:
        config["model_type"] = "deepseek_coder"
        json.dump(config, f, indent=4)
--- a/llms/deepseek-coder/deepseek_coder.py
+++ b/llms/deepseek-coder/deepseek_coder.py
@@ -1,313 +0,0 @@
 import argparse
 import json
 import math
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional, Tuple
 import mlx.core as mx
 import mlx.nn as nn
 from mlx.utils import tree_unflatten
 from transformers import AutoTokenizer
@dataclass
 class ModelArgs:
    hidden_size: int = 4096
    num_attention_heads: int = 32
    num_hidden_layers: int = 32
    num_key_value_heads: int = 32
    max_position_embeddings: int = 16384
    rms_norm_eps: float = 1e-6
    intermediate_size: int = 11008
    rope_theta: float = 100000
    rope_scaling_factor: float = 4.0
    vocab_size: int = 32256
 class RMSNorm(nn.Module):
    def __init__(self, dims: int, eps: float = 1e-5):
        super().__init__()
        self.weight = mx.ones((dims,))
        self.eps = eps
    def _norm(self, x):
        return x * mx.rsqrt(x.square().mean(-1, keepdims=True) + self.eps)
    def __call__(self, x):
        output = self._norm(x.astype(mx.float32)).astype(x.dtype)
        return self.weight * output
 class LinearScalingRoPE(nn.RoPE):
    def __init__(
        self, dims: int, rope_scaling_factor: float = 4.0, base: float = 10000
    ):
        super().__init__(dims)
        self.base = base
        self.rope_scaling_factor = rope_scaling_factor
    def __call__(self, x, offset: int = 0):
        shape = x.shape
        x = mx.reshape(x, (-1, shape[-2], shape[-1]))
        N = x.shape[1] + offset
        costheta, sintheta = LinearScalingRoPE.create_cos_sin_theta(
            N,
            self.dims,
            offset=offset,
            base=self.base,
            rope_scaling_factor=self.rope_scaling_factor,
            dtype=x.dtype,
        )
        rx = self._compute_rope(costheta, sintheta, x)
        return mx.reshape(rx, shape)
    @staticmethod
    def create_cos_sin_theta(
        N: int,
        D: int,
        offset: int = 0,
        base: float = 10000,
        rope_scaling_factor: float = 1.0,
        dtype=mx.float32,
    ):
        D = D // 2
        positions = mx.arange(offset, N, dtype=dtype)
        positions = positions / rope_scaling_factor
        freqs = mx.exp(-mx.arange(0.0, D, dtype=dtype) * (math.log(base) / D))
        theta = mx.reshape(positions, (-1, 1)) * mx.reshape(freqs, (1, -1))
        return mx.cos(theta), mx.sin(theta)
 class Attention(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.num_attention_heads: int = args.num_attention_heads
        self.num_key_value_heads: int = args.num_key_value_heads
        self.repeats = self.num_attention_heads // self.num_key_value_heads
        self.head_dim = args.hidden_size // args.num_attention_heads
        self.scale = self.head_dim**-0.5
        self.wq = nn.Linear(
            args.hidden_size, args.num_attention_heads * self.head_dim, bias=False
        )
        self.wk = nn.Linear(
            args.hidden_size, args.num_key_value_heads * self.head_dim, bias=False
        )
        self.wv = nn.Linear(
            args.hidden_size, args.num_key_value_heads * self.head_dim, bias=False
        )
        self.wo = nn.Linear(
            args.num_attention_heads * self.head_dim, args.hidden_size, bias=False
        )
        self.rope = LinearScalingRoPE(
            self.head_dim,
            rope_scaling_factor=args.rope_scaling_factor,
            base=args.rope_theta,
        )
    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        B, L, D = x.shape
        queries, keys, values = self.wq(x), self.wk(x), self.wv(x)
        # Prepare the queries, keys and values for the attention computation
        queries = queries.reshape(B, L, self.num_attention_heads, -1).transpose(
            0, 2, 1, 3
        )
        keys = keys.reshape(B, L, self.num_key_value_heads, -1).transpose(0, 2, 1, 3)
        values = values.reshape(B, L, self.num_key_value_heads, -1).transpose(
            0, 2, 1, 3
        )
        def repeat(a):
            a = mx.concatenate([mx.expand_dims(a, 2)] * self.repeats, axis=2)
            return a.reshape([B, self.num_attention_heads, L, -1])
        keys, values = map(repeat, (keys, values))
        if cache is not None:
            key_cache, value_cache = cache
            queries = self.rope(queries, offset=key_cache.shape[2])
            keys = self.rope(keys, offset=key_cache.shape[2])
            keys = mx.concatenate([key_cache, keys], axis=2)
            values = mx.concatenate([value_cache, values], axis=2)
        else:
            queries = self.rope(queries)
            keys = self.rope(keys)
        scores = (queries * self.scale) @ keys.transpose(0, 1, 3, 2)
        if mask is not None:
            scores += mask
        scores = mx.softmax(scores.astype(mx.float32), axis=-1).astype(scores.dtype)
        output = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
        return self.wo(output), (keys, values)
 class FeedForward(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.w1 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
        self.w2 = nn.Linear(args.intermediate_size, args.hidden_size, bias=False)
        self.w3 = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
    def __call__(self, x) -> mx.array:
        return self.w2(nn.silu(self.w1(x)) * self.w3(x))
 class TransformerBlock(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.attention = Attention(args)
        self.feed_forward = FeedForward(args=args)
        self.attention_norm = RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
        self.ffn_norm = RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
    def __call__(
        self,
        x: mx.array,
        mask: Optional[mx.array] = None,
        cache: Optional[Tuple[mx.array, mx.array]] = None,
    ) -> mx.array:
        r, cache = self.attention(self.attention_norm(x), mask, cache)
        h = x + r
        r = self.feed_forward(self.ffn_norm(h))
        out = h + r
        return out, cache
 class DeepseekCoder(nn.Module):
    def __init__(self, args: ModelArgs):
        super().__init__()
        self.args = args
        self.vocab_size = args.vocab_size
        self.tok_embeddings = nn.Embedding(args.vocab_size, args.hidden_size)
        self.layers = [
            TransformerBlock(args=args) for _ in range(args.num_hidden_layers)
        ]
        self.norm = RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
        self.output = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
    def __call__(self, x, mask=None, cache=None):
        x = self.tok_embeddings(x)
        mask = None
        T = x.shape[1]
        if T > 1:
            mask = nn.MultiHeadAttention.create_additive_causal_mask(T)
            mask = mask.astype(x.dtype)
        if cache is None:
            cache = [None] * len(self.layers)
        for e, layer in enumerate(self.layers):
            x, cache[e] = layer(x, mask, cache[e])
        x = self.norm(x)
        return self.output(x), cache
 def generate(
    prompt: mx.array,
    model: DeepseekCoder,
    temp: float = 0.0,
 ):
    def sample(logits):
        if temp == 0:
            return mx.argmax(logits, axis=-1)
        else:
            return mx.random.categorical(logits * (1 / temp))
    y = prompt
    cache = None
    while True:
        logits, cache = model(y[None], cache=cache)
        logits = logits[:, -1, :]
        y = sample(logits)
        yield y
 def load_model(model_path: str):
    model_path = Path(model_path)
    with open(model_path / "config.json", "r") as f:
        config = json.load(f)
        config.pop("model_type")
        quantization = config.pop("quantization", None)
        model_args = ModelArgs(**config)
    model = DeepseekCoder(model_args)
    weights = mx.load(str(model_path / "weights.npz"))
    if quantization is not None:
        nn.QuantizedLinear.quantize_module(model, **quantization)
    model.update(tree_unflatten(list(weights.items())))
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    return model, tokenizer
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Deepseek coder inference script")
    parser.add_argument(
        "--model-path",
        type=str,
        default="mlx_model",
        help="The path to the mlx model weights, tokenizer, and config",
    )
    parser.add_argument(
        "--prompt",
        help="The message to be processed by the model",
        default="### Instruction: \nwrite a quick sort algorithm in python.\n### Response: \n",
    )
    parser.add_argument(
        "--max-tokens",
        "-m",
        type=int,
        default=100,
        help="Maximum number of tokens to generate",
    )
    parser.add_argument(
        "--temp",
        help="The sampling temperature.",
        type=float,
        default=0.6,
    )
    parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")
    args = parser.parse_args()
    mx.random.seed(args.seed)
    model, tokenizer = load_model(args.model_path)
    prompt = tokenizer(
        args.prompt,
        return_tensors="np",
        return_attention_mask=False,
    )[
        "input_ids"
    ][0]
    prompt = mx.array(prompt)
    print(args.prompt, end="", flush=True)
    tokens = []
    skip = 0
    for token, _ in zip(
        generate(prompt, model, args.temp),
        range(args.max_tokens),
    ):
        if token == tokenizer.eos_token_id:
            break
        tokens.append(token.item())
        s = tokenizer.decode(tokens)
        print(s[skip:], end="", flush=True)
        skip = len(s)
    print(tokenizer.decode(tokens)[skip:], flush=True)
--- a/llms/deepseek-coder/requirements.txt
+++ b/llms/deepseek-coder/requirements.txt
@@ -1,4 +0,0 @@
 torch
 mlx
 numpy
 transformers>=4.35
--- a/llms/hf_llm/README.md
+++ b/llms/hf_llm/README.md
@@ -45,6 +45,8 @@ Here are a few examples of Hugging Face models which work with this example:
 - [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
 - [TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T](https://huggingface.co/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T)
 - [deepseek-ai/deepseek-coder-6.7b-instruct](https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct)
 - [01-ai/Yi-6B-Chat](https://huggingface.co/01-ai/Yi-6B-Chat)
 Most
 [Mistral](https://huggingface.co/models?library=transformers,safetensors&other=mistral&sort=trending)
--- a/llms/hf_llm/models.py
+++ b/llms/hf_llm/models.py
@@ -5,7 +5,7 @@ import inspect
 import json
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple, Union
 import mlx.core as mx
 import mlx.nn as nn
@@ -26,11 +26,20 @@ class ModelArgs:
    rope_theta: float = 10000
    rope_traditional: bool = False
    model_type: str = None
    rope_scaling: Optional[Dict[str, Union[float, str]]] = None
    def __post_init__(self):
        if self.num_key_value_heads is None:
            self.num_key_value_heads = self.num_attention_heads
        if self.rope_scaling:
            required_keys = {"factor", "type"}
            if not all(key in self.rope_scaling for key in required_keys):
                raise ValueError(f"rope_scaling must contain keys {required_keys}")
            if self.rope_scaling["type"] != "linear":
                raise ValueError("rope_scaling 'type' currently only supports 'linear'")
    @classmethod
    def from_dict(cls, params):
        return cls(
@@ -73,8 +82,16 @@ class Attention(nn.Module):
        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=False)
        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=False)
        rope_scale = (
            1 / args.rope_scaling["factor"]
            if args.rope_scaling is not None and args.rope_scaling["type"] == "linear"
            else 1
        )
        self.rope = nn.RoPE(
-            head_dim, traditional=args.rope_traditional, base=args.rope_theta
+            head_dim,
            traditional=args.rope_traditional,
            base=args.rope_theta,
            scale=rope_scale,
        )
    def __call__(
--- a/llms/hf_llm/requirements.txt
+++ b/llms/hf_llm/requirements.txt
@@ -1,3 +1,4 @@
 mlx>=0.0.7
 numpy
 transformers
 protobuf