Example reading directly from gguf file (#222)

* Draft of tiny llama from gguf * Transpose all * No transposition with new layout * Read config from gguf * Create tokenizer from gguf * move gguf and update to be similar to hf_llm * change model to HF style + updates to REAMDE * nits in REAMDE * nit readme * only use mlx for metadata * fix eos/bos tokenizer * fix tokenization * quantization runs * 8-bit works * tokenizer fix * bump mlx version --------- Co-authored-by: Juarez Bochi <juarez.bochi@grammarly.com> Co-authored-by: Awni Hannun <awni@apple.com>
2025-12-14 17:28:59 +08:00 · 2024-01-23 18:41:54 -05:00
parent 40b61c1719
commit f5b80c95fb
5 changed files with 536 additions and 0 deletions
--- a/llms/gguf_llm/utils.py
+++ b/llms/gguf_llm/utils.py
@@ -0,0 +1,53 @@
+import sentencepiece as spm
+import sentencepiece.sentencepiece_model_pb2 as model
+
+
+def spm_tokenizer(metadata):
+    tokens = metadata["tokenizer.ggml.tokens"]
+    bos = metadata["tokenizer.ggml.bos_token_id"].item()
+    eos = metadata["tokenizer.ggml.eos_token_id"].item()
+    unk = metadata["tokenizer.ggml.unknown_token_id"].item()
+
+    normalizer_spec = model.NormalizerSpec(
+        name="identity",
+        precompiled_charsmap=b"",
+        add_dummy_prefix=True,
+        remove_extra_whitespaces=False,
+        normalization_rule_tsv=b"",
+    )
+    trainer_spec = model.TrainerSpec(
+        model_type="BPE",
+        vocab_size=len(tokens),
+        input_format="text",
+        split_by_unicode_script=True,
+        split_by_whitespace=True,
+        split_by_number=True,
+        treat_whitespace_as_suffix=False,
+        split_digits=True,
+        allow_whitespace_only_pieces=True,
+        vocabulary_output_piece_score=True,
+        byte_fallback=True,
+        unk_id=unk,
+        bos_id=bos,
+        eos_id=eos,
+        pad_id=-1,
+        unk_piece="<unk>",
+        bos_piece="<s>",
+        eos_piece="</s>",
+        pad_piece="<pad>",
+        pretokenization_delimiter="",
+    )
+    m = model.ModelProto(trainer_spec=trainer_spec, normalizer_spec=normalizer_spec)
+    scores = metadata.get("tokenizer.ggml.scores", None)
+    scores = scores.tolist() if scores is not None else None
+    token_types = metadata.get("tokenizer.ggml.token_type", None)
+    token_types = token_types.tolist() if token_types is not None else None
+
+    for i, token in enumerate(tokens):
+        score = scores[i] if scores else 0
+        token_type = token_types[i] if token_types else 0
+        m.pieces.append(
+            model.ModelProto.SentencePiece(piece=token, score=score, type=token_type)
+        )
+    tokenizer = spm.SentencePieceProcessor(model_proto=m.SerializeToString())
+    return tokenizer