From 796d5e40e4cce0e0d49d3b3b3c00957b31702fe0 Mon Sep 17 00:00:00 2001 From: Angelos Katharopoulos Date: Fri, 20 Sep 2024 13:33:45 -0700 Subject: [PATCH] Fix export to gguf (#993) --- llms/mlx_lm/gguf.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llms/mlx_lm/gguf.py b/llms/mlx_lm/gguf.py index 5d524580..241ac35a 100644 --- a/llms/mlx_lm/gguf.py +++ b/llms/mlx_lm/gguf.py @@ -67,7 +67,7 @@ class HfVocab: def get_token_type( self, token_id: int, token_text: bytes, special_ids: Set[int] ) -> TokenType: - if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text.encode("utf-8")): + if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token_text): return TokenType.BYTE return TokenType.CONTROL if token_id in special_ids else TokenType.NORMAL @@ -77,9 +77,7 @@ class HfVocab: def added_tokens(self) -> Iterable[Tuple[bytes, float, TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type( - self.specials[text], b"", self.special_ids - ) + toktype = self.get_token_type(self.specials[text], "", self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = TokenType.USER_DEFINED @@ -243,15 +241,18 @@ def prepare_metadata(config, vocab): metadata["tokenizer.ggml.tokens"] = tokens metadata["tokenizer.ggml.scores"] = mx.array(scores, dtype=mx.float32) metadata["tokenizer.ggml.token_type"] = mx.array(toktypes, dtype=mx.uint32) - metadata["tokenizer.ggml.bos_token_id"] = mx.array( - vocab.tokenizer.bos_token_id, dtype=mx.uint32 - ) - metadata["tokenizer.ggml.eos_token_id"] = mx.array( - vocab.tokenizer.eos_token_id, dtype=mx.uint32 - ) - metadata["tokenizer.ggml.unknown_token_id"] = mx.array( - vocab.tokenizer.unk_token_id, dtype=mx.uint32 - ) + if vocab.tokenizer.bos_token_id is not None: + metadata["tokenizer.ggml.bos_token_id"] = mx.array( + vocab.tokenizer.bos_token_id, dtype=mx.uint32 + ) + if vocab.tokenizer.eos_token_id is not None: + metadata["tokenizer.ggml.eos_token_id"] = mx.array( + vocab.tokenizer.eos_token_id, dtype=mx.uint32 + ) + if vocab.tokenizer.unk_token_id is not None: + metadata["tokenizer.ggml.unknown_token_id"] = mx.array( + vocab.tokenizer.unk_token_id, dtype=mx.uint32 + ) metadata = {k: v for k, v in metadata.items() if v is not None} return metadata