Fix export to gguf (#993)

This commit is contained in:
Angelos Katharopoulos 2024-09-20 13:33:45 -07:00 committed by GitHub
parent f530f56df2
commit 796d5e40e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -67,7 +67,7 @@ class HfVocab:
def get_token_type( def get_token_type(
self, token_id: int, token_text: bytes, special_ids: Set[int] self, token_id: int, token_text: bytes, special_ids: Set[int]
) -> TokenType: ) -> TokenType:
if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text.encode("utf-8")): if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token_text):
return TokenType.BYTE return TokenType.BYTE
return TokenType.CONTROL if token_id in special_ids else TokenType.NORMAL return TokenType.CONTROL if token_id in special_ids else TokenType.NORMAL
@ -77,9 +77,7 @@ class HfVocab:
def added_tokens(self) -> Iterable[Tuple[bytes, float, TokenType]]: def added_tokens(self) -> Iterable[Tuple[bytes, float, TokenType]]:
for text in self.added_tokens_list: for text in self.added_tokens_list:
if text in self.specials: if text in self.specials:
toktype = self.get_token_type( toktype = self.get_token_type(self.specials[text], "", self.special_ids)
self.specials[text], b"", self.special_ids
)
score = self.get_token_score(self.specials[text]) score = self.get_token_score(self.specials[text])
else: else:
toktype = TokenType.USER_DEFINED toktype = TokenType.USER_DEFINED
@ -243,15 +241,18 @@ def prepare_metadata(config, vocab):
metadata["tokenizer.ggml.tokens"] = tokens metadata["tokenizer.ggml.tokens"] = tokens
metadata["tokenizer.ggml.scores"] = mx.array(scores, dtype=mx.float32) metadata["tokenizer.ggml.scores"] = mx.array(scores, dtype=mx.float32)
metadata["tokenizer.ggml.token_type"] = mx.array(toktypes, dtype=mx.uint32) metadata["tokenizer.ggml.token_type"] = mx.array(toktypes, dtype=mx.uint32)
metadata["tokenizer.ggml.bos_token_id"] = mx.array( if vocab.tokenizer.bos_token_id is not None:
vocab.tokenizer.bos_token_id, dtype=mx.uint32 metadata["tokenizer.ggml.bos_token_id"] = mx.array(
) vocab.tokenizer.bos_token_id, dtype=mx.uint32
metadata["tokenizer.ggml.eos_token_id"] = mx.array( )
vocab.tokenizer.eos_token_id, dtype=mx.uint32 if vocab.tokenizer.eos_token_id is not None:
) metadata["tokenizer.ggml.eos_token_id"] = mx.array(
metadata["tokenizer.ggml.unknown_token_id"] = mx.array( vocab.tokenizer.eos_token_id, dtype=mx.uint32
vocab.tokenizer.unk_token_id, dtype=mx.uint32 )
) if vocab.tokenizer.unk_token_id is not None:
metadata["tokenizer.ggml.unknown_token_id"] = mx.array(
vocab.tokenizer.unk_token_id, dtype=mx.uint32
)
metadata = {k: v for k, v in metadata.items() if v is not None} metadata = {k: v for k, v in metadata.items() if v is not None}
return metadata return metadata