From 324184d670ec11916a5e92314171d497b312eefe Mon Sep 17 00:00:00 2001 From: Angelos Katharopoulos Date: Fri, 6 Sep 2024 20:19:27 -0700 Subject: [PATCH 1/5] Fix the cache_prompt (#979) --- llms/mlx_lm/cache_prompt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llms/mlx_lm/cache_prompt.py b/llms/mlx_lm/cache_prompt.py index fe088118..9829efb4 100644 --- a/llms/mlx_lm/cache_prompt.py +++ b/llms/mlx_lm/cache_prompt.py @@ -139,8 +139,8 @@ def main(): print("Saving...") cache_dict = {} for i, c in enumerate(cache): - cache_dict[f"{i}_keys"] = c.state[0] - cache_dict[f"{i}_values"] = c.state[1] + cache_dict[f"{i}_keys"] = c.state[0][..., : c.offset, :] + cache_dict[f"{i}_values"] = c.state[1][..., : c.offset, :] metadata = {} metadata["model"] = args.model metadata["chat_template"] = tokenizer.chat_template From c3e3411756e098a3f5f29d988e9221034f1af47c Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Sat, 7 Sep 2024 06:06:15 -0700 Subject: [PATCH 2/5] Update LLM generation docs to use chat template (#973) * fix docs * add template to model cards as well * revert * version --- llms/README.md | 14 +++++++++++++- llms/mlx_lm/_version.py | 2 +- llms/mlx_lm/utils.py | 11 ++++++++++- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/llms/README.md b/llms/README.md index 79f26d41..b8e1914d 100644 --- a/llms/README.md +++ b/llms/README.md @@ -29,7 +29,14 @@ from mlx_lm import load, generate model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.3-4bit") -response = generate(model, tokenizer, prompt="hello", verbose=True) +prompt = "Write a story about Einstein" + +messages = [{"role": "user", "content": prompt}] +prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True +) + +response = generate(model, tokenizer, prompt=prompt, verbose=True) ``` To see a description of all the arguments you can do: @@ -79,6 +86,11 @@ model, tokenizer = load(repo) prompt = "Write a story about Einstein" +messages = [{"role": "user", "content": prompt}] +prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True +) + for t in stream_generate(model, tokenizer, prompt, max_tokens=512): print(t, end="", flush=True) print() diff --git a/llms/mlx_lm/_version.py b/llms/mlx_lm/_version.py index a2eb9a25..8110c823 100644 --- a/llms/mlx_lm/_version.py +++ b/llms/mlx_lm/_version.py @@ -1,3 +1,3 @@ # Copyright © 2023-2024 Apple Inc. -__version__ = "0.18.1" +__version__ = "0.18.2" diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index eee28c9c..ad9b3221 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -577,7 +577,16 @@ def upload_to_hub(path: str, upload_repo: str, hf_path: str): from mlx_lm import load, generate model, tokenizer = load("{upload_repo}") - response = generate(model, tokenizer, prompt="hello", verbose=True) + + prompt="hello" + + if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None: + messages = [{"role": "user", "content": prompt}] + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + + response = generate(model, tokenizer, prompt=prompt, verbose=True) ``` """ ) From 6c2369e4b97f49fb5906ec46033497b39931b25d Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Sat, 7 Sep 2024 14:46:57 -0700 Subject: [PATCH 3/5] Fix bug in upload + docs nit (#981) * fix bug in upload + docs nit * nit --- llms/mlx_lm/LORA.md | 30 +++++++----------------------- llms/mlx_lm/utils.py | 2 +- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/llms/mlx_lm/LORA.md b/llms/mlx_lm/LORA.md index 2e739d0f..2d9a2553 100644 --- a/llms/mlx_lm/LORA.md +++ b/llms/mlx_lm/LORA.md @@ -166,44 +166,28 @@ Currently, `*.jsonl` files support three data formats: `chat`, `chat`: ```jsonl -{ - "messages": [ - { - "role": "system", - "content": "You are a helpful assistant." - }, - { - "role": "user", - "content": "Hello." - }, - { - "role": "assistant", - "content": "How can I assistant you today." - } - ] -} +{"messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello."}, {"role": "assistant", "content": "How can I assistant you today."}]} ``` `completions`: ```jsonl -{ - "prompt": "What is the capital of France?", - "completion": "Paris." -} +{"prompt": "What is the capital of France?", "completion": "Paris."} ``` `text`: ```jsonl -{ - "text": "This is an example for the model." -} +{"text": "This is an example for the model."} ``` Note, the format is automatically determined by the dataset. Note also, keys in each line not expected by the loader will be ignored. +> [!NOTE] +> Each example in the datasets must be on a single line. Do not put more than +> one example per line and do not split an example accross multiple lines. + ### Hugging Face Datasets To use Hugging Face datasets, first install the `datasets` package: diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index ad9b3221..b4a2ea51 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -581,7 +581,7 @@ def upload_to_hub(path: str, upload_repo: str, hf_path: str): prompt="hello" if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None: - messages = [{"role": "user", "content": prompt}] + messages = [{{"role": "user", "content": prompt}}] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) From f530f56df2738a54982c4541189a8c8d7cd94c44 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Tue, 17 Sep 2024 16:22:48 -0700 Subject: [PATCH 4/5] don't use internal exception (#990) --- llms/mlx_lm/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index b4a2ea51..5621609d 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -14,7 +14,6 @@ from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Type, import mlx.core as mx import mlx.nn as nn from huggingface_hub import snapshot_download -from huggingface_hub.utils._errors import RepositoryNotFoundError from mlx.utils import tree_flatten from transformers import PreTrainedTokenizer @@ -91,7 +90,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path ], ) ) - except RepositoryNotFoundError: + except: raise ModelNotFoundError( f"Model not found for path or HF repo: {path_or_hf_repo}.\n" "Please make sure you specified the local path or Hugging Face" From 796d5e40e4cce0e0d49d3b3b3c00957b31702fe0 Mon Sep 17 00:00:00 2001 From: Angelos Katharopoulos Date: Fri, 20 Sep 2024 13:33:45 -0700 Subject: [PATCH 5/5] Fix export to gguf (#993) --- llms/mlx_lm/gguf.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/llms/mlx_lm/gguf.py b/llms/mlx_lm/gguf.py index 5d524580..241ac35a 100644 --- a/llms/mlx_lm/gguf.py +++ b/llms/mlx_lm/gguf.py @@ -67,7 +67,7 @@ class HfVocab: def get_token_type( self, token_id: int, token_text: bytes, special_ids: Set[int] ) -> TokenType: - if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text.encode("utf-8")): + if re.fullmatch(r"<0x[0-9A-Fa-f]{2}>", token_text): return TokenType.BYTE return TokenType.CONTROL if token_id in special_ids else TokenType.NORMAL @@ -77,9 +77,7 @@ class HfVocab: def added_tokens(self) -> Iterable[Tuple[bytes, float, TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type( - self.specials[text], b"", self.special_ids - ) + toktype = self.get_token_type(self.specials[text], "", self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = TokenType.USER_DEFINED @@ -243,15 +241,18 @@ def prepare_metadata(config, vocab): metadata["tokenizer.ggml.tokens"] = tokens metadata["tokenizer.ggml.scores"] = mx.array(scores, dtype=mx.float32) metadata["tokenizer.ggml.token_type"] = mx.array(toktypes, dtype=mx.uint32) - metadata["tokenizer.ggml.bos_token_id"] = mx.array( - vocab.tokenizer.bos_token_id, dtype=mx.uint32 - ) - metadata["tokenizer.ggml.eos_token_id"] = mx.array( - vocab.tokenizer.eos_token_id, dtype=mx.uint32 - ) - metadata["tokenizer.ggml.unknown_token_id"] = mx.array( - vocab.tokenizer.unk_token_id, dtype=mx.uint32 - ) + if vocab.tokenizer.bos_token_id is not None: + metadata["tokenizer.ggml.bos_token_id"] = mx.array( + vocab.tokenizer.bos_token_id, dtype=mx.uint32 + ) + if vocab.tokenizer.eos_token_id is not None: + metadata["tokenizer.ggml.eos_token_id"] = mx.array( + vocab.tokenizer.eos_token_id, dtype=mx.uint32 + ) + if vocab.tokenizer.unk_token_id is not None: + metadata["tokenizer.ggml.unknown_token_id"] = mx.array( + vocab.tokenizer.unk_token_id, dtype=mx.uint32 + ) metadata = {k: v for k, v in metadata.items() if v is not None} return metadata