diff --git a/llms/mlx_lm/_version.py b/llms/mlx_lm/_version.py index 89e6cd00..839089b6 100644 --- a/llms/mlx_lm/_version.py +++ b/llms/mlx_lm/_version.py @@ -1,3 +1,3 @@ # Copyright © 2023-2024 Apple Inc. -__version__ = "0.21.5" +__version__ = "0.21.6" diff --git a/llms/mlx_lm/chat.py b/llms/mlx_lm/chat.py index e52ad10d..5c0b78db 100644 --- a/llms/mlx_lm/chat.py +++ b/llms/mlx_lm/chat.py @@ -65,12 +65,25 @@ def main(): tokenizer_config={"trust_remote_code": True}, ) - print(f"[INFO] Starting chat session with {args.model}. To exit, enter 'q'.") + def print_help(): + print("The command list:") + print("- 'q' to exit") + print("- 'r' to reset the chat") + print("- 'h' to display these commands") + + print(f"[INFO] Starting chat session with {args.model}.") + print_help() prompt_cache = make_prompt_cache(model, args.max_kv_size) while True: query = input(">> ") if query == "q": break + if query == "r": + prompt_cache = make_prompt_cache(model, args.max_kv_size) + continue + if query == "h": + print_help() + continue messages = [{"role": "user", "content": query}] prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True) for response in stream_generate( diff --git a/llms/mlx_lm/models/deepseek_v3.py b/llms/mlx_lm/models/deepseek_v3.py index 47e17236..5cd40a0d 100644 --- a/llms/mlx_lm/models/deepseek_v3.py +++ b/llms/mlx_lm/models/deepseek_v3.py @@ -181,30 +181,37 @@ class DeepseekV3Attention(nn.Module): bias=config.attention_bias, ) - mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) - scaling_factor = self.config.rope_scaling["factor"] - if mscale_all_dim: - mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) - self.scale = self.scale * mscale * mscale + if self.config.rope_scaling is not None: + mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_scaling["factor"] + if mscale_all_dim: + mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) + self.scale = self.scale * mscale * mscale - rope_kwargs = { - key: self.config.rope_scaling[key] - for key in [ - "original_max_position_embeddings", - "beta_fast", - "beta_slow", - "mscale", - "mscale_all_dim", - ] - if key in self.config.rope_scaling - } - self.rope = DeepseekV3YarnRotaryEmbedding( - dim=self.qk_rope_head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - **rope_kwargs, - ) + rope_kwargs = { + key: self.config.rope_scaling[key] + for key in [ + "original_max_position_embeddings", + "beta_fast", + "beta_slow", + "mscale", + "mscale_all_dim", + ] + if key in self.config.rope_scaling + } + self.rope = DeepseekV3YarnRotaryEmbedding( + dim=self.qk_rope_head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **rope_kwargs, + ) + else: + self.rope = nn.RoPE( + dims=self.qk_rope_head_dim, + base=self.rope_theta, + traditional=True, + ) def __call__( self, @@ -487,8 +494,12 @@ class Model(nn.Module): ] weights[f"{prefix}.mlp.switch_mlp.{m}.{k}"] = mx.stack(to_join) - # Remove multi-token prediction layer - return {k: v for k, v in weights.items() if not k.startswith("model.layers.61")} + # Remove multi-token prediction layer and any unused precomputed rotary freqs + return { + k: v + for k, v in weights.items() + if not k.startswith("model.layers.61") and "rotary_emb.inv_freq" not in k + } @property def layers(self): diff --git a/llms/mlx_lm/utils.py b/llms/mlx_lm/utils.py index 7dff0ee3..05fac92f 100644 --- a/llms/mlx_lm/utils.py +++ b/llms/mlx_lm/utils.py @@ -191,6 +191,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path "*.py", "tokenizer.model", "*.tiktoken", + "tiktoken.model", "*.txt", "*.jsonl", ],