support kimi + more options in chat mode (#1312)

2025-12-14 17:28:59 +08:00 · 2025-02-28 11:33:18 -08:00
parent b2108a0de6
commit 845cd8c01e
4 changed files with 52 additions and 27 deletions
--- a/llms/mlx_lm/_version.py
+++ b/llms/mlx_lm/_version.py
@@ -1,3 +1,3 @@
 # Copyright © 2023-2024 Apple Inc.

-__version__ = "0.21.5"
+__version__ = "0.21.6"
--- a/llms/mlx_lm/chat.py
+++ b/llms/mlx_lm/chat.py
@@ -65,12 +65,25 @@ def main():
        tokenizer_config={"trust_remote_code": True},
    )

-    print(f"[INFO] Starting chat session with {args.model}. To exit, enter 'q'.")
+    def print_help():
+        print("The command list:")
+        print("- 'q' to exit")
+        print("- 'r' to reset the chat")
+        print("- 'h' to display these commands")
+
+    print(f"[INFO] Starting chat session with {args.model}.")
+    print_help()
    prompt_cache = make_prompt_cache(model, args.max_kv_size)
    while True:
        query = input(">> ")
        if query == "q":
            break
+        if query == "r":
+            prompt_cache = make_prompt_cache(model, args.max_kv_size)
+            continue
+        if query == "h":
+            print_help()
+            continue
        messages = [{"role": "user", "content": query}]
        prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
        for response in stream_generate(
--- a/llms/mlx_lm/models/deepseek_v3.py
+++ b/llms/mlx_lm/models/deepseek_v3.py
@@ -181,30 +181,37 @@ class DeepseekV3Attention(nn.Module):
            bias=config.attention_bias,
        )

-        mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
-        scaling_factor = self.config.rope_scaling["factor"]
-        if mscale_all_dim:
-            mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
-            self.scale = self.scale * mscale * mscale
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scale = self.scale * mscale * mscale

-        rope_kwargs = {
-            key: self.config.rope_scaling[key]
-            for key in [
-                "original_max_position_embeddings",
-                "beta_fast",
-                "beta_slow",
-                "mscale",
-                "mscale_all_dim",
-            ]
-            if key in self.config.rope_scaling
-        }
-        self.rope = DeepseekV3YarnRotaryEmbedding(
-            dim=self.qk_rope_head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            scaling_factor=scaling_factor,
-            base=self.rope_theta,
-            **rope_kwargs,
-        )
+            rope_kwargs = {
+                key: self.config.rope_scaling[key]
+                for key in [
+                    "original_max_position_embeddings",
+                    "beta_fast",
+                    "beta_slow",
+                    "mscale",
+                    "mscale_all_dim",
+                ]
+                if key in self.config.rope_scaling
+            }
+            self.rope = DeepseekV3YarnRotaryEmbedding(
+                dim=self.qk_rope_head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=scaling_factor,
+                base=self.rope_theta,
+                **rope_kwargs,
+            )
+        else:
+            self.rope = nn.RoPE(
+                dims=self.qk_rope_head_dim,
+                base=self.rope_theta,
+                traditional=True,
+            )

    def __call__(
        self,
@@ -487,8 +494,12 @@ class Model(nn.Module):
                        ]
                        weights[f"{prefix}.mlp.switch_mlp.{m}.{k}"] = mx.stack(to_join)

-        # Remove multi-token prediction layer
-        return {k: v for k, v in weights.items() if not k.startswith("model.layers.61")}
+        # Remove multi-token prediction layer and any unused precomputed rotary freqs
+        return {
+            k: v
+            for k, v in weights.items()
+            if not k.startswith("model.layers.61") and "rotary_emb.inv_freq" not in k
+        }

    @property
    def layers(self):
--- a/llms/mlx_lm/utils.py
+++ b/llms/mlx_lm/utils.py
@@ -191,6 +191,7 @@ def get_model_path(path_or_hf_repo: str, revision: Optional[str] = None) -> Path
                        "*.py",
                        "tokenizer.model",
                        "*.tiktoken",
+                        "tiktoken.model",
                        "*.txt",
                        "*.jsonl",
                    ],