adding OLMoE architecture (#1321)

* initial commit * udpate ACKNOWLEDGMENTS.md * adding olmoe to training * clean up * faster generation * remove sanitize method * more clean ups * adding SwitchGLU * clean up * a little faster and adding norm_topk_prob * formated
2025-12-16 02:08:55 +08:00 · 2025-03-05 22:46:06 +01:00
parent e7267d30f8
commit 56d2db23e1
3 changed files with 221 additions and 1 deletions
--- a/llms/mlx_lm/tuner/utils.py
+++ b/llms/mlx_lm/tuner/utils.py
@@ -98,6 +98,7 @@ def linear_to_lora_layers(
        "minicpm",
        "deepseek",
        "olmo2",
+        "olmoe",
        "internlm3",
    ]:
        keys = set(["self_attn.q_proj", "self_attn.v_proj"])
@@ -106,6 +107,8 @@ def linear_to_lora_layers(
        if model.model_type == "qwen2_moe":
            keys.add("mlp.gate")
            keys.add("mlp.shared_expert_gate")
+        if model.model_type == "olmoe":
+            keys.add("mlp.gate")

    elif model.model_type == "gpt_bigcode":
        keys = set(["attn.c_attn"])