adding OLMoE architecture (#1321)

* initial commit

* udpate ACKNOWLEDGMENTS.md

* adding olmoe to training

* clean up

* faster generation

* remove sanitize method

* more clean ups

* adding SwitchGLU

* clean up

* a little faster and adding norm_topk_prob

* formated
This commit is contained in:
Gökdeniz Gülmez
2025-03-05 22:46:06 +01:00
committed by GitHub
parent e7267d30f8
commit 56d2db23e1
3 changed files with 221 additions and 1 deletions

View File

@@ -98,6 +98,7 @@ def linear_to_lora_layers(
"minicpm",
"deepseek",
"olmo2",
"olmoe",
"internlm3",
]:
keys = set(["self_attn.q_proj", "self_attn.v_proj"])
@@ -106,6 +107,8 @@ def linear_to_lora_layers(
if model.model_type == "qwen2_moe":
keys.add("mlp.gate")
keys.add("mlp.shared_expert_gate")
if model.model_type == "olmoe":
keys.add("mlp.gate")
elif model.model_type == "gpt_bigcode":
keys = set(["attn.c_attn"])