cleanup conversion to use single qkv matrix

2025-09-01 12:49:50 +08:00 · 2023-12-14 09:19:44 -08:00
parent 45c1800fc6
commit fa9e34b041
5 changed files with 11 additions and 57 deletions
--- a/phi2/convert.py
+++ b/phi2/convert.py
@@ -1,34 +1,5 @@
 from transformers import AutoModelForCausalLM
-
-import numpy
-
-
-def split_attention_matrix(state_dict, key) -> dict:
-    # "transformer.h.0.mixer"
-    _, model_dim = state_dict[key + ".weight"].shape
-    # (3 * model_dim, model_dim)
-    Wqkv_weight_key = key + ".weight"
-    Wq_weight = state_dict[Wqkv_weight_key][:model_dim, :]
-    Wk_weight = state_dict[Wqkv_weight_key][model_dim : 2 * model_dim, :]
-    Wv_weight = state_dict[Wqkv_weight_key][2 * model_dim :, :]
-
-    # (3 * model_dim)
-    Wqkv_bias_key = key + ".bias"
-    Wq_bias = state_dict[Wqkv_bias_key][:model_dim]
-    Wk_bias = state_dict[Wqkv_bias_key][model_dim : 2 * model_dim]
-    Wv_bias = state_dict[Wqkv_bias_key][2 * model_dim :]
-
-    out_key = key.replace("mixer.Wqkv", "self_attention")
-
-    return {
-        out_key + ".query_proj.weight": Wq_weight,
-        out_key + ".query_proj.bias": Wq_bias,
-        out_key + ".key_proj.weight": Wk_weight,
-        out_key + ".key_proj.bias": Wk_bias,
-        out_key + ".value_proj.weight": Wv_weight,
-        out_key + ".value_proj.bias": Wv_bias,
-    }
-
+import numpy as np

 def replace_key(key: str) -> str:
    if "wte.weight" in key:
@@ -36,10 +7,6 @@ def replace_key(key: str) -> str:

    if ".mlp" in key:
        key = key.replace(".mlp", "")
-
-    if ".mixer.out_proj" in key:
-        key = key.replace(".mixer", ".self_attention")
-
    return key


@@ -48,19 +15,8 @@ def convert():
        "microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
    )
    state_dict = model.state_dict()
-    keys = list(state_dict.keys())
-
-    for key in keys:
-        if ".mixer.Wqkv.weight" not in key:
-            continue
-        key_stub = key.rstrip(".weight")
-        state_dict.update(split_attention_matrix(state_dict, key_stub))
-
-        del state_dict[key_stub + ".weight"]
-        del state_dict[key_stub + ".bias"]
-
    weights = {replace_key(k): v.numpy() for k, v in state_dict.items()}
-    numpy.savez("weights.npz", **weights)
+    np.savez("weights.npz", **weights)


 if __name__ == "__main__":