From 0e026e6a77ad4b514ead832762269280e7ecf98d Mon Sep 17 00:00:00 2001
From: Pawel Kowalski <p.b.kowalski@gmail.com>
Date: Wed, 13 Dec 2023 23:36:47 +0100
Subject: [PATCH] moved the weight squeeze to map_unet_weights, style check

---
 stable_diffusion/stable_diffusion/model_io.py | 27 +++++++------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/stable_diffusion/stable_diffusion/model_io.py b/stable_diffusion/stable_diffusion/model_io.py
index f9c9af65..349e5d76 100644
--- a/stable_diffusion/stable_diffusion/model_io.py
+++ b/stable_diffusion/stable_diffusion/model_io.py
@@ -8,7 +8,7 @@ from huggingface_hub import hf_hub_download
 from safetensors import safe_open as safetensor_open
 
 import mlx.core as mx
-from mlx.utils import tree_unflatten, tree_flatten
+from mlx.utils import tree_unflatten
 
 from .clip import CLIPTextModel
 from .config import UNetConfig, CLIPTextModelConfig, AutoencoderConfig, DiffusionConfig
@@ -31,7 +31,7 @@ _MODELS = {
         "tokenizer_vocab": "tokenizer/vocab.json",
         "tokenizer_merges": "tokenizer/merges.txt",
     },
-        "nitrosocke/Ghibli-Diffusion": {
+    "nitrosocke/Ghibli-Diffusion": {
         "unet_config": "unet/config.json",
         "unet": "unet/diffusion_pytorch_model.safetensors",
         "text_encoder_config": "text_encoder/config.json",
@@ -41,7 +41,7 @@ _MODELS = {
         "diffusion_config": "scheduler/scheduler_config.json",
         "tokenizer_vocab": "tokenizer/vocab.json",
         "tokenizer_merges": "tokenizer/merges.txt",
-    }
+    },
 }
 
 
@@ -87,6 +87,10 @@ def map_unet_weights(key, value):
     if "conv_shortcut.weight" in key:
         value = value.squeeze()
 
+    # Transform the weights from 1x1 convs to linear
+    if len(value.shape) == 4 and ("proj_in" in key or "proj_out" in key):
+        value = value.squeeze()
+
     if len(value.shape) == 4:
         value = value.transpose(0, 2, 3, 1)
 
@@ -165,23 +169,10 @@ def _flatten(params):
     return [(k, v) for p in params for (k, v) in p]
 
 
-def _match_shapes(model, weights):
-    #check whether the safetensor weights have the same shape as the model, if not reshape them
-    weight_shapes = {x[0]:x[1].shape for x in weights if isinstance(x[1], mx.array)}
-    arrays_model_shapes = {x[0]:x[1].shape for x in tree_flatten(model) if isinstance(x[1], mx.array)}
-    mismatched_keys = [k for k in weight_shapes if weight_shapes[k]!= arrays_model_shapes.get(k, weight_shapes[k])]
-    weights_dict = dict(weights)
-    for k in mismatched_keys:
-        weights_dict[k] = weights_dict[k].reshape(arrays_model_shapes[k])
-    weights = list(weights_dict.items())
-    return weights
-
-
 def _load_safetensor_weights(mapper, model, weight_file, float16: bool = False):
     dtype = np.float16 if float16 else np.float32
     with safetensor_open(weight_file, framework="numpy") as f:
         weights = _flatten([mapper(k, f.get_tensor(k).astype(dtype)) for k in f.keys()])
-    weights = _match_shapes(model, weights)
     model.update(tree_unflatten(weights))
 
 
@@ -208,7 +199,9 @@ def load_unet(key: str = _DEFAULT_MODEL, float16: bool = False):
             out_channels=config["out_channels"],
             block_out_channels=config["block_out_channels"],
             layers_per_block=[config["layers_per_block"]] * n_blocks,
-            num_attention_heads=[config["attention_head_dim"]] * n_blocks if isinstance(config["attention_head_dim"], int) else config["attention_head_dim"],
+            num_attention_heads=[config["attention_head_dim"]] * n_blocks
+            if isinstance(config["attention_head_dim"], int)
+            else config["attention_head_dim"],
             cross_attention_dim=[config["cross_attention_dim"]] * n_blocks,
             norm_num_groups=config["norm_num_groups"],
         )