Skip to content

Commit 371f765

Browse files
authored
[Diffusers -> Original SD conversion] fix things (#6933)
* fix: bias loading bug * fixes for SDXL * apply changes to the conversion script to match single_file_utils.py * do transpose to match the single file loading logic.
1 parent 75aee39 commit 371f765

File tree

3 files changed

+15
-3
lines changed

3 files changed

+15
-3
lines changed

scripts/convert_diffusers_to_original_sdxl.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,10 @@ def convert_unet_state_dict(unet_state_dict):
167167

168168
def reshape_weight_for_sd(w):
169169
# convert HF linear weights to SD conv2d weights
170-
return w.reshape(*w.shape, 1, 1)
170+
if not w.ndim == 1:
171+
return w.reshape(*w.shape, 1, 1)
172+
else:
173+
return w
171174

172175

173176
def convert_vae_state_dict(vae_state_dict):
@@ -321,11 +324,18 @@ def convert_openai_text_enc_state_dict(text_enc_dict):
321324
vae_state_dict = convert_vae_state_dict(vae_state_dict)
322325
vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
323326

327+
# Convert text encoder 1
324328
text_enc_dict = convert_openai_text_enc_state_dict(text_enc_dict)
325329
text_enc_dict = {"conditioner.embedders.0.transformer." + k: v for k, v in text_enc_dict.items()}
326330

331+
# Convert text encoder 2
327332
text_enc_2_dict = convert_openclip_text_enc_state_dict(text_enc_2_dict)
328333
text_enc_2_dict = {"conditioner.embedders.1.model." + k: v for k, v in text_enc_2_dict.items()}
334+
# We call the `.T.contiguous()` to match what's done in
335+
# https://github.com/huggingface/diffusers/blob/84905ca7287876b925b6bf8e9bb92fec21c78764/src/diffusers/loaders/single_file_utils.py#L1085
336+
text_enc_2_dict["conditioner.embedders.1.model.text_projection"] = text_enc_2_dict.pop(
337+
"conditioner.embedders.1.model.text_projection.weight"
338+
).T.contiguous()
329339

330340
# Put together new checkpoint
331341
state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict, **text_enc_2_dict}

scripts/convert_diffusers_to_original_stable_diffusion.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,10 @@ def convert_unet_state_dict(unet_state_dict):
170170

171171
def reshape_weight_for_sd(w):
172172
# convert HF linear weights to SD conv2d weights
173-
return w.reshape(*w.shape, 1, 1)
173+
if not w.ndim == 1:
174+
return w.reshape(*w.shape, 1, 1)
175+
else:
176+
return w
174177

175178

176179
def convert_vae_state_dict(vae_state_dict):

src/diffusers/loaders/single_file_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1112,7 +1112,6 @@ def create_text_encoder_from_open_clip_checkpoint(
11121112
text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim]
11131113
text_model_dict[diffusers_key + ".k_proj.bias"] = weight_value[text_proj_dim : text_proj_dim * 2]
11141114
text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :]
1115-
11161115
else:
11171116
text_model_dict[diffusers_key] = checkpoint[key]
11181117

0 commit comments

Comments
 (0)