diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index e9b0ec535..e2bc3209d 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -52,8 +52,9 @@ def convert_to_transformers(sd, prefix): sd = transformers_convert(sd, prefix, "vision_model.", 32) return sd -def load_clipvision_from_sd(sd, prefix): - sd = convert_to_transformers(sd, prefix) +def load_clipvision_from_sd(sd, prefix="", convert_keys=False): + if convert_keys: + sd = convert_to_transformers(sd, prefix) if "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") else: diff --git a/comfy/sd.py b/comfy/sd.py index 64c955311..15caf3603 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1015,7 +1015,7 @@ def load_checkpoint(config_path=None, ckpt_path=None, output_vae=True, output_cl class EmptyClass: pass clip_target = EmptyClass() - clip_target.params = clip_config["params"] + clip_target.params = clip_config.get("params", {}) if clip_config["target"].endswith("FrozenOpenCLIPEmbedder"): clip_target.clip = sd2_clip.SD2ClipModel clip_target.tokenizer = sd2_clip.SD2Tokenizer @@ -1049,7 +1049,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o if model_config.clip_vision_prefix is not None: if output_clipvision: - clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix) + clipvision = clip_vision.load_clipvision_from_sd(sd, model_config.clip_vision_prefix, True) model = model_config.get_model(sd) model.load_model_weights(sd, "model.diffusion_model.")