diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py index 447b1ce4a..4e08b6c08 100644 --- a/comfy/clip_vision.py +++ b/comfy/clip_vision.py @@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json") elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json") - elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd: + elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd: embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0] - if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152: + norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] + if norm_weight == 1152: if embed_shape == 729: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json") elif embed_shape == 1024: @@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False): json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json") else: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json") + elif embed_shape == 1024 and norm_weight == 768: + json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json") else: json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json") diff --git a/comfy/clip_vision_siglip2_base_512.json b/comfy/clip_vision_siglip2_base_512.json new file mode 100644 index 000000000..4324857e4 --- /dev/null +++ b/comfy/clip_vision_siglip2_base_512.json @@ -0,0 +1,14 @@ +{ + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 768, + "image_size": 512, + "intermediate_size": 3072, + "layer_norm_eps": 1e-06, + "model_type": "siglip_vision_model", + "num_attention_heads": 12, + "num_channels": 3, + "num_hidden_layers": 12, + "patch_size": 16, + "image_mean": [0.5, 0.5, 0.5], + "image_std": [0.5, 0.5, 0.5] + } diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 5a0bbc68c..1691731a8 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE): def process_clip_state_dict(self, state_dict): state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True) + state_dict["logit_scale"] = torch.tensor(1.0) return state_dict class QwenImage(supported_models_base.BASE): diff --git a/comfy_extras/nodes_hunyuan_foley.py b/comfy_extras/nodes_hunyuan_foley.py index 649d79718..70c3d3d4e 100644 --- a/comfy_extras/nodes_hunyuan_foley.py +++ b/comfy_extras/nodes_hunyuan_foley.py @@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode): ) @classmethod - def execute(cls, video_encoding_1, video_encoding_2, text_encoding): - embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0) + def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding): + + if isinstance(text_encoding, list): + text_encoding = text_encoding[0] + + embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0) positive = [[embeds, {}]] negative = [[torch.zeros_like(embeds), {}]] return io.NodeOutput(positive, negative)