clip vision base support + small fixes

2026-03-07 10:17:31 +08:00 · 2025-10-01 23:44:02 +03:00 · 2025-10-01 23:44:02 +03:00 · 4241f106dc
commit 4241f106dc
parent cc3a1389ad
4 changed files with 26 additions and 4 deletions
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
    elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
        json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
-    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
-        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+        norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
+        if norm_weight == 1152:
            if embed_shape == 729:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
            elif embed_shape == 1024:
@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
            else:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        elif embed_shape == 1024 and norm_weight == 768:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
        else:
            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")

--- a/comfy/clip_vision_siglip2_base_512.json
+++ b/comfy/clip_vision_siglip2_base_512.json
@ -0,0 +1,14 @@
+{
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "image_size": 512,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 12,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "patch_size": 16,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_std": [0.5, 0.5, 0.5]
+  }
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE):
    
    def process_clip_state_dict(self, state_dict):
        state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
+        state_dict["logit_scale"] = torch.tensor(1.0)
        return state_dict

 class QwenImage(supported_models_base.BASE):
--- a/comfy_extras/nodes_hunyuan_foley.py
+++ b/comfy_extras/nodes_hunyuan_foley.py
@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode):
        )

    @classmethod
-    def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
-        embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
+    def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding):
+
+        if isinstance(text_encoding, list):
+            text_encoding = text_encoding[0]
+
+        embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0)
        positive = [[embeds, {}]]
        negative = [[torch.zeros_like(embeds), {}]]
        return io.NodeOutput(positive, negative)