diff --git a/comfy/clip_vision.py b/comfy/clip_vision.py
index 447b1ce4a..4e08b6c08 100644
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
     elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
         json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
-    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
+    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
         embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
-        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
+        norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
+        if norm_weight == 1152:
             if embed_shape == 729:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
             elif embed_shape == 1024:
@@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
             else:
                 json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
+        elif embed_shape == 1024 and norm_weight == 768:
+            json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
         else:
             json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
 
diff --git a/comfy/clip_vision_siglip2_base_512.json b/comfy/clip_vision_siglip2_base_512.json
new file mode 100644
index 000000000..4324857e4
--- /dev/null
+++ b/comfy/clip_vision_siglip2_base_512.json
@@ -0,0 +1,14 @@
+{
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 768,
+    "image_size": 512,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip_vision_model",
+    "num_attention_heads": 12,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "patch_size": 16,
+    "image_mean": [0.5, 0.5, 0.5],
+    "image_std": [0.5, 0.5, 0.5]
+  }
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 5a0bbc68c..1691731a8 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE):
     
     def process_clip_state_dict(self, state_dict):
         state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
+        state_dict["logit_scale"] = torch.tensor(1.0)
         return state_dict
 
 class QwenImage(supported_models_base.BASE):
diff --git a/comfy_extras/nodes_hunyuan_foley.py b/comfy_extras/nodes_hunyuan_foley.py
index 649d79718..70c3d3d4e 100644
--- a/comfy_extras/nodes_hunyuan_foley.py
+++ b/comfy_extras/nodes_hunyuan_foley.py
@@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode):
         )
 
     @classmethod
-    def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
-        embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
+    def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding):
+
+        if isinstance(text_encoding, list):
+            text_encoding = text_encoding[0]
+
+        embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0)
         positive = [[embeds, {}]]
         negative = [[torch.zeros_like(embeds), {}]]
         return io.NodeOutput(positive, negative)