clip vision base support + small fixes

This commit is contained in:
Yousef Rafat 2025-10-01 23:44:02 +03:00
parent cc3a1389ad
commit 4241f106dc
4 changed files with 26 additions and 4 deletions

View File

@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
if norm_weight == 1152:
if embed_shape == 729:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
elif embed_shape == 1024:
@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
else:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
elif embed_shape == 1024 and norm_weight == 768:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
else:
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")

View File

@ -0,0 +1,14 @@
{
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 768,
"image_size": 512,
"intermediate_size": 3072,
"layer_norm_eps": 1e-06,
"model_type": "siglip_vision_model",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"patch_size": 16,
"image_mean": [0.5, 0.5, 0.5],
"image_std": [0.5, 0.5, 0.5]
}

View File

@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE):
def process_clip_state_dict(self, state_dict):
state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
state_dict["logit_scale"] = torch.tensor(1.0)
return state_dict
class QwenImage(supported_models_base.BASE):

View File

@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode):
)
@classmethod
def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding):
if isinstance(text_encoding, list):
text_encoding = text_encoding[0]
embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0)
positive = [[embeds, {}]]
negative = [[torch.zeros_like(embeds), {}]]
return io.NodeOutput(positive, negative)