mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-11 05:52:33 +08:00
clip vision base support + small fixes
This commit is contained in:
parent
cc3a1389ad
commit
4241f106dc
@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
|
||||
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
||||
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
|
||||
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
|
||||
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
|
||||
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
|
||||
norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
|
||||
if norm_weight == 1152:
|
||||
if embed_shape == 729:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
|
||||
elif embed_shape == 1024:
|
||||
@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
|
||||
else:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
|
||||
elif embed_shape == 1024 and norm_weight == 768:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
|
||||
else:
|
||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
|
||||
|
||||
|
||||
14
comfy/clip_vision_siglip2_base_512.json
Normal file
14
comfy/clip_vision_siglip2_base_512.json
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"hidden_act": "gelu_pytorch_tanh",
|
||||
"hidden_size": 768,
|
||||
"image_size": 512,
|
||||
"intermediate_size": 3072,
|
||||
"layer_norm_eps": 1e-06,
|
||||
"model_type": "siglip_vision_model",
|
||||
"num_attention_heads": 12,
|
||||
"num_channels": 3,
|
||||
"num_hidden_layers": 12,
|
||||
"patch_size": 16,
|
||||
"image_mean": [0.5, 0.5, 0.5],
|
||||
"image_std": [0.5, 0.5, 0.5]
|
||||
}
|
||||
@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE):
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
|
||||
state_dict["logit_scale"] = torch.tensor(1.0)
|
||||
return state_dict
|
||||
|
||||
class QwenImage(supported_models_base.BASE):
|
||||
|
||||
@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
|
||||
embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
|
||||
def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding):
|
||||
|
||||
if isinstance(text_encoding, list):
|
||||
text_encoding = text_encoding[0]
|
||||
|
||||
embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0)
|
||||
positive = [[embeds, {}]]
|
||||
negative = [[torch.zeros_like(embeds), {}]]
|
||||
return io.NodeOutput(positive, negative)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user