mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-02-13 06:52:42 +08:00
clip vision base support + small fixes
This commit is contained in:
parent
cc3a1389ad
commit
4241f106dc
@ -122,9 +122,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
|||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_g.json")
|
||||||
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
elif "vision_model.encoder.layers.30.layer_norm1.weight" in sd:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_h.json")
|
||||||
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
|
elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd or "vision_model.encoder.layers.11.layer_norm1.weight" in sd:
|
||||||
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
|
embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
|
||||||
if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
|
norm_weight = sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0]
|
||||||
|
if norm_weight == 1152:
|
||||||
if embed_shape == 729:
|
if embed_shape == 729:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
|
||||||
elif embed_shape == 1024:
|
elif embed_shape == 1024:
|
||||||
@ -134,6 +135,8 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
|
|||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
|
||||||
else:
|
else:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336.json")
|
||||||
|
elif embed_shape == 1024 and norm_weight == 768:
|
||||||
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_512.json")
|
||||||
else:
|
else:
|
||||||
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
|
json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl.json")
|
||||||
|
|
||||||
|
|||||||
14
comfy/clip_vision_siglip2_base_512.json
Normal file
14
comfy/clip_vision_siglip2_base_512.json
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"hidden_act": "gelu_pytorch_tanh",
|
||||||
|
"hidden_size": 768,
|
||||||
|
"image_size": 512,
|
||||||
|
"intermediate_size": 3072,
|
||||||
|
"layer_norm_eps": 1e-06,
|
||||||
|
"model_type": "siglip_vision_model",
|
||||||
|
"num_attention_heads": 12,
|
||||||
|
"num_channels": 3,
|
||||||
|
"num_hidden_layers": 12,
|
||||||
|
"patch_size": 16,
|
||||||
|
"image_mean": [0.5, 0.5, 0.5],
|
||||||
|
"image_std": [0.5, 0.5, 0.5]
|
||||||
|
}
|
||||||
@ -1321,6 +1321,7 @@ class HunyuanFoley(supported_models_base.BASE):
|
|||||||
|
|
||||||
def process_clip_state_dict(self, state_dict):
|
def process_clip_state_dict(self, state_dict):
|
||||||
state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
|
state_dict = utils.state_dict_prefix_replace(state_dict, {k: "transformer." for k in self.text_encoder_key_prefix}, filter_keys=True)
|
||||||
|
state_dict["logit_scale"] = torch.tensor(1.0)
|
||||||
return state_dict
|
return state_dict
|
||||||
|
|
||||||
class QwenImage(supported_models_base.BASE):
|
class QwenImage(supported_models_base.BASE):
|
||||||
|
|||||||
@ -42,8 +42,12 @@ class HunyuanFoleyConditioning(io.ComfyNode):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def execute(cls, video_encoding_1, video_encoding_2, text_encoding):
|
def execute(cls, siglip_encoding_1, synchformer_encoding_2, text_encoding):
|
||||||
embeds = torch.cat([video_encoding_1, video_encoding_2, text_encoding], dim = 0)
|
|
||||||
|
if isinstance(text_encoding, list):
|
||||||
|
text_encoding = text_encoding[0]
|
||||||
|
|
||||||
|
embeds = torch.cat([siglip_encoding_1, synchformer_encoding_2, text_encoding], dim = 0)
|
||||||
positive = [[embeds, {}]]
|
positive = [[embeds, {}]]
|
||||||
negative = [[torch.zeros_like(embeds), {}]]
|
negative = [[torch.zeros_like(embeds), {}]]
|
||||||
return io.NodeOutput(positive, negative)
|
return io.NodeOutput(positive, negative)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user