mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2025-12-24 13:30:49 +08:00
Merge branch 'master' into v3-improvements
This commit is contained in:
commit
a54faaa480
@ -119,6 +119,9 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
|
||||
|
||||
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
|
||||
- Releases a new stable version (e.g., v0.7.0) roughly every week.
|
||||
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
|
||||
- Minor versions will be used for releases off the master branch.
|
||||
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
|
||||
- Commits outside of the stable release tags may be very unstable and break many custom nodes.
|
||||
- Serves as the foundation for the desktop release
|
||||
|
||||
|
||||
@ -143,7 +143,7 @@ class IndexListContextHandler(ContextHandlerABC):
|
||||
# if multiple conds, split based on primary region
|
||||
if self.split_conds_to_windows and len(cond_in) > 1:
|
||||
region = window.get_region_index(len(cond_in))
|
||||
logging.info(f"Splitting conds to windows; using region {region} for window {window[0]}-{window[-1]} with center ratio {window.center_ratio:.3f}")
|
||||
logging.info(f"Splitting conds to windows; using region {region} for window {window.index_list[0]}-{window.index_list[-1]} with center ratio {window.center_ratio:.3f}")
|
||||
cond_in = [cond_in[region]]
|
||||
# cond object is a list containing a dict - outer list is irrelevant, so just loop through it
|
||||
for actual_cond in cond_in:
|
||||
|
||||
@ -625,7 +625,7 @@ class NextDiT(nn.Module):
|
||||
if pooled is not None:
|
||||
pooled = self.clip_text_pooled_proj(pooled)
|
||||
else:
|
||||
pooled = torch.zeros((1, self.clip_text_dim), device=x.device, dtype=x.dtype)
|
||||
pooled = torch.zeros((x.shape[0], self.clip_text_dim), device=x.device, dtype=x.dtype)
|
||||
|
||||
adaln_input = self.time_text_embed(torch.cat((t, pooled), dim=-1))
|
||||
|
||||
|
||||
@ -1110,7 +1110,7 @@ class Lumina2(BaseModel):
|
||||
if 'num_tokens' not in out:
|
||||
out['num_tokens'] = comfy.conds.CONDConstant(cross_attn.shape[1])
|
||||
|
||||
clip_text_pooled = kwargs["pooled_output"] # Newbie
|
||||
clip_text_pooled = kwargs.get("pooled_output", None) # NewBie
|
||||
if clip_text_pooled is not None:
|
||||
out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)
|
||||
|
||||
|
||||
@ -430,8 +430,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["rope_theta"] = 10000.0
|
||||
dit_config["ffn_dim_multiplier"] = 4.0
|
||||
ctd_weight = state_dict.get('{}clip_text_pooled_proj.0.weight'.format(key_prefix), None)
|
||||
if ctd_weight is not None:
|
||||
if ctd_weight is not None: # NewBie
|
||||
dit_config["clip_text_dim"] = ctd_weight.shape[0]
|
||||
# NewBie also sets axes_lens = [1024, 512, 512] but it's not used in ComfyUI
|
||||
elif dit_config["dim"] == 3840: # Z image
|
||||
dit_config["n_heads"] = 30
|
||||
dit_config["n_kv_heads"] = 30
|
||||
|
||||
@ -984,9 +984,6 @@ class CFGGuider:
|
||||
self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
|
||||
device = self.model_patcher.load_device
|
||||
|
||||
if denoise_mask is not None:
|
||||
denoise_mask = comfy.sampler_helpers.prepare_mask(denoise_mask, noise.shape, device)
|
||||
|
||||
noise = noise.to(device)
|
||||
latent_image = latent_image.to(device)
|
||||
sigmas = sigmas.to(device)
|
||||
@ -1013,6 +1010,24 @@ class CFGGuider:
|
||||
else:
|
||||
latent_shapes = [latent_image.shape]
|
||||
|
||||
if denoise_mask is not None:
|
||||
if denoise_mask.is_nested:
|
||||
denoise_masks = denoise_mask.unbind()
|
||||
denoise_masks = denoise_masks[:len(latent_shapes)]
|
||||
else:
|
||||
denoise_masks = [denoise_mask]
|
||||
|
||||
for i in range(len(denoise_masks), len(latent_shapes)):
|
||||
denoise_masks.append(torch.ones(latent_shapes[i]))
|
||||
|
||||
for i in range(len(denoise_masks)):
|
||||
denoise_masks[i] = comfy.sampler_helpers.prepare_mask(denoise_masks[i], latent_shapes[i], self.model_patcher.load_device)
|
||||
|
||||
if len(denoise_masks) > 1:
|
||||
denoise_mask, _ = comfy.utils.pack_latents(denoise_masks)
|
||||
else:
|
||||
denoise_mask = denoise_masks[0]
|
||||
|
||||
self.conds = {}
|
||||
for k in self.original_conds:
|
||||
self.conds[k] = list(map(lambda a: a.copy(), self.original_conds[k]))
|
||||
|
||||
20
comfy/sd.py
20
comfy/sd.py
@ -55,6 +55,8 @@ import comfy.text_encoders.hunyuan_image
|
||||
import comfy.text_encoders.z_image
|
||||
import comfy.text_encoders.ovis
|
||||
import comfy.text_encoders.kandinsky5
|
||||
import comfy.text_encoders.jina_clip_2
|
||||
import comfy.text_encoders.newbie
|
||||
|
||||
import comfy.model_patcher
|
||||
import comfy.lora
|
||||
@ -1008,6 +1010,7 @@ class CLIPType(Enum):
|
||||
OVIS = 21
|
||||
KANDINSKY5 = 22
|
||||
KANDINSKY5_IMAGE = 23
|
||||
NEWBIE = 24
|
||||
|
||||
|
||||
def load_clip(ckpt_paths, embedding_directory=None, clip_type=CLIPType.STABLE_DIFFUSION, model_options={}):
|
||||
@ -1038,6 +1041,7 @@ class TEModel(Enum):
|
||||
MISTRAL3_24B_PRUNED_FLUX2 = 15
|
||||
QWEN3_4B = 16
|
||||
QWEN3_2B = 17
|
||||
JINA_CLIP_2 = 18
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1047,6 +1051,8 @@ def detect_te_model(sd):
|
||||
return TEModel.CLIP_H
|
||||
if "text_model.encoder.layers.0.mlp.fc1.weight" in sd:
|
||||
return TEModel.CLIP_L
|
||||
if "model.encoder.layers.0.mixer.Wqkv.weight" in sd:
|
||||
return TEModel.JINA_CLIP_2
|
||||
if "encoder.block.23.layer.1.DenseReluDense.wi_1.weight" in sd:
|
||||
weight = sd["encoder.block.23.layer.1.DenseReluDense.wi_1.weight"]
|
||||
if weight.shape[-1] == 4096:
|
||||
@ -1207,6 +1213,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
elif te_model == TEModel.QWEN3_2B:
|
||||
clip_target.clip = comfy.text_encoders.ovis.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ovis.OvisTokenizer
|
||||
elif te_model == TEModel.JINA_CLIP_2:
|
||||
clip_target.clip = comfy.text_encoders.jina_clip_2.JinaClip2TextModelWrapper
|
||||
clip_target.tokenizer = comfy.text_encoders.jina_clip_2.JinaClip2TokenizerWrapper
|
||||
else:
|
||||
# clip_l
|
||||
if clip_type == CLIPType.SD3:
|
||||
@ -1262,6 +1271,17 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
elif clip_type == CLIPType.KANDINSKY5_IMAGE:
|
||||
clip_target.clip = comfy.text_encoders.kandinsky5.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.kandinsky5.Kandinsky5TokenizerImage
|
||||
elif clip_type == CLIPType.NEWBIE:
|
||||
clip_target.clip = comfy.text_encoders.newbie.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.newbie.NewBieTokenizer
|
||||
if "model.layers.0.self_attn.q_norm.weight" in clip_data[0]:
|
||||
clip_data_gemma = clip_data[0]
|
||||
clip_data_jina = clip_data[1]
|
||||
else:
|
||||
clip_data_gemma = clip_data[1]
|
||||
clip_data_jina = clip_data[0]
|
||||
tokenizer_data["gemma_spiece_model"] = clip_data_gemma.get("spiece_model", None)
|
||||
tokenizer_data["jina_spiece_model"] = clip_data_jina.get("spiece_model", None)
|
||||
else:
|
||||
clip_target.clip = sdxl_clip.SDXLClipModel
|
||||
clip_target.tokenizer = sdxl_clip.SDXLTokenizer
|
||||
|
||||
@ -466,7 +466,7 @@ def load_embed(embedding_name, embedding_directory, embedding_size, embed_key=No
|
||||
return embed_out
|
||||
|
||||
class SDTokenizer:
|
||||
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, tokenizer_data={}, tokenizer_args={}):
|
||||
def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True, embedding_directory=None, embedding_size=768, embedding_key='clip_l', tokenizer_class=CLIPTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=True, min_length=None, pad_token=None, end_token=None, min_padding=None, pad_left=False, disable_weights=False, tokenizer_data={}, tokenizer_args={}):
|
||||
if tokenizer_path is None:
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
|
||||
self.tokenizer = tokenizer_class.from_pretrained(tokenizer_path, **tokenizer_args)
|
||||
@ -513,6 +513,8 @@ class SDTokenizer:
|
||||
self.embedding_size = embedding_size
|
||||
self.embedding_key = embedding_key
|
||||
|
||||
self.disable_weights = disable_weights
|
||||
|
||||
def _try_get_embedding(self, embedding_name:str):
|
||||
'''
|
||||
Takes a potential embedding name and tries to retrieve it.
|
||||
@ -547,7 +549,7 @@ class SDTokenizer:
|
||||
min_padding = tokenizer_options.get("{}_min_padding".format(self.embedding_key), self.min_padding)
|
||||
|
||||
text = escape_important(text)
|
||||
if kwargs.get("disable_weights", False):
|
||||
if kwargs.get("disable_weights", self.disable_weights):
|
||||
parsed_weights = [(text, 1.0)]
|
||||
else:
|
||||
parsed_weights = token_weights(text, 1.0)
|
||||
|
||||
219
comfy/text_encoders/jina_clip_2.py
Normal file
219
comfy/text_encoders/jina_clip_2.py
Normal file
@ -0,0 +1,219 @@
|
||||
# Jina CLIP v2 and Jina Embeddings v3 both use their modified XLM-RoBERTa architecture. Reference implementation:
|
||||
# Jina CLIP v2 (both text and vision): https://huggingface.co/jinaai/jina-clip-implementation/blob/39e6a55ae971b59bea6e44675d237c99762e7ee2/modeling_clip.py
|
||||
# Jina XLM-RoBERTa (text only): http://huggingface.co/jinaai/xlm-roberta-flash-implementation/blob/2b6bc3f30750b3a9648fe9b63448c09920efe9be/modeling_xlm_roberta.py
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
from torch import nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.ops
|
||||
from comfy import sd1_clip
|
||||
from .spiece_tokenizer import SPieceTokenizer
|
||||
|
||||
class JinaClip2Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
# The official NewBie uses max_length=8000, but Jina Embeddings v3 actually supports 8192
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=1024, embedding_key='jina_clip_2', tokenizer_class=SPieceTokenizer, has_start_token=True, has_end_token=True, pad_to_max_length=False, max_length=8192, min_length=1, pad_token=1, end_token=2, tokenizer_args={"add_bos": True, "add_eos": True}, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
|
||||
class JinaClip2TokenizerWrapper(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, tokenizer=JinaClip2Tokenizer, name="jina_clip_2")
|
||||
|
||||
# https://huggingface.co/jinaai/jina-embeddings-v3/blob/343dbf534c76fe845f304fa5c2d1fd87e1e78918/config.json
|
||||
@dataclass
|
||||
class XLMRobertaConfig:
|
||||
vocab_size: int = 250002
|
||||
type_vocab_size: int = 1
|
||||
hidden_size: int = 1024
|
||||
num_hidden_layers: int = 24
|
||||
num_attention_heads: int = 16
|
||||
rotary_emb_base: float = 20000.0
|
||||
intermediate_size: int = 4096
|
||||
hidden_act: str = "gelu"
|
||||
hidden_dropout_prob: float = 0.1
|
||||
attention_probs_dropout_prob: float = 0.1
|
||||
layer_norm_eps: float = 1e-05
|
||||
bos_token_id: int = 0
|
||||
eos_token_id: int = 2
|
||||
pad_token_id: int = 1
|
||||
|
||||
class XLMRobertaEmbeddings(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
embed_dim = config.hidden_size
|
||||
self.word_embeddings = ops.Embedding(config.vocab_size, embed_dim, padding_idx=config.pad_token_id, device=device, dtype=dtype)
|
||||
self.token_type_embeddings = ops.Embedding(config.type_vocab_size, embed_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, input_ids=None, embeddings=None):
|
||||
if input_ids is not None and embeddings is None:
|
||||
embeddings = self.word_embeddings(input_ids)
|
||||
|
||||
if embeddings is not None:
|
||||
token_type_ids = torch.zeros(embeddings.shape[1], device=embeddings.device, dtype=torch.int32)
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
embeddings = embeddings + token_type_embeddings
|
||||
return embeddings
|
||||
|
||||
class RotaryEmbedding(nn.Module):
|
||||
def __init__(self, dim, base, device=None):
|
||||
super().__init__()
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
|
||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||
self._seq_len_cached = 0
|
||||
self._cos_cached = None
|
||||
self._sin_cached = None
|
||||
|
||||
def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
|
||||
if seqlen > self._seq_len_cached or self._cos_cached is None or self._cos_cached.device != device or self._cos_cached.dtype != dtype:
|
||||
self._seq_len_cached = seqlen
|
||||
t = torch.arange(seqlen, device=device, dtype=torch.float32)
|
||||
freqs = torch.outer(t, self.inv_freq.to(device=t.device))
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
self._cos_cached = emb.cos().to(dtype)
|
||||
self._sin_cached = emb.sin().to(dtype)
|
||||
|
||||
def forward(self, q, k):
|
||||
batch, seqlen, heads, head_dim = q.shape
|
||||
self._update_cos_sin_cache(seqlen, device=q.device, dtype=q.dtype)
|
||||
|
||||
cos = self._cos_cached[:seqlen].view(1, seqlen, 1, head_dim)
|
||||
sin = self._sin_cached[:seqlen].view(1, seqlen, 1, head_dim)
|
||||
|
||||
def rotate_half(x):
|
||||
size = x.shape[-1] // 2
|
||||
x1, x2 = x[..., :size], x[..., size:]
|
||||
return torch.cat((-x2, x1), dim=-1)
|
||||
|
||||
q_embed = (q * cos) + (rotate_half(q) * sin)
|
||||
k_embed = (k * cos) + (rotate_half(k) * sin)
|
||||
return q_embed, k_embed
|
||||
|
||||
class MHA(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
embed_dim = config.hidden_size
|
||||
self.num_heads = config.num_attention_heads
|
||||
self.head_dim = embed_dim // config.num_attention_heads
|
||||
|
||||
self.rotary_emb = RotaryEmbedding(self.head_dim, config.rotary_emb_base, device=device)
|
||||
self.Wqkv = ops.Linear(embed_dim, 3 * embed_dim, device=device, dtype=dtype)
|
||||
self.out_proj = ops.Linear(embed_dim, embed_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, mask=None, optimized_attention=None):
|
||||
qkv = self.Wqkv(x)
|
||||
batch_size, seq_len, _ = qkv.shape
|
||||
qkv = qkv.view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
|
||||
q, k, v = qkv.unbind(2)
|
||||
|
||||
q, k = self.rotary_emb(q, k)
|
||||
|
||||
# NHD -> HND
|
||||
q = q.transpose(1, 2)
|
||||
k = k.transpose(1, 2)
|
||||
v = v.transpose(1, 2)
|
||||
|
||||
out = optimized_attention(q, k, v, heads=self.num_heads, mask=mask, skip_reshape=True)
|
||||
return self.out_proj(out)
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.fc1 = ops.Linear(config.hidden_size, config.intermediate_size, device=device, dtype=dtype)
|
||||
self.activation = F.gelu
|
||||
self.fc2 = ops.Linear(config.intermediate_size, config.hidden_size, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.activation(x)
|
||||
x = self.fc2(x)
|
||||
return x
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.mixer = MHA(config, device=device, dtype=dtype, ops=ops)
|
||||
self.dropout1 = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.norm1 = ops.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device, dtype=dtype)
|
||||
self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
|
||||
self.dropout2 = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.norm2 = ops.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, mask=None, optimized_attention=None):
|
||||
mixer_out = self.mixer(hidden_states, mask=mask, optimized_attention=optimized_attention)
|
||||
hidden_states = self.norm1(self.dropout1(mixer_out) + hidden_states)
|
||||
mlp_out = self.mlp(hidden_states)
|
||||
hidden_states = self.norm2(self.dropout2(mlp_out) + hidden_states)
|
||||
return hidden_states
|
||||
|
||||
class XLMRobertaEncoder(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([Block(config, device=device, dtype=dtype, ops=ops) for _ in range(config.num_hidden_layers)])
|
||||
|
||||
def forward(self, hidden_states, attention_mask=None):
|
||||
optimized_attention = comfy.ldm.modules.attention.optimized_attention_for_device(hidden_states.device, mask=attention_mask is not None, small_input=True)
|
||||
for layer in self.layers:
|
||||
hidden_states = layer(hidden_states, mask=attention_mask, optimized_attention=optimized_attention)
|
||||
return hidden_states
|
||||
|
||||
class XLMRobertaModel_(nn.Module):
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.embeddings = XLMRobertaEmbeddings(config, device=device, dtype=dtype, ops=ops)
|
||||
self.emb_ln = ops.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, device=device, dtype=dtype)
|
||||
self.emb_drop = nn.Dropout(config.hidden_dropout_prob)
|
||||
self.encoder = XLMRobertaEncoder(config, device=device, dtype=dtype, ops=ops)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[]):
|
||||
x = self.embeddings(input_ids=input_ids, embeddings=embeds)
|
||||
x = self.emb_ln(x)
|
||||
x = self.emb_drop(x)
|
||||
|
||||
mask = None
|
||||
if attention_mask is not None:
|
||||
mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, 1, attention_mask.shape[-1]))
|
||||
mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
|
||||
|
||||
sequence_output = self.encoder(x, attention_mask=mask)
|
||||
|
||||
# Mean pool, see https://huggingface.co/jinaai/jina-clip-implementation/blob/39e6a55ae971b59bea6e44675d237c99762e7ee2/hf_model.py
|
||||
pooled_output = None
|
||||
if attention_mask is None:
|
||||
pooled_output = sequence_output.mean(dim=1)
|
||||
else:
|
||||
attention_mask = attention_mask.to(sequence_output.dtype)
|
||||
pooled_output = (sequence_output * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=-1, keepdim=True)
|
||||
|
||||
# Intermediate output is not yet implemented, use None for placeholder
|
||||
return sequence_output, None, pooled_output
|
||||
|
||||
class XLMRobertaModel(nn.Module):
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
self.config = XLMRobertaConfig(**config_dict)
|
||||
self.model = XLMRobertaModel_(self.config, device=device, dtype=dtype, ops=operations)
|
||||
self.num_layers = self.config.num_hidden_layers
|
||||
|
||||
def get_input_embeddings(self):
|
||||
return self.model.embeddings.word_embeddings
|
||||
|
||||
def set_input_embeddings(self, embeddings):
|
||||
self.model.embeddings.word_embeddings = embeddings
|
||||
|
||||
def forward(self, *args, **kwargs):
|
||||
return self.model(*args, **kwargs)
|
||||
|
||||
class JinaClip2TextModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, textmodel_json_config={}, model_class=XLMRobertaModel, special_tokens={"start": 0, "end": 2, "pad": 1}, enable_attention_masks=True, return_attention_masks=True, model_options=model_options)
|
||||
|
||||
class JinaClip2TextModelWrapper(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, clip_model=JinaClip2TextModel, name="jina_clip_2", model_options=model_options)
|
||||
@ -3,7 +3,6 @@ import torch.nn as nn
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Any
|
||||
import math
|
||||
import logging
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
import comfy.model_management
|
||||
@ -177,7 +176,7 @@ class Gemma3_4B_Config:
|
||||
num_key_value_heads: int = 4
|
||||
max_position_embeddings: int = 131072
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta = [10000.0, 1000000.0]
|
||||
rope_theta = [1000000.0, 10000.0]
|
||||
transformer_type: str = "gemma3"
|
||||
head_dim = 256
|
||||
rms_norm_add = True
|
||||
@ -186,8 +185,8 @@ class Gemma3_4B_Config:
|
||||
rope_dims = None
|
||||
q_norm = "gemma3"
|
||||
k_norm = "gemma3"
|
||||
sliding_attention = [False, False, False, False, False, 1024]
|
||||
rope_scale = [1.0, 8.0]
|
||||
sliding_attention = [1024, 1024, 1024, 1024, 1024, False]
|
||||
rope_scale = [8.0, 1.0]
|
||||
final_norm: bool = True
|
||||
|
||||
class RMSNorm(nn.Module):
|
||||
@ -370,7 +369,7 @@ class TransformerBlockGemma2(nn.Module):
|
||||
self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=config.rms_norm_add, device=device, dtype=dtype)
|
||||
|
||||
if config.sliding_attention is not None: # TODO: implement. (Not that necessary since models are trained on less than 1024 tokens)
|
||||
if config.sliding_attention is not None:
|
||||
self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)]
|
||||
else:
|
||||
self.sliding_attention = False
|
||||
@ -387,7 +386,12 @@ class TransformerBlockGemma2(nn.Module):
|
||||
if self.transformer_type == 'gemma3':
|
||||
if self.sliding_attention:
|
||||
if x.shape[1] > self.sliding_attention:
|
||||
logging.warning("Warning: sliding attention not implemented, results may be incorrect")
|
||||
sliding_mask = torch.full((x.shape[1], x.shape[1]), float("-inf"), device=x.device, dtype=x.dtype)
|
||||
sliding_mask.tril_(diagonal=-self.sliding_attention)
|
||||
if attention_mask is not None:
|
||||
attention_mask = attention_mask + sliding_mask
|
||||
else:
|
||||
attention_mask = sliding_mask
|
||||
freqs_cis = freqs_cis[1]
|
||||
else:
|
||||
freqs_cis = freqs_cis[0]
|
||||
|
||||
@ -14,7 +14,7 @@ class Gemma2BTokenizer(sd1_clip.SDTokenizer):
|
||||
class Gemma3_4BTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer = tokenizer_data.get("spiece_model", None)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, tokenizer_data=tokenizer_data)
|
||||
super().__init__(tokenizer, pad_with_end=False, embedding_size=2560, embedding_key='gemma3_4b', tokenizer_class=SPieceTokenizer, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, tokenizer_args={"add_bos": True, "add_eos": False}, disable_weights=True, tokenizer_data=tokenizer_data)
|
||||
|
||||
def state_dict(self):
|
||||
return {"spiece_model": self.tokenizer.serialize_model()}
|
||||
@ -33,6 +33,11 @@ class Gemma2_2BModel(sd1_clip.SDClipModel):
|
||||
|
||||
class Gemma3_4BModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-2, dtype=None, attention_mask=True, model_options={}):
|
||||
llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
class LuminaModel(sd1_clip.SD1ClipModel):
|
||||
|
||||
62
comfy/text_encoders/newbie.py
Normal file
62
comfy/text_encoders/newbie.py
Normal file
@ -0,0 +1,62 @@
|
||||
import torch
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.text_encoders.jina_clip_2
|
||||
import comfy.text_encoders.lumina2
|
||||
|
||||
class NewBieTokenizer:
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
self.gemma = comfy.text_encoders.lumina2.Gemma3_4BTokenizer(embedding_directory=embedding_directory, tokenizer_data={"spiece_model": tokenizer_data["gemma_spiece_model"]})
|
||||
self.jina = comfy.text_encoders.jina_clip_2.JinaClip2Tokenizer(embedding_directory=embedding_directory, tokenizer_data={"spiece_model": tokenizer_data["jina_spiece_model"]})
|
||||
|
||||
def tokenize_with_weights(self, text:str, return_word_ids=False, **kwargs):
|
||||
out = {}
|
||||
out["gemma"] = self.gemma.tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
out["jina"] = self.jina.tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
return out
|
||||
|
||||
def untokenize(self, token_weight_pair):
|
||||
raise NotImplementedError
|
||||
|
||||
def state_dict(self):
|
||||
return {}
|
||||
|
||||
class NewBieTEModel(torch.nn.Module):
|
||||
def __init__(self, dtype_gemma=None, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__()
|
||||
dtype_gemma = comfy.model_management.pick_weight_dtype(dtype_gemma, dtype, device)
|
||||
self.gemma = comfy.text_encoders.lumina2.Gemma3_4BModel(device=device, dtype=dtype_gemma, model_options=model_options)
|
||||
self.jina = comfy.text_encoders.jina_clip_2.JinaClip2TextModel(device=device, dtype=dtype, model_options=model_options)
|
||||
self.dtypes = {dtype, dtype_gemma}
|
||||
|
||||
def set_clip_options(self, options):
|
||||
self.gemma.set_clip_options(options)
|
||||
self.jina.set_clip_options(options)
|
||||
|
||||
def reset_clip_options(self):
|
||||
self.gemma.reset_clip_options()
|
||||
self.jina.reset_clip_options()
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
token_weight_pairs_gemma = token_weight_pairs["gemma"]
|
||||
token_weight_pairs_jina = token_weight_pairs["jina"]
|
||||
|
||||
gemma_out, gemma_pooled, gemma_extra = self.gemma.encode_token_weights(token_weight_pairs_gemma)
|
||||
jina_out, jina_pooled, jina_extra = self.jina.encode_token_weights(token_weight_pairs_jina)
|
||||
|
||||
return gemma_out, jina_pooled, gemma_extra
|
||||
|
||||
def load_sd(self, sd):
|
||||
if "model.layers.0.self_attn.q_norm.weight" in sd:
|
||||
return self.gemma.load_sd(sd)
|
||||
else:
|
||||
return self.jina.load_sd(sd)
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class NewBieTEModel_(NewBieTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["llama_quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(dtype_gemma=dtype_llama, device=device, dtype=dtype, model_options=model_options)
|
||||
return NewBieTEModel_
|
||||
@ -10,7 +10,7 @@ class Text2ImageTaskCreationRequest(BaseModel):
|
||||
size: str | None = Field(None)
|
||||
seed: int | None = Field(0, ge=0, le=2147483647)
|
||||
guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
|
||||
watermark: bool | None = Field(True)
|
||||
watermark: bool | None = Field(False)
|
||||
|
||||
|
||||
class Image2ImageTaskCreationRequest(BaseModel):
|
||||
@ -21,7 +21,7 @@ class Image2ImageTaskCreationRequest(BaseModel):
|
||||
size: str | None = Field("adaptive")
|
||||
seed: int | None = Field(..., ge=0, le=2147483647)
|
||||
guidance_scale: float | None = Field(..., ge=1.0, le=10.0)
|
||||
watermark: bool | None = Field(True)
|
||||
watermark: bool | None = Field(False)
|
||||
|
||||
|
||||
class Seedream4Options(BaseModel):
|
||||
@ -37,7 +37,7 @@ class Seedream4TaskCreationRequest(BaseModel):
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
sequential_image_generation: str = Field("disabled")
|
||||
sequential_image_generation_options: Seedream4Options = Field(Seedream4Options(max_images=15))
|
||||
watermark: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class ImageTaskCreationResponse(BaseModel):
|
||||
|
||||
@ -112,7 +112,7 @@ class ByteDanceImageNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the image',
|
||||
optional=True,
|
||||
),
|
||||
@ -215,7 +215,7 @@ class ByteDanceImageEditNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the image',
|
||||
optional=True,
|
||||
),
|
||||
@ -346,7 +346,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the image.',
|
||||
optional=True,
|
||||
),
|
||||
@ -380,7 +380,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
sequential_image_generation: str = "disabled",
|
||||
max_images: int = 1,
|
||||
seed: int = 0,
|
||||
watermark: bool = True,
|
||||
watermark: bool = False,
|
||||
fail_on_partial: bool = True,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
@ -507,7 +507,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the video.',
|
||||
optional=True,
|
||||
),
|
||||
@ -617,7 +617,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the video.',
|
||||
optional=True,
|
||||
),
|
||||
@ -739,7 +739,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the video.',
|
||||
optional=True,
|
||||
),
|
||||
@ -862,7 +862,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the video.',
|
||||
optional=True,
|
||||
),
|
||||
|
||||
@ -858,7 +858,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
tooltip="A text prompt describing the video content. "
|
||||
"This can include both positive and negative descriptions.",
|
||||
),
|
||||
IO.Combo.Input("duration", options=["5", "10"]),
|
||||
IO.Int.Input("duration", default=5, min=3, max=10, display_mode=IO.NumberDisplay.slider),
|
||||
IO.Image.Input("first_frame"),
|
||||
IO.Image.Input(
|
||||
"end_frame",
|
||||
@ -897,6 +897,10 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
validate_string(prompt, min_length=1, max_length=2500)
|
||||
if end_frame is not None and reference_images is not None:
|
||||
raise ValueError("The 'end_frame' input cannot be used simultaneously with 'reference_images'.")
|
||||
if duration not in (5, 10) and end_frame is None and reference_images is None:
|
||||
raise ValueError(
|
||||
"Duration is only supported for 5 or 10 seconds if there is no end frame or reference images."
|
||||
)
|
||||
validate_image_dimensions(first_frame, min_width=300, min_height=300)
|
||||
validate_image_aspect_ratio(first_frame, (1, 2.5), (2.5, 1))
|
||||
image_list: list[OmniParamImage] = [
|
||||
|
||||
@ -23,10 +23,6 @@ UPSCALER_MODELS_MAP = {
|
||||
"Starlight (Astra) Fast": "slf-1",
|
||||
"Starlight (Astra) Creative": "slc-1",
|
||||
}
|
||||
UPSCALER_VALUES_MAP = {
|
||||
"FullHD (1080p)": 1920,
|
||||
"4K (2160p)": 3840,
|
||||
}
|
||||
|
||||
|
||||
class TopazImageEnhance(IO.ComfyNode):
|
||||
@ -214,7 +210,7 @@ class TopazVideoEnhance(IO.ComfyNode):
|
||||
IO.Video.Input("video"),
|
||||
IO.Boolean.Input("upscaler_enabled", default=True),
|
||||
IO.Combo.Input("upscaler_model", options=list(UPSCALER_MODELS_MAP.keys())),
|
||||
IO.Combo.Input("upscaler_resolution", options=list(UPSCALER_VALUES_MAP.keys())),
|
||||
IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"]),
|
||||
IO.Combo.Input(
|
||||
"upscaler_creativity",
|
||||
options=["low", "middle", "high"],
|
||||
@ -306,8 +302,33 @@ class TopazVideoEnhance(IO.ComfyNode):
|
||||
target_frame_rate = src_frame_rate
|
||||
filters = []
|
||||
if upscaler_enabled:
|
||||
target_width = UPSCALER_VALUES_MAP[upscaler_resolution]
|
||||
target_height = UPSCALER_VALUES_MAP[upscaler_resolution]
|
||||
if "1080p" in upscaler_resolution:
|
||||
target_pixel_p = 1080
|
||||
max_long_side = 1920
|
||||
else:
|
||||
target_pixel_p = 2160
|
||||
max_long_side = 3840
|
||||
ar = src_width / src_height
|
||||
if src_width >= src_height:
|
||||
# Landscape or Square; Attempt to set height to target (e.g., 2160), calculate width
|
||||
target_height = target_pixel_p
|
||||
target_width = int(target_height * ar)
|
||||
# Check if width exceeds standard bounds (for ultra-wide e.g., 21:9 ARs)
|
||||
if target_width > max_long_side:
|
||||
target_width = max_long_side
|
||||
target_height = int(target_width / ar)
|
||||
else:
|
||||
# Portrait; Attempt to set width to target (e.g., 2160), calculate height
|
||||
target_width = target_pixel_p
|
||||
target_height = int(target_width / ar)
|
||||
# Check if height exceeds standard bounds
|
||||
if target_height > max_long_side:
|
||||
target_height = max_long_side
|
||||
target_width = int(target_height * ar)
|
||||
if target_width % 2 != 0:
|
||||
target_width += 1
|
||||
if target_height % 2 != 0:
|
||||
target_height += 1
|
||||
filters.append(
|
||||
topaz_api.VideoEnhancementFilter(
|
||||
model=UPSCALER_MODELS_MAP[upscaler_model],
|
||||
|
||||
@ -46,14 +46,14 @@ class Txt2ImageParametersField(BaseModel):
|
||||
n: int = Field(1, description="Number of images to generate.") # we support only value=1
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class Image2ImageParametersField(BaseModel):
|
||||
size: str | None = Field(None)
|
||||
n: int = Field(1, description="Number of images to generate.") # we support only value=1
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
watermark: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
|
||||
|
||||
class Text2VideoParametersField(BaseModel):
|
||||
@ -61,7 +61,7 @@ class Text2VideoParametersField(BaseModel):
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
duration: int = Field(5, ge=5, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
audio: bool = Field(False, description="Whether to generate audio automatically.")
|
||||
shot_type: str = Field("single")
|
||||
|
||||
@ -71,7 +71,7 @@ class Image2VideoParametersField(BaseModel):
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
duration: int = Field(5, ge=5, le=15)
|
||||
prompt_extend: bool = Field(True)
|
||||
watermark: bool = Field(True)
|
||||
watermark: bool = Field(False)
|
||||
audio: bool = Field(False, description="Whether to generate audio automatically.")
|
||||
shot_type: str = Field("single")
|
||||
|
||||
@ -208,7 +208,7 @@ class WanTextToImageApi(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
optional=True,
|
||||
),
|
||||
@ -234,7 +234,7 @@ class WanTextToImageApi(IO.ComfyNode):
|
||||
height: int = 1024,
|
||||
seed: int = 0,
|
||||
prompt_extend: bool = True,
|
||||
watermark: bool = True,
|
||||
watermark: bool = False,
|
||||
):
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
@ -327,7 +327,7 @@ class WanImageToImageApi(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
optional=True,
|
||||
),
|
||||
@ -353,7 +353,7 @@ class WanImageToImageApi(IO.ComfyNode):
|
||||
# width: int = 1024,
|
||||
# height: int = 1024,
|
||||
seed: int = 0,
|
||||
watermark: bool = True,
|
||||
watermark: bool = False,
|
||||
):
|
||||
n_images = get_number_of_images(image)
|
||||
if n_images not in (1, 2):
|
||||
@ -476,7 +476,7 @@ class WanTextToVideoApi(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
optional=True,
|
||||
),
|
||||
@ -512,7 +512,7 @@ class WanTextToVideoApi(IO.ComfyNode):
|
||||
seed: int = 0,
|
||||
generate_audio: bool = False,
|
||||
prompt_extend: bool = True,
|
||||
watermark: bool = True,
|
||||
watermark: bool = False,
|
||||
shot_type: str = "single",
|
||||
):
|
||||
if "480p" in size and model == "wan2.6-t2v":
|
||||
@ -637,7 +637,7 @@ class WanImageToVideoApi(IO.ComfyNode):
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=True,
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
optional=True,
|
||||
),
|
||||
@ -674,7 +674,7 @@ class WanImageToVideoApi(IO.ComfyNode):
|
||||
seed: int = 0,
|
||||
generate_audio: bool = False,
|
||||
prompt_extend: bool = True,
|
||||
watermark: bool = True,
|
||||
watermark: bool = False,
|
||||
shot_type: str = "single",
|
||||
):
|
||||
if get_number_of_images(image) != 1:
|
||||
|
||||
@ -760,8 +760,12 @@ class SamplerCustom(io.ComfyNode):
|
||||
out = latent.copy()
|
||||
out["samples"] = samples
|
||||
if "x0" in x0_output:
|
||||
x0_out = model.model.process_latent_out(x0_output["x0"].cpu())
|
||||
if samples.is_nested:
|
||||
latent_shapes = [x.shape for x in samples.unbind()]
|
||||
x0_out = comfy.nested_tensor.NestedTensor(comfy.utils.unpack_latents(x0_out, latent_shapes))
|
||||
out_denoised = latent.copy()
|
||||
out_denoised["samples"] = model.model.process_latent_out(x0_output["x0"].cpu())
|
||||
out_denoised["samples"] = x0_out
|
||||
else:
|
||||
out_denoised = out
|
||||
return io.NodeOutput(out, out_denoised)
|
||||
@ -948,8 +952,12 @@ class SamplerCustomAdvanced(io.ComfyNode):
|
||||
out = latent.copy()
|
||||
out["samples"] = samples
|
||||
if "x0" in x0_output:
|
||||
x0_out = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
|
||||
if samples.is_nested:
|
||||
latent_shapes = [x.shape for x in samples.unbind()]
|
||||
x0_out = comfy.nested_tensor.NestedTensor(comfy.utils.unpack_latents(x0_out, latent_shapes))
|
||||
out_denoised = latent.copy()
|
||||
out_denoised["samples"] = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
|
||||
out_denoised["samples"] = x0_out
|
||||
else:
|
||||
out_denoised = out
|
||||
return io.NodeOutput(out, out_denoised)
|
||||
|
||||
@ -348,7 +348,7 @@ class ZImageControlPatch:
|
||||
if self.mask is None:
|
||||
mask_ = torch.zeros_like(inpaint_image_latent)[:, :1]
|
||||
else:
|
||||
mask_ = comfy.utils.common_upscale(self.mask.view(self.mask.shape[0], -1, self.mask.shape[-2], self.mask.shape[-1]).mean(dim=1, keepdim=True), inpaint_image_latent.shape[-1], inpaint_image_latent.shape[-2], "nearest", "center")
|
||||
mask_ = comfy.utils.common_upscale(self.mask.view(self.mask.shape[0], -1, self.mask.shape[-2], self.mask.shape[-1]).mean(dim=1, keepdim=True).to(device=inpaint_image_latent.device), inpaint_image_latent.shape[-1], inpaint_image_latent.shape[-2], "nearest", "center")
|
||||
|
||||
if latent_image is None:
|
||||
latent_image = comfy.latent_formats.Flux().process_in(self.vae.encode(torch.ones_like(inpaint_image) * 0.5))
|
||||
|
||||
@ -3,7 +3,9 @@ import comfy.utils
|
||||
import math
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
import comfy.model_management
|
||||
import torch
|
||||
import nodes
|
||||
|
||||
class TextEncodeQwenImageEdit(io.ComfyNode):
|
||||
@classmethod
|
||||
@ -104,12 +106,37 @@ class TextEncodeQwenImageEditPlus(io.ComfyNode):
|
||||
return io.NodeOutput(conditioning)
|
||||
|
||||
|
||||
class EmptyQwenImageLayeredLatentImage(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyQwenImageLayeredLatentImage",
|
||||
display_name="Empty Qwen Image Layered Latent",
|
||||
category="latent/qwen",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("layers", default=3, min=0, max=nodes.MAX_RESOLUTION, step=1),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
],
|
||||
outputs=[
|
||||
io.Latent.Output(),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, width, height, layers, batch_size=1) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, layers + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
return io.NodeOutput({"samples": latent})
|
||||
|
||||
|
||||
class QwenExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
TextEncodeQwenImageEdit,
|
||||
TextEncodeQwenImageEditPlus,
|
||||
EmptyQwenImageLayeredLatentImage,
|
||||
]
|
||||
|
||||
|
||||
|
||||
4
nodes.py
4
nodes.py
@ -970,7 +970,7 @@ class DualCLIPLoader:
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name1": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"clip_name2": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image"], ),
|
||||
"type": (["sdxl", "sd3", "flux", "hunyuan_video", "hidream", "hunyuan_image", "hunyuan_video_15", "kandinsky5", "kandinsky5_image", "newbie"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -980,7 +980,7 @@ class DualCLIPLoader:
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small"
|
||||
DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small\nnewbie: gemma-3-4b-it, jina clip v2"
|
||||
|
||||
def load_clip(self, clip_name1, clip_name2, type, device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.34.9
|
||||
comfyui-workflow-templates==0.7.60
|
||||
comfyui-workflow-templates==0.7.62
|
||||
comfyui-embedded-docs==0.3.1
|
||||
torch
|
||||
torchsde
|
||||
|
||||
Loading…
Reference in New Issue
Block a user