mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-17 21:39:45 +08:00
Merge branch 'master' into alexis/add_output_save_nodes
This commit is contained in:
commit
7ba2a90f41
15
comfy/sd.py
15
comfy/sd.py
@ -67,6 +67,7 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.qwen3vl
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.gemma4
|
||||
import comfy.text_encoders.cogvideo
|
||||
@ -1353,6 +1354,8 @@ class TEModel(Enum):
|
||||
GEMMA_4_31B = 31
|
||||
T5_GEMMA = 32
|
||||
GPT_OSS_20B = 33
|
||||
QWEN3VL_4B = 34
|
||||
QWEN3VL_8B = 35
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1414,6 +1417,8 @@ def detect_te_model(sd):
|
||||
if weight.shape[0] == 5120:
|
||||
return TEModel.QWEN35_27B
|
||||
return TEModel.QWEN35_2B
|
||||
if "model.visual.deepstack_merger_list.0.norm.weight" in sd: # DeepStack is unique to Qwen3-VL
|
||||
return TEModel.QWEN3VL_4B if sd["model.visual.merger.linear_fc2.weight"].shape[0] == 2560 else TEModel.QWEN3VL_8B
|
||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
@ -1612,6 +1617,16 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
qwen35_type = {TEModel.QWEN35_08B: "qwen35_08b", TEModel.QWEN35_2B: "qwen35_2b", TEModel.QWEN35_4B: "qwen35_4b", TEModel.QWEN35_9B: "qwen35_9b", TEModel.QWEN35_27B: "qwen35_27b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
|
||||
elif te_model in (TEModel.QWEN3VL_4B, TEModel.QWEN3VL_8B):
|
||||
if clip_type == CLIPType.IDEOGRAM4 and te_model == TEModel.QWEN3VL_8B: # Ideogram4 reuses the full Qwen3-VL-8B (13-layer tap for conditioning + multimodal generate).
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
|
||||
else:
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen3vl.te(**llama_detect(clip_data), model_type=qwen3vl_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen3vl.tokenizer(model_type=qwen3vl_type)
|
||||
elif te_model == TEModel.QWEN3_06B:
|
||||
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
|
||||
|
||||
@ -9,6 +9,7 @@ import os
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
import comfy.text_encoders.llama
|
||||
import comfy.text_encoders.qwen3vl
|
||||
from comfy import sd1_clip
|
||||
|
||||
# Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1.
|
||||
@ -77,3 +78,43 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return Ideogram4TEModel_
|
||||
|
||||
|
||||
# Full Qwen3-VL-8B variant with vision
|
||||
|
||||
class Ideogram4Qwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None, dtype=dtype,
|
||||
attention_mask=attention_mask, model_options=model_options, model_type="qwen3vl_8b")
|
||||
|
||||
|
||||
class Ideogram4Qwen3VLTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Ideogram4Qwen3VLClipModel, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
|
||||
b, n, seq, h = out.shape # (B, n_taps=13, seq, 4096), ascending layer order.
|
||||
out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n) # (B, seq, 4096*13 = 53248).
|
||||
return out, pooled, extra
|
||||
|
||||
|
||||
class Ideogram4Qwen3VLTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
|
||||
# Ideogram 4 conditions on the no-think template; default thinking=True drops the empty think block qwen3vl adds.
|
||||
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
|
||||
|
||||
|
||||
def te_qwen3vl(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class Ideogram4Qwen3VLTEModel_(Ideogram4Qwen3VLTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return Ideogram4Qwen3VLTEModel_
|
||||
|
||||
@ -251,6 +251,19 @@ class Qwen3_8BConfig:
|
||||
lm_head: bool = True
|
||||
stop_tokens = [151643, 151645]
|
||||
|
||||
@dataclass
|
||||
class Qwen3VL_8BConfig(Qwen3_8BConfig):
|
||||
max_position_embeddings: int = 262144
|
||||
rope_theta: float = 5000000.0
|
||||
rope_dims = [24, 20, 20]
|
||||
interleaved_mrope = True
|
||||
|
||||
@dataclass
|
||||
class Qwen3VL_4BConfig(Qwen3VL_8BConfig):
|
||||
hidden_size: int = 2560
|
||||
intermediate_size: int = 9728
|
||||
lm_head: bool = False # 4B ties word embeddings
|
||||
|
||||
@dataclass
|
||||
class Ovis25_2BConfig:
|
||||
vocab_size: int = 151936
|
||||
@ -703,7 +716,8 @@ class Llama2_(nn.Module):
|
||||
interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True,
|
||||
dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None,deepstack_embeds=None, visual_pos_masks=None):
|
||||
if embeds is not None:
|
||||
x = embeds
|
||||
else:
|
||||
@ -767,6 +781,10 @@ class Llama2_(nn.Module):
|
||||
if current_kv is not None:
|
||||
next_key_values.append(current_kv)
|
||||
|
||||
# DeepStack: add per-layer visual features into the first len() decoder layers at image positions (Qwen3-VL)
|
||||
if deepstack_embeds is not None and i < len(deepstack_embeds):
|
||||
x[visual_pos_masks] = x[visual_pos_masks] + deepstack_embeds[i].to(x)
|
||||
|
||||
if i == intermediate_output:
|
||||
intermediate = x.clone()
|
||||
|
||||
@ -860,7 +878,7 @@ class BaseGenerate:
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
return past_key_values
|
||||
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None):
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None, position_ids=None, deepstack_embeds=None, visual_pos_masks=None):
|
||||
device = embeds.device
|
||||
|
||||
if stop_tokens is None:
|
||||
@ -884,10 +902,18 @@ class BaseGenerate:
|
||||
generated_token_ids = []
|
||||
pbar = comfy.utils.ProgressBar(max_length)
|
||||
|
||||
# MRoPE: prefill uses explicit 3D position_ids, decode continues from the last position
|
||||
next_pos = int(position_ids[:, -1].max()) + 1 if position_ids is not None else None
|
||||
|
||||
# Generation loop
|
||||
current_input_ids = initial_input_ids
|
||||
for step in tqdm(range(max_length), desc="Generating tokens"):
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids)
|
||||
# DeepStack visual features are injected on the prefill only; gemma4's forward lacks these kwargs.
|
||||
extra = {}
|
||||
if step == 0 and deepstack_embeds is not None:
|
||||
extra["deepstack_embeds"] = deepstack_embeds
|
||||
extra["visual_pos_masks"] = visual_pos_masks
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids, position_ids=position_ids, **extra)
|
||||
logits = self.logits(x)[:, -1]
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
|
||||
token_id = next_token[0].item()
|
||||
@ -895,6 +921,9 @@ class BaseGenerate:
|
||||
|
||||
embeds = self.model.embed_tokens(next_token).to(execution_dtype)
|
||||
current_input_ids = next_token if initial_input_ids is not None else None
|
||||
if next_pos is not None: # advance MRoPE position for the next (decode) step
|
||||
position_ids = torch.tensor([[next_pos]], device=device)
|
||||
next_pos += 1
|
||||
pbar.update(1)
|
||||
|
||||
if token_id in stop_tokens:
|
||||
|
||||
@ -3,7 +3,6 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from dataclasses import dataclass, field
|
||||
import os
|
||||
import math
|
||||
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
@ -563,6 +562,8 @@ class Qwen35VisionModel(nn.Module):
|
||||
for _ in range(config["depth"])
|
||||
])
|
||||
self.merger = Qwen35VisionPatchMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
self.deepstack_visual_indexes = [] # DeepStack, per-layer visual features (Qwen3-VL)
|
||||
self.deepstack_merger_list = None
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
merge_size = self.spatial_merge_size
|
||||
@ -664,9 +665,14 @@ class Qwen35VisionModel(nn.Module):
|
||||
).cumsum(dim=0, dtype=torch.int32)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
|
||||
for blk in self.blocks:
|
||||
deepstack_features = []
|
||||
for layer_num, blk in enumerate(self.blocks):
|
||||
x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
if self.deepstack_merger_list is not None and layer_num in self.deepstack_visual_indexes:
|
||||
deepstack_features.append(self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](x))
|
||||
merged = self.merger(x)
|
||||
if self.deepstack_merger_list is not None:
|
||||
return merged, deepstack_features
|
||||
return merged
|
||||
|
||||
# Model Wrapper
|
||||
@ -690,30 +696,7 @@ class Qwen35(BaseLlama, BaseGenerate, torch.nn.Module):
|
||||
return None, None
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[], past_key_values=None):
|
||||
grid = None
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
grid = e.get("extra", None)
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
|
||||
if grid is None:
|
||||
position_ids = None
|
||||
|
||||
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, embeds.shape[1], embeds.device)
|
||||
return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids, past_key_values=past_key_values)
|
||||
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
|
||||
193
comfy/text_encoders/qwen3vl.py
Normal file
193
comfy/text_encoders/qwen3vl.py
Normal file
@ -0,0 +1,193 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.qwen_vl
|
||||
from .qwen35 import Qwen35VisionModel
|
||||
from .llama import BaseLlama, BaseQwen3, BaseGenerate, Llama2_, Qwen3VL_4BConfig, Qwen3VL_8BConfig
|
||||
|
||||
|
||||
QWEN3VL_VISION = {
|
||||
"qwen3vl_4b": dict(hidden_size=1024, intermediate_size=4096, depth=24, deepstack_visual_indexes=[5, 11, 17]),
|
||||
"qwen3vl_8b": dict(hidden_size=1152, intermediate_size=4304, depth=27, deepstack_visual_indexes=[8, 16, 24]),
|
||||
}
|
||||
QWEN3VL_VISION_COMMON = dict(num_heads=16, patch_size=16, temporal_patch_size=2, in_channels=3,
|
||||
spatial_merge_size=2, num_position_embeddings=2304)
|
||||
|
||||
QWEN3VL_CONFIGS = {"qwen3vl_4b": Qwen3VL_4BConfig, "qwen3vl_8b": Qwen3VL_8BConfig}
|
||||
|
||||
|
||||
class Qwen3VLDeepstackMerger(nn.Module):
|
||||
# DeepStack merger: postshuffle LayerNorm (applied after spatial merge), unlike the main merger.
|
||||
def __init__(self, hidden_size, spatial_merge_size, out_hidden_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.merge_dim = hidden_size * (spatial_merge_size ** 2)
|
||||
self.norm = ops.LayerNorm(self.merge_dim, eps=1e-6, device=device, dtype=dtype)
|
||||
self.linear_fc1 = ops.Linear(self.merge_dim, self.merge_dim, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(self.merge_dim, out_hidden_size, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm(x.view(-1, self.merge_dim))
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(x)))
|
||||
|
||||
|
||||
class Qwen3VLVisionModel(Qwen35VisionModel):
|
||||
# Qwen3.5 vision + DeepStack
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__(config, device=device, dtype=dtype, ops=ops)
|
||||
self.deepstack_visual_indexes = config["deepstack_visual_indexes"]
|
||||
self.deepstack_merger_list = nn.ModuleList([
|
||||
Qwen3VLDeepstackMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
for _ in self.deepstack_visual_indexes
|
||||
])
|
||||
|
||||
|
||||
class Qwen3VL(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
|
||||
model_type = "qwen3vl_8b"
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = QWEN3VL_CONFIGS[self.model_type](**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
vision_config = {**QWEN3VL_VISION_COMMON, **QWEN3VL_VISION[self.model_type], "out_hidden_size": config.hidden_size}
|
||||
self.visual = Qwen3VLVisionModel(vision_config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
def preprocess_embed(self, embed, device):
|
||||
if embed["type"] == "image":
|
||||
# Qwen3-VL normalizes to [-1, 1] (mean/std 0.5), unlike Qwen2.5-VL's CLIP normalization.
|
||||
image, grid = comfy.text_encoders.qwen_vl.process_qwen2vl_images(embed["data"], patch_size=16, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
merged, deepstack = self.visual(image.to(device, dtype=torch.float32), grid)
|
||||
return merged, {"grid": grid, "deepstack": deepstack}
|
||||
return None, None
|
||||
|
||||
def build_image_inputs(self, embeds, embeds_info):
|
||||
# Returns (position_ids, visual_pos_masks, deepstack) for the prompt
|
||||
images = sorted([e for e in embeds_info if e.get("type") == "image"], key=lambda e: e["index"])
|
||||
if len(images) == 0:
|
||||
return None, None, None
|
||||
|
||||
device = embeds.device
|
||||
seq = embeds.shape[1]
|
||||
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, seq, device)
|
||||
|
||||
# DeepStack: mask of image positions + per-vision-layer features to inject there.
|
||||
visual_pos_masks = torch.zeros((1, seq), dtype=torch.bool, device=device)
|
||||
deepstack = None
|
||||
for e in images:
|
||||
start = e["index"]
|
||||
end = e["size"] + start
|
||||
visual_pos_masks[0, start:end] = True
|
||||
ds = e["extra"]["deepstack"]
|
||||
if deepstack is None:
|
||||
deepstack = [d for d in ds]
|
||||
else:
|
||||
deepstack = [torch.cat([deepstack[i], ds[i]], dim=0) for i in range(len(ds))]
|
||||
return position_ids, visual_pos_masks, deepstack
|
||||
|
||||
|
||||
def _make_qwen3vl_model(model_type):
|
||||
class Qwen3VL_(Qwen3VL):
|
||||
pass
|
||||
Qwen3VL_.model_type = model_type
|
||||
return Qwen3VL_
|
||||
|
||||
|
||||
class Qwen3VLClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={},
|
||||
dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False,
|
||||
model_class=_make_qwen3vl_model(model_type), enable_attention_masks=attention_mask,
|
||||
return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
|
||||
if isinstance(tokens, dict):
|
||||
tokens = next(iter(tokens.values()))
|
||||
tokens_only = [[t[0] for t in b] for b in tokens]
|
||||
embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
|
||||
position_ids, visual_pos_masks, deepstack = self.transformer.build_image_inputs(embeds, embeds_info)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed,
|
||||
presence_penalty=presence_penalty, position_ids=position_ids,
|
||||
visual_pos_masks=visual_pos_masks, deepstack_embeds=deepstack)
|
||||
|
||||
|
||||
class Qwen3VLTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, model_type="qwen3vl_8b"):
|
||||
clip_model = lambda **kw: Qwen3VLClipModel(**kw, model_type=model_type)
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
class Qwen3VLSDTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, embedding_size=4096, embedding_key="qwen3vl_8b"):
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class Qwen3VLTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, model_type="qwen3vl_8b"):
|
||||
embedding_size = 2560 if model_type == "qwen3vl_4b" else 4096
|
||||
tokenizer = lambda *a, **kw: Qwen3VLSDTokenizer(*a, **kw, embedding_size=embedding_size, embedding_key=model_type)
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=model_type, tokenizer=tokenizer)
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
self.llama_template_images = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
|
||||
image = kwargs.get("image", None)
|
||||
if image is not None and len(images) == 0:
|
||||
images = [image[i:i + 1] for i in range(image.shape[0])]
|
||||
|
||||
skip_template = text.startswith('<|im_start|>')
|
||||
if prevent_empty_text and text == '':
|
||||
text = ' '
|
||||
|
||||
if skip_template:
|
||||
llama_text = text
|
||||
else:
|
||||
if llama_template is not None:
|
||||
template = llama_template
|
||||
elif len(images) == 0:
|
||||
template = self.llama_template
|
||||
else:
|
||||
template = self.llama_template_images
|
||||
if len(images) > 1:
|
||||
vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
template = template.replace(vision_block, vision_block * len(images), 1)
|
||||
llama_text = template.format(text)
|
||||
if not thinking: # Qwen3 convention: empty think block suppresses reasoning
|
||||
llama_text += "<think>\n\n</think>\n\n"
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
key_name = next(iter(tokens))
|
||||
embed_count = 0
|
||||
for r in tokens[key_name]:
|
||||
for i in range(len(r)):
|
||||
if r[i][0] == 151655: # <|image_pad|>
|
||||
if len(images) > embed_count:
|
||||
r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
|
||||
embed_count += 1
|
||||
return tokens
|
||||
|
||||
|
||||
def tokenizer(model_type="qwen3vl_8b"):
|
||||
class Qwen3VLTokenizer_(Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type=model_type)
|
||||
return Qwen3VLTokenizer_
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen3vl_8b"):
|
||||
class Qwen3VLTEModel_(Qwen3VLTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, model_type=model_type)
|
||||
return Qwen3VLTEModel_
|
||||
@ -88,6 +88,32 @@ def process_qwen2vl_images(
|
||||
return flatten_patches, image_grid_thw
|
||||
|
||||
|
||||
def qwen2vl_mrope_position_ids(embeds_info, seq_len, device):
|
||||
# (3, seq_len) T/H/W MRoPE position ids: text runs sequentially, each image span gets its grid positions.
|
||||
# Returns None when there are no image embeds. `extra` is the image grid_thw, or a dict carrying it under "grid".
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
extra = e.get("extra", None)
|
||||
grid = extra["grid"] if isinstance(extra, dict) else extra
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, seq_len), device=device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (seq_len - end) + offset, device=device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
return position_ids
|
||||
|
||||
|
||||
class VisionPatchEmbed(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -11,7 +11,7 @@ class TextEncodeAceStepAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/ace",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
@ -33,7 +33,7 @@ class TextEncodeAceStepAudio15(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio1.5",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/ace",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
@ -67,7 +67,7 @@ class EmptyAceStepLatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStepLatentAudio",
|
||||
display_name="Empty Ace Step 1.0 Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ace",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
IO.Int.Input(
|
||||
@ -90,7 +90,7 @@ class EmptyAceStep15LatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStep1.5LatentAudio",
|
||||
display_name="Empty Ace Step 1.5 Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ace",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
IO.Int.Input(
|
||||
@ -111,8 +111,8 @@ class ReferenceAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ReferenceTimbreAudio",
|
||||
display_name="Reference Audio",
|
||||
category="advanced/conditioning/audio",
|
||||
display_name="Set Reference Audio",
|
||||
category="model/conditioning",
|
||||
is_experimental=True,
|
||||
description="This node sets the reference audio for ace step 1.5",
|
||||
inputs=[
|
||||
|
||||
@ -16,7 +16,7 @@ class APG(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="APG",
|
||||
display_name="Adaptive Projected Guidance",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input(
|
||||
|
||||
@ -19,7 +19,7 @@ class EmptyARVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyARVideoLatent",
|
||||
category="model/latent/video",
|
||||
category="model/latent/autoregressive",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=832, min=16, max=8192, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=8192, step=16),
|
||||
@ -85,7 +85,7 @@ class ARVideoI2V(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ARVideoI2V",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/autoregressive",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Vae.Input("vae"),
|
||||
|
||||
@ -16,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyLatentAudio",
|
||||
display_name="Empty Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
essentials_category="Audio",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1),
|
||||
@ -41,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ConditioningStableAudio",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/stable audio",
|
||||
inputs=[
|
||||
IO.Conditioning.Input("positive"),
|
||||
IO.Conditioning.Input("negative"),
|
||||
@ -70,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode):
|
||||
node_id="VAEEncodeAudio",
|
||||
search_aliases=["audio to latent"],
|
||||
display_name="VAE Encode Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Audio.Input("audio"),
|
||||
IO.Vae.Input("vae"),
|
||||
@ -115,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode):
|
||||
node_id="VAEDecodeAudio",
|
||||
search_aliases=["latent to audio"],
|
||||
display_name="VAE Decode Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
@ -137,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode):
|
||||
node_id="VAEDecodeAudioTiled",
|
||||
search_aliases=["latent to audio"],
|
||||
display_name="VAE Decode Audio (Tiled)",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
|
||||
@ -39,9 +39,9 @@ class BerniniConditioning(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="BerniniConditioning",
|
||||
display_name="Bernini Conditioning",
|
||||
category="conditioning/video_models",
|
||||
category="model/conditioning/bernini",
|
||||
description="Conditioning node for Bernini in-context video/image conditioning. It can be used for the following tasks: t2v (text-to-video), v2v (video-to-video), rv2v (reference-guided video editing), r2v (reference-to-video), ads2v (insert image/video into video)."
|
||||
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
|
||||
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -50,14 +50,11 @@ class BerniniConditioning(io.ComfyNode):
|
||||
io.Int.Input("height", default=480, min=16, max=8192, step=16),
|
||||
io.Int.Input("length", default=81, min=1, max=8192, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("source_video", optional=True, tooltip=(
|
||||
"Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
|
||||
io.Image.Input("reference_video", optional=True, tooltip=(
|
||||
"Video to insert into the source video (ads2v).")),
|
||||
io.Image.Input("source_video", optional=True, tooltip=("Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
|
||||
io.Image.Input("reference_video", optional=True, tooltip=("Video to insert into the source video (ads2v).")),
|
||||
io.Autogrow.Input("reference_images", optional=True,
|
||||
template=io.Autogrow.TemplatePrefix(
|
||||
input=io.Image.Input("reference_image", tooltip=(
|
||||
"Reference image injected as an in-context token (r2v, rv2v).")),
|
||||
input=io.Image.Input("reference_image", tooltip=("Reference image injected as an in-context token (r2v, rv2v).")),
|
||||
prefix="reference_image_", min=0, max=8)),
|
||||
io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True, tooltip=(
|
||||
"Max size for the long edge of reference_video and reference_images. Resized with preserved aspect ratio and snapped to 16px.")),
|
||||
@ -70,10 +67,8 @@ class BerniniConditioning(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size,
|
||||
source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
|
||||
device=comfy.model_management.intermediate_device())
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size, source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
|
||||
# source_video (1), reference_video (2), reference_images (3, 4, ...).
|
||||
context = []
|
||||
@ -106,9 +101,7 @@ class BerniniConditioning(io.ComfyNode):
|
||||
class BerniniExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
BerniniConditioning,
|
||||
]
|
||||
return [BerniniConditioning,]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> BerniniExtension:
|
||||
|
||||
@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanCameraEmbedding",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/camera",
|
||||
inputs=[
|
||||
io.Combo.Input(
|
||||
"camera_pose",
|
||||
|
||||
@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyChromaRadianceLatentImage",
|
||||
category="model/latent/chroma_radiance",
|
||||
category="model/latent/chroma radiance",
|
||||
inputs=[
|
||||
io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="ChromaRadianceOptions",
|
||||
category="model/patch/chroma_radiance",
|
||||
category="model/patch/chroma radiance",
|
||||
description="Allows setting advanced options for the Chroma Radiance model.",
|
||||
inputs=[
|
||||
io.Model.Input(id="model"),
|
||||
|
||||
@ -9,7 +9,8 @@ class CLIPTextEncodeSDXLRefiner(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSDXLRefiner",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SDXL Refiner)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Float.Input("ascore", default=6.0, min=0.0, max=1000.0, step=0.01),
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
@ -30,7 +31,8 @@ class CLIPTextEncodeSDXL(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSDXL",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SDXL)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
|
||||
@ -66,6 +66,7 @@ class WanContextWindowsManualNode(ContextWindowsManualNode):
|
||||
schema.node_id = "WanContextWindowsManual"
|
||||
schema.display_name = "WAN Context Windows (Manual)"
|
||||
schema.description = "Manually set context windows for WAN-like models (dim=2)."
|
||||
schema.category="model/patch/wan"
|
||||
schema.inputs = [
|
||||
io.Model.Input("model", tooltip="The model to apply context windows to during sampling."),
|
||||
io.Int.Input("context_length", min=1, max=nodes.MAX_RESOLUTION, step=4, default=81, tooltip="The length of the context window.", advanced=True),
|
||||
|
||||
@ -9,6 +9,8 @@ class SetUnionControlNetType(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SetUnionControlNetType",
|
||||
search_aliases=["set controlnet type", "union controlnet type"],
|
||||
display_name="Set Union ControlNet Type",
|
||||
category="model/conditioning/controlnet",
|
||||
inputs=[
|
||||
io.ControlNet.Input("control_net"),
|
||||
@ -39,6 +41,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="ControlNetInpaintingAliMamaApply",
|
||||
search_aliases=["masked controlnet"],
|
||||
display_name="Apply ControlNet Inpainting (AliMama)",
|
||||
category="model/conditioning/controlnet",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
|
||||
@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyCosmosLatentVideo",
|
||||
category="model/latent/video",
|
||||
category="model/latent/cosmos",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/cosmos",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosPredict2ImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/cosmos",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
|
||||
@ -729,7 +729,7 @@ class SamplerCustom(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SamplerCustom",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Boolean.Input("add_noise", default=True, advanced=True),
|
||||
@ -1015,7 +1015,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SamplerCustomAdvanced",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Noise.Input("noise"),
|
||||
io.Guider.Input("guider"),
|
||||
@ -1143,7 +1143,7 @@ class CFGOverride(io.ComfyNode):
|
||||
display_name="CFG Override",
|
||||
description="Override cfg to a fixed value over a [start, end] percent (sigma) range. "
|
||||
"With multiple overrides, the one nearest the sampler wins on overlap.",
|
||||
category="sampling/custom_sampling",
|
||||
category="model/sampling/guiders",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("cfg", default=1.0, min=0.0, max=100.0, step=0.1, round=0.01),
|
||||
|
||||
@ -363,7 +363,7 @@ class EasyCacheNode(io.ComfyNode):
|
||||
node_id="EasyCache",
|
||||
display_name="EasyCache",
|
||||
description="Native EasyCache implementation.",
|
||||
category="advanced/debug/model",
|
||||
category="advanced/debug",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model", tooltip="The model to add EasyCache to."),
|
||||
@ -496,7 +496,7 @@ class LazyCacheNode(io.ComfyNode):
|
||||
node_id="LazyCache",
|
||||
display_name="LazyCache",
|
||||
description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
|
||||
category="advanced/debug/model",
|
||||
category="advanced/debug",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model", tooltip="The model to add LazyCache to."),
|
||||
|
||||
@ -8,7 +8,8 @@ class ReferenceLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ReferenceLatent",
|
||||
category="advanced/conditioning/edit_models",
|
||||
display_name="Set Reference Latent",
|
||||
category="model/conditioning",
|
||||
description="This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images.",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
|
||||
@ -13,7 +13,7 @@ class CLIPTextEncodeFlux(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeFlux",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyFlux2LatentImage",
|
||||
display_name="Empty Flux 2 Latent",
|
||||
category="model/latent",
|
||||
category="model/latent/flux",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -61,7 +61,7 @@ class FluxGuidance(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxGuidance",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1),
|
||||
@ -84,7 +84,7 @@ class FluxDisableGuidance(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxDisableGuidance",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
description="This node completely disables the guidance embed on Flux and Flux like models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
@ -128,7 +128,7 @@ class FluxKontextImageScale(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxKontextImageScale",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
description="This node resizes the image to one that is more optimal for flux kontext.",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
@ -156,7 +156,7 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="FluxKontextMultiReferenceLatentMethod",
|
||||
display_name="Edit Model Reference Method",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Combo.Input(
|
||||
|
||||
@ -11,8 +11,9 @@ class QuadrupleCLIPLoader(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="QuadrupleCLIPLoader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
|
||||
display_name="Load CLIP (Quadruple)",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
|
||||
inputs=[
|
||||
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
|
||||
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
|
||||
@ -38,8 +39,9 @@ class CLIPTextEncodeHiDream(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeHiDream",
|
||||
display_name="CLIP Text Encode (HiDream)",
|
||||
search_aliases=["hidream prompt"],
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/hidream",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -14,7 +14,7 @@ class EmptyHiDreamO1LatentImage(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyHiDreamO1LatentImage",
|
||||
display_name="Empty HiDream-O1 Latent Image",
|
||||
category="model/latent/image",
|
||||
category="model/latent/hidream",
|
||||
description=(
|
||||
"Empty pixel-space latent for HiDream-O1-Image. The model was "
|
||||
"trained at ~4 megapixels; lower resolutions go off-distribution "
|
||||
@ -47,7 +47,7 @@ class HiDreamO1ReferenceImages(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1ReferenceImages",
|
||||
display_name="HiDream-O1 Reference Images",
|
||||
category="model/conditioning/image",
|
||||
category="model/conditioning/hidream",
|
||||
description=(
|
||||
"Attach 1-10 reference images to conditioning, one for edit instruction"
|
||||
"or multiple for subject-driven personalization."
|
||||
@ -117,7 +117,7 @@ class HiDreamO1PatchSeamSmoothing(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1PatchSeamSmoothing",
|
||||
display_name="HiDream-O1 Patch Seam Smoothing",
|
||||
category="advanced/model",
|
||||
category="model/patch/hidream",
|
||||
is_experimental=True,
|
||||
description=(
|
||||
"Average the model output across multiple shifted patch-grid "
|
||||
|
||||
@ -14,7 +14,8 @@ class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeHunyuanDiT",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (Hunyuan Image)",
|
||||
category="model/conditioning/hunyuan image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("bert", multiline=True, dynamic_prompts=True),
|
||||
@ -41,7 +42,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyHunyuanLatentVideo",
|
||||
display_name="Empty HunyuanVideo 1.0 Latent",
|
||||
category="model/latent/video",
|
||||
category="model/latent/hunyuan video",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -67,6 +68,7 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo):
|
||||
schema = super().define_schema()
|
||||
schema.node_id = "EmptyHunyuanVideo15Latent"
|
||||
schema.display_name = "Empty HunyuanVideo 1.5 Latent"
|
||||
schema.category = "model/latent/hunyuan video"
|
||||
return schema
|
||||
|
||||
@classmethod
|
||||
@ -81,7 +83,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15ImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -132,7 +134,7 @@ class HunyuanVideo15SuperResolution(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15SuperResolution",
|
||||
display_name="Hunyuan Video 1.5 Super Resolution",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -227,7 +229,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15LatentUpscaleWithModel",
|
||||
display_name="Hunyuan Video 15 Latent Upscale With Model",
|
||||
category="model/latent",
|
||||
category="model/latent/hunyhuan video",
|
||||
inputs=[
|
||||
io.LatentUpscaleModel.Input("model"),
|
||||
io.Latent.Input("samples"),
|
||||
@ -276,7 +278,7 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeHunyuanVideo_ImageToVideo",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.ClipVisionOutput.Input("clip_vision_output"),
|
||||
@ -308,7 +310,7 @@ class HunyuanImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Vae.Input("vae"),
|
||||
@ -359,7 +361,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyHunyuanImageLatent",
|
||||
category="model/latent",
|
||||
category="model/latent/hunyuan image",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -384,7 +386,7 @@ class HunyuanRefinerLatent(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanRefinerLatent",
|
||||
display_name="Hunyuan Latent Refiner",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -12,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="EmptyLatentHunyuan3Dv2",
|
||||
category="model/latent/3d",
|
||||
category="model/latent/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.Int.Input("resolution", default=3072, min=1, max=8192),
|
||||
IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
|
||||
@ -35,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="Hunyuan3Dv2Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.ClipVisionOutput.Input("clip_vision_output"),
|
||||
],
|
||||
@ -60,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="Hunyuan3Dv2ConditioningMultiView",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.ClipVisionOutput.Input("front", optional=True),
|
||||
IO.ClipVisionOutput.Input("left", optional=True),
|
||||
@ -97,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="VAEDecodeHunyuan3D",
|
||||
category="model/latent/3d",
|
||||
category="model/latent/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
|
||||
@ -38,7 +38,7 @@ class Ideogram4Scheduler(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="Ideogram4Scheduler",
|
||||
display_name="Ideogram 4 Scheduler",
|
||||
category="sampling/custom_sampling/schedulers",
|
||||
category="model/sampling/schedulers",
|
||||
inputs=[
|
||||
io.Int.Input("steps", default=20, min=1, max=200),
|
||||
io.Int.Input("width", default=1024, min=256, max=8192, step=16),
|
||||
|
||||
@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Kandinsky5ImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/kandinsky",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="NormalizeVideoLatentStart",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning",
|
||||
description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.",
|
||||
inputs=[
|
||||
io.Latent.Input("latent"),
|
||||
@ -104,8 +104,9 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeKandinsky5",
|
||||
display_name="CLIP Text Encode (Kandinsky 5)",
|
||||
search_aliases=["kandinsky prompt"],
|
||||
category="advanced/conditioning/kandinsky5",
|
||||
category="model/conditioning/kandinsky",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -262,6 +262,7 @@ class LatentBatch(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LatentBatch",
|
||||
search_aliases=["combine latents", "merge latents", "join latents"],
|
||||
display_name="Batch Latents (DEPRECATED)",
|
||||
category="model/latent/batch",
|
||||
is_deprecated=True,
|
||||
inputs=[
|
||||
@ -447,6 +448,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ReplaceVideoLatentFrames",
|
||||
display_name="Replace Video Latent Frames",
|
||||
category="model/latent/batch",
|
||||
inputs=[
|
||||
io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."),
|
||||
|
||||
@ -25,7 +25,7 @@ class GetICLoRAParameters(io.ComfyNode):
|
||||
display_name="Get IC-LoRA Parameters",
|
||||
description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded "
|
||||
"model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"],
|
||||
inputs=[
|
||||
io.Model.Input(
|
||||
@ -62,7 +62,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyLTXVLatentVideo",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -86,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVImgToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -131,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVImgToVideoInplace",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Image.Input("image"),
|
||||
@ -251,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVAddGuide",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -498,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVCropGuides",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -542,7 +542,7 @@ class LTXVConditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVConditioning",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -566,7 +566,7 @@ class ModelSamplingLTXV(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ModelSamplingLTXV",
|
||||
category="advanced/model",
|
||||
category="model/patch/ltxv",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
|
||||
@ -746,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVConcatAVLatent",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Latent.Input("video_latent"),
|
||||
io.Latent.Input("audio_latent"),
|
||||
@ -781,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVSeparateAVLatent",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
description="LTXV Separate AV Latent",
|
||||
inputs=[
|
||||
io.Latent.Input("av_latent"),
|
||||
@ -814,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVReferenceAudio",
|
||||
display_name="LTXV Reference Audio (ID-LoRA)",
|
||||
category="model/conditioning/audio",
|
||||
category="model/conditioning/ltxv",
|
||||
description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
|
||||
@ -40,7 +40,7 @@ class LTXVAudioVAEEncode(VAEEncodeAudio):
|
||||
return io.Schema(
|
||||
node_id="LTXVAudioVAEEncode",
|
||||
display_name="LTXV Audio VAE Encode",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Audio.Input("audio", tooltip="The audio to be encoded."),
|
||||
io.Vae.Input(
|
||||
@ -63,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVAudioVAEDecode",
|
||||
display_name="LTXV Audio VAE Decode",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Latent.Input("samples", tooltip="The latent to be decoded."),
|
||||
io.Vae.Input(
|
||||
@ -96,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVEmptyLatentAudio",
|
||||
display_name="LTXV Empty Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Int.Input(
|
||||
"frames_number",
|
||||
@ -168,9 +168,9 @@ class LTXAVTextEncoderLoader(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="LTXAVTextEncoderLoader",
|
||||
display_name="LTXV Audio Text Encoder Loader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nltxav: gemma 3 12B",
|
||||
display_name="Load LTXV Audio Text Encoder",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nltxav: gemma 3 12B",
|
||||
inputs=[
|
||||
io.Combo.Input(
|
||||
"text_encoder",
|
||||
|
||||
@ -13,7 +13,7 @@ class LTXVLatentUpsampler(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="LTXVLatentUpsampler",
|
||||
category="model/latent/video",
|
||||
category="model/latent/ltxv",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
|
||||
@ -9,7 +9,7 @@ class RenormCFG(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RenormCFG",
|
||||
category="advanced/model",
|
||||
category="model/patch",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("cfg_trunc", default=100, min=0.0, max=100.0, step=0.01, advanced=True),
|
||||
@ -80,8 +80,8 @@ class CLIPTextEncodeLumina2(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeLumina2",
|
||||
search_aliases=["lumina prompt"],
|
||||
display_name="CLIP Text Encode for Lumina2",
|
||||
category="model/conditioning",
|
||||
display_name="CLIP Text Encode (Lumina 2)",
|
||||
category="model/conditioning/lumina",
|
||||
description="Encodes a system prompt and a user prompt using a CLIP model into an embedding "
|
||||
"that can be used to guide the diffusion model towards generating specific images.",
|
||||
inputs=[
|
||||
|
||||
@ -53,6 +53,7 @@ class LatentCompositeMasked(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="LatentCompositeMasked",
|
||||
search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"],
|
||||
display_name="Latent Composite Masked",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("destination"),
|
||||
|
||||
@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyMochiLatentVideo",
|
||||
category="model/latent/video",
|
||||
category="model/latent/mochi",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
|
||||
@ -59,7 +59,7 @@ class ModelSamplingDiscrete:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, zsnr):
|
||||
m = model.clone()
|
||||
@ -97,7 +97,7 @@ class ModelSamplingStableCascade:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/stable cascade"
|
||||
|
||||
def patch(self, model, shift):
|
||||
m = model.clone()
|
||||
@ -123,7 +123,7 @@ class ModelSamplingSD3:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/stable diffusion"
|
||||
|
||||
def patch(self, model, shift, multiplier=1000):
|
||||
m = model.clone()
|
||||
@ -150,6 +150,7 @@ class ModelSamplingAuraFlow(ModelSamplingSD3):
|
||||
}}
|
||||
|
||||
FUNCTION = "patch_aura"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch_aura(self, model, shift):
|
||||
return self.patch(model, shift, multiplier=1.0)
|
||||
@ -167,7 +168,7 @@ class ModelSamplingFlux:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/flux"
|
||||
|
||||
def patch(self, model, max_shift, base_shift, width, height):
|
||||
m = model.clone()
|
||||
@ -202,7 +203,7 @@ class ModelSamplingContinuousEDM:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, sigma_max, sigma_min):
|
||||
m = model.clone()
|
||||
@ -247,7 +248,7 @@ class ModelSamplingContinuousV:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, sigma_max, sigma_min):
|
||||
m = model.clone()
|
||||
@ -273,7 +274,7 @@ class RescaleCFG:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, multiplier):
|
||||
def rescale_cfg(args):
|
||||
@ -314,7 +315,7 @@ class ModelNoiseScale:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, noise_scale):
|
||||
m = model.clone()
|
||||
@ -337,7 +338,7 @@ class ModelComputeDtype:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/debug/model"
|
||||
CATEGORY = "advanced/debug"
|
||||
|
||||
def patch(self, model, dtype):
|
||||
m = model.clone()
|
||||
|
||||
@ -21,7 +21,7 @@ class ModelMergeSimple:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, ratio):
|
||||
m = model1.clone()
|
||||
@ -40,7 +40,7 @@ class ModelSubtract:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, multiplier):
|
||||
m = model1.clone()
|
||||
@ -58,7 +58,7 @@ class ModelAdd:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2):
|
||||
m = model1.clone()
|
||||
@ -78,7 +78,7 @@ class CLIPMergeSimple:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2, ratio):
|
||||
m = clip1.clone()
|
||||
@ -101,7 +101,7 @@ class CLIPSubtract:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2, multiplier):
|
||||
m = clip1.clone()
|
||||
@ -123,7 +123,7 @@ class CLIPAdd:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2):
|
||||
m = clip1.clone()
|
||||
@ -147,7 +147,7 @@ class ModelMergeBlocks:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, **kwargs):
|
||||
m = model1.clone()
|
||||
@ -242,7 +242,7 @@ class CheckpointSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, model, clip, vae, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
save_checkpoint(model, clip=clip, vae=vae, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
|
||||
@ -261,7 +261,7 @@ class CLIPSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, clip, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
prompt_info = ""
|
||||
@ -318,7 +318,7 @@ class VAESave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, vae, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
|
||||
@ -353,7 +353,7 @@ class ModelSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, model, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
save_checkpoint(model, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import comfy_extras.nodes_model_merging
|
||||
|
||||
class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
arg_dict = { "model1": ("MODEL",),
|
||||
@ -27,7 +27,7 @@ class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
|
||||
|
||||
class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -53,7 +53,7 @@ class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -77,7 +77,7 @@ class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
|
||||
|
||||
class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -104,7 +104,7 @@ class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -130,7 +130,7 @@ class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -153,7 +153,7 @@ class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -175,7 +175,7 @@ class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -197,7 +197,7 @@ class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -221,7 +221,7 @@ class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -245,7 +245,7 @@ class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
DESCRIPTION = "1.3B model has 30 blocks, 14B model has 40 blocks. Image to video model has the extra img_emb."
|
||||
|
||||
@classmethod
|
||||
@ -269,7 +269,7 @@ class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -292,7 +292,7 @@ class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlo
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -315,7 +315,7 @@ class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBl
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
|
||||
@ -232,7 +232,7 @@ class ModelPatchLoader:
|
||||
FUNCTION = "load_model_patch"
|
||||
EXPERIMENTAL = True
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_model_patch(self, name):
|
||||
model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name)
|
||||
@ -479,7 +479,7 @@ class QwenImageDiffsynthControlnet:
|
||||
FUNCTION = "diffsynth_controlnet"
|
||||
EXPERIMENTAL = True
|
||||
|
||||
CATEGORY = "advanced/loaders/qwen"
|
||||
CATEGORY = "model/patch/qwen"
|
||||
|
||||
def diffsynth_controlnet(self, model, model_patch, vae, image=None, strength=1.0, inpaint_image=None, mask=None):
|
||||
model_patched = model.clone()
|
||||
@ -512,7 +512,7 @@ class ZImageFunControlnet(QwenImageDiffsynthControlnet):
|
||||
},
|
||||
"optional": {"image": ("IMAGE",), "inpaint_image": ("IMAGE",), "mask": ("MASK",)}}
|
||||
|
||||
CATEGORY = "advanced/loaders/zimage"
|
||||
CATEGORY = "model/patch/z-image"
|
||||
|
||||
class UsoStyleProjectorPatch:
|
||||
def __init__(self, model_patch, encoded_image):
|
||||
@ -675,3 +675,11 @@ NODE_CLASS_MAPPINGS = {
|
||||
"USOStyleReference": USOStyleReference,
|
||||
"SUPIRApply": SUPIRApply,
|
||||
}
|
||||
|
||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"ModelPatchLoader": "Load Model Patch",
|
||||
"QwenImageDiffsynthControlnet": "Apply Qwen Image DiffSynth ControlNet",
|
||||
"ZImageFunControlnet": "Apply Z-Image Fun ControlNet",
|
||||
"USOStyleReference": "Apply USO Style Reference",
|
||||
"SUPIRApply": "Apply SUPIR Patch",
|
||||
}
|
||||
|
||||
@ -14,10 +14,8 @@ class PiDConditioning(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="PiDConditioning",
|
||||
display_name="PiD Conditioning",
|
||||
category="advanced/conditioning",
|
||||
description=(
|
||||
"Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"
|
||||
),
|
||||
category="model/conditioning",
|
||||
description=("Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"),
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."),
|
||||
|
||||
@ -7,8 +7,9 @@ class CLIPTextEncodePixArtAlpha(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodePixArtAlpha",
|
||||
display_name="CLIP Text Encode (PixArt Alpha)",
|
||||
search_aliases=["pixart prompt"],
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/pixart",
|
||||
description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
|
||||
@ -616,7 +616,7 @@ class BatchLatentsNode(io.ComfyNode):
|
||||
node_id="BatchLatentsNode",
|
||||
search_aliases=["combine latents", "stack latents", "merge latents"],
|
||||
display_name="Batch Latents",
|
||||
category="model/latent",
|
||||
category="model/latent/batch",
|
||||
inputs=[
|
||||
io.Autogrow.Input("latents", template=autogrow_template)
|
||||
],
|
||||
|
||||
@ -12,7 +12,7 @@ class TextEncodeQwenImageEdit(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeQwenImageEdit",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/qwen image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
@ -55,7 +55,7 @@ class TextEncodeQwenImageEditPlus(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeQwenImageEditPlus",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/qwen image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -123,7 +123,7 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSCAILToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/scail",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -257,18 +257,16 @@ class SCAIL2ColoredMask(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="SCAIL2ColoredMask",
|
||||
display_name="Create SCAIL-2 Colored Mask",
|
||||
category="conditioning/video_models/scail",
|
||||
category="model/conditioning/wan/scail",
|
||||
inputs=[
|
||||
SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
|
||||
SAM3TrackData.Input("ref_track_data", optional=True,
|
||||
tooltip="SAM3 track of the reference image."),
|
||||
io.String.Input("object_indices", default="",
|
||||
tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
|
||||
SAM3TrackData.Input("ref_track_data", optional=True, tooltip="SAM3 track of the reference image."),
|
||||
io.String.Input("object_indices", default="", tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
|
||||
io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
|
||||
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
|
||||
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
|
||||
io.Boolean.Input("replacement_mode", default=False,
|
||||
tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
|
||||
"True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
|
||||
tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
|
||||
"True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output("pose_video_mask"),
|
||||
|
||||
@ -13,8 +13,9 @@ class TripleCLIPLoader(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TripleCLIPLoader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nsd3: clip-l, clip-g, t5",
|
||||
display_name="Load CLIP (Triple)",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nsd3: clip-l, clip-g, t5",
|
||||
inputs=[
|
||||
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
|
||||
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
|
||||
@ -41,7 +42,7 @@ class EmptySD3LatentImage(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptySD3LatentImage",
|
||||
category="model/latent/sd3",
|
||||
category="model/latent/stable diffusion",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -66,7 +67,8 @@ class CLIPTextEncodeSD3(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSD3",
|
||||
search_aliases=["sd3 prompt"],
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SD3)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -96,8 +96,12 @@ class KeypointDraw:
|
||||
# Body connections - matching DWPose limbSeq (1-indexed, converted to 0-indexed)
|
||||
self.body_limbSeq = [
|
||||
[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10],
|
||||
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17],
|
||||
[1, 16], [16, 18]
|
||||
[10, 11], [2, 12], [12, 13], [13, 14]
|
||||
]
|
||||
|
||||
# Head connections (1-indexed, converted to 0-indexed)
|
||||
self.head_edges = [
|
||||
[2, 1], [1, 15], [15, 17], [1, 16], [16, 18]
|
||||
]
|
||||
|
||||
# Colors matching DWPose
|
||||
@ -215,7 +219,7 @@ class KeypointDraw:
|
||||
return unique_pts if len(unique_pts) > 1 else [[center[0], center[1]], [center[0], center[1]]]
|
||||
|
||||
def draw_wholebody_keypoints(self, canvas, keypoints, scores=None, threshold=0.3,
|
||||
draw_body=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
|
||||
draw_body=True, draw_head=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
|
||||
"""
|
||||
Draw wholebody keypoints (134 keypoints after processing) in DWPose style.
|
||||
|
||||
@ -237,9 +241,17 @@ class KeypointDraw:
|
||||
"""
|
||||
H, W, C = canvas.shape
|
||||
|
||||
# Draw body limbs
|
||||
if draw_body and len(keypoints) >= 18:
|
||||
for i, limb in enumerate(self.body_limbSeq):
|
||||
# Draw body limbs & head connections
|
||||
if (draw_body or draw_head) and len(keypoints) >= 18:
|
||||
colorIndexOffset = 0
|
||||
edges = []
|
||||
if draw_body:
|
||||
edges += self.body_limbSeq
|
||||
else:
|
||||
colorIndexOffset += len(self.body_limbSeq)
|
||||
if draw_head:
|
||||
edges += self.head_edges
|
||||
for i, limb in enumerate(edges):
|
||||
# Convert from 1-indexed to 0-indexed
|
||||
idx1, idx2 = limb[0] - 1, limb[1] - 1
|
||||
|
||||
@ -262,11 +274,17 @@ class KeypointDraw:
|
||||
|
||||
polygon = self.draw.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stick_width), int(angle), 0, 360, 1)
|
||||
|
||||
self.draw.fillConvexPoly(canvas, polygon, self.colors[i % len(self.colors)])
|
||||
self.draw.fillConvexPoly(canvas, polygon, self.colors[(i + colorIndexOffset) % len(self.colors)])
|
||||
|
||||
# Draw body keypoints
|
||||
if draw_body and len(keypoints) >= 18:
|
||||
# Draw body & head keypoints
|
||||
if (draw_body or draw_head) and len(keypoints) >= 18:
|
||||
head_keypoints = {0, 14, 15, 16, 17} # nose, eyes, ears
|
||||
neck_point = 1
|
||||
for i in range(18):
|
||||
if not draw_head and i in head_keypoints:
|
||||
continue
|
||||
if not draw_body and i not in head_keypoints and i != neck_point:
|
||||
continue
|
||||
if scores is not None and scores[i] < threshold:
|
||||
continue
|
||||
x, y = int(keypoints[i][0]), int(keypoints[i][1])
|
||||
@ -365,6 +383,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
io.Int.Input("stick_width", default=4, min=1, max=10, step=1),
|
||||
io.Int.Input("face_point_size", default=3, min=1, max=10, step=1),
|
||||
io.Float.Input("score_threshold", default=0.3, min=0.0, max=1.0, step=0.01),
|
||||
io.Boolean.Input("draw_head", default=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output(),
|
||||
@ -372,7 +391,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold) -> io.NodeOutput:
|
||||
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold, draw_head) -> io.NodeOutput:
|
||||
if not keypoints:
|
||||
return io.NodeOutput(torch.zeros((1, 64, 64, 3), dtype=torch.float32))
|
||||
height = keypoints[0]["canvas_height"]
|
||||
@ -405,7 +424,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
canvas = drawer.draw_wholebody_keypoints(
|
||||
canvas, kp, sc,
|
||||
threshold=score_threshold,
|
||||
draw_body=draw_body, draw_feet=draw_feet,
|
||||
draw_body=draw_body, draw_head=draw_head, draw_feet=draw_feet,
|
||||
draw_face=draw_face, draw_hands=draw_hands,
|
||||
stick_width=stick_width, face_point_size=face_point_size,
|
||||
)
|
||||
|
||||
@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SD_4XUpscale_Conditioning",
|
||||
category="model/conditioning/upscale_diffusion",
|
||||
category="model/conditioning/stable diffusion upscaler",
|
||||
inputs=[
|
||||
io.Image.Input("images"),
|
||||
io.Conditioning.Input("positive"),
|
||||
|
||||
@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableZero123_Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable zero123",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableZero123_Conditioning_Batched",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable zero123",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SV3D_Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable video 3d",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
|
||||
@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_EmptyLatentImage",
|
||||
category="model/latent/stable_cascade",
|
||||
category="model/latent/stable cascade",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
|
||||
@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_StageC_VAEEncode",
|
||||
category="model/latent/stable_cascade",
|
||||
category="model/latent/stable cascade",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
io.Vae.Input("vae"),
|
||||
@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_StageB_Conditioning",
|
||||
category="model/conditioning/stable_cascade",
|
||||
category="model/conditioning/stable cascade",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Latent.Input("stage_c"),
|
||||
|
||||
@ -35,7 +35,7 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Image.Input("image", optional=True),
|
||||
io.Image.Input("video", optional=True, tooltip="Video frames as image batch. Assumed to be 24 FPS; subsampled to 1 FPS internally."),
|
||||
io.Audio.Input("audio", optional=True),
|
||||
io.Int.Input("max_length", default=256, min=1, max=2048),
|
||||
io.Int.Input("max_length", default=512, min=1, max=32768),
|
||||
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
|
||||
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
|
||||
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
|
||||
|
||||
@ -1367,7 +1367,7 @@ class SaveLoRA(io.ComfyNode):
|
||||
node_id="SaveLoRA",
|
||||
search_aliases=["export lora"],
|
||||
display_name="Save LoRA Weights",
|
||||
category="advanced/model_merging",
|
||||
category="model/merging",
|
||||
is_experimental=True,
|
||||
is_output_node=True,
|
||||
inputs=[
|
||||
|
||||
@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning:
|
||||
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/conditioning/video_models"
|
||||
CATEGORY = "model/conditioning/stable video"
|
||||
|
||||
def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level):
|
||||
output = clip_vision.encode_image(init_image)
|
||||
@ -108,7 +108,7 @@ class VideoTriangleCFGGuidance:
|
||||
return (m, )
|
||||
|
||||
class ImageOnlyCheckpointSave(comfy_extras.nodes_model_merging.CheckpointSave):
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, temporal, x, y, z, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
|
||||
@ -160,4 +160,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"ImageOnlyCheckpointLoader": "Load Checkpoint Image Only (img2vid model)",
|
||||
"VideoLinearCFGGuidance": "Video Linear CFG Guidance",
|
||||
"VideoTriangleCFGGuidance": "Video Triangle CFG Guidance",
|
||||
"ConditioningSetAreaPercentageVideo": "Conditioning (Set Area with Percentage for Video)",
|
||||
}
|
||||
|
||||
@ -175,7 +175,7 @@ class VOIDInpaintConditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDInpaintConditioning",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/void",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -288,7 +288,7 @@ class VOIDWarpedNoise(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoise",
|
||||
category="model/latent/video",
|
||||
category="model/latent/void",
|
||||
inputs=[
|
||||
OpticalFlow.Input(
|
||||
"optical_flow",
|
||||
@ -393,7 +393,7 @@ class VOIDWarpedNoiseSource(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoiseSource",
|
||||
category="model/sampling/noise",
|
||||
category="model/latent/void",
|
||||
inputs=[
|
||||
io.Latent.Input("warped_noise",
|
||||
tooltip="Warped noise latent from VOIDWarpedNoise"),
|
||||
|
||||
@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFunControlToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun control",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Wan22FunControlToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun control",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFirstLastFrameToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFunInpaintToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun inpaint",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="WanVaceToVideo",
|
||||
search_aliases=["video conditioning", "video control"],
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/vace",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -375,7 +375,8 @@ class TrimVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TrimVideoLatent",
|
||||
category="model/latent/video",
|
||||
display_name="Trim Video Latent",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
io.Latent.Input("samples"),
|
||||
io.Int.Input("trim_amount", default=0, min=0, max=99999),
|
||||
@ -398,7 +399,7 @@ class WanCameraImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanCameraImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/camera",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -452,7 +453,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanPhantomSubjectToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/phantom subject",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -707,7 +708,7 @@ class WanTrackToVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="WanTrackToVideo",
|
||||
search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"],
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -951,7 +952,7 @@ class WanSoundImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSoundImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/sound",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -984,7 +985,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSoundImageToVideoExtend",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/sound",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1046,7 +1047,7 @@ class WanHuMoImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanHuMoImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/humo",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1112,7 +1113,7 @@ class WanAnimateToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanAnimateToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/animate",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1252,7 +1253,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Wan22ImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -1302,7 +1303,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanInfiniteTalkToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/infinite talk",
|
||||
inputs=[
|
||||
io.DynamicCombo.Input("mode", options=[
|
||||
io.DynamicCombo.Option("single_speaker", []),
|
||||
|
||||
@ -713,7 +713,7 @@ class WanDancerEncodeAudio(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanDancerEncodeAudio",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/dancer",
|
||||
inputs=[
|
||||
io.Audio.Input("audio"),
|
||||
io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
@ -787,7 +787,7 @@ class WanDancerVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanDancerVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/dancer",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -247,7 +247,7 @@ class WanMoveVisualizeTracks(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveVisualizeTracks",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Image.Input("images"),
|
||||
io.Tracks.Input("tracks", optional=True),
|
||||
@ -283,7 +283,7 @@ class WanMoveTracksFromCoords(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveTracksFromCoords",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.String.Input("track_coords", force_input=True, default="[]", optional=True),
|
||||
io.Mask.Input("track_mask", optional=True),
|
||||
@ -325,7 +325,8 @@ class GenerateTracks(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="GenerateTracks",
|
||||
search_aliases=["motion paths", "camera movement", "trajectory"],
|
||||
category="model/conditioning/video_models",
|
||||
display_name="Generate Video Tracks",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=832, min=16, max=4096, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=4096, step=16),
|
||||
@ -434,7 +435,7 @@ class WanMoveConcatTrack(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveConcatTrack",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Tracks.Input("tracks_1"),
|
||||
io.Tracks.Input("tracks_2", optional=True),
|
||||
@ -463,7 +464,7 @@ class WanMoveTrackToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveTrackToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -10,7 +10,7 @@ class TextEncodeZImageOmni(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeZImageOmni",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/z-image",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
|
||||
4
main.py
4
main.py
@ -127,6 +127,10 @@ def apply_custom_paths():
|
||||
for config_path in itertools.chain(*args.extra_model_paths_config):
|
||||
utils.extra_config.load_extra_path_config(config_path)
|
||||
|
||||
# --base-directory
|
||||
if args.base_directory:
|
||||
logging.info(f"Setting base directory to: {folder_paths.base_path}")
|
||||
|
||||
# --output-directory, --input-directory, --user-directory
|
||||
if args.output_directory:
|
||||
output_dir = os.path.abspath(args.output_directory)
|
||||
|
||||
58
nodes.py
58
nodes.py
@ -87,7 +87,7 @@ class ConditioningCombine:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "combine"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"]
|
||||
|
||||
def combine(self, conditioning_1, conditioning_2):
|
||||
@ -104,7 +104,7 @@ class ConditioningAverage :
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "addWeighted"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def addWeighted(self, conditioning_to, conditioning_from, conditioning_to_strength):
|
||||
out = []
|
||||
@ -143,7 +143,7 @@ class ConditioningConcat:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "concat"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def concat(self, conditioning_to, conditioning_from):
|
||||
out = []
|
||||
@ -176,7 +176,7 @@ class ConditioningSetArea:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, x, y, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": (height // 8, width // 8, y // 8, x // 8),
|
||||
@ -197,7 +197,7 @@ class ConditioningSetAreaPercentage:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, x, y, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", height, width, y, x),
|
||||
@ -214,7 +214,7 @@ class ConditioningSetAreaStrength:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"strength": strength})
|
||||
@ -234,7 +234,7 @@ class ConditioningSetMask:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, mask, set_cond_area, strength):
|
||||
set_area_to_bounds = False
|
||||
@ -257,7 +257,7 @@ class ConditioningZeroOut:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "zero_out"
|
||||
|
||||
CATEGORY = "advanced/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def zero_out(self, conditioning):
|
||||
c = []
|
||||
@ -283,11 +283,10 @@ class ConditioningSetTimestepRange:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "set_range"
|
||||
|
||||
CATEGORY = "advanced/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def set_range(self, conditioning, start, end):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"start_percent": start,
|
||||
"end_percent": end})
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"start_percent": start, "end_percent": end})
|
||||
return (c, )
|
||||
|
||||
class VAEDecode:
|
||||
@ -389,7 +388,7 @@ class VAEEncodeForInpaint:
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/latent/inpaint"
|
||||
CATEGORY = "model/latent"
|
||||
|
||||
def encode(self, vae, pixels, mask, grow_mask_by=6):
|
||||
downscale_ratio = vae.spacial_compression_encode()
|
||||
@ -438,7 +437,7 @@ class InpaintModelConditioning:
|
||||
RETURN_NAMES = ("positive", "negative", "latent")
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/conditioning/inpaint"
|
||||
CATEGORY = "model/conditioning"
|
||||
|
||||
def encode(self, positive, negative, pixels, vae, mask, noise_mask=True):
|
||||
x = (pixels.shape[1] // 8) * 8
|
||||
@ -579,7 +578,7 @@ class CheckpointLoader:
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE")
|
||||
FUNCTION = "load_checkpoint"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
DEPRECATED = True
|
||||
|
||||
def load_checkpoint(self, config_name, ckpt_name):
|
||||
@ -625,8 +624,9 @@ class DiffusersLoader:
|
||||
return {"required": {"model_path": (paths,), }}
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE")
|
||||
FUNCTION = "load_checkpoint"
|
||||
DEPRECATED = True
|
||||
|
||||
CATEGORY = "advanced/loaders/deprecated"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_checkpoint(self, model_path, output_vae=True, output_clip=True):
|
||||
for search_path in folder_paths.get_folder_paths("diffusers"):
|
||||
@ -952,7 +952,7 @@ class UNETLoader:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "load_unet"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_unet(self, unet_name, weight_dtype):
|
||||
model_options = {}
|
||||
@ -980,9 +980,9 @@ class CLIPLoader:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm"
|
||||
DESCRIPTION = "Recipes:\nsd: clip-l\nstable cascade: clip-g\nsd3: t5 xxl / clip-g / clip-l\nstable audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\npixeldit: gemma 2 2B elm"
|
||||
|
||||
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@ -1008,9 +1008,9 @@ class DualCLIPLoader:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small\nnewbie: gemma-3-4b-it, jina clip v2"
|
||||
DESCRIPTION = "Recipes:\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small\nnewbie: gemma-3-4b-it, jina clip v2"
|
||||
|
||||
def load_clip(self, clip_name1, clip_name2, type, device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@ -1091,7 +1091,7 @@ class StyleModelApply:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "apply_stylemodel"
|
||||
|
||||
CATEGORY = "model/conditioning/style_model"
|
||||
CATEGORY = "model/conditioning"
|
||||
|
||||
def apply_stylemodel(self, conditioning, style_model, clip_vision_output, strength, strength_type):
|
||||
cond = style_model.get_cond(clip_vision_output).flatten(start_dim=0, end_dim=1).unsqueeze(dim=0)
|
||||
@ -1521,13 +1521,11 @@ class LatentCrop:
|
||||
class SetLatentNoiseMask:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "samples": ("LATENT",),
|
||||
"mask": ("MASK",),
|
||||
}}
|
||||
return {"required": { "samples": ("LATENT",), "mask": ("MASK",), }}
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "set_mask"
|
||||
|
||||
CATEGORY = "model/latent/inpaint"
|
||||
CATEGORY = "model/latent"
|
||||
|
||||
def set_mask(self, samples, mask):
|
||||
s = samples.copy()
|
||||
@ -2053,7 +2051,7 @@ NODE_CLASS_MAPPINGS = {
|
||||
"ImageBatch": ImageBatch,
|
||||
"ImagePadForOutpaint": ImagePadForOutpaint,
|
||||
"EmptyImage": EmptyImage,
|
||||
"ConditioningAverage": ConditioningAverage ,
|
||||
"ConditioningAverage": ConditioningAverage,
|
||||
"ConditioningCombine": ConditioningCombine,
|
||||
"ConditioningConcat": ConditioningConcat,
|
||||
"ConditioningSetArea": ConditioningSetArea,
|
||||
@ -2109,6 +2107,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"LoraLoader": "Load LoRA (Model and CLIP)",
|
||||
"LoraLoaderModelOnly": "Load LoRA",
|
||||
"CLIPLoader": "Load CLIP",
|
||||
"DualCLIPLoader": "Load CLIP (Dual)",
|
||||
"ControlNetLoader": "Load ControlNet Model",
|
||||
"DiffControlNetLoader": "Load ControlNet Model (diff)",
|
||||
"StyleModelLoader": "Load Style Model",
|
||||
@ -2116,6 +2115,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"UNETLoader": "Load Diffusion Model",
|
||||
"unCLIPCheckpointLoader": "Load unCLIP Checkpoint",
|
||||
"GLIGENLoader": "Load GLIGEN Model",
|
||||
"DiffusersLoader": "Load Diffusers Model (DEPRECATED)",
|
||||
# Conditioning
|
||||
"CLIPVisionEncode": "CLIP Vision Encode",
|
||||
"StyleModelApply": "Apply Style Model",
|
||||
@ -2123,12 +2123,16 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"CLIPSetLastLayer": "CLIP Set Last Layer",
|
||||
"ConditioningCombine": "Conditioning (Combine)",
|
||||
"ConditioningAverage ": "Conditioning (Average)",
|
||||
"ConditioningAverage": "Conditioning (Average)",
|
||||
"ConditioningConcat": "Conditioning (Concat)",
|
||||
"ConditioningSetArea": "Conditioning (Set Area)",
|
||||
"ConditioningSetAreaPercentage": "Conditioning (Set Area with Percentage)",
|
||||
"ConditioningSetAreaStrength": "Conditioning (Set Area Strength)",
|
||||
"ConditioningSetMask": "Conditioning (Set Mask)",
|
||||
"ControlNetApply": "Apply ControlNet (DEPRECATED)",
|
||||
"ControlNetApplyAdvanced": "Apply ControlNet",
|
||||
"GLIGENTextBoxApply": "Apply GLIGEN Text Box",
|
||||
"ConditioningZeroOut": "Conditioning Zero Out",
|
||||
# Latent
|
||||
"VAEEncodeForInpaint": "VAE Encode (for Inpainting)",
|
||||
"SetLatentNoiseMask": "Set Latent Noise Mask",
|
||||
@ -2142,7 +2146,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"LatentUpscaleBy": "Upscale Latent By",
|
||||
"LatentComposite": "Latent Composite",
|
||||
"LatentBlend": "Latent Blend",
|
||||
"LatentFromBatch" : "Latent From Batch",
|
||||
"LatentFromBatch" : "Get Latent From Batch",
|
||||
"RepeatLatentBatch": "Repeat Latent Batch",
|
||||
# Image
|
||||
"EmptyImage": "Empty Image",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user