diff --git a/.github/workflows/detect-unreviewed-merge.yml b/.github/workflows/detect-unreviewed-merge.yml new file mode 100644 index 000000000..4fabecb94 --- /dev/null +++ b/.github/workflows/detect-unreviewed-merge.yml @@ -0,0 +1,24 @@ +name: Detect Unreviewed Merge + +# SOC 2 compliance — reusable workflow lives in Comfy-Org/github-workflows, +# tracking issues are filed in Comfy-Org/unreviewed-merges. + +on: + push: + branches: [master] + +concurrency: + group: detect-unreviewed-merge-${{ github.sha }} + cancel-in-progress: false + +permissions: + contents: read + pull-requests: read + +jobs: + detect: + uses: Comfy-Org/github-workflows/.github/workflows/detect-unreviewed-merge.yml@4d9cb6b87f953bb7cd69954280e1465fb9bd2040 # v1 + with: + approval-mode: latest-per-reviewer + secrets: + UNREVIEWED_MERGES_TOKEN: ${{ secrets.UNREVIEWED_MERGES_TOKEN }} diff --git a/comfy/bg_removal_model.py b/comfy/bg_removal_model.py index 6dec65e63..c772c5f6a 100644 --- a/comfy/bg_removal_model.py +++ b/comfy/bg_removal_model.py @@ -55,12 +55,7 @@ class BackgroundRemovalModel(): out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False) mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) - if mask.ndim == 3: - mask = mask.unsqueeze(0) - if mask.shape[1] != 1: - mask = mask.movedim(-1, 1) - - return mask + return mask.squeeze(1) # (B, 1, H, W) -> (B, H, W) def load_background_removal_model(sd): diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 9bda414d1..a4cabcc65 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -149,6 +149,7 @@ parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=Non parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.") parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.") parser.add_argument("--enable-dynamic-vram", action="store_true", help="Enable dynamic VRAM on systems where it's not enabled by default.") +parser.add_argument("--fast-disk", action="store_true", help="Prefer disk-backed dynamic loading and offload over unpinned RAM. Can be faster for users with fast NVME disks.") parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.") diff --git a/comfy/float.py b/comfy/float.py index 184b3d6d0..3c82d6359 100644 --- a/comfy/float.py +++ b/comfy/float.py @@ -1,5 +1,20 @@ +import logging + import torch +_CK_STOCHASTIC_ROUNDING_AVAILABLE = False +try: + import comfy_kitchen as ck + _ck_stochastic_rounding_fp8 = ck.stochastic_rounding_fp8 + _CK_STOCHASTIC_ROUNDING_AVAILABLE = True +except (AttributeError, ImportError): + logging.warning("comfy_kitchen does not support stochastic FP8 rounding, please update comfy_kitchen.") + +if not _CK_STOCHASTIC_ROUNDING_AVAILABLE: + def _ck_stochastic_rounding_fp8(value, rng, dtype): + raise NotImplementedError("comfy_kitchen does not support stochastic FP8 rounding") + + def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None): mantissa_scaled = torch.where( normal_mask, @@ -57,6 +72,10 @@ def stochastic_rounding(value, dtype, seed=0): if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2: generator = torch.Generator(device=value.device) generator.manual_seed(seed) + if _CK_STOCHASTIC_ROUNDING_AVAILABLE: + rng = torch.randint(0, 256, value.size(), dtype=torch.uint8, layout=value.layout, device=value.device, generator=generator) + return _ck_stochastic_rounding_fp8(value, rng, dtype) + output = torch.empty_like(value, dtype=dtype) num_slices = max(1, (value.numel() / (4096 * 4096))) slice_size = max(1, round(value.shape[0] / num_slices)) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 75d459b59..12a934d71 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -799,13 +799,15 @@ class ZImagePixelSpace(ChromaRadiance): """ pass - class HiDreamO1Pixel(ChromaRadiance): """Pixel-space latent format for HiDream-O1. No VAE — model patches/unpatches raw RGB internally with patch_size=32. """ pass +class PixelDiTPixel(ChromaRadiance): + pass + class CogVideoX(LatentFormat): """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b). diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py index a6258b755..c28be5b49 100644 --- a/comfy/ldm/audio/dit.py +++ b/comfy/ldm/audio/dit.py @@ -433,11 +433,11 @@ class Attention(nn.Module): if self.differential: q, q_diff = q.unbind(dim=1) k, k_diff = k.unbind(dim=1) - out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options) - out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, transformer_options=transformer_options) + out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) + out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) out = out - out_diff else: - out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options) + out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options) out = self.to_out(out) diff --git a/comfy/ldm/audio/vae_sa3.py b/comfy/ldm/audio/vae_sa3.py index 276846444..8be36d6ee 100644 --- a/comfy/ldm/audio/vae_sa3.py +++ b/comfy/ldm/audio/vae_sa3.py @@ -138,11 +138,11 @@ class Attention(nn.Module): k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype) if self.differential: - out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True) - - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True)) + out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False) + - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)) del q, k, v, q_diff, k_diff else: - out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True) + out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False) del q, k, v return self.to_out(out) diff --git a/comfy/ldm/cosmos/predict2.py b/comfy/ldm/cosmos/predict2.py index 2268bff38..671fe834d 100644 --- a/comfy/ldm/cosmos/predict2.py +++ b/comfy/ldm/cosmos/predict2.py @@ -14,15 +14,7 @@ from torchvision import transforms import comfy.patcher_extension from comfy.ldm.modules.attention import optimized_attention import comfy.ldm.common_dit - -def apply_rotary_pos_emb( - t: torch.Tensor, - freqs: torch.Tensor, -) -> torch.Tensor: - t_ = t.reshape(*t.shape[:-1], 2, -1).movedim(-2, -1).unsqueeze(-2).float() - t_out = freqs[..., 0] * t_[..., 0] + freqs[..., 1] * t_[..., 1] - t_out = t_out.movedim(-1, -2).reshape(*t.shape).type_as(t) - return t_out +import comfy.quant_ops # ---------------------- Feed Forward Network ----------------------- @@ -173,8 +165,7 @@ class Attention(nn.Module): k = self.k_norm(k) v = self.v_norm(v) if self.is_selfattn and rope_emb is not None: # only apply to self-attention! - q = apply_rotary_pos_emb(q, rope_emb) - k = apply_rotary_pos_emb(k, rope_emb) + q, k = comfy.quant_ops.ck.apply_rope_split_half(q, k, rope_emb) return q, k, v q, k, v = apply_norm_and_rotary_pos_emb(q, k, v, rope_emb) diff --git a/comfy/ldm/ernie/model.py b/comfy/ldm/ernie/model.py index eba661aec..f158ca1d2 100644 --- a/comfy/ldm/ernie/model.py +++ b/comfy/ldm/ernie/model.py @@ -5,6 +5,7 @@ import torch.nn.functional as F from comfy.ldm.modules.attention import optimized_attention import comfy.model_management +import comfy.quant_ops def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: assert dim % 2 == 0 @@ -19,15 +20,6 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor: out = torch.stack([torch.cos(out), torch.sin(out)], dim=0) return out.to(dtype=torch.float32, device=pos.device) -def apply_rotary_emb(x_in: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: - rot_dim = freqs_cis.shape[-1] - x, x_pass = x_in[..., :rot_dim], x_in[..., rot_dim:] - cos_ = freqs_cis[0] - sin_ = freqs_cis[1] - x1, x2 = x.chunk(2, dim=-1) - x_rotated = torch.cat((-x2, x1), dim=-1) - return torch.cat((x * cos_ + x_rotated * sin_, x_pass), dim=-1) - class ErnieImageEmbedND3(nn.Module): def __init__(self, dim: int, theta: int, axes_dim: tuple): super().__init__() @@ -37,8 +29,16 @@ class ErnieImageEmbedND3(nn.Module): def forward(self, ids: torch.Tensor) -> torch.Tensor: emb = torch.cat([rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(3)], dim=-1) - emb = emb.unsqueeze(3) # [2, B, S, 1, head_dim//2] - return torch.stack([emb, emb], dim=-1).reshape(*emb.shape[:-1], -1) # [B, S, 1, head_dim] + cos_ = emb[0] + sin_ = emb[1] + N = cos_.shape[-1] + half = N // 2 + cos_top = cos_[..., :half].repeat_interleave(2, dim=-1) + sin_top = sin_[..., :half].repeat_interleave(2, dim=-1) + cos_bot = cos_[..., half:].repeat_interleave(2, dim=-1) + sin_bot = sin_[..., half:].repeat_interleave(2, dim=-1) + rot = torch.stack([cos_top, -sin_top, sin_bot, cos_bot], dim=-1) + return rot.reshape(*rot.shape[:-1], 2, 2).unsqueeze(2) class ErnieImagePatchEmbedDynamic(nn.Module): def __init__(self, in_channels: int, embed_dim: int, patch_size: int, operations, device=None, dtype=None): @@ -115,8 +115,7 @@ class ErnieImageAttention(nn.Module): key = self.norm_k(key) if image_rotary_emb is not None: - query = apply_rotary_emb(query, image_rotary_emb) - key = apply_rotary_emb(key, image_rotary_emb) + query, key = comfy.quant_ops.ck.apply_rope_split_half(query, key, image_rotary_emb) q_flat = query.reshape(B, S, -1) k_flat = key.reshape(B, S, -1) @@ -274,7 +273,7 @@ class ErnieImageModel(nn.Module): image_ids = image_ids.view(1, N_img, 3).expand(B, -1, -1) - rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype) + rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)) del image_ids, text_ids sample = self.time_proj(timesteps).to(dtype) diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 0dc8fe789..9ab3c463c 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module): Embeds scalar timesteps into vector representations. """ - def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None): + def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000): super().__init__() if output_size is None: output_size = hidden_size @@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module): operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device), ) self.frequency_embedding_size = frequency_embedding_size + self.max_period = max_period def forward(self, t, dtype, **kwargs): - t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype) + t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype) t_emb = self.mlp(t_freq) return t_emb diff --git a/comfy/ldm/pixeldit/model.py b/comfy/ldm/pixeldit/model.py new file mode 100644 index 000000000..b044b9b29 --- /dev/null +++ b/comfy/ldm/pixeldit/model.py @@ -0,0 +1,239 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ldm.common_dit +import comfy.patcher_extension +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.hidream.model import FeedForwardSwiGLU +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder + +from .modules import ( + FinalLayer, + PatchTokenEmbedder, + PiTBlock, + PixelTokenEmbedder, + apply_adaln_, + precompute_freqs_cis_2d, +) + + +class MMDiTJointAttention(nn.Module): + """Joint MMDiT attention with separate Q/K/V/proj for image and text streams. + + RoPE is applied to each stream before concatenation so each stream uses its own + 2D/1D positional encoding. Concat order is [text, image] (text first). + """ + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + + self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + + self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device) + self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + B, Nx, _ = x.shape + _, Ny, _ = y.shape + H = self.num_heads + D = self.head_dim + + qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4) + qx, kx, vx = qkv_x.unbind(0) + qx = self.q_norm_x(qx) + kx = self.k_norm_x(kx) + + qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4) + qy, ky, vy = qkv_y.unbind(0) + qy = self.q_norm_y(qy) + ky = self.k_norm_y(ky) + + qx, kx = apply_rope(qx, kx, pos_img[None, None]) + if pos_txt is not None: + qy, ky = apply_rope(qy, ky, pos_txt[None, None]) + + q_joint = torch.cat([qy, qx], dim=2) + k_joint = torch.cat([ky, kx], dim=2) + v_joint = torch.cat([vy, vx], dim=2) + + out_joint = optimized_attention( + q_joint, k_joint, v_joint, H, + mask=attn_mask, skip_reshape=True, skip_output_reshape=True, + transformer_options=transformer_options, + ) + + out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D) + out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D) + + return self.proj_x(out_x), self.proj_y(out_y) + + +class MMDiTBlockT2I(nn.Module): + def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None): + super().__init__() + self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + + def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1) + shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1) + + x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x) + y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y) + attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options) + x = torch.addcmul(x, gate_msa_x, attn_x) + y = torch.addcmul(y, gate_msa_y, attn_y) + + x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x))) + y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y))) + return x, y + + +class PixDiT_T2I(nn.Module): + """PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint + (also runs at 512px when fed the appropriate latent size and flow_shift). + + Forward: + x: [B, 3, H, W] pixel-space input (no VAE) + timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention) + context: [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended) + Returns flow-matching velocity [B, 3, H, W]. + """ + def __init__( + self, + in_channels=3, + num_groups=24, + hidden_size=1536, + pixel_hidden_size=16, + pixel_attn_hidden_size=1152, + pixel_num_groups=16, + patch_depth=14, + pixel_depth=2, + patch_size=16, + txt_embed_dim=2304, + txt_max_length=300, + use_text_rope=True, + text_rope_theta=10000.0, + image_model=None, + dtype=None, + device=None, + operations=None, + pixel_mlp_chunks=2, + ): + super().__init__() + self.dtype = dtype + self.in_channels = in_channels + self.out_channels = in_channels + self.hidden_size = hidden_size + self.num_groups = num_groups + self.patch_depth = patch_depth + self.pixel_depth = pixel_depth + self.patch_size = patch_size + self.pixel_hidden_size = pixel_hidden_size + self.pixel_attn_hidden_size = pixel_attn_hidden_size + self.pixel_num_groups = pixel_num_groups + self.txt_embed_dim = txt_embed_dim + self.txt_max_length = txt_max_length + self.use_text_rope = use_text_rope + self.text_rope_theta = text_rope_theta + + self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations) + self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations) + self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10) + self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations) + self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device)) + + self.patch_blocks = nn.ModuleList([ + MMDiTBlockT2I(self.hidden_size, self.num_groups, + dtype=dtype, device=device, operations=operations) + for _ in range(self.patch_depth) + ]) + self.pixel_blocks = nn.ModuleList([ + PiTBlock( + self.pixel_hidden_size, + self.hidden_size, + patch_size=self.patch_size, + num_heads=self.num_groups, + attn_hidden_size=self.pixel_attn_hidden_size, + attn_num_heads=self.pixel_num_groups, + dtype=dtype, device=device, operations=operations, + mlp_chunks=pixel_mlp_chunks, + ) + for _ in range(self.pixel_depth) + ]) + + self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts) + + def _fetch_text_pos(self, length, device, dtype): + return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype) + + def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs) + + def _pre_patch_block(self, s, i, **kwargs): + """Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate).""" + return s + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + H_orig, W_orig = x.shape[2], x.shape[3] + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) + B, _, H, W = x.shape + Hs = H // self.patch_size + Ws = W // self.patch_size + L = Hs * Ws + + pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2) + + t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size) + + if context is None or context.dim() != 3: + raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]") + Ltxt = min(context.shape[1], self.txt_max_length) + y = context[:, :Ltxt, :] + y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size) + y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter + + condition = F.silu(t_emb) + pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None + + s = self.s_embedder(x_patches) + for i, blk in enumerate(self.patch_blocks): + s = self._pre_patch_block(s, i, **kwargs) + s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options) + s = F.silu(t_emb + s) + + s_cond = s.view(B * L, self.hidden_size) + x_pixels = self.pixel_embedder(x, patch_size=self.patch_size) + for blk in self.pixel_blocks: + x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options) + + x_pixels = self.final_layer(x_pixels) + C_out = self.out_channels + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L) + out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size) + return out[:, :, :H_orig, :W_orig] diff --git a/comfy/ldm/pixeldit/modules.py b/comfy/ldm/pixeldit/modules.py new file mode 100644 index 000000000..4b1e538c7 --- /dev/null +++ b/comfy/ldm/pixeldit/modules.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn + +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch + + +def apply_adaln_(x, shift, scale): + return x.addcmul_(x, scale).add_(shift) + + +def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0, + ref_grid_h=None, ref_grid_w=None, + scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0, + device=None, dtype=torch.float32, **kwargs): + """2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim. + + rope_options: + scale_x / scale_y multiply the position range (RoPE extrapolation). + shift_x / shift_y offset the position origin (tiled / regional inference). + With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling + (rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)). + Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2]. + Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}]. + """ + dim_axis = dim // 2 + if ref_grid_h is not None and dim_axis > 2: + h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2)) + w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2)) + else: + h_ntk = w_ntk = 1.0 + + x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device) + y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device) + y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij") + x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0) + y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0) + out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2) + return out.to(dtype=dtype) + + +def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32): + """Standard 2D sin/cos absolute positional embedding (ViT-style). + + first half encodes W-coordinates, second half H. + """ + assert embed_dim % 4 == 0 + grid_h = torch.arange(height, dtype=torch.float32, device=device) + grid_w = torch.arange(width, dtype=torch.float32, device=device) + grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij") + emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device) + emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device) + return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype) + + +class RotaryAttention(nn.Module): + """Single-stream self-attention with rotary positional encoding (used inside PiTBlock).""" + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.proj = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, pos, mask=None, transformer_options={}): + B, N, C = x.shape + H = self.num_heads + D = self.head_dim + qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None]) + x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options) + return self.proj(x) + + +class FinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None): + super().__init__() + self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device) + + def forward(self, x): + return self.linear(self.norm(x)) + + +class PatchTokenEmbedder(nn.Module): + """Linear projection used both for patchified-image tokens and text-feature tokens.""" + def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device) + self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity() + + def forward(self, x): + return self.norm(self.proj(x)) + + +class PixelTokenEmbedder(nn.Module): + """Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences.""" + def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None): + super().__init__() + self.in_channels = in_channels + self.hidden_size_output = hidden_size_output + self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device) + + def forward(self, inputs, patch_size): + B, _, H, W = inputs.shape + Hs, Ws = H // patch_size, W // patch_size + P2 = patch_size * patch_size + x = inputs.permute(0, 2, 3, 1).contiguous() + x = self.proj(x) + pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output) + x = x + pos_full.unsqueeze(0) + x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output) + return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output) + + +class PiTBlock(nn.Module): + """Pixel-level transformer block. + + Compresses each patch's P^2 pixel tokens → 1 attention token via a linear, + runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens. + Conditioning is per-pixel adaLN from the patch-level features. + """ + def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0, + attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1): + super().__init__() + self.pixel_dim = pixel_hidden_size + self.context_dim = patch_hidden_size + self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size + self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads + assert self.attn_dim % self.num_heads == 0 + + p2 = patch_size * patch_size + self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device) + self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device) + + self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations) + + self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + + self._rope_fn = precompute_freqs_cis_2d + self.mlp_chunks = max(1, int(mlp_chunks)) + + def _fetch_pos(self, height, width, device, dtype, **rope_opts): + return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts) + + def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}): + BL, P2, _ = x.shape + Hs, Ws = image_height // patch_size, image_width // patch_size + L = Hs * Ws + B = BL // L + + # Attention path uses only msa params; compute, use, free before mlp params allocate. + msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1) + + x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa) + x_flat = x_norm.view(BL, P2 * self.pixel_dim) + + x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim) + pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options) + attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim)) + attn_exp = attn_flat.view(BL, P2, self.pixel_dim) + x = torch.addcmul(x, gate_msa, attn_exp) + del msa_params, shift_msa, scale_msa, gate_msa + + mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1) + gate_mlp = gate_mlp.contiguous() # detach from mlp_params so the del below frees shift+scale storage before the MLP + mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp) + del mlp_params, shift_mlp, scale_mlp + + # MLP in chunks since the peak memory usage is huge here + chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks + for s in range(0, BL, chunk_size): + e = min(s + chunk_size, BL) + x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e])) + return x diff --git a/comfy/ldm/pixeldit/pid.py b/comfy/ldm/pixeldit/pid.py new file mode 100644 index 000000000..21b73907a --- /dev/null +++ b/comfy/ldm/pixeldit/pid.py @@ -0,0 +1,227 @@ +"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent +directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I +body + LQ projection branch injected before each MMDiT patch block. +""" + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .model import PixDiT_T2I +from .modules import precompute_freqs_cis_2d + + +class SigmaAwareGatePerTokenPerDim(nn.Module): + """gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq. + + Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1. + """ + + def __init__(self, dim: int, dtype=None, device=None, operations=None): + super().__init__() + self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device) + self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device)) + + def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor: + content_logit = self.content_proj(torch.cat([x, lq], dim=-1)) + # log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM. + log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32) + sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1) + gate = torch.sigmoid(content_logit + sigma_offset) + return x + (gate * lq).to(x.dtype) + + +class ResBlock(nn.Module): + """Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip.""" + + def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None): + super().__init__() + self.block = nn.Sequential( + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.block(x) + + +class LQProjection2D(nn.Module): + """LQ latent -> per-block patch-aligned features for controlnet-style injection.""" + + def __init__( + self, + latent_channels: int, + hidden_dim: int = 512, + out_dim: int = 1536, + patch_size: int = 16, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + num_res_blocks: int = 4, + num_outputs: int = 7, + interval: int = 2, + dtype=None, device=None, operations=None, + ): + super().__init__() + self.latent_channels = latent_channels + self.hidden_dim = hidden_dim + self.out_dim = out_dim + self.patch_size = patch_size + self.sr_scale = sr_scale + self.latent_spatial_down_factor = latent_spatial_down_factor + self.num_outputs = num_outputs + self.interval = interval + + z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size + self.z_to_patch_ratio = z_to_patch_ratio + if z_to_patch_ratio >= 1: + self.latent_fold_factor = 0 + latent_proj_in_ch = latent_channels + else: + fold_factor = int(1 / z_to_patch_ratio) + assert fold_factor * z_to_patch_ratio == 1.0 + self.latent_fold_factor = fold_factor + latent_proj_in_ch = latent_channels * fold_factor * fold_factor + + layers = [ + operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + ] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations)) + self.latent_proj = nn.Sequential(*layers) + + self.output_heads = nn.ModuleList( + [operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)] + ) + self.gate_modules = nn.ModuleList( + [SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations) + for _ in range(num_outputs)] + ) + + def is_gate_active(self, block_idx: int) -> bool: + return block_idx % self.interval == 0 + + def output_index(self, block_idx: int) -> int: + return block_idx // self.interval + + def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor: + return self.gate_modules[out_idx](x, lq_feature, sigma) + + def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor: + B, z_dim = lq_latent.shape[:2] + if self.z_to_patch_ratio >= 1: + if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW: + z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest") + else: + z_aligned = lq_latent + else: + f = self.latent_fold_factor + zH_expected, zW_expected = pH * f, pW * f + if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected: + lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest") + z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4) + z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW) + return self.latent_proj(z_aligned) + + def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]: + feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW) + B, C, H, W = feat.shape + tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C) + return [head(tokens) for head in self.output_heads] + + +class PidNet(PixDiT_T2I): + """PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block).""" + + def __init__( + self, + lq_latent_channels: int = 16, + lq_hidden_dim: int = 512, + lq_num_res_blocks: int = 4, + lq_interval: int = 2, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64. + rope_ref_w: int = 1024, + image_model=None, + dtype=None, device=None, operations=None, + **pixdit_kwargs, + ): + super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs) + + self.rope_ref_grid_h = rope_ref_h // self.patch_size + self.rope_ref_grid_w = rope_ref_w // self.patch_size + + # Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware. + def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts): + return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts) + for blk in self.pixel_blocks: + blk._rope_fn = _pit_rope_fn + + num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval + self.lq_proj = LQProjection2D( + latent_channels=lq_latent_channels, + hidden_dim=lq_hidden_dim, + out_dim=self.hidden_size, + patch_size=self.patch_size, + sr_scale=sr_scale, + latent_spatial_down_factor=latent_spatial_down_factor, + num_res_blocks=lq_num_res_blocks, + num_outputs=num_lq_outputs, + interval=lq_interval, + dtype=dtype, + device=device, + operations=operations, + ) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d( + self.hidden_size // self.num_groups, + height, width, + ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, + device=device, dtype=dtype, **rope_opts, + ) + + def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs): + if not self.lq_proj.is_gate_active(i): + return s + out_idx = self.lq_proj.output_index(i) + if out_idx >= len(pid_lq_features): + return s + return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx) + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs): + if lq_latent is None: + raise ValueError("PidNet requires lq_latent — attach via PiDConditioning") + expected_c = self.lq_proj.latent_channels + if lq_latent.shape[1] != expected_c: + raise ValueError( + f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. " + f"Flux1/SD3 = 16 channels, Flux2 = 128 channels." + ) + B = x.shape[0] + # Match the backbone's pad_to_patch_size (round up) so the LQ grid lines up with the patch stream. + Hs = -(-x.shape[2] // self.patch_size) + Ws = -(-x.shape[3] // self.patch_size) + + degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1) + if degrade_sigma.numel() == 1 and B > 1: + degrade_sigma = degrade_sigma.expand(B).contiguous() + + lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws) + + return super()._forward( + x, timesteps, + context=context, attention_mask=attention_mask, + transformer_options=transformer_options, + pid_lq_features=lq_features, + pid_degrade_sigma=degrade_sigma, + **kwargs, + ) diff --git a/comfy/ldm/qwen_image/model.py b/comfy/ldm/qwen_image/model.py index 0862f72f7..3462d8108 100644 --- a/comfy/ldm/qwen_image/model.py +++ b/comfy/ldm/qwen_image/model.py @@ -51,15 +51,6 @@ class FeedForward(nn.Module): return hidden_states -def apply_rotary_emb(x, freqs_cis): - if x.shape[1] == 0: - return x - - t_ = x.reshape(*x.shape[:-1], -1, 1, 2) - t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1] - return t_out.reshape(*x.shape) - - class QwenTimestepProjEmbeddings(nn.Module): def __init__(self, embedding_dim, pooled_projection_dim, use_additional_t_cond=False, dtype=None, device=None, operations=None): super().__init__() diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 962addb27..e032b7dcd 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -4,6 +4,7 @@ import dataclasses import torch from typing import NamedTuple +import comfy_aimdo.host_buffer from comfy.quant_ops import QuantizedTensor @@ -17,21 +18,18 @@ class TensorFileSlice(NamedTuple): def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None): if isinstance(tensor, QuantizedTensor): - if not isinstance(destination, QuantizedTensor): - return False - if tensor._layout_cls != destination._layout_cls: - return False - - if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream, + if not read_tensor_file_slice_into(tensor._qdata, + destination._qdata if destination is not None else None, stream=stream, destination2=(destination2._qdata if destination2 is not None else None)): return False - dst_orig_dtype = destination._params.orig_dtype - destination._params.copy_from(tensor._params, non_blocking=False) - destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) + if destination is not None: + dst_orig_dtype = destination._params.orig_dtype + destination._params.copy_from(tensor._params, non_blocking=False) + destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) if destination2 is not None: dst_orig_dtype = destination2._params.orig_dtype - destination2._params.copy_from(destination._params, non_blocking=True) + destination2._params.copy_from(destination._params if destination is not None else tensor._params, non_blocking=True) destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype) return True @@ -39,10 +37,15 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N if info is None: return False + if destination is not None and destination.device.type != "cpu" and destination2 is None: + destination2 = destination + destination = None + file_obj = info.file_ref - if (destination.device.type != "cpu" - or file_obj is None - or destination.numel() * destination.element_size() < info.size + if (file_obj is None + or (destination is None and destination2 is None) + or (destination is not None and (destination.device.type != "cpu" or destination.numel() * destination.element_size() < info.size)) + or (destination2 is not None and (destination2.device.type == "cpu" or destination2.numel() * destination2.element_size() < info.size)) or tensor.numel() * tensor.element_size() != info.size or tensor.storage_offset() != 0 or not tensor.is_contiguous()): @@ -51,6 +54,14 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N if info.size == 0: return True + if destination is None: + stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 + comfy_aimdo.host_buffer.read_file_to_device(file_obj, info.offset, info.size, + stream_ptr, destination2.data_ptr(), + destination2.device.index, + mark_cold=False) + return True + hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) if hostbuf is not None: stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 @@ -63,6 +74,9 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N device=None if destination2 is None else destination2.device.index) return True + if not hasattr(file_obj, "seek") or not hasattr(file_obj, "readinto"): + return False + buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) diff --git a/comfy/model_base.py b/comfy/model_base.py index d10e90399..99e120f31 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -50,6 +50,8 @@ import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model import comfy.ldm.chroma.model import comfy.ldm.chroma_radiance.model +import comfy.ldm.pixeldit.model +import comfy.ldm.pixeldit.pid import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model @@ -1519,6 +1521,53 @@ class ZImagePixelSpace(Lumina2): BaseModel.__init__(self, model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiTPixelSpace) self.memory_usage_factor_conds = ("ref_latents",) + +class PixelDiTT2I(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.model.PixDiT_T2I) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out["attention_mask"] = comfy.conds.CONDRegular(attention_mask) + return out + + +class PiD(PixelDiTT2I): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + BaseModel.__init__(self, model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.pid.PidNet) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + lq_latent = kwargs.get("lq_latent", None) + if lq_latent is not None: + out["lq_latent"] = comfy.conds.CONDRegular(lq_latent) + degrade_sigma = kwargs.get("degrade_sigma", None) + if degrade_sigma is not None: + out["degrade_sigma"] = comfy.conds.CONDRegular(degrade_sigma) + return out + + def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]): + if cond_key == "lq_latent" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor): + lq = cond_value.cond + dim = window.dim + if dim >= lq.ndim: + return None + lq_proj = self.diffusion_model.lq_proj + ratio = lq_proj.sr_scale * lq_proj.latent_spatial_down_factor + # Map x window indices -> lq indices (deduplicated, sorted, in-bounds). + lq_size = lq.size(dim) + lq_indices = sorted({i // ratio for i in window.index_list if 0 <= i // ratio < lq_size}) + if not lq_indices: + return None + idx = tuple([slice(None)] * dim + [lq_indices]) + return cond_value._copy_with(lq[idx].to(device)) + return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list) + + class WAN21(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 2b0b98cd8..f0db7d388 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -463,6 +463,23 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["extra_per_block_abs_pos_emb_type"] = "learnable" return dit_config + # PiD (Pixel Diffusion Decoder). Must check BEFORE plain PixelDiT_T2I. + _lq_w_key = '{}lq_proj.latent_proj.0.weight'.format(key_prefix) + if _lq_w_key in state_dict_keys: + in_ch = int(state_dict[_lq_w_key].shape[1]) + _gate_prefix = '{}lq_proj.gate_modules.'.format(key_prefix) + num_gates = len({k[len(_gate_prefix):].split('.')[0] + for k in state_dict_keys if k.startswith(_gate_prefix)}) + dit_config = {"image_model": "pid", + "lq_latent_channels": in_ch, + "latent_spatial_down_factor": 16 if in_ch >= 64 else 8} + if num_gates > 0: + dit_config["lq_interval"] = (14 + num_gates - 1) // num_gates + return dit_config + + if '{}core.pixel_embedder.proj.weight'.format(key_prefix) in state_dict_keys: # PixelDiT T2I + return {"image_model": "pixeldit_t2i"} + if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys and '{}noise_refiner.0.attention.k_norm.weight'.format(key_prefix) in state_dict_keys: # Lumina 2 dit_config = {} dit_config["image_model"] = "lumina2" diff --git a/comfy/model_management.py b/comfy/model_management.py index b01c4d7fa..dfd58bf1b 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -641,14 +641,17 @@ def free_pins(size, evict_active=False): return freed_total def ensure_pin_budget(size, evict_active=False): - shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available + if args.fast_disk: + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + else: + shortfall = size + max(comfy.memory_management.RAM_CACHE_HEADROOM / 2, 2048 * 1024 ** 2) - psutil.virtual_memory().available if shortfall <= 0: return True to_free = shortfall + PIN_PRESSURE_HYSTERESIS return free_pins(to_free, evict_active=evict_active) >= shortfall -def ensure_pin_registerable(size, evict_active=False): +def ensure_pin_registerable(size, evict_active=True): shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY if MAX_PINNED_MEMORY <= 0: return False @@ -658,10 +661,17 @@ def ensure_pin_registerable(size, evict_active=False): shortfall += REGISTERABLE_PIN_HYSTERESIS for loaded_model in reversed(current_loaded_models): model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + if model is not None and model.is_dynamic() and not model.model.dynamic_pins[model.load_device]["active"]: shortfall -= model.unregister_inactive_pins(shortfall) if shortfall <= 0: return True + if evict_active: + for loaded_model in current_loaded_models: + model = loaded_model.model + if model is not None and model.is_dynamic() and model.model.dynamic_pins[model.load_device]["active"]: + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return True return shortfall <= REGISTERABLE_PIN_HYSTERESIS class LoadedModel: @@ -803,9 +813,9 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for x in can_unload_sorted: i = x[-1] memory_to_free = 1e32 - if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None): + if not DISABLE_SMART_MEMORY or device is None: memory_to_free = 0 if device is None else memory_required - get_free_memory(device) - if for_dynamic: + if current_loaded_models[i].model.is_dynamic() and for_dynamic: #don't actually unload dynamic models for the sake of other dynamic models #as that works on-demand. memory_required -= current_loaded_models[i].model.loaded_size() @@ -817,6 +827,10 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins for i in sorted(unloaded_model, reverse=True): unloaded_models.append(current_loaded_models.pop(i)) + if not for_dynamic and pins_required > 0: + ensure_pin_budget(pins_required) + ensure_pin_registerable(pins_required) + if len(unloaded_model) > 0: soft_empty_cache() elif device is not None: @@ -879,15 +893,19 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu model_to_unload.model_finalizer.detach() total_memory_required = {} + total_pins_required = {} for loaded_model in models_to_load: device = loaded_model.device total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device) + if not loaded_model.model.is_dynamic(): + total_pins_required[device] = total_pins_required.get(device, 0) + loaded_model.model_memory() for device in total_memory_required: if device != torch.device("cpu"): free_memory(total_memory_required[device] * 1.1 + extra_mem, device, - for_dynamic=free_for_dynamic) + for_dynamic=free_for_dynamic, + pins_required=total_pins_required.get(device, 0)) for device in total_memory_required: if device != torch.device("cpu"): @@ -1283,7 +1301,6 @@ STREAM_CAST_BUFFERS = {} LARGEST_CASTED_WEIGHT = (None, 0) STREAM_AIMDO_CAST_BUFFERS = {} LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) -STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 @@ -1326,42 +1343,13 @@ def get_aimdo_cast_buffer(offload_stream, device): STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer return cast_buffer -def get_pin_buffer(offload_stream): - pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) - if pin_buffer is None: - pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3), mark_cold=False) - STREAM_PIN_BUFFERS[offload_stream] = pin_buffer - elif offload_stream is not None: - event = getattr(pin_buffer, "_comfy_event", None) - if event is not None: - event.synchronize() - delattr(pin_buffer, "_comfy_event") - return pin_buffer - -def resize_pin_buffer(pin_buffer, size): - global TOTAL_PINNED_MEMORY - old_size = pin_buffer.size - if size <= old_size: - return True - growth = size - old_size - comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) - ensure_pin_budget(growth, evict_active=True) - ensure_pin_registerable(growth, evict_active=True) - try: - pin_buffer.extend(size=size, reallocate=True) - except RuntimeError: - return False - TOTAL_PINNED_MEMORY += pin_buffer.size - old_size - return True - def reset_cast_buffers(): - global TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT LARGEST_CASTED_WEIGHT = (None, 0) LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) - for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS): + for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS): if offload_stream is not None: offload_stream.synchronize() synchronize() @@ -1370,20 +1358,24 @@ def reset_cast_buffers(): mmap_obj.bounce() DIRTY_MMAPS.clear() - for pin_buffer in STREAM_PIN_BUFFERS.values(): - TOTAL_PINNED_MEMORY -= pin_buffer.size - TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) - for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): - model.model.dynamic_pins[model.load_device]["active"] = False + pin_state = model.model.dynamic_pins[model.load_device] + + if pin_state["active"]: + *_, buckets = pin_state["weights"] + for size, bucket in list(buckets.items()): + bucket[:] = [ entry for entry in bucket if entry[-1] is not None ] + if not bucket: + del buckets[size] + + pin_state["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0]) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0], [0], {}) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() - STREAM_PIN_BUFFERS.clear() soft_empty_cache() def get_offload_stream(device): @@ -1436,7 +1428,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): if hasattr(wf_context, "as_context"): wf_context = wf_context.as_context(stream) - dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) + dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) if r is not None else [None] * len(tensors) dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None with wf_context: for tensor in tensors: @@ -1448,9 +1440,10 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() mark_mmap_dirty(storage) - dest_view.copy_(tensor, non_blocking=non_blocking) + if dest_view is not None: + dest_view.copy_(tensor, non_blocking=non_blocking) if dest2_view is not None: - dest2_view.copy_(dest_view, non_blocking=non_blocking) + dest2_view.copy_(tensor if dest_view is None else dest_view, non_blocking=non_blocking) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None): @@ -1723,6 +1716,13 @@ def is_device_xpu(device): def is_device_cuda(device): return is_device_type(device, 'cuda') +def set_torch_device(device): + """Set the current device for the given torch device. Supports CUDA and XPU.""" + if is_device_cuda(device): + torch.cuda.set_device(device) + elif is_device_xpu(device): + torch.xpu.set_device(device) + def is_directml_enabled(): global directml_enabled if directml_enabled: diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 00a15fa63..b716a69e2 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1721,8 +1721,8 @@ class ModelPatcherDynamic(ModelPatcher): """ if device not in self.model.dynamic_pins: self.model.dynamic_pins[device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}), "hostbufs_initialized": False, "failed": False, "active": False, @@ -1799,8 +1799,8 @@ class ModelPatcherDynamic(ModelPatcher): pin_state = self.model.dynamic_pins[self.load_device] if not pin_state["hostbufs_initialized"]: hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size()) - pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0]) - pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0]) + pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {}) + pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {}) pin_state["hostbufs_initialized"] = True pin_state["failed"] = False pin_state["active"] = True @@ -1942,18 +1942,16 @@ class ModelPatcherDynamic(ModelPatcher): return freed def loaded_ram_size(self): - return (self.model.dynamic_pins[self.load_device]["weights"][0].size + - self.model.dynamic_pins[self.load_device]["patches"][0].size) + return (self.model.dynamic_pins[self.load_device]["weights"][0].size) def pinned_memory_size(self): - return (self.model.dynamic_pins[self.load_device]["weights"][3][0] + - self.model.dynamic_pins[self.load_device]["patches"][3][0]) + return (self.model.dynamic_pins[self.load_device]["weights"][3][0]) def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset] split = stack_split[0] while split >= 0: module, offset = stack[split] @@ -1978,10 +1976,12 @@ class ModelPatcherDynamic(ModelPatcher): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() + module._pin_balancer_entry[-1] = None + del module._pin_balancer_entry del module._pin hostbuf.truncate(offset, do_unregister=module._pin_registered) stack_split[0] = min(stack_split[0], len(stack) - 1) diff --git a/comfy/model_prefetch.py b/comfy/model_prefetch.py index 72e11dec6..aa6d22d77 100644 --- a/comfy/model_prefetch.py +++ b/comfy/model_prefetch.py @@ -1,4 +1,5 @@ import comfy_aimdo.model_vbar +import comfy.memory_management import comfy.model_management import comfy.ops @@ -50,7 +51,17 @@ def prefetch_queue_pop(queue, device, module): if hasattr(s, "_v"): comfy_modules.append(s) + registerable_size = 0 + for s in comfy_modules: + registerable_size += comfy.memory_management.vram_aligned_size([s.weight, s.bias]) + for param_key in ("weight", "bias"): + lowvram_fn = getattr(s, param_key + "_lowvram_function", None) + if lowvram_fn is not None: + registerable_size += lowvram_fn.memory_required() + offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True) + if not comfy.model_management.args.fast_disk: + comfy.model_management.ensure_pin_registerable(registerable_size) comfy.model_management.sync_stream(device, offload_stream) queue[0] = (offload_stream, (prefetch, comfy_modules)) diff --git a/comfy/multigpu.py b/comfy/multigpu.py index e7f5b3d6f..bb9d334d3 100644 --- a/comfy/multigpu.py +++ b/comfy/multigpu.py @@ -17,7 +17,7 @@ class MultiGPUThreadPool: """Persistent thread pool for multi-GPU work distribution. Maintains one worker thread per extra GPU device. Each thread calls - torch.cuda.set_device() once at startup so that compiled kernel caches + set_torch_device() once at startup so that compiled kernel caches (inductor/triton) stay warm across diffusion steps. """ @@ -37,7 +37,7 @@ class MultiGPUThreadPool: def _worker_loop(self, device: torch.device, work_q: queue.Queue, result_q: queue.Queue): try: - torch.cuda.set_device(device) + comfy.model_management.set_torch_device(device) except Exception as e: logging.error(f"MultiGPUThreadPool: failed to set device {device}: {e}") while True: diff --git a/comfy/ops.py b/comfy/ops.py index 56445be8d..119177c37 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -76,8 +76,6 @@ except: cast_to = comfy.model_management.cast_to #TODO: remove once no more references -STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024 - def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -94,9 +92,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin offload_stream = None cast_buffer = None cast_buffer_offset = 0 - stream_pin_hostbuf = None - stream_pin_offset = 0 - stream_pin_queue = [] def ensure_offload_stream(module, required_size, check_largest): nonlocal offload_stream @@ -130,22 +125,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin cast_buffer_offset += buffer_size return buffer - def get_stream_pin_buffer_offset(buffer_size): - nonlocal stream_pin_hostbuf - nonlocal stream_pin_offset - - if buffer_size == 0 or offload_stream is None: - return None - - if stream_pin_hostbuf is None: - stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) - if stream_pin_hostbuf is None: - return None - - offset = stream_pin_offset - stream_pin_offset += buffer_size - return offset - for s in comfy_modules: signature = comfy_aimdo.model_vbar.vbar_fault(s._v) resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) @@ -184,12 +163,18 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream, xfer_dest2=None): if xfer_source is not None: if getattr(xfer_source, "is_lowvram_patch", False): - xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) - else: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) + if xfer_dest is not None: + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + xfer_source = [ xfer_dest ] + xfer_dest = xfer_dest2 + xfer_dest2 = None + elif xfer_dest2 is not None: + xfer_source.prepare(xfer_dest2, stream, copy=True, commit=False) + return + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream, r2=xfer_dest2) def handle_pin(m, pin, source, dest, subset="weights", size=None): if pin is not None: @@ -198,19 +183,7 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin if signature is None: comfy.pinned_memory.pin_memory(m, subset=subset, size=size) pin = comfy.pinned_memory.get_pin(m, subset=subset) - if pin is not None: - if isinstance(source, list): - comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest) - else: - cast_maybe_lowvram_patch(source, pin, None) - cast_maybe_lowvram_patch([ pin ], dest, offload_stream) - return - if pin is None: - pin_offset = get_stream_pin_buffer_offset(size) - if pin_offset is not None: - stream_pin_queue.append((source, pin_offset, size, dest)) - return - cast_maybe_lowvram_patch(source, dest, offload_stream) + cast_maybe_lowvram_patch(source, pin, offload_stream, xfer_dest2=dest) handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size) @@ -232,23 +205,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin prefetch["needs_cast"] = needs_cast s._prefetch = prefetch - if stream_pin_offset > 0: - if stream_pin_hostbuf.size < stream_pin_offset: - if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): - for xfer_source, _, _, xfer_dest in stream_pin_queue: - cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) - return offload_stream - stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf) - stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf - for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: - pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] - if isinstance(xfer_source, list): - comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest) - else: - cast_maybe_lowvram_patch(xfer_source, pin, None) - comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) - stream_pin_hostbuf._comfy_event = offload_stream.record_event() - return offload_stream diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 0e8f573ba..ffe12e0dc 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -1,17 +1,55 @@ +import bisect + import comfy.model_management import comfy.memory_management +import comfy.utils import comfy_aimdo.host_buffer import comfy_aimdo.torch import torch from comfy.cli_args import args +def _add_to_bucket(module, buckets, size, priority): + bucket = buckets.setdefault(size, []) + entry = [-priority, 0, module] + entry[1] = id(entry) + bisect.insort(bucket, entry) + module._pin_balancer_entry = entry + +def _steal_pin(module, stack, buckets, size, priority): + bucket = buckets.get(size) + if bucket is None: + return False + + while bucket and bucket[-1][-1] is None: + bucket.pop() + if not bucket: + del buckets[size] + return False + + if priority <= -bucket[-1][0]: + return False + + *_, victim = bucket.pop() + module._pin = victim._pin + module._pin_registered = victim._pin_registered + module._pin_stack_index = victim._pin_stack_index + stack[module._pin_stack_index] = (module, stack[module._pin_stack_index][1]) + + victim._pin_registered = False + del victim._pin + del victim._pin_stack_index + del victim._pin_balancer_entry + + _add_to_bucket(module, buckets, size, priority) + return True + def get_pin(module, subset="weights"): pin = getattr(module, "_pin", None) if pin is None or module._pin_registered or args.disable_pinned_memory: return pin - _, _, stack_split, pinned_size = module._pin_state[subset] + _, _, stack_split, pinned_size, *_ = module._pin_state[subset] size = pin.nbytes comfy.model_management.ensure_pin_registerable(size) @@ -31,26 +69,30 @@ def pin_memory(module, subset="weights", size=None): return pin = get_pin(module, subset) - if pin is not None or pin_state["failed"]: + if pin is not None: return - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, counter, buckets = pin_state[subset] if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size - registerable_size = size + max(0, hostbuf.size - pinned_size[0]) + registerable_size = size + priority = getattr(module, "_pin_balancer_priority", None) + + if priority is None: + priority = comfy.utils.bit_reverse_range(counter[0], 16) + counter[0] += 1 + module._pin_balancer_priority = priority comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) if (not comfy.model_management.ensure_pin_budget(size) or not comfy.model_management.ensure_pin_registerable(registerable_size)): - pin_state["failed"] = True - return False + return _steal_pin(module, stack, buckets, size, priority) try: hostbuf.extend(size=size) except RuntimeError: - pin_state["failed"] = True - return False + return _steal_pin(module, stack, buckets, size, priority) module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf @@ -60,4 +102,5 @@ def pin_memory(module, subset="weights", size=None): stack_split[0] = max(stack_split[0], module._pin_stack_index) comfy.model_management.TOTAL_PINNED_MEMORY += size pinned_size[0] += size + _add_to_bucket(module, buckets, size, priority) return True diff --git a/comfy/samplers.py b/comfy/samplers.py index e31277f7b..25c5a855f 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -464,10 +464,7 @@ def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: t def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]): try: - # TODO: non-NVIDIA support -- guard with `if device.type == "cuda":` once - # we extend multigpu QA beyond CUDA. Unconditional call crashes on - # XPU/NPU/MPS/CPU/DirectML backends. - torch.cuda.set_device(device) + comfy.model_management.set_torch_device(device) model_current: BaseModel = model_options["multigpu_clones"][device].model # run every hooked_to_run separately with torch.no_grad(): diff --git a/comfy/sd.py b/comfy/sd.py index beb782310..30b877b85 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -49,6 +49,7 @@ import comfy.text_encoders.lt import comfy.text_encoders.hunyuan_video import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 +import comfy.text_encoders.pixeldit import comfy.text_encoders.wan import comfy.text_encoders.hidream import comfy.text_encoders.ace @@ -1285,6 +1286,7 @@ class CLIPType(Enum): LONGCAT_IMAGE = 26 COGVIDEOX = 27 LENS = 28 + PIXELDIT = 29 @@ -1528,8 +1530,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.tokenizer = variant.tokenizer tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None) elif te_model == TEModel.GEMMA_2_2B: - clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) - clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer + if clip_type == CLIPType.PIXELDIT: + clip_target.clip = comfy.text_encoders.pixeldit.pixeldit_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer + else: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.GEMMA_3_4B: clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") diff --git a/comfy/supported_models.py b/comfy/supported_models.py index e451892e9..00941da53 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -30,6 +30,7 @@ import comfy.text_encoders.longcat_image import comfy.text_encoders.ernie import comfy.text_encoders.cogvideo import comfy.text_encoders.hidream_o1 +import comfy.text_encoders.pixeldit from . import supported_models_base from . import latent_formats @@ -844,6 +845,8 @@ class Lens(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.Flux2 + memory_usage_factor = 4.0 + supported_inference_dtypes = [torch.bfloat16, torch.float32] # fp16 causes NaNs vae_key_prefix = ["vae."] @@ -1201,6 +1204,72 @@ class ZImagePixelSpace(ZImage): def get_model(self, state_dict, prefix="", device=None): return model_base.ZImagePixelSpace(self, device=device) +class PixelDiTT2I(supported_models_base.BASE): + unet_config = { + "image_model": "pixeldit_t2i", + } + + unet_extra_config = {} + + sampling_settings = { + "shift": 4.0, # 1024px stage 3 default; 2.0 for 512px + } + + latent_format = latent_formats.PixelDiTPixel + memory_usage_factor = 0.04 + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PixelDiTT2I(self, device=device) + + def process_unet_state_dict(self, state_dict): + # pixel_dim from pixel_embedder.proj.weight = (pixel_dim, in_channels); p2 derived per-weight from total // (6 * pixel_dim). + pixel_dim = next(v for k, v in state_dict.items() if k.endswith("pixel_embedder.proj.weight")).shape[0] + + out = {} + marker = ".adaLN_modulation.0." + for k, v in state_dict.items(): + if k.startswith("_repa_projector") or k.startswith("net_ema."): + continue + if k.startswith("core."): + k = k[len("core."):] + elif k.startswith("net."): + k = k[len("net."):] + if "pixel_blocks." in k and marker in k: + # Split into msa (chunks 0-2) and mlp (chunks 3-5) for the two-Linear PiTBlock to reduce peak VRAM + p2 = v.shape[0] // (6 * pixel_dim) + trail = v.shape[1:] # () for bias, (in_dim,) for weight + vv = v.view(p2, 6, pixel_dim, *trail) + base, suffix = k.split(marker) + out[f"{base}.adaLN_modulation_msa.{suffix}"] = vv[:, 0:3].reshape(3 * p2 * pixel_dim, *trail).contiguous() + out[f"{base}.adaLN_modulation_mlp.{suffix}"] = vv[:, 3:6].reshape(3 * p2 * pixel_dim, *trail).contiguous() + else: + out[k] = v + return out + + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget( + comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer, + comfy.text_encoders.pixeldit.PixelDiTGemma2TE, + ) + +class PiD(PixelDiTT2I): + unet_config = { + "image_model": "pid", + } + + sampling_settings = { + "shift": 1.5, # close approximation of the original distill 4 steps [0.999, 0.866, 0.634, 0.342, 0] + } + + memory_usage_factor = 0.04 + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PiD(self, device=device) + class WAN21_T2V(supported_models_base.BASE): unet_config = { "image_model": "wan2.1", @@ -2111,6 +2180,8 @@ models = [ CosmosI2VPredict2, ZImagePixelSpace, ZImage, + PiD, + PixelDiTT2I, Lumina2, WAN22_T2V, WAN21_CausalAR_T2V, diff --git a/comfy/text_encoders/pixeldit.py b/comfy/text_encoders/pixeldit.py new file mode 100644 index 000000000..3539711e4 --- /dev/null +++ b/comfy/text_encoders/pixeldit.py @@ -0,0 +1,104 @@ +import torch + +from comfy import sd1_clip +from .lumina2 import Gemma2BTokenizer, LuminaModel +import comfy.text_encoders.llama + + +class PixelDiTGemma2_2BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + + super().__init__( + device=device, layer=layer, layer_idx=layer_idx, + textmodel_json_config={}, dtype=dtype, + special_tokens={"start": 2, "pad": 0}, + layer_norm_hidden_state=False, + model_class=comfy.text_encoders.llama.Gemma2_2B, + enable_attention_masks=attention_mask, + return_attention_masks=attention_mask, + model_options=model_options, + ) + + +_PIXELDIT_CHI_PROMPT = ( + 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions ' + "suitable for image generation. Evaluate the level of detail in the user prompt:\n" + "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, " + "and spatial relationships to create vivid and concrete scenes.\n" + "- If the prompt is already detailed, refine and enhance the existing details slightly without " + "overcomplicating.\n" + "Here are examples of how to transform or refine prompts:\n" + "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, " + "sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n" + "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring " + "glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus " + "passing by towering glass skyscrapers.\n" + "Please generate only the enhanced description for the prompt below and avoid including any " + "additional commentary or evaluations:\n" + "User Prompt: " +) + +_PIXELDIT_MAX_LENGTH = 300 +_PIXELDIT_CHI_PROMPT_DETECT_PREFIX = 'Given a user prompt, generate an "Enhanced prompt"' + + +class PixelDiTGemma2Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data=None): + if tokenizer_data is None: + tokenizer_data = {} + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, + name="gemma2_2b", tokenizer=Gemma2BTokenizer) + + def tokenize_with_weights(self, text, return_word_ids=False, **kwargs): + if not text.strip(): + return super().tokenize_with_weights("", return_word_ids=return_word_ids, disable_weights=True, min_length=_PIXELDIT_MAX_LENGTH) + + chi_token_count = len(self.gemma2_2b.tokenizer(_PIXELDIT_CHI_PROMPT)["input_ids"]) + combined = text if text.startswith(_PIXELDIT_CHI_PROMPT_DETECT_PREFIX) else _PIXELDIT_CHI_PROMPT + text + max_length_all = chi_token_count + _PIXELDIT_MAX_LENGTH - 2 + out = super().tokenize_with_weights(combined, return_word_ids=return_word_ids, + disable_weights=True, min_length=max_length_all) + out["gemma2_2b"] = [out["gemma2_2b"][0][:max_length_all]] + return out + + def untokenize(self, token_weight_pair): + return self.gemma2_2b.untokenize(token_weight_pair) + + def state_dict(self): + return self.gemma2_2b.state_dict() + + +class PixelDiTGemma2TE(LuminaModel): + # PixelDiT's select_index: keep BOS + last 299 embeddings of the padded sequence. + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="gemma2_2b", + clip_model=PixelDiTGemma2_2BModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + result = super().encode_token_weights(token_weight_pairs) + cond, pooled = result[0], result[1] + extra = result[2] if len(result) > 2 else None + if cond.shape[1] > _PIXELDIT_MAX_LENGTH: + cond = torch.cat([cond[:, :1], cond[:, -(_PIXELDIT_MAX_LENGTH - 1):]], dim=1) + if extra is not None and "attention_mask" in extra: + am = extra["attention_mask"] + extra["attention_mask"] = torch.cat([am[..., :1], am[..., -(_PIXELDIT_MAX_LENGTH - 1):]], dim=-1) + if extra is not None: + return cond, pooled, extra + return cond, pooled + + +def pixeldit_te(dtype_llama=None, llama_quantization_metadata=None): + class PixelDiTTE_(PixelDiTGemma2TE): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["llama_quantization_metadata"] = llama_quantization_metadata + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return PixelDiTTE_ diff --git a/comfy/utils.py b/comfy/utils.py index 49ae12b06..09d783fff 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -85,9 +85,9 @@ _TYPES = { def load_safetensors(ckpt): import comfy_aimdo.model_mmap - f = open(ckpt, "rb", buffering=0) file_lock = threading.Lock() model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt) + f = model_mmap.get_file_handle() file_size = os.path.getsize(ckpt) mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get())) @@ -1452,3 +1452,10 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res + +def bit_reverse_range(index, bits): + result = 0 + for _ in range(bits): + result = (result << 1) | (index & 1) + index >>= 1 + return result diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index e0a585b10..294ad425e 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -5,7 +5,7 @@ from comfy_api.internal.singleton import ProxiedSingleton from comfy_api.internal.async_to_sync import create_sync_class from ._input import ImageInput, AudioInput, MaskInput, LatentInput, VideoInput from ._input_impl import VideoFromFile, VideoFromComponents -from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL, File3D +from ._util import VideoCodec, VideoContainer, VideoComponents, MESH, VOXEL, SPLAT, File3D from . import _io_public as io from . import _ui_public as ui from comfy_execution.utils import get_executing_context @@ -143,6 +143,7 @@ class Types: VideoComponents = VideoComponents MESH = MESH VOXEL = VOXEL + SPLAT = SPLAT File3D = File3D diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py index 5ed968960..a3aa508ce 100644 --- a/comfy_api/latest/_io.py +++ b/comfy_api/latest/_io.py @@ -28,7 +28,7 @@ if TYPE_CHECKING: from comfy_api.internal import (_ComfyNodeInternal, _NodeOutputInternal, classproperty, copy_class, first_real_override, is_class, prune_dict, shallow_clone_class) from comfy_execution.graph_utils import ExecutionBlocker -from ._util import MESH, VOXEL, SVG as _SVG, File3D +from ._util import MESH, VOXEL, SPLAT, SVG as _SVG, File3D class FolderType(str, Enum): @@ -684,6 +684,10 @@ class Voxel(ComfyTypeIO): class Mesh(ComfyTypeIO): Type = MESH +@comfytype(io_type="SPLAT") +class Splat(ComfyTypeIO): + Type = SPLAT + @comfytype(io_type="FILE_3D") class File3DAny(ComfyTypeIO): @@ -727,6 +731,30 @@ class File3DUSDZ(ComfyTypeIO): Type = File3D +@comfytype(io_type="FILE_3D_PLY") +class File3DPLY(ComfyTypeIO): + """PLY format 3D file - point cloud or Gaussian splat.""" + Type = File3D + + +@comfytype(io_type="FILE_3D_SPLAT") +class File3DSPLAT(ComfyTypeIO): + """SPLAT format 3D file - 3D Gaussian splat.""" + Type = File3D + + +@comfytype(io_type="FILE_3D_SPZ") +class File3DSPZ(ComfyTypeIO): + """SPZ format 3D file - compressed 3D Gaussian splat.""" + Type = File3D + + +@comfytype(io_type="FILE_3D_KSPLAT") +class File3DKSPLAT(ComfyTypeIO): + """KSPLAT format 3D file - 3D Gaussian splat.""" + Type = File3D + + @comfytype(io_type="HOOKS") class Hooks(ComfyTypeIO): if TYPE_CHECKING: @@ -762,14 +790,32 @@ class Accumulation(ComfyTypeIO): @comfytype(io_type="LOAD3D_CAMERA") class Load3DCamera(ComfyTypeIO): class CameraInfo(TypedDict): - position: dict[str, float | int] - target: dict[str, float | int] - zoom: int - cameraType: str + # Coordinate system: right-handed, Y-up, camera looks down -Z + position: dict[str, float | int] # scene units + target: dict[str, float | int] # scene units; OrbitControls focus point + zoom: float | int # dimensionless, 1 = 100% + cameraType: str # 'perspective' | 'orthographic' + quaternion: NotRequired[dict[str, float | int]] # normalized, dimensionless; camera world rotation + fov: NotRequired[float | int] # degrees, vertical FOV (perspective only) + aspect: NotRequired[float | int] # width / height (perspective only) + near: NotRequired[float | int] # scene units + far: NotRequired[float | int] # scene units + frustum: NotRequired[dict[str, float | int]] # orthographic only: {left, right, top, bottom} in scene units Type = CameraInfo +@comfytype(io_type="LOAD3D_MODEL_INFO") +class Load3DModelInfo(ComfyTypeIO): + class Model3DTransform(TypedDict): + # Coordinate system: right-handed, Y-up, world space + position: dict[str, float | int] # scene units + quaternion: dict[str, float | int] # normalized, dimensionless; world rotation + scale: dict[str, float | int] # dimensionless multiplier + + Type = list[Model3DTransform] + + @comfytype(io_type="LOAD_3D") class Load3D(ComfyTypeIO): """3D models are stored as a dictionary.""" @@ -779,6 +825,7 @@ class Load3D(ComfyTypeIO): normal: str camera_info: Load3DCamera.CameraInfo recording: NotRequired[str] + model_3d_info: NotRequired[list[Load3DModelInfo.Model3DTransform]] Type = Model3DDict @@ -2277,6 +2324,7 @@ __all__ = [ "LossMap", "Voxel", "Mesh", + "Splat", "File3DAny", "File3DGLB", "File3DGLTF", @@ -2284,6 +2332,10 @@ __all__ = [ "File3DOBJ", "File3DSTL", "File3DUSDZ", + "File3DPLY", + "File3DSPLAT", + "File3DSPZ", + "File3DKSPLAT", "Hooks", "HookKeyframes", "TimestepsRange", @@ -2291,6 +2343,7 @@ __all__ = [ "FlowControl", "Accumulation", "Load3DCamera", + "Load3DModelInfo", "Load3D", "Load3DAnimation", "Photomaker", diff --git a/comfy_api/latest/_ui.py b/comfy_api/latest/_ui.py index e238cdf3c..6592f6b1d 100644 --- a/comfy_api/latest/_ui.py +++ b/comfy_api/latest/_ui.py @@ -452,6 +452,16 @@ class PreviewUI3D(_UIOutput): return {"result": [self.model_file, self.camera_info, self.bg_image_path]} +class PreviewUI3DAdvanced(_UIOutput): + def __init__(self, model_file, camera_info, model_3d_info): + self.model_file = model_file + self.camera_info = camera_info + self.model_3d_info = model_3d_info + + def as_dict(self): + return {"result": [self.model_file, self.camera_info, self.model_3d_info]} + + class PreviewText(_UIOutput): def __init__(self, value: str, **kwargs): self.value = value @@ -471,5 +481,6 @@ __all__ = [ "PreviewAudio", "PreviewVideo", "PreviewUI3D", + "PreviewUI3DAdvanced", "PreviewText", ] diff --git a/comfy_api/latest/_util/__init__.py b/comfy_api/latest/_util/__init__.py index 115baf392..b27f5a97e 100644 --- a/comfy_api/latest/_util/__init__.py +++ b/comfy_api/latest/_util/__init__.py @@ -1,5 +1,5 @@ from .video_types import VideoContainer, VideoCodec, VideoComponents -from .geometry_types import VOXEL, MESH, File3D +from .geometry_types import VOXEL, MESH, SPLAT, File3D from .image_types import SVG __all__ = [ @@ -9,6 +9,7 @@ __all__ = [ "VideoComponents", "VOXEL", "MESH", + "SPLAT", "File3D", "SVG", ] diff --git a/comfy_api/latest/_util/geometry_types.py b/comfy_api/latest/_util/geometry_types.py index cdde60b10..84a18d69a 100644 --- a/comfy_api/latest/_util/geometry_types.py +++ b/comfy_api/latest/_util/geometry_types.py @@ -11,13 +11,32 @@ class VOXEL: self.data = data +class SPLAT: + """A batch of 3D Gaussian splats in render-ready (activated, world-space) form. + + Tensors are (B, N, ...) and zero-padded to a common N across the batch; `counts` (B,) holds the + real per-item lengths (None when rows are uniform and no slicing is needed). SH coefficients are + stored as (B, N, K, 3) with K = (sh_degree + 1)**2; the DC (diffuse) term is sh[..., 0, :]. + """ + + def __init__(self, positions: torch.Tensor, scales: torch.Tensor, rotations: torch.Tensor, + opacities: torch.Tensor, sh: torch.Tensor, counts: torch.Tensor | None = None): + self.positions = positions # (B, N, 3) world-space centers + self.scales = scales # (B, N, 3) linear (positive) per-axis std + self.rotations = rotations # (B, N, 4) quaternion wxyz (normalized) + self.opacities = opacities # (B, N, 1) in [0, 1] + self.sh = sh # (B, N, K, 3) spherical-harmonic color coefficients + self.counts = counts # (B,) real lengths, or None + + class MESH: def __init__(self, vertices: torch.Tensor, faces: torch.Tensor, uvs: torch.Tensor | None = None, vertex_colors: torch.Tensor | None = None, texture: torch.Tensor | None = None, vertex_counts: torch.Tensor | None = None, - face_counts: torch.Tensor | None = None): + face_counts: torch.Tensor | None = None, + unlit: bool = False): assert (vertex_counts is None) == (face_counts is None), \ "vertex_counts and face_counts must be provided together (both or neither)" @@ -30,6 +49,8 @@ class MESH: # these hold the real per-item lengths (B,). None means rows are uniform and no slicing is needed. self.vertex_counts = vertex_counts self.face_counts = face_counts + # Render flat / emissive (no scene lighting) when saved, e.g. for gaussian-splat-derived meshes. + self.unlit = unlit class File3D: diff --git a/comfy_api_nodes/apis/beeble.py b/comfy_api_nodes/apis/beeble.py new file mode 100644 index 000000000..90175b214 --- /dev/null +++ b/comfy_api_nodes/apis/beeble.py @@ -0,0 +1,32 @@ +from pydantic import BaseModel, Field + + +class CreateSwitchXRequest(BaseModel): + generation_type: str = Field(...) + source_uri: str = Field(...) + alpha_mode: str = Field(...) + prompt: str | None = Field(None, max_length=2000) + reference_image_uri: str | None = Field(None) + alpha_uri: str | None = Field(None) + max_resolution: int = Field(1080) + callback_url: str | None = Field(None) + idempotency_key: str | None = Field(None, max_length=256, min_length=1) + + +class SwitchXOutputUrls(BaseModel): + render: str | None = Field(None) + source: str | None = Field(None) + alpha: str | None = Field(None) + + +class SwitchXStatusResponse(BaseModel): + id: str = Field(...) + status: str = Field(...) + progress: int | None = Field(None) + generation_type: str | None = Field(None) + alpha_mode: str | None = Field(None) + output: SwitchXOutputUrls | None = Field(None) + error: str | None = Field(None) + created_at: str | None = Field(None) + modified_at: str | None = Field(None) + completed_at: str | None = Field(None) diff --git a/comfy_api_nodes/apis/bytedance.py b/comfy_api_nodes/apis/bytedance.py index 03f4c445b..47f24586c 100644 --- a/comfy_api_nodes/apis/bytedance.py +++ b/comfy_api_nodes/apis/bytedance.py @@ -158,8 +158,9 @@ class SeedanceCreateAssetResponse(BaseModel): class SeedanceVirtualLibraryCreateAssetRequest(BaseModel): - url: str = Field(..., description="Publicly accessible URL of the image asset to upload.") + url: str = Field(..., description="Publicly accessible URL of the asset to upload.") hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.") + asset_type: str | None = Field(None, description="BytePlus asset type. Defaults to Image server-side when omitted.") # Dollars per 1K tokens, keyed by (model_id, has_video_input). diff --git a/comfy_api_nodes/apis/krea.py b/comfy_api_nodes/apis/krea.py new file mode 100644 index 000000000..6e294a3b7 --- /dev/null +++ b/comfy_api_nodes/apis/krea.py @@ -0,0 +1,46 @@ +"""Pydantic models for the Krea image-generation API.""" + +from pydantic import BaseModel, Field + + +class KreaMoodboard(BaseModel): + id: str = Field(...) + strength: float = Field(default=0.35, ge=-0.5, le=1.5) + + +class KreaImageStyleReference(BaseModel): + strength: float = Field(..., ge=-2.0, le=2.0) + url: str | None = Field(default=None) + + +class KreaGenerateImageRequest(BaseModel): + prompt: str = Field(...) + aspect_ratio: str = Field(...) + resolution: str = Field(...) + seed: int | None = Field(default=None) + creativity: str = Field(default="medium") + moodboards: list[KreaMoodboard] | None = Field(default=None) + image_style_references: list[KreaImageStyleReference] | None = Field(default=None) + + +class KreaJobResult(BaseModel): + urls: list[str] | None = Field(default=None) + style_id: str | None = Field(default=None) + + +class KreaJob(BaseModel): + job_id: str = Field(...) + status: str = Field(...) + created_at: str = Field(...) + completed_at: str | None = Field(default=None) + result: KreaJobResult | None = Field(default=None) + + +class KreaAssetResponse(BaseModel): + id: str = Field(...) + image_url: str = Field(...) + uploaded_at: str = Field(...) + width: float | None = Field(default=None) + height: float | None = Field(default=None) + size_bytes: float | None = Field(default=None) + mime_type: str | None = Field(default=None) diff --git a/comfy_api_nodes/apis/tripo.py b/comfy_api_nodes/apis/tripo.py index bce6b0e89..7ac81d42c 100644 --- a/comfy_api_nodes/apis/tripo.py +++ b/comfy_api_nodes/apis/tripo.py @@ -1,25 +1,25 @@ from enum import Enum -from typing import Optional, Any +from typing import Any from pydantic import BaseModel, Field, RootModel class TripoModelVersion(str, Enum): - v3_1_20260211 = 'v3.1-20260211' - v3_0_20250812 = 'v3.0-20250812' - v2_5_20250123 = 'v2.5-20250123' - v2_0_20240919 = 'v2.0-20240919' - v1_4_20240625 = 'v1.4-20240625' + v3_1_20260211 = "v3.1-20260211" + v3_0_20250812 = "v3.0-20250812" + v2_5_20250123 = "v2.5-20250123" + v2_0_20240919 = "v2.0-20240919" + v1_4_20240625 = "v1.4-20240625" class TripoGeometryQuality(str, Enum): - standard = 'standard' - detailed = 'detailed' + standard = "standard" + detailed = "detailed" class TripoTextureQuality(str, Enum): - standard = 'standard' - detailed = 'detailed' + standard = "standard" + detailed = "detailed" class TripoStyle(str, Enum): @@ -33,6 +33,7 @@ class TripoStyle(str, Enum): ANCIENT_BRONZE = "ancient_bronze" NONE = "None" + class TripoTaskType(str, Enum): TEXT_TO_MODEL = "text_to_model" IMAGE_TO_MODEL = "image_to_model" @@ -45,26 +46,27 @@ class TripoTaskType(str, Enum): STYLIZE_MODEL = "stylize_model" CONVERT_MODEL = "convert_model" + class TripoTextureAlignment(str, Enum): ORIGINAL_IMAGE = "original_image" GEOMETRY = "geometry" + class TripoOrientation(str, Enum): ALIGN_IMAGE = "align_image" DEFAULT = "default" + class TripoOutFormat(str, Enum): GLB = "glb" FBX = "fbx" -class TripoTopology(str, Enum): - BIP = "bip" - QUAD = "quad" class TripoSpec(str, Enum): MIXAMO = "mixamo" TRIPO = "tripo" + class TripoAnimation(str, Enum): IDLE = "preset:idle" WALK = "preset:walk" @@ -83,11 +85,6 @@ class TripoAnimation(str, Enum): SERPENTINE_MARCH = "preset:serpentine:march" AQUATIC_MARCH = "preset:aquatic:march" -class TripoStylizeStyle(str, Enum): - LEGO = "lego" - VOXEL = "voxel" - VORONOI = "voronoi" - MINECRAFT = "minecraft" class TripoConvertFormat(str, Enum): GLTF = "GLTF" @@ -97,6 +94,7 @@ class TripoConvertFormat(str, Enum): STL = "STL" _3MF = "3MF" + class TripoTextureFormat(str, Enum): BMP = "BMP" DPX = "DPX" @@ -108,6 +106,7 @@ class TripoTextureFormat(str, Enum): TIFF = "TIFF" WEBP = "WEBP" + class TripoTaskStatus(str, Enum): QUEUED = "queued" RUNNING = "running" @@ -118,183 +117,223 @@ class TripoTaskStatus(str, Enum): BANNED = "banned" EXPIRED = "expired" + class TripoFbxPreset(str, Enum): BLENDER = "blender" MIXAMO = "mixamo" _3DSMAX = "3dsmax" + class TripoFileTokenReference(BaseModel): - type: Optional[str] = Field(None, description='The type of the reference') + type: str | None = Field(None, description="The type of the reference") file_token: str + class TripoUrlReference(BaseModel): - type: Optional[str] = Field(None, description='The type of the reference') + type: str | None = Field(None, description="The type of the reference") url: str + class TripoObjectStorage(BaseModel): bucket: str key: str + class TripoObjectReference(BaseModel): type: str object: TripoObjectStorage + class TripoFileEmptyReference(BaseModel): pass + class TripoFileReference(RootModel): root: TripoFileTokenReference | TripoUrlReference | TripoObjectReference | TripoFileEmptyReference -class TripoGetStsTokenRequest(BaseModel): - format: str = Field(..., description='The format of the image') class TripoTextToModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description='Type of task') - prompt: str = Field(..., description='The text prompt describing the model to generate', max_length=1024) - negative_prompt: Optional[str] = Field(None, description='The negative text prompt', max_length=1024) - model_version: Optional[TripoModelVersion] = TripoModelVersion.v2_5_20250123 - face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to') - texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model') - pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model') - image_seed: Optional[int] = Field(None, description='The seed for the text') - model_seed: Optional[int] = Field(None, description='The seed for the model') - texture_seed: Optional[int] = Field(None, description='The seed for the texture') - texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard - geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard - style: Optional[TripoStyle] = None - auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') - quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model') + type: TripoTaskType = Field(TripoTaskType.TEXT_TO_MODEL, description="Type of task") + prompt: str = Field(..., description="The text prompt describing the model to generate", max_length=1024) + negative_prompt: str | None = Field(None, description="The negative text prompt", max_length=1024) + model_version: TripoModelVersion | None = TripoModelVersion.v2_5_20250123 + face_limit: int | None = Field(None, description="The number of faces to limit the generation to") + texture: bool | None = Field(True, description="Whether to apply texture to the generated model") + pbr: bool | None = Field(True, description="Whether to apply PBR to the generated model") + image_seed: int | None = Field(None, description="The seed for the text") + model_seed: int | None = Field(None, description="The seed for the model") + texture_seed: int | None = Field(None, description="The seed for the texture") + texture_quality: TripoTextureQuality | None = TripoTextureQuality.standard + geometry_quality: TripoGeometryQuality | None = TripoGeometryQuality.standard + style: TripoStyle | None = None + auto_size: bool | None = Field(False, description="Whether to auto-size the model") + quad: bool | None = Field(False, description="Whether to apply quad to the generated model") + class TripoImageToModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.IMAGE_TO_MODEL, description='Type of task') - file: TripoFileReference = Field(..., description='The file reference to convert to a model') - model_version: Optional[TripoModelVersion] = Field(None, description='The model version to use for generation') - face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to') - texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model') - pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model') - model_seed: Optional[int] = Field(None, description='The seed for the model') - texture_seed: Optional[int] = Field(None, description='The seed for the texture') - texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard - geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard - texture_alignment: Optional[TripoTextureAlignment] = Field(TripoTextureAlignment.ORIGINAL_IMAGE, description='The texture alignment method') - style: Optional[TripoStyle] = Field(None, description='The style to apply to the generated model') - auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') - orientation: Optional[TripoOrientation] = TripoOrientation.DEFAULT - quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model') + type: TripoTaskType = Field(TripoTaskType.IMAGE_TO_MODEL, description="Type of task") + file: TripoFileReference = Field(..., description="The file reference to convert to a model") + model_version: TripoModelVersion | None = Field(None, description="The model version to use for generation") + face_limit: int | None = Field(None, description="The number of faces to limit the generation to") + texture: bool | None = Field(True, description="Whether to apply texture to the generated model") + pbr: bool | None = Field(True, description="Whether to apply PBR to the generated model") + model_seed: int | None = Field(None, description="The seed for the model") + texture_seed: int | None = Field(None, description="The seed for the texture") + texture_quality: TripoTextureQuality | None = TripoTextureQuality.standard + geometry_quality: TripoGeometryQuality | None = TripoGeometryQuality.standard + texture_alignment: TripoTextureAlignment | None = Field( + TripoTextureAlignment.ORIGINAL_IMAGE, description="The texture alignment method" + ) + style: TripoStyle | None = Field(None, description="The style to apply to the generated model") + auto_size: bool | None = Field(False, description="Whether to auto-size the model") + orientation: TripoOrientation | None = TripoOrientation.DEFAULT + quad: bool | None = Field(False, description="Whether to apply quad to the generated model") + class TripoMultiviewToModelRequest(BaseModel): type: TripoTaskType = TripoTaskType.MULTIVIEW_TO_MODEL - files: list[TripoFileReference] = Field(..., description='The file references to convert to a model') - model_version: Optional[TripoModelVersion] = Field(None, description='The model version to use for generation') - orthographic_projection: Optional[bool] = Field(False, description='Whether to use orthographic projection') - face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to') - texture: Optional[bool] = Field(True, description='Whether to apply texture to the generated model') - pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the generated model') - model_seed: Optional[int] = Field(None, description='The seed for the model') - texture_seed: Optional[int] = Field(None, description='The seed for the texture') - texture_quality: Optional[TripoTextureQuality] = TripoTextureQuality.standard - geometry_quality: Optional[TripoGeometryQuality] = TripoGeometryQuality.standard - texture_alignment: Optional[TripoTextureAlignment] = TripoTextureAlignment.ORIGINAL_IMAGE - auto_size: Optional[bool] = Field(False, description='Whether to auto-size the model') - orientation: Optional[TripoOrientation] = Field(TripoOrientation.DEFAULT, description='The orientation for the model') - quad: Optional[bool] = Field(False, description='Whether to apply quad to the generated model') + files: list[TripoFileReference] = Field(..., description="The file references to convert to a model") + model_version: TripoModelVersion | None = Field(None, description="The model version to use for generation") + orthographic_projection: bool | None = Field(False, description="Whether to use orthographic projection") + face_limit: int | None = Field(None, description="The number of faces to limit the generation to") + texture: bool | None = Field(True, description="Whether to apply texture to the generated model") + pbr: bool | None = Field(True, description="Whether to apply PBR to the generated model") + model_seed: int | None = Field(None, description="The seed for the model") + texture_seed: int | None = Field(None, description="The seed for the texture") + texture_quality: TripoTextureQuality | None = TripoTextureQuality.standard + geometry_quality: TripoGeometryQuality | None = TripoGeometryQuality.standard + texture_alignment: TripoTextureAlignment | None = TripoTextureAlignment.ORIGINAL_IMAGE + auto_size: bool | None = Field(False, description="Whether to auto-size the model") + orientation: TripoOrientation | None = Field(TripoOrientation.DEFAULT, description="The orientation for the model") + quad: bool | None = Field(False, description="Whether to apply quad to the generated model") + class TripoTextureModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.TEXTURE_MODEL, description='Type of task') - original_model_task_id: str = Field(..., description='The task ID of the original model') - texture: Optional[bool] = Field(True, description='Whether to apply texture to the model') - pbr: Optional[bool] = Field(True, description='Whether to apply PBR to the model') - model_seed: Optional[int] = Field(None, description='The seed for the model') - texture_seed: Optional[int] = Field(None, description='The seed for the texture') - texture_quality: Optional[TripoTextureQuality] = Field(None, description='The quality of the texture') - texture_alignment: Optional[TripoTextureAlignment] = Field(TripoTextureAlignment.ORIGINAL_IMAGE, description='The texture alignment method') + type: TripoTaskType = Field(TripoTaskType.TEXTURE_MODEL, description="Type of task") + original_model_task_id: str = Field(..., description="The task ID of the original model") + texture: bool | None = Field(True, description="Whether to apply texture to the model") + pbr: bool | None = Field(True, description="Whether to apply PBR to the model") + model_seed: int | None = Field(None, description="The seed for the model") + texture_seed: int | None = Field(None, description="The seed for the texture") + texture_quality: TripoTextureQuality | None = Field(None, description="The quality of the texture") + texture_alignment: TripoTextureAlignment | None = Field( + TripoTextureAlignment.ORIGINAL_IMAGE, description="The texture alignment method" + ) + class TripoRefineModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.REFINE_MODEL, description='Type of task') - draft_model_task_id: str = Field(..., description='The task ID of the draft model') + type: TripoTaskType = Field(TripoTaskType.REFINE_MODEL, description="Type of task") + draft_model_task_id: str = Field(..., description="The task ID of the draft model") -class TripoAnimatePrerigcheckRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.ANIMATE_PRERIGCHECK, description='Type of task') - original_model_task_id: str = Field(..., description='The task ID of the original model') class TripoAnimateRigRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.ANIMATE_RIG, description='Type of task') - original_model_task_id: str = Field(..., description='The task ID of the original model') - out_format: Optional[TripoOutFormat] = Field(TripoOutFormat.GLB, description='The output format') - spec: Optional[TripoSpec] = Field(TripoSpec.TRIPO, description='The specification for rigging') + type: TripoTaskType = Field(TripoTaskType.ANIMATE_RIG, description="Type of task") + original_model_task_id: str = Field(..., description="The task ID of the original model") + out_format: TripoOutFormat | None = Field(TripoOutFormat.GLB, description="The output format") + spec: TripoSpec | None = Field(TripoSpec.TRIPO, description="The specification for rigging") + class TripoAnimateRetargetRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.ANIMATE_RETARGET, description='Type of task') - original_model_task_id: str = Field(..., description='The task ID of the original model') - animation: TripoAnimation = Field(..., description='The animation to apply') - out_format: Optional[TripoOutFormat] = Field(TripoOutFormat.GLB, description='The output format') - bake_animation: Optional[bool] = Field(True, description='Whether to bake the animation') + type: TripoTaskType = Field(TripoTaskType.ANIMATE_RETARGET, description="Type of task") + original_model_task_id: str = Field(..., description="The task ID of the original model") + animation: TripoAnimation = Field(..., description="The animation to apply") + out_format: TripoOutFormat | None = Field(TripoOutFormat.GLB, description="The output format") + bake_animation: bool | None = Field(True, description="Whether to bake the animation") -class TripoStylizeModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.STYLIZE_MODEL, description='Type of task') - style: TripoStylizeStyle = Field(..., description='The style to apply to the model') - original_model_task_id: str = Field(..., description='The task ID of the original model') - block_size: Optional[int] = Field(80, description='The block size for stylization') class TripoConvertModelRequest(BaseModel): - type: TripoTaskType = Field(TripoTaskType.CONVERT_MODEL, description='Type of task') - format: TripoConvertFormat = Field(..., description='The format to convert to') - original_model_task_id: str = Field(..., description='The task ID of the original model') - quad: Optional[bool] = Field(None, description='Whether to apply quad to the model') - force_symmetry: Optional[bool] = Field(None, description='Whether to force symmetry') - face_limit: Optional[int] = Field(None, description='The number of faces to limit the conversion to') - flatten_bottom: Optional[bool] = Field(None, description='Whether to flatten the bottom of the model') - flatten_bottom_threshold: Optional[float] = Field(None, description='The threshold for flattening the bottom') - texture_size: Optional[int] = Field(None, description='The size of the texture') - texture_format: Optional[TripoTextureFormat] = Field(TripoTextureFormat.JPEG, description='The format of the texture') - pivot_to_center_bottom: Optional[bool] = Field(None, description='Whether to pivot to the center bottom') - scale_factor: Optional[float] = Field(None, description='The scale factor for the model') - with_animation: Optional[bool] = Field(None, description='Whether to include animations') - pack_uv: Optional[bool] = Field(None, description='Whether to pack the UVs') - bake: Optional[bool] = Field(None, description='Whether to bake the model') - part_names: Optional[list[str]] = Field(None, description='The names of the parts to include') - fbx_preset: Optional[TripoFbxPreset] = Field(None, description='The preset for the FBX export') - export_vertex_colors: Optional[bool] = Field(None, description='Whether to export the vertex colors') - export_orientation: Optional[TripoOrientation] = Field(None, description='The orientation for the export') - animate_in_place: Optional[bool] = Field(None, description='Whether to animate in place') + type: TripoTaskType = Field(TripoTaskType.CONVERT_MODEL, description="Type of task") + format: TripoConvertFormat = Field(..., description="The format to convert to") + original_model_task_id: str = Field(..., description="The task ID of the original model") + quad: bool | None = Field(None, description="Whether to apply quad to the model") + force_symmetry: bool | None = Field(None, description="Whether to force symmetry") + face_limit: int | None = Field(None, description="The number of faces to limit the conversion to") + flatten_bottom: bool | None = Field(None, description="Whether to flatten the bottom of the model") + flatten_bottom_threshold: float | None = Field(None, description="The threshold for flattening the bottom") + texture_size: int | None = Field(None, description="The size of the texture") + texture_format: TripoTextureFormat | None = Field(TripoTextureFormat.JPEG, description="The format of the texture") + pivot_to_center_bottom: bool | None = Field(None, description="Whether to pivot to the center bottom") + scale_factor: float | None = Field(None, description="The scale factor for the model") + with_animation: bool | None = Field(None, description="Whether to include animations") + pack_uv: bool | None = Field(None, description="Whether to pack the UVs") + bake: bool | None = Field(None, description="Whether to bake the model") + part_names: list[str] | None = Field(None, description="The names of the parts to include") + fbx_preset: TripoFbxPreset | None = Field(None, description="The preset for the FBX export") + export_vertex_colors: bool | None = Field(None, description="Whether to export the vertex colors") + export_orientation: TripoOrientation | None = Field(None, description="The orientation for the export") + animate_in_place: bool | None = Field(None, description="Whether to animate in place") + + +class TripoP1CommonRequest(BaseModel): + """Fields supported by Tripo P1 across all input types.""" + + model_version: str = Field("P1-20260311") + model_seed: int | None = Field(None, description="Random seed for geometry generation") + face_limit: int | None = Field(None, ge=48, le=20000, description="Target face count (48-20000)") + texture: bool | None = Field(None, description="Enable texturing; pbr=True forces this true") + pbr: bool | None = Field(None, description="Enable PBR maps; when true, texture is also enabled") + texture_seed: int | None = Field(None, description="Random seed for texture generation") + texture_quality: str | None = Field(None, description='"standard" or "detailed"') + auto_size: bool | None = Field(None, description="Scale to real-world meters") + compress: str | None = Field(None, description='Only "geometry" is supported') + export_uv: bool | None = Field(None, description="Perform UV unwrapping during generation") + + +class TripoP1TextToModelRequest(TripoP1CommonRequest): + type: str = "text_to_model" + prompt: str = Field(..., max_length=1024) + negative_prompt: str | None = Field(None, max_length=255) + image_seed: int | None = None + + +class TripoP1ImageToModelRequest(TripoP1CommonRequest): + type: str = "image_to_model" + file: TripoFileReference + enable_image_autofix: bool | None = None + texture_alignment: str | None = Field(None, description='"original_image" or "geometry"') + orientation: str | None = Field(None, description='"default" or "align_image"; needs texture=true') + + +class TripoP1MultiviewToModelRequest(TripoP1CommonRequest): + """P1 multiview generation. + + Tripo requires `files` to be exactly four entries in [front, left, back, right] order with `{}` + (TripoFileEmptyReference) for omitted slots; front is required and at least two images total must be provided. + """ + + type: str = "multiview_to_model" + files: list[TripoFileReference] + texture_alignment: str | None = None + orientation: str | None = None class TripoTaskOutput(BaseModel): - model: Optional[str] = Field(None, description='URL to the model') - base_model: Optional[str] = Field(None, description='URL to the base model') - pbr_model: Optional[str] = Field(None, description='URL to the PBR model') - rendered_image: Optional[str] = Field(None, description='URL to the rendered image') - riggable: Optional[bool] = Field(None, description='Whether the model is riggable') + model: str | None = Field(None, description="URL to the model") + base_model: str | None = Field(None, description="URL to the base model") + pbr_model: str | None = Field(None, description="URL to the PBR model") + rendered_image: str | None = Field(None, description="URL to the rendered image") + riggable: bool | None = Field(None, description="Whether the model is riggable") + class TripoTask(BaseModel): - task_id: str = Field(..., description='The task ID') - type: Optional[str] = Field(None, description='The type of task') - status: Optional[TripoTaskStatus] = Field(None, description='The status of the task') - input: Optional[dict[str, Any]] = Field(None, description='The input parameters for the task') - output: Optional[TripoTaskOutput] = Field(None, description='The output of the task') - progress: Optional[int] = Field(None, description='The progress of the task', ge=0, le=100) - create_time: Optional[int] = Field(None, description='The creation time of the task') - running_left_time: Optional[int] = Field(None, description='The estimated time left for the task') - queue_position: Optional[int] = Field(None, description='The position in the queue') + task_id: str = Field(..., description="The task ID") + type: str | None = Field(None, description="The type of task") + status: TripoTaskStatus | None = Field(None, description="The status of the task") + input: dict[str, Any] | None = Field(None, description="The input parameters for the task") + output: TripoTaskOutput | None = Field(None, description="The output of the task") + progress: int | None = Field(None, description="The progress of the task", ge=0, le=100) + create_time: int | None = Field(None, description="The creation time of the task") + running_left_time: int | None = Field(None, description="The estimated time left for the task") + queue_position: int | None = Field(None, description="The position in the queue") consumed_credit: int | None = Field(None) + class TripoTaskResponse(BaseModel): - code: int = Field(0, description='The response code') - data: TripoTask = Field(..., description='The task data') + code: int = Field(0, description="The response code") + data: TripoTask = Field(..., description="The task data") -class TripoGeneralResponse(BaseModel): - code: int = Field(0, description='The response code') - data: dict[str, str] = Field(..., description='The task ID data') - -class TripoBalanceData(BaseModel): - balance: float = Field(..., description='The account balance') - frozen: float = Field(..., description='The frozen balance') - -class TripoBalanceResponse(BaseModel): - code: int = Field(0, description='The response code') - data: TripoBalanceData = Field(..., description='The balance data') class TripoErrorResponse(BaseModel): - code: int = Field(..., description='The error code') - message: str = Field(..., description='The error message') - suggestion: str = Field(..., description='The suggestion for fixing the error') + code: int = Field(..., description="The error code") + message: str = Field(..., description="The error message") + suggestion: str = Field(..., description="The suggestion for fixing the error") diff --git a/comfy_api_nodes/nodes_anthropic.py b/comfy_api_nodes/nodes_anthropic.py index 42ec5708f..7805c96ce 100644 --- a/comfy_api_nodes/nodes_anthropic.py +++ b/comfy_api_nodes/nodes_anthropic.py @@ -155,7 +155,7 @@ class ClaudeNode(IO.ComfyNode): return IO.Schema( node_id="ClaudeNode", display_name="Anthropic Claude", - category="api node/text/Anthropic", + category="text/partner/Anthropic", essentials_category="Text Generation", description="Generate text responses with Anthropic's Claude models. " "Provide a text prompt and optionally one or more images for multimodal context.", diff --git a/comfy_api_nodes/nodes_beeble.py b/comfy_api_nodes/nodes_beeble.py new file mode 100644 index 000000000..f1082884c --- /dev/null +++ b/comfy_api_nodes/nodes_beeble.py @@ -0,0 +1,404 @@ +from fractions import Fraction + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types +from comfy_api_nodes.apis.beeble import ( + CreateSwitchXRequest, + SwitchXStatusResponse, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + bytesio_to_image_tensor, + convert_mask_to_image, + download_url_as_bytesio, + download_url_to_image_tensor, + download_url_to_video_output, + downscale_image_tensor, + downscale_video_to_max_pixels, + poll_op, + sync_op, + upload_image_to_comfyapi, + upload_video_to_comfyapi, + validate_string, + validate_video_frame_count, +) + +_MAX_PIXELS = 2_770_000 +_MAX_FRAMES = 240 +_MAX_PROMPT_LEN = 2000 + + +def _validate_inputs(prompt: str | None, reference_image: Input.Image | None) -> str | None: + """Beeble requires at least one of prompt or reference_image. Returns the cleaned prompt.""" + cleaned = prompt.strip() if prompt else "" + if not cleaned and reference_image is None: + raise ValueError("At least one of 'prompt' or 'reference_image' must be provided.") + if cleaned: + validate_string(cleaned, strip_whitespace=False, max_length=_MAX_PROMPT_LEN) + return cleaned or None + + +async def _upload_mask_as_image( + cls: type[IO.ComfyNode], + mask: Input.Image, + *, + wait_label: str, +) -> str: + """Encode a single-frame MASK (H, W) or (1, H, W) as a PNG and upload.""" + if mask.dim() == 2: + mask = mask.unsqueeze(0) + image = convert_mask_to_image(mask[:1]) + return await upload_image_to_comfyapi( + cls, + image, + mime_type="image/png", + wait_label=wait_label, + total_pixels=_MAX_PIXELS, + ) + + +async def _upload_mask_batch_as_video( + cls: type[IO.ComfyNode], + mask: Input.Image, + *, + frame_rate: Fraction, + source_frame_count: int, + wait_label: str, +) -> str: + """Encode a MASK batch (N, H, W) as a grayscale H.264 MP4 at frame_rate and upload. + + The matte is always downscaled to the pixel budget so it stays within Beeble's limit and + keeps the same dimensions as the (similarly downscaled) source — both use the same algorithm + from the same starting dimensions, and downscaling is a no-op when already within budget. + """ + if mask.dim() == 2: + mask = mask.unsqueeze(0) + if mask.shape[0] != source_frame_count: + raise ValueError( + f"Custom alpha video frame count ({mask.shape[0]}) does not match the " + f"source video frame count ({source_frame_count}). The Beeble API requires " + "one mask per source frame." + ) + images = downscale_image_tensor(convert_mask_to_image(mask), _MAX_PIXELS) + alpha_video = InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=None, frame_rate=frame_rate)) + return await upload_video_to_comfyapi(cls, alpha_video, wait_label=wait_label) + + +def _alpha_mode_input(*, video: bool) -> IO.DynamicCombo.Input: + """Build the alpha_mode DynamicCombo with mode-specific extra inputs.""" + select_keyframe_tooltip = ( + "First-frame keyframe mask. Beeble propagates this across the video." if video else "Grayscale keyframe mask." + ) + custom_tooltip = ( + "Per-frame grayscale mask covering the entire video. " + "Must have the same frame count as the source. " + "Connect a MASK output from SAM3_TrackToMask or similar." + if video + else "Grayscale mask to apply." + ) + return IO.DynamicCombo.Input( + "alpha_mode", + tooltip=( + "Controls how SwitchX decides what to keep vs. regenerate. " + "'auto' isolates the main subject automatically. " + "'fill' regenerates the entire frame while preserving geometry. " + "'select' propagates a first-frame keyframe across the clip. " + "'custom' uses a per-frame alpha matte you provide." + ), + options=[ + IO.DynamicCombo.Option("auto", []), + IO.DynamicCombo.Option("fill", []), + IO.DynamicCombo.Option( + "select", + [IO.Mask.Input("alpha_keyframe", tooltip=select_keyframe_tooltip)], + ), + IO.DynamicCombo.Option( + "custom", + [IO.Mask.Input("alpha_mask", tooltip=custom_tooltip)], + ), + ], + ) + + +def _common_inputs(*, source: IO.Input, video: bool) -> list[IO.Input]: + return [ + source, + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip=( + "Text description of the desired output (max 2000 chars). " + "At least one of 'prompt' or 'reference_image' is required." + ), + ), + IO.Image.Input( + "reference_image", + optional=True, + tooltip=( + "Reference image whose look (background, lighting, costume) the result " + "should adopt. At least one of 'reference_image' or 'prompt' is required." + ), + ), + _alpha_mode_input(video=video), + IO.Combo.Input( + "max_resolution", + options=["1080p", "720p"], + default="1080p", + tooltip="Maximum output resolution.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip=( + "Seed controls whether the node should re-run; " "results are non-deterministic regardless of seed." + ), + ), + ] + + +async def _submit_and_poll( + cls: type[IO.ComfyNode], + request: CreateSwitchXRequest, +) -> SwitchXStatusResponse: + initial = await sync_op( + cls, + ApiEndpoint(path="/proxy/beeble/v1/switchx/generations", method="POST"), + response_model=SwitchXStatusResponse, + data=request, + ) + return await poll_op( + cls, + ApiEndpoint(path=f"/proxy/beeble/v1/switchx/generations/{initial.id}"), + response_model=SwitchXStatusResponse, + status_extractor=lambda r: r.status, + progress_extractor=lambda r: r.progress, + ) + + +def _require_output_url(response: SwitchXStatusResponse, name: str) -> str: + if response.output is None or getattr(response.output, name) is None: + raise RuntimeError(f"Beeble job {response.id} completed without a {name!r} output URL.") + return getattr(response.output, name) + + +def _alpha_url(response: SwitchXStatusResponse, mode: str) -> str | None: + """URL of the alpha matte, or None when the mode produces no separate matte. + + 'fill' selects the whole frame, so Beeble writes no alpha asset even though the status + response still returns a (dangling) signed URL for it — fetching it 403s with S3 + AccessDenied. The other three modes ('auto', 'custom', 'select') all produce a real, + downloadable matte. + """ + if mode == "fill" or response.output is None: + return None + return response.output.alpha + + +class BeebleSwitchXVideoEdit(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="BeebleSwitchXVideoEdit", + display_name="Beeble SwitchX Video Edit", + category="video/partner/Beeble", + description=( + "Edit a video with Beeble SwitchX. Switches anything in the scene (background, " + "lighting, costume) while preserving the original subject's pixels and motion. " + "Provide a reference image and/or text prompt to describe the new look. " + "Max 240 frames, max ~2.77MP per frame." + ), + inputs=_common_inputs(source=IO.Video.Input("video"), video=True), + outputs=[ + IO.Video.Output(display_name="video"), + IO.Video.Output( + display_name="alpha", + tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.", + ), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]), + expr=""" + ( + $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143; + {"type":"usd","usd": $rate, "format":{"suffix":"/30 frames"}} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + video: Input.Video, + prompt: str, + alpha_mode: dict, + max_resolution: str, + seed: int, + reference_image: Input.Image | None = None, + ) -> IO.NodeOutput: + cleaned_prompt = _validate_inputs(prompt, reference_image) + + validate_video_frame_count(video, max_frame_count=_MAX_FRAMES) + video = downscale_video_to_max_pixels(video, _MAX_PIXELS) + + mode = alpha_mode["alpha_mode"] + alpha_uri: str | None = None + if mode == "select": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe") + elif mode == "custom": + alpha_uri = await _upload_mask_batch_as_video( + cls, + alpha_mode["alpha_mask"], + frame_rate=video.get_frame_rate(), + source_frame_count=video.get_frame_count(), + wait_label="Uploading alpha video", + ) + + source_uri = await upload_video_to_comfyapi(cls, video, wait_label="Uploading source") + reference_uri: str | None = None + if reference_image is not None: + reference_uri = await upload_image_to_comfyapi( + cls, + reference_image, + mime_type="image/png", + wait_label="Uploading reference", + total_pixels=_MAX_PIXELS, + ) + + request = CreateSwitchXRequest( + generation_type="video", + source_uri=source_uri, + alpha_mode=mode, + prompt=cleaned_prompt, + reference_image_uri=reference_uri, + alpha_uri=alpha_uri, + max_resolution=1080 if max_resolution == "1080p" else 720, + ) + response = await _submit_and_poll(cls, request) + + render = await download_url_to_video_output(_require_output_url(response, "render")) + alpha = None + if (alpha_url := _alpha_url(response, mode)) is not None: + alpha = await download_url_to_video_output(alpha_url) + return IO.NodeOutput(render, alpha) + + +class BeebleSwitchXImageEdit(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="BeebleSwitchXImageEdit", + display_name="Beeble SwitchX Image Edit", + category="image/partner/Beeble", + description=( + "Edit a single image with Beeble SwitchX. Switches anything in the scene " + "(background, lighting, costume) while preserving the original subject's pixels. " + "Provide a reference image and/or text prompt to describe the new look. " + "Max ~2.77MP." + ), + inputs=_common_inputs(source=IO.Image.Input("image"), video=False), + outputs=[ + IO.Image.Output(display_name="image"), + IO.Mask.Output( + display_name="alpha", + tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.", + ), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]), + expr=""" + ( + $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143; + {"type":"usd","usd": $rate} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + prompt: str, + alpha_mode: dict, + max_resolution: str, + seed: int, + reference_image: Input.Image | None = None, + ) -> IO.NodeOutput: + cleaned_prompt = _validate_inputs(prompt, reference_image) + + image = downscale_image_tensor(image, _MAX_PIXELS) + + mode = alpha_mode["alpha_mode"] + alpha_uri: str | None = None + if mode == "select": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe") + elif mode == "custom": + alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_mask"], wait_label="Uploading alpha") + + source_uri = await upload_image_to_comfyapi( + cls, + image, + mime_type="image/png", + wait_label="Uploading source", + total_pixels=None, + ) + reference_uri: str | None = None + if reference_image is not None: + reference_uri = await upload_image_to_comfyapi( + cls, + reference_image, + mime_type="image/png", + wait_label="Uploading reference", + total_pixels=_MAX_PIXELS, + ) + + request = CreateSwitchXRequest( + generation_type="image", + source_uri=source_uri, + alpha_mode=mode, + prompt=cleaned_prompt, + reference_image_uri=reference_uri, + alpha_uri=alpha_uri, + max_resolution=1080 if max_resolution == "1080p" else 720, + ) + response = await _submit_and_poll(cls, request) + + render = await download_url_to_image_tensor(_require_output_url(response, "render")) + alpha_mask = None + if (alpha_url := _alpha_url(response, mode)) is not None: + alpha_image = bytesio_to_image_tensor(await download_url_as_bytesio(alpha_url), mode="L") + alpha_mask = alpha_image.squeeze(-1) if alpha_image.dim() == 4 else alpha_image + return IO.NodeOutput(render, alpha_mask) + + +class BeebleExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + BeebleSwitchXVideoEdit, + BeebleSwitchXImageEdit, + ] + + +async def comfy_entrypoint() -> BeebleExtension: + return BeebleExtension() diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py index 3f0ce29d8..f1a5dc5f0 100644 --- a/comfy_api_nodes/nodes_bfl.py +++ b/comfy_api_nodes/nodes_bfl.py @@ -42,7 +42,7 @@ class FluxProUltraImageNode(IO.ComfyNode): return IO.Schema( node_id="FluxProUltraImageNode", display_name="Flux 1.1 [pro] Ultra Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution.", inputs=[ IO.String.Input( @@ -160,7 +160,7 @@ class FluxKontextProImageNode(IO.ComfyNode): return IO.Schema( node_id=cls.NODE_ID, display_name=cls.DISPLAY_NAME, - category="api node/image/BFL", + category="image/partner/BFL", description="Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio.", inputs=[ IO.String.Input( @@ -282,7 +282,7 @@ class FluxProExpandNode(IO.ComfyNode): return IO.Schema( node_id="FluxProExpandNode", display_name="Flux.1 Expand Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Outpaints image based on prompt.", inputs=[ IO.Image.Input("image"), @@ -419,7 +419,7 @@ class FluxProFillNode(IO.ComfyNode): return IO.Schema( node_id="FluxProFillNode", display_name="Flux.1 Fill Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Inpaints image based on mask and prompt.", inputs=[ IO.Image.Input("image"), @@ -545,7 +545,7 @@ class Flux2ProImageNode(IO.ComfyNode): return IO.Schema( node_id=cls.NODE_ID, display_name=cls.DISPLAY_NAME, - category="api node/image/BFL", + category="image/partner/BFL", description="Generates images synchronously based on prompt and resolution.", inputs=[ IO.String.Input( @@ -716,7 +716,7 @@ class Flux2ImageNode(IO.ComfyNode): return IO.Schema( node_id="Flux2ImageNode", display_name="Flux.2 Image", - category="api node/image/BFL", + category="image/partner/BFL", description="Generate images via Flux.2 [pro] or Flux.2 [max] from a prompt and optional reference images.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py index 4044ee3ea..53e763210 100644 --- a/comfy_api_nodes/nodes_bria.py +++ b/comfy_api_nodes/nodes_bria.py @@ -31,7 +31,7 @@ class BriaImageEditNode(IO.ComfyNode): return IO.Schema( node_id="BriaImageEditNode", display_name="Bria FIBO Image Edit", - category="api node/image/Bria", + category="image/partner/Bria", description="Edit images using Bria latest model", inputs=[ IO.Combo.Input("model", options=["FIBO"]), @@ -169,7 +169,7 @@ class BriaRemoveImageBackground(IO.ComfyNode): return IO.Schema( node_id="BriaRemoveImageBackground", display_name="Bria Remove Image Background", - category="api node/image/Bria", + category="image/partner/Bria", description="Remove the background from an image using Bria RMBG 2.0.", inputs=[ IO.Image.Input("image"), @@ -245,7 +245,7 @@ class BriaRemoveVideoBackground(IO.ComfyNode): return IO.Schema( node_id="BriaRemoveVideoBackground", display_name="Bria Remove Video Background", - category="api node/video/Bria", + category="video/partner/Bria", description="Remove the background from a video using Bria. ", inputs=[ IO.Video.Input("video"), diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py index e08fc0b01..3711bac1d 100644 --- a/comfy_api_nodes/nodes_bytedance.py +++ b/comfy_api_nodes/nodes_bytedance.py @@ -2,11 +2,12 @@ import hashlib import logging import math import re +from io import BytesIO import torch from typing_extensions import override -from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api.latest import IO, ComfyExtension, Input, Types from comfy_api_nodes.apis.bytedance import ( RECOMMENDED_PRESETS, RECOMMENDED_PRESETS_SEEDREAM_4, @@ -43,6 +44,7 @@ from comfy_api_nodes.util import ( ApiEndpoint, download_url_to_image_tensor, download_url_to_video_output, + downscale_image_tensor_by_max_side, downscale_video_to_max_pixels, get_number_of_images, image_tensor_pair_to_batch, @@ -121,6 +123,14 @@ def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: st ) +def _prepare_seedance_image(image: Input.Image) -> Input.Image: + """Auto-downscale a Seedance image input to the per-side limits, then validate it.""" + validate_image_aspect_ratio(image, (2, 5), (5, 2), strict=False) # 0.4 to 2.5 + image = downscale_image_tensor_by_max_side(image, max_side=6000) + validate_image_dimensions(image, min_width=300, min_height=300, max_width=6000, max_height=6000) + return image + + async def _resolve_reference_assets( cls: type[IO.ComfyNode], asset_ids: list[str], @@ -308,6 +318,26 @@ async def _seedance_virtual_library_upload_image_asset( return f"asset://{create_resp.asset_id}" +async def _seedance_virtual_library_upload_video_asset( + cls: type[IO.ComfyNode], + video: Input.Video, + *, + wait_label: str = "Uploading video", +) -> str: + buf = BytesIO() + video.save_to(buf, format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264) + video_hash = hashlib.sha256(buf.getbuffer()).hexdigest() + public_url = await upload_video_to_comfyapi(cls, video, wait_label=wait_label) + create_resp = await sync_op( + cls, + ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"), + response_model=SeedanceCreateAssetResponse, + data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=video_hash, asset_type="Video"), + ) + await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library") + return f"asset://{create_resp.asset_id}" + + def _seedance2_price_extractor(model_id: str, has_video_input: bool): """Returns a price_extractor closure for Seedance 2.0 poll_op.""" rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input)) @@ -338,7 +368,7 @@ class ByteDanceImageNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageNode", display_name="ByteDance Image", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Generate images using ByteDance models via api based on prompt", inputs=[ IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]), @@ -462,7 +492,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedreamNode", display_name="ByteDance Seedream 4.5 & 5.0", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ IO.Combo.Input( @@ -724,7 +754,7 @@ class ByteDanceSeedreamNodeV2(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedreamNodeV2", display_name="ByteDance Seedream 4.5 & 5.0", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.", inputs=[ IO.String.Input( @@ -890,7 +920,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceTextToVideoNode", display_name="ByteDance Text to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using ByteDance models via api based on prompt", inputs=[ IO.Combo.Input( @@ -1018,7 +1048,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageToVideoNode", display_name="ByteDance Image to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using ByteDance models via api based on image and prompt", inputs=[ IO.Combo.Input( @@ -1155,7 +1185,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceFirstLastFrameNode", display_name="ByteDance First-Last-Frame to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using prompt and first and last frames.", inputs=[ IO.Combo.Input( @@ -1303,7 +1333,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceImageReferenceNode", display_name="ByteDance Reference Images to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using prompt and reference images.", inputs=[ IO.Combo.Input( @@ -1546,7 +1576,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2TextToVideoNode", display_name="ByteDance Seedance 2.0 Text to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using Seedance 2.0 models based on a text prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1647,7 +1677,7 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2FirstLastFrameNode", display_name="ByteDance Seedance 2.0 First-Last-Frame to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate video using Seedance 2.0 from a first frame image and optional last frame image.", inputs=[ IO.DynamicCombo.Input( @@ -1760,6 +1790,11 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode): if last_frame is not None and last_frame_asset_id: raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.") + if first_frame is not None: + first_frame = _prepare_seedance_image(first_frame) + if last_frame is not None: + last_frame = _prepare_seedance_image(last_frame) + asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a] image_assets: dict[str, str] = {} if asset_ids_to_resolve: @@ -1866,7 +1901,7 @@ def _seedance2_reference_inputs(resolutions: list[str], default_ratio: str = "16 ), IO.Boolean.Input( "auto_downscale", - default=False, + default=True, optional=True, tooltip="Automatically downscale reference videos that exceed the model's pixel budget " "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.", @@ -1909,7 +1944,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode): return IO.Schema( node_id="ByteDance2ReferenceNode", display_name="ByteDance Seedance 2.0 Reference to Video", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description="Generate, edit, or extend video using Seedance 2.0 with reference images, " "videos, and audio. Supports multimodal reference, video editing, and video extension.", inputs=[ @@ -2034,6 +2069,9 @@ class ByteDance2ReferenceNode(IO.ComfyNode): f"(audios={len(reference_audios)}, audio assets={len(reference_audio_assets)}). Maximum is 3." ) + for key in reference_images: + reference_images[key] = _prepare_seedance_image(reference_images[key]) + model_id = SEEDANCE_MODELS[model["model"]] has_video_input = total_videos > 0 @@ -2106,7 +2144,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode): content.append( TaskVideoContent( video_url=TaskVideoContentUrl( - url=await upload_video_to_comfyapi( + url=await _seedance_virtual_library_upload_video_asset( cls, reference_videos[key], wait_label=f"Uploading video {i}", @@ -2203,7 +2241,7 @@ class ByteDanceCreateImageAsset(IO.ComfyNode): return IO.Schema( node_id="ByteDanceCreateImageAsset", display_name="ByteDance Create Image Asset", - category="api node/image/ByteDance", + category="image/partner/ByteDance", description=( "Create a Seedance 2.0 personal image asset. Uploads the input image and " "registers it in the given asset group. If group_id is empty, runs a real-person " @@ -2270,7 +2308,7 @@ class ByteDanceCreateVideoAsset(IO.ComfyNode): return IO.Schema( node_id="ByteDanceCreateVideoAsset", display_name="ByteDance Create Video Asset", - category="api node/video/ByteDance", + category="video/partner/ByteDance", description=( "Create a Seedance 2.0 personal video asset. Uploads the input video and " "registers it in the given asset group. If group_id is empty, runs a real-person " diff --git a/comfy_api_nodes/nodes_bytedance_llm.py b/comfy_api_nodes/nodes_bytedance_llm.py index fa7fe370a..007cac45f 100644 --- a/comfy_api_nodes/nodes_bytedance_llm.py +++ b/comfy_api_nodes/nodes_bytedance_llm.py @@ -144,7 +144,7 @@ class ByteDanceSeedNode(IO.ComfyNode): return IO.Schema( node_id="ByteDanceSeedNode", display_name="ByteDance Seed", - category="api node/text/ByteDance", + category="text/partner/ByteDance", essentials_category="Text Generation", description="Generate text responses with ByteDance's Seed 2.0 models. " "Provide a text prompt and optionally one or more images or videos for multimodal context.", diff --git a/comfy_api_nodes/nodes_elevenlabs.py b/comfy_api_nodes/nodes_elevenlabs.py index e452daf77..37eeb2601 100644 --- a/comfy_api_nodes/nodes_elevenlabs.py +++ b/comfy_api_nodes/nodes_elevenlabs.py @@ -69,7 +69,7 @@ class ElevenLabsSpeechToText(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsSpeechToText", display_name="ElevenLabs Speech to Text", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Transcribe audio to text. " "Supports automatic language detection, speaker diarization, and audio event tagging.", inputs=[ @@ -210,7 +210,7 @@ class ElevenLabsVoiceSelector(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsVoiceSelector", display_name="ElevenLabs Voice Selector", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Select a predefined ElevenLabs voice for text-to-speech generation.", inputs=[ IO.Combo.Input( @@ -239,7 +239,7 @@ class ElevenLabsTextToSpeech(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToSpeech", display_name="ElevenLabs Text to Speech", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Convert text to speech.", inputs=[ IO.Custom(ELEVENLABS_VOICE).Input( @@ -414,7 +414,7 @@ class ElevenLabsAudioIsolation(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsAudioIsolation", display_name="ElevenLabs Voice Isolation", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Remove background noise from audio, isolating vocals or speech.", inputs=[ IO.Audio.Input( @@ -459,7 +459,7 @@ class ElevenLabsTextToSoundEffects(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToSoundEffects", display_name="ElevenLabs Text to Sound Effects", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Generate sound effects from text descriptions.", inputs=[ IO.String.Input( @@ -555,7 +555,7 @@ class ElevenLabsInstantVoiceClone(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsInstantVoiceClone", display_name="ElevenLabs Instant Voice Clone", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Create a cloned voice from audio samples. " "Provide 1-8 audio recordings of the voice to clone.", inputs=[ @@ -658,7 +658,7 @@ class ElevenLabsSpeechToSpeech(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsSpeechToSpeech", display_name="ElevenLabs Speech to Speech", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Transform speech from one voice to another while preserving the original content and emotion.", inputs=[ IO.Custom(ELEVENLABS_VOICE).Input( @@ -793,7 +793,7 @@ class ElevenLabsTextToDialogue(IO.ComfyNode): return IO.Schema( node_id="ElevenLabsTextToDialogue", display_name="ElevenLabs Text to Dialogue", - category="api node/audio/ElevenLabs", + category="audio/partner/ElevenLabs", description="Generate multi-speaker dialogue from text. Each dialogue entry has its own text and voice.", inputs=[ IO.Float.Input( diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py index d18c958a8..3cfd541b2 100644 --- a/comfy_api_nodes/nodes_gemini.py +++ b/comfy_api_nodes/nodes_gemini.py @@ -300,7 +300,7 @@ class GeminiNode(IO.ComfyNode): return IO.Schema( node_id="GeminiNode", display_name="Google Gemini", - category="api node/text/Gemini", + category="text/partner/Gemini", description="Generate text responses with Google's Gemini AI model. " "You can provide multiple types of inputs (text, images, audio, video) " "as context for generating more relevant and meaningful responses.", @@ -541,7 +541,7 @@ class GeminiInputFiles(IO.ComfyNode): return IO.Schema( node_id="GeminiInputFiles", display_name="Gemini Input Files", - category="api node/text/Gemini", + category="text/partner/Gemini", description="Loads and prepares input files to include as inputs for Gemini LLM nodes. " "The files will be read by the Gemini model when generating a response. " "The contents of the text file count toward the token limit. " @@ -598,7 +598,7 @@ class GeminiImage(IO.ComfyNode): return IO.Schema( node_id="GeminiImageNode", display_name="Nano Banana (Google Gemini Image)", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Edit images synchronously via Google API.", inputs=[ IO.String.Input( @@ -731,7 +731,7 @@ class GeminiImage2(IO.ComfyNode): return IO.Schema( node_id="GeminiImage2Node", display_name="Nano Banana Pro (Google Gemini Image)", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( @@ -869,7 +869,7 @@ class GeminiNanoBanana2(IO.ComfyNode): return IO.Schema( node_id="GeminiNanoBanana2", display_name="Nano Banana 2", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( @@ -1085,7 +1085,7 @@ class GeminiNanoBanana2V2(IO.ComfyNode): return IO.Schema( node_id="GeminiNanoBanana2V2", display_name="Nano Banana 2", - category="api node/image/Gemini", + category="image/partner/Gemini", description="Generate or edit images synchronously via Google Vertex API.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_grok.py b/comfy_api_nodes/nodes_grok.py index a103f24ee..a41da42f3 100644 --- a/comfy_api_nodes/nodes_grok.py +++ b/comfy_api_nodes/nodes_grok.py @@ -49,7 +49,7 @@ class GrokImageNode(IO.ComfyNode): return IO.Schema( node_id="GrokImageNode", display_name="Grok Image", - category="api node/image/Grok", + category="image/partner/Grok", description="Generate images using Grok based on a text prompt", inputs=[ IO.Combo.Input( @@ -58,7 +58,6 @@ class GrokImageNode(IO.ComfyNode): "grok-imagine-image-quality", "grok-imagine-image-pro", "grok-imagine-image", - "grok-imagine-image-beta", ], ), IO.String.Input( @@ -224,7 +223,7 @@ class GrokImageEditNode(IO.ComfyNode): return IO.Schema( node_id="GrokImageEditNode", display_name="Grok Image Edit", - category="api node/image/Grok", + category="image/partner/Grok", description="Modify an existing image based on a text prompt", inputs=[ IO.Combo.Input( @@ -233,7 +232,6 @@ class GrokImageEditNode(IO.ComfyNode): "grok-imagine-image-quality", "grok-imagine-image-pro", "grok-imagine-image", - "grok-imagine-image-beta", ], ), IO.Image.Input("image", display_name="images"), @@ -366,7 +364,7 @@ class GrokImageEditNodeV2(IO.ComfyNode): return IO.Schema( node_id="GrokImageEditNodeV2", display_name="Grok Image Edit", - category="api node/image/Grok", + category="image/partner/Grok", description="Modify an existing image based on a text prompt", inputs=[ IO.String.Input( @@ -503,10 +501,10 @@ class GrokVideoNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoNode", display_name="Grok Video", - category="api node/video/Grok", + category="video/partner/Grok", description="Generate video from a prompt or an image", inputs=[ - IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]), + IO.Combo.Input("model", options=["grok-imagine-video"]), IO.String.Input( "prompt", multiline=True, @@ -576,8 +574,6 @@ class GrokVideoNode(IO.ComfyNode): seed: int, image: Input.Image | None = None, ) -> IO.NodeOutput: - if model == "grok-imagine-video-beta": - model = "grok-imagine-video" image_url = None if image is not None: if get_number_of_images(image) != 1: @@ -615,10 +611,10 @@ class GrokVideoEditNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoEditNode", display_name="Grok Video Edit", - category="api node/video/Grok", + category="video/partner/Grok", description="Edit an existing video based on a text prompt.", inputs=[ - IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]), + IO.Combo.Input("model", options=["grok-imagine-video"]), IO.String.Input( "prompt", multiline=True, @@ -693,7 +689,7 @@ class GrokVideoReferenceNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoReferenceNode", display_name="Grok Reference-to-Video", - category="api node/video/Grok", + category="video/partner/Grok", description="Generate video guided by reference images as style and content references.", inputs=[ IO.String.Input( @@ -826,7 +822,7 @@ class GrokVideoExtendNode(IO.ComfyNode): return IO.Schema( node_id="GrokVideoExtendNode", display_name="Grok Video Extend", - category="api node/video/Grok", + category="video/partner/Grok", description="Extend an existing video with a seamless continuation based on a text prompt.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_hitpaw.py b/comfy_api_nodes/nodes_hitpaw.py index bca5170e4..22e679c29 100644 --- a/comfy_api_nodes/nodes_hitpaw.py +++ b/comfy_api_nodes/nodes_hitpaw.py @@ -71,7 +71,7 @@ class HitPawGeneralImageEnhance(IO.ComfyNode): return IO.Schema( node_id="HitPawGeneralImageEnhance", display_name="HitPaw General Image Enhance", - category="api node/image/HitPaw", + category="image/partner/HitPaw", description="Upscale low-resolution images to super-resolution, eliminate artifacts and noise. " f"Maximum output: {MAX_MP_GENERATIVE} megapixels.", inputs=[ @@ -201,7 +201,7 @@ class HitPawVideoEnhance(IO.ComfyNode): return IO.Schema( node_id="HitPawVideoEnhance", display_name="HitPaw Video Enhance", - category="api node/video/HitPaw", + category="video/partner/HitPaw", description="Upscale low-resolution videos to high resolution, eliminate artifacts and noise. " "Prices shown are per second of video.", inputs=[ diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py index 5fc31bccd..826a3bd2d 100644 --- a/comfy_api_nodes/nodes_hunyuan3d.py +++ b/comfy_api_nodes/nodes_hunyuan3d.py @@ -123,7 +123,7 @@ class TencentTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="TencentTextToModelNode", display_name="Hunyuan3D: Text to Model", - category="api node/3d/Tencent", + category="3d/partner/Tencent", essentials_category="3D", inputs=[ IO.Combo.Input( @@ -242,7 +242,7 @@ class TencentImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="TencentImageToModelNode", display_name="Hunyuan3D: Image(s) to Model", - category="api node/3d/Tencent", + category="3d/partner/Tencent", essentials_category="3D", inputs=[ IO.Combo.Input( @@ -415,7 +415,7 @@ class TencentModelTo3DUVNode(IO.ComfyNode): return IO.Schema( node_id="TencentModelTo3DUVNode", display_name="Hunyuan3D: Model to UV", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Perform UV unfolding on a 3D model to generate UV texture. " "Input model must have less than 30000 faces.", inputs=[ @@ -505,7 +505,7 @@ class Tencent3DTextureEditNode(IO.ComfyNode): return IO.Schema( node_id="Tencent3DTextureEditNode", display_name="Hunyuan3D: 3D Texture Edit", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="After inputting the 3D model, perform 3D model texture redrawing.", inputs=[ IO.MultiType.Input( @@ -594,7 +594,7 @@ class Tencent3DPartNode(IO.ComfyNode): return IO.Schema( node_id="Tencent3DPartNode", display_name="Hunyuan3D: 3D Part", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Automatically perform component identification and generation based on the model structure.", inputs=[ IO.MultiType.Input( @@ -666,7 +666,7 @@ class TencentSmartTopologyNode(IO.ComfyNode): return IO.Schema( node_id="TencentSmartTopologyNode", display_name="Hunyuan3D: Smart Topology", - category="api node/3d/Tencent", + category="3d/partner/Tencent", description="Perform smart retopology on a 3D model. " "Supports GLB/OBJ formats; max 200MB; recommended for high-poly models.", inputs=[ diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py index 97c3609bd..edd9b9435 100644 --- a/comfy_api_nodes/nodes_ideogram.py +++ b/comfy_api_nodes/nodes_ideogram.py @@ -234,7 +234,7 @@ class IdeogramV1(IO.ComfyNode): return IO.Schema( node_id="IdeogramV1", display_name="Ideogram V1", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V1 model.", inputs=[ IO.String.Input( @@ -360,7 +360,7 @@ class IdeogramV2(IO.ComfyNode): return IO.Schema( node_id="IdeogramV2", display_name="Ideogram V2", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V2 model.", inputs=[ IO.String.Input( @@ -526,7 +526,7 @@ class IdeogramV3(IO.ComfyNode): return IO.Schema( node_id="IdeogramV3", display_name="Ideogram V3", - category="api node/image/Ideogram", + category="image/partner/Ideogram", description="Generates images using the Ideogram V3 model. " "Supports both regular image generation from text prompts and image editing with mask.", inputs=[ diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py index 7586f1816..9925ec548 100644 --- a/comfy_api_nodes/nodes_kling.py +++ b/comfy_api_nodes/nodes_kling.py @@ -642,7 +642,7 @@ class KlingCameraControls(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControls", display_name="Kling Camera Controls", - category="api node/video/Kling", + category="video/partner/Kling", description="Allows specifying configuration options for Kling Camera Controls and motion control effects.", inputs=[ IO.Combo.Input("camera_control_type", options=KlingCameraControlType), @@ -762,7 +762,7 @@ class KlingTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingTextToVideoNode", display_name="Kling Text to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Text to Video Node", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -849,7 +849,7 @@ class OmniProTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProTextToVideoNode", display_name="Kling 3.0 Omni Text to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use text prompts to generate videos with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -998,7 +998,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProFirstLastFrameNode", display_name="Kling 3.0 Omni First-Last-Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use a start frame, an optional end frame, or reference images with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1205,7 +1205,7 @@ class OmniProImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProImageToVideoNode", display_name="Kling 3.0 Omni Image to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use up to 7 reference images to generate a video with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1374,7 +1374,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProVideoToVideoNode", display_name="Kling 3.0 Omni Video to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Use a video and up to 4 reference images to generate a video with the latest Kling model.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]), @@ -1485,7 +1485,7 @@ class OmniProEditVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProEditVideoNode", display_name="Kling 3.0 Omni Edit Video", - category="api node/video/Kling", + category="video/partner/Kling", essentials_category="Video Generation", description="Edit an existing video with the latest model from Kling.", inputs=[ @@ -1593,7 +1593,7 @@ class OmniProImageNode(IO.ComfyNode): return IO.Schema( node_id="KlingOmniProImageNode", display_name="Kling 3.0 Omni Image", - category="api node/image/Kling", + category="image/partner/Kling", description="Create or edit images with the latest model from Kling.", inputs=[ IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-image-o1"]), @@ -1721,7 +1721,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControlT2VNode", display_name="Kling Text to Video (Camera Control)", - category="api node/video/Kling", + category="video/partner/Kling", description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -1783,7 +1783,7 @@ class KlingImage2VideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingImage2VideoNode", display_name="Kling Image(First Frame) to Video", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -1882,7 +1882,7 @@ class KlingCameraControlI2VNode(IO.ComfyNode): return IO.Schema( node_id="KlingCameraControlI2VNode", display_name="Kling Image to Video (Camera Control)", - category="api node/video/Kling", + category="video/partner/Kling", description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.", inputs=[ IO.Image.Input( @@ -1953,7 +1953,7 @@ class KlingStartEndFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingStartEndFrameNode", display_name="Kling Start-End Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.", inputs=[ IO.Image.Input( @@ -2047,7 +2047,7 @@ class KlingVideoExtendNode(IO.ComfyNode): return IO.Schema( node_id="KlingVideoExtendNode", display_name="Kling Video Extend", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.", inputs=[ IO.String.Input( @@ -2128,7 +2128,7 @@ class KlingDualCharacterVideoEffectNode(IO.ComfyNode): return IO.Schema( node_id="KlingDualCharacterVideoEffectNode", display_name="Kling Dual Character Video Effects", - category="api node/video/Kling", + category="video/partner/Kling", description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.", inputs=[ IO.Image.Input("image_left", tooltip="Left side image"), @@ -2218,7 +2218,7 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode): return IO.Schema( node_id="KlingSingleImageVideoEffectNode", display_name="Kling Video Effects", - category="api node/video/Kling", + category="video/partner/Kling", description="Achieve different special effects when generating a video based on the effect_scene.", inputs=[ IO.Image.Input( @@ -2291,7 +2291,7 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingLipSyncAudioToVideoNode", display_name="Kling Lip Sync Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", essentials_category="Video Generation", description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ @@ -2343,7 +2343,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingLipSyncTextToVideoNode", display_name="Kling Lip Sync Video with Text", - category="api node/video/Kling", + category="video/partner/Kling", description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.", inputs=[ IO.Video.Input("video"), @@ -2411,7 +2411,7 @@ class KlingVirtualTryOnNode(IO.ComfyNode): return IO.Schema( node_id="KlingVirtualTryOnNode", display_name="Kling Virtual Try On", - category="api node/image/Kling", + category="image/partner/Kling", description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.", inputs=[ IO.Image.Input("human_image"), @@ -2478,7 +2478,7 @@ class KlingImageGenerationNode(IO.ComfyNode): return IO.Schema( node_id="KlingImageGenerationNode", display_name="Kling 3.0 Image", - category="api node/image/Kling", + category="image/partner/Kling", description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.", inputs=[ IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"), @@ -2615,7 +2615,7 @@ class TextToVideoWithAudio(IO.ComfyNode): return IO.Schema( node_id="KlingTextToVideoWithAudio", display_name="Kling 2.6 Text to Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Combo.Input("model_name", options=["kling-v2-6"]), IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."), @@ -2683,7 +2683,7 @@ class ImageToVideoWithAudio(IO.ComfyNode): return IO.Schema( node_id="KlingImageToVideoWithAudio", display_name="Kling 2.6 Image(First Frame) to Video with Audio", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.Combo.Input("model_name", options=["kling-v2-6"]), IO.Image.Input("start_frame"), @@ -2753,7 +2753,7 @@ class MotionControl(IO.ComfyNode): return IO.Schema( node_id="KlingMotionControl", display_name="Kling Motion Control", - category="api node/video/Kling", + category="video/partner/Kling", inputs=[ IO.String.Input("prompt", multiline=True), IO.Image.Input("reference_image"), @@ -2854,7 +2854,7 @@ class KlingVideoNode(IO.ComfyNode): return IO.Schema( node_id="KlingVideoNode", display_name="Kling 3.0 Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate videos with Kling V3. " "Supports text-to-video and image-to-video with optional storyboard multi-prompt and audio generation.", inputs=[ @@ -3077,7 +3077,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="KlingFirstLastFrameNode", display_name="Kling 3.0 First-Last-Frame to Video", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate videos with Kling V3 using first and last frames.", inputs=[ IO.String.Input("prompt", multiline=True, default=""), @@ -3202,7 +3202,7 @@ class KlingAvatarNode(IO.ComfyNode): return IO.Schema( node_id="KlingAvatarNode", display_name="Kling Avatar 2.0", - category="api node/video/Kling", + category="video/partner/Kling", description="Generate broadcast-style digital human videos from a single photo and an audio file.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_krea.py b/comfy_api_nodes/nodes_krea.py new file mode 100644 index 000000000..be04a272b --- /dev/null +++ b/comfy_api_nodes/nodes_krea.py @@ -0,0 +1,290 @@ +"""Krea image-generation nodes.""" + +import re + +from typing_extensions import override + +from comfy_api.latest import IO, ComfyExtension, Input +from comfy_api_nodes.apis.krea import ( + KreaAssetResponse, + KreaGenerateImageRequest, + KreaImageStyleReference, + KreaJob, + KreaMoodboard, +) +from comfy_api_nodes.util import ( + ApiEndpoint, + download_url_to_image_tensor, + poll_op, + sync_op, + tensor_to_bytesio, + validate_string, +) + + +class KreaIO: + STYLE_REF = "KREA_STYLE_REF" + + +async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Image) -> str: + """Upload an image to Krea's /assets endpoint and return the Krea-hosted image URL.""" + img_io = tensor_to_bytesio(image, total_pixels=2048 * 2048, mime_type="image/png") + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/krea/assets", method="POST"), + response_model=KreaAssetResponse, + files=[("file", (img_io.name, img_io, "image/png"))], + content_type="multipart/form-data", + max_retries=1, + wait_label="Uploading reference", + ) + return response.image_url + + +_MODEL_MEDIUM = "Krea 2 Medium" +_MODEL_LARGE = "Krea 2 Large" +_MODEL_ENDPOINTS: dict[str, str] = { + _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium", + _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large", +} + +_ASPECT_RATIOS = ["1:1", "4:3", "3:2", "16:9", "2.35:1", "4:5", "2:3", "9:16"] +_RESOLUTIONS = ["1K"] +_CREATIVITY_LEVELS = ["raw", "low", "medium", "high"] +_KREA_QUEUED_STATUSES = ["backlogged", "queued", "scheduled"] + +_UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$") + + +def _krea_model_inputs() -> list: + """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo.""" + return [ + IO.Combo.Input( + "aspect_ratio", + options=_ASPECT_RATIOS, + tooltip="Output aspect ratio.", + ), + IO.Combo.Input( + "resolution", + options=_RESOLUTIONS, + tooltip="Resolution scale.", + ), + IO.Combo.Input( + "creativity", + options=_CREATIVITY_LEVELS, + default="medium", + tooltip="Prompt interpretation strength: raw stays closest to the prompt; high is most creative.", + ), + IO.String.Input( + "moodboard_id", + default="", + tooltip="Optional Krea moodboard UUID (e.g. from the Krea website). " + "Leave empty to disable. Only one moodboard is supported per request.", + optional=True, + ), + IO.Float.Input( + "moodboard_strength", + default=0.35, + min=-0.5, + max=1.5, + step=0.05, + tooltip="Moodboard influence; ignored when moodboard_id is empty.", + optional=True, + ), + IO.Custom(KreaIO.STYLE_REF).Input( + "style_reference", + optional=True, + tooltip="Optional chain of style references (max 10) from Krea 2 Style Reference nodes.", + ), + ] + + +class Krea2ImageNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Krea2ImageNode", + display_name="Krea 2 Image", + category="image/partner/Krea", + description=( + "Generate images via Krea 2 — pick Medium (expressive illustrations) or " + "Large (expressive photorealism). Supports an optional moodboard and up " + "to 10 chained image style references." + ), + inputs=[ + IO.String.Input( + "prompt", + multiline=True, + default="", + tooltip="Text prompt for the image.", + ), + IO.DynamicCombo.Input( + "model", + options=[ + IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()), + IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()), + ], + tooltip="Krea 2 Medium is best for expressive illustrations; " + "Krea 2 Large is best for expressive photorealism.", + ), + IO.Int.Input( + "seed", + default=0, + min=0, + max=2147483647, + control_after_generate=True, + tooltip="Random seed for reproducibility.", + ), + ], + outputs=[IO.Image.Output()], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends( + widgets=["model", "model.moodboard_id"], + inputs=["model.style_reference"], + ), + expr=""" + ( + $isLarge := widgets.model = "krea 2 large"; + $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0; + $hasStyle := $lookup(inputs, "model.style_reference").connected; + $usd := $hasMoodboard + ? ($isLarge ? 0.07 : 0.04) + : ($hasStyle + ? ($isLarge ? 0.065 : 0.035) + : ($isLarge ? 0.06 : 0.03)); + {"type":"usd","usd": $usd} + ) + """, + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + model: dict, + seed: int, + ) -> IO.NodeOutput: + validate_string(prompt, strip_whitespace=False, min_length=1) + + model_choice = model["model"] + endpoint_path = _MODEL_ENDPOINTS.get(model_choice) + if endpoint_path is None: + raise ValueError(f"Unknown Krea 2 model: {model_choice!r}") + + moodboards: list[KreaMoodboard] | None = None + mb_id = (model.get("moodboard_id") or "").strip() + if mb_id: + if not _UUID_RE.match(mb_id): + raise ValueError(f"moodboard_id must be a UUID (received {mb_id!r}); copy it from the Krea website.") + mb_strength = model.get("moodboard_strength") + moodboards = [KreaMoodboard(id=mb_id, strength=0.35 if mb_strength is None else float(mb_strength))] + + style_reference = model.get("style_reference") + image_style_references: list[KreaImageStyleReference] | None = None + if style_reference: + if len(style_reference) > 10: + raise ValueError(f"Krea 2 accepts at most 10 image_style_references; received {len(style_reference)}.") + image_style_references = [ + KreaImageStyleReference(url=ref["url"], strength=float(ref["strength"])) for ref in style_reference + ] + initial = await sync_op( + cls, + ApiEndpoint(path=endpoint_path, method="POST"), + response_model=KreaJob, + data=KreaGenerateImageRequest( + prompt=prompt, + aspect_ratio=model["aspect_ratio"], + resolution=model["resolution"], + seed=seed, + creativity=model["creativity"], + moodboards=moodboards, + image_style_references=image_style_references, + ), + ) + job = await poll_op( + cls, + ApiEndpoint(path=f"/proxy/krea/jobs/{initial.job_id}", method="GET"), + response_model=KreaJob, + status_extractor=lambda r: r.status, + queued_statuses=_KREA_QUEUED_STATUSES, + ) + if not job.result or not job.result.urls: + raise RuntimeError(f"Krea 2 job {job.job_id} completed without any image URLs.") + image = await download_url_to_image_tensor(job.result.urls[0]) + return IO.NodeOutput(image) + + +class Krea2StyleReferenceNode(IO.ComfyNode): + + @classmethod + def define_schema(cls) -> IO.Schema: + return IO.Schema( + node_id="Krea2StyleReferenceNode", + display_name="Krea 2 Style Reference", + category="image/partner/Krea", + description=( + "Add an image style reference to a Krea 2 generation. Chain multiple Krea 2 " + "Style Reference nodes (max 10) and feed the final `style_reference` output " + "into Krea 2 Image. Each image is uploaded to ComfyAPI storage and passed as URL." + ), + inputs=[ + IO.Image.Input( + "image", + tooltip="Reference image whose style influences the generation.", + ), + IO.Float.Input( + "strength", + default=1.0, + min=-2.0, + max=2.0, + step=0.05, + tooltip="Reference strength; negative values invert the style influence.", + ), + IO.Custom(KreaIO.STYLE_REF).Input( + "style_reference", + optional=True, + tooltip="Optional incoming chain of style references; this node appends one more.", + ), + ], + outputs=[IO.Custom(KreaIO.STYLE_REF).Output(display_name="style_reference")], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + strength: float, + style_reference: list[dict] | None = None, + ) -> IO.NodeOutput: + chain: list[dict] = list(style_reference) if style_reference else [] + if len(chain) >= 10: + raise ValueError("Krea 2 accepts at most 10 image_style_references in one generation.") + url = await _upload_image_to_krea_assets(cls, image) + chain.append({"url": url, "strength": float(strength)}) + return IO.NodeOutput(chain) + + +class KreaExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [ + Krea2ImageNode, + Krea2StyleReferenceNode, + ] + + +async def comfy_entrypoint() -> KreaExtension: + return KreaExtension() diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py index 0a219af96..01791d354 100644 --- a/comfy_api_nodes/nodes_ltxv.py +++ b/comfy_api_nodes/nodes_ltxv.py @@ -50,7 +50,7 @@ class TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="LtxvApiTextToVideo", display_name="LTXV Text To Video", - category="api node/video/LTXV", + category="video/partner/LTXV", description="Professional-quality videos with customizable duration and resolution.", inputs=[ IO.Combo.Input("model", options=list(MODELS_MAP.keys())), @@ -127,7 +127,7 @@ class ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="LtxvApiImageToVideo", display_name="LTXV Image To Video", - category="api node/video/LTXV", + category="video/partner/LTXV", description="Professional-quality videos with customizable duration and resolution based on start image.", inputs=[ IO.Image.Input("image", tooltip="First frame to be used for the video."), diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py index d92a7c382..08ae9904c 100644 --- a/comfy_api_nodes/nodes_luma.py +++ b/comfy_api_nodes/nodes_luma.py @@ -46,7 +46,7 @@ class LumaReferenceNode(IO.ComfyNode): return IO.Schema( node_id="LumaReferenceNode", display_name="Luma Reference", - category="api node/image/Luma", + category="image/partner/Luma", description="Holds an image and weight for use with Luma Generate Image node.", inputs=[ IO.Image.Input( @@ -85,7 +85,7 @@ class LumaConceptsNode(IO.ComfyNode): return IO.Schema( node_id="LumaConceptsNode", display_name="Luma Concepts", - category="api node/video/Luma", + category="video/partner/Luma", description="Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes.", inputs=[ IO.Combo.Input( @@ -134,7 +134,7 @@ class LumaImageGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageNode", display_name="Luma Text to Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Generates images synchronously based on prompt and aspect ratio.", inputs=[ IO.String.Input( @@ -278,7 +278,7 @@ class LumaImageModifyNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageModifyNode", display_name="Luma Image to Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Modifies images synchronously based on prompt and aspect ratio.", inputs=[ IO.Image.Input( @@ -371,7 +371,7 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaVideoNode", display_name="Luma Text to Video", - category="api node/video/Luma", + category="video/partner/Luma", description="Generates videos synchronously based on prompt and output_size.", inputs=[ IO.String.Input( @@ -472,7 +472,7 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageToVideoNode", display_name="Luma Image to Video", - category="api node/video/Luma", + category="video/partner/Luma", description="Generates videos synchronously based on prompt, input images, and output_size.", inputs=[ IO.String.Input( @@ -724,7 +724,7 @@ class LumaImageNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageNode2", display_name="Luma UNI-1 Image", - category="api node/image/Luma", + category="image/partner/Luma", description="Generate images from text using the Luma UNI-1 model.", inputs=[ IO.String.Input( @@ -853,7 +853,7 @@ class LumaImageEditNode(IO.ComfyNode): return IO.Schema( node_id="LumaImageEditNode2", display_name="Luma UNI-1 Image Edit", - category="api node/image/Luma", + category="image/partner/Luma", description="Edit an existing image with a text prompt using the Luma UNI-1 model.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py index 38b881fea..a6aeb194a 100644 --- a/comfy_api_nodes/nodes_magnific.py +++ b/comfy_api_nodes/nodes_magnific.py @@ -61,7 +61,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageUpscalerCreativeNode", display_name="Magnific Image Upscale (Creative)", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Prompt‑guided enhancement, stylization, and 2x/4x/8x/16x upscaling. " "Maximum output: 25.3 megapixels.", inputs=[ @@ -240,7 +240,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode): return IO.Schema( node_id="MagnificImageUpscalerPreciseV2Node", display_name="Magnific Image Upscale (Precise V2)", - category="api node/image/Magnific", + category="image/partner/Magnific", description="High-fidelity upscaling with fine control over sharpness, grain, and detail. " "Maximum output: 10060×10060 pixels.", inputs=[ @@ -400,7 +400,7 @@ class MagnificImageStyleTransferNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageStyleTransferNode", display_name="Magnific Image Style Transfer", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Transfer the style from a reference image to your input image.", inputs=[ IO.Image.Input("image", tooltip="The image to apply style transfer to."), @@ -549,7 +549,7 @@ class MagnificImageRelightNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageRelightNode", display_name="Magnific Image Relight", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Relight an image with lighting adjustments and optional reference-based light transfer.", inputs=[ IO.Image.Input("image", tooltip="The image to relight."), @@ -789,7 +789,7 @@ class MagnificImageSkinEnhancerNode(IO.ComfyNode): return IO.Schema( node_id="MagnificImageSkinEnhancerNode", display_name="Magnific Image Skin Enhancer", - category="api node/image/Magnific", + category="image/partner/Magnific", description="Skin enhancement for portraits with multiple processing modes.", inputs=[ IO.Image.Input("image", tooltip="The portrait image to enhance."), diff --git a/comfy_api_nodes/nodes_meshy.py b/comfy_api_nodes/nodes_meshy.py index 3cf577f4a..4fb670404 100644 --- a/comfy_api_nodes/nodes_meshy.py +++ b/comfy_api_nodes/nodes_meshy.py @@ -33,7 +33,7 @@ class MeshyTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyTextToModelNode", display_name="Meshy: Text to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.String.Input("prompt", multiline=True, default=""), @@ -145,7 +145,7 @@ class MeshyRefineNode(IO.ComfyNode): return IO.Schema( node_id="MeshyRefineNode", display_name="Meshy: Refine Draft Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Refine a previously created draft model.", inputs=[ IO.Combo.Input("model", options=["latest"]), @@ -240,7 +240,7 @@ class MeshyImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyImageToModelNode", display_name="Meshy: Image to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Image.Input("image"), @@ -405,7 +405,7 @@ class MeshyMultiImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyMultiImageToModelNode", display_name="Meshy: Multi-Image to Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Autogrow.Input( @@ -575,7 +575,7 @@ class MeshyRigModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyRigModelNode", display_name="Meshy: Rig Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Provides a rigged character in standard formats. " "Auto-rigging is currently not suitable for untextured meshes, non-humanoid assets, " "or humanoid assets with unclear limb and body structure.", @@ -656,7 +656,7 @@ class MeshyAnimateModelNode(IO.ComfyNode): return IO.Schema( node_id="MeshyAnimateModelNode", display_name="Meshy: Animate Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", description="Apply a specific animation action to a previously rigged character.", inputs=[ IO.Custom("MESHY_RIGGED_TASK_ID").Input("rig_task_id"), @@ -722,7 +722,7 @@ class MeshyTextureNode(IO.ComfyNode): return IO.Schema( node_id="MeshyTextureNode", display_name="Meshy: Texture Model", - category="api node/3d/Meshy", + category="3d/partner/Meshy", inputs=[ IO.Combo.Input("model", options=["latest"]), IO.Custom("MESHY_TASK_ID").Input("meshy_task_id"), diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py index b5d0b461f..338584148 100644 --- a/comfy_api_nodes/nodes_minimax.py +++ b/comfy_api_nodes/nodes_minimax.py @@ -101,7 +101,7 @@ class MinimaxTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxTextToVideoNode", display_name="MiniMax Text to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on a prompt, and optional parameters.", inputs=[ IO.String.Input( @@ -163,7 +163,7 @@ class MinimaxImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxImageToVideoNode", display_name="MiniMax Image to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( @@ -230,7 +230,7 @@ class MinimaxSubjectToVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxSubjectToVideoNode", display_name="MiniMax Subject to Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos synchronously based on an image and prompt, and optional parameters.", inputs=[ IO.Image.Input( @@ -294,7 +294,7 @@ class MinimaxHailuoVideoNode(IO.ComfyNode): return IO.Schema( node_id="MinimaxHailuoVideoNode", display_name="MiniMax Hailuo Video", - category="api node/video/MiniMax", + category="video/partner/MiniMax", description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py index a5a188634..48c739dfe 100644 --- a/comfy_api_nodes/nodes_openai.py +++ b/comfy_api_nodes/nodes_openai.py @@ -99,7 +99,7 @@ class OpenAIDalle2(IO.ComfyNode): return IO.Schema( node_id="OpenAIDalle2", display_name="OpenAI DALL·E 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's DALL·E 2 endpoint.", inputs=[ IO.String.Input( @@ -249,7 +249,7 @@ class OpenAIDalle3(IO.ComfyNode): return IO.Schema( node_id="OpenAIDalle3", display_name="OpenAI DALL·E 3", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's DALL·E 3 endpoint.", inputs=[ IO.String.Input( @@ -371,7 +371,7 @@ class OpenAIGPTImage1(IO.ComfyNode): return IO.Schema( node_id="OpenAIGPTImage1", display_name="OpenAI GPT Image 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images synchronously via OpenAI's GPT Image endpoint.", is_deprecated=True, inputs=[ @@ -695,7 +695,7 @@ class OpenAIGPTImageNodeV2(IO.ComfyNode): return IO.Schema( node_id="OpenAIGPTImageNodeV2", display_name="OpenAI GPT Image 2", - category="api node/image/OpenAI", + category="image/partner/OpenAI", description="Generates images via OpenAI's GPT Image endpoint.", inputs=[ IO.String.Input( @@ -962,7 +962,7 @@ class OpenAIChatNode(IO.ComfyNode): return IO.Schema( node_id="OpenAIChatNode", display_name="OpenAI ChatGPT", - category="api node/text/OpenAI", + category="text/partner/OpenAI", essentials_category="Text Generation", description="Generate text responses from an OpenAI model.", inputs=[ @@ -1201,7 +1201,7 @@ class OpenAIInputFiles(IO.ComfyNode): return IO.Schema( node_id="OpenAIInputFiles", display_name="OpenAI ChatGPT Input Files", - category="api node/text/OpenAI", + category="text/partner/OpenAI", description="Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes.", inputs=[ IO.Combo.Input( @@ -1248,7 +1248,7 @@ class OpenAIChatConfig(IO.ComfyNode): return IO.Schema( node_id="OpenAIChatConfig", display_name="OpenAI ChatGPT Advanced Options", - category="api node/text/OpenAI", + category="text/partner/OpenAI", description="Allows specifying advanced configuration options for the OpenAI Chat Nodes.", inputs=[ IO.Combo.Input( diff --git a/comfy_api_nodes/nodes_openrouter.py b/comfy_api_nodes/nodes_openrouter.py index 031301870..d2ebbef0d 100644 --- a/comfy_api_nodes/nodes_openrouter.py +++ b/comfy_api_nodes/nodes_openrouter.py @@ -265,7 +265,7 @@ class OpenRouterLLMNode(IO.ComfyNode): return IO.Schema( node_id="OpenRouterLLMNode", display_name="OpenRouter LLM", - category="api node/text/OpenRouter", + category="text/partner/OpenRouter", essentials_category="Text Generation", description=( "Generate text responses through OpenRouter. Routes to a curated set of popular " diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py index e17a24ae7..3861cfedd 100644 --- a/comfy_api_nodes/nodes_pixverse.py +++ b/comfy_api_nodes/nodes_pixverse.py @@ -53,7 +53,7 @@ class PixverseTemplateNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTemplateNode", display_name="PixVerse Template", - category="api node/video/PixVerse", + category="video/partner/PixVerse", inputs=[ IO.Combo.Input("template", options=list(pixverse_templates.keys())), ], @@ -74,7 +74,7 @@ class PixverseTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTextToVideoNode", display_name="PixVerse Text to Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.String.Input( @@ -192,7 +192,7 @@ class PixverseImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseImageToVideoNode", display_name="PixVerse Image to Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("image"), @@ -310,7 +310,7 @@ class PixverseTransitionVideoNode(IO.ComfyNode): return IO.Schema( node_id="PixverseTransitionVideoNode", display_name="PixVerse Transition Video", - category="api node/video/PixVerse", + category="video/partner/PixVerse", description="Generates videos based on prompt and output_size.", inputs=[ IO.Image.Input("first_frame"), diff --git a/comfy_api_nodes/nodes_quiver.py b/comfy_api_nodes/nodes_quiver.py index 3269c0afe..ad045a7ef 100644 --- a/comfy_api_nodes/nodes_quiver.py +++ b/comfy_api_nodes/nodes_quiver.py @@ -62,7 +62,7 @@ class QuiverTextToSVGNode(IO.ComfyNode): return IO.Schema( node_id="QuiverTextToSVGNode", display_name="Quiver Text to SVG", - category="api node/image/Quiver", + category="image/partner/Quiver", description="Generate an SVG from a text prompt using Quiver AI.", inputs=[ IO.String.Input( @@ -177,7 +177,7 @@ class QuiverImageToSVGNode(IO.ComfyNode): return IO.Schema( node_id="QuiverImageToSVGNode", display_name="Quiver Image to SVG", - category="api node/image/Quiver", + category="image/partner/Quiver", description="Vectorize a raster image into SVG using Quiver AI.", inputs=[ IO.Image.Input( diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py index c60cfbc4a..07387821d 100644 --- a/comfy_api_nodes/nodes_recraft.py +++ b/comfy_api_nodes/nodes_recraft.py @@ -178,7 +178,7 @@ class RecraftColorRGBNode(IO.ComfyNode): return IO.Schema( node_id="RecraftColorRGB", display_name="Recraft Color RGB", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create Recraft Color by choosing specific RGB values.", inputs=[ IO.Int.Input("r", default=0, min=0, max=255, tooltip="Red value of color."), @@ -204,7 +204,7 @@ class RecraftControlsNode(IO.ComfyNode): return IO.Schema( node_id="RecraftControls", display_name="Recraft Controls", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create Recraft Controls for customizing Recraft generation.", inputs=[ IO.Custom(RecraftIO.COLOR).Input("colors", optional=True), @@ -228,7 +228,7 @@ class RecraftStyleV3RealisticImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftStyleV3RealisticImage", display_name="Recraft Style - Realistic Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -253,7 +253,7 @@ class RecraftStyleV3DigitalIllustrationNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3DigitalIllustration", display_name="Recraft Style - Digital Illustration", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -272,7 +272,7 @@ class RecraftStyleV3VectorIllustrationNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3VectorIllustrationNode", display_name="Recraft Style - Realistic Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)), @@ -291,7 +291,7 @@ class RecraftStyleV3LogoRasterNode(RecraftStyleV3RealisticImageNode): return IO.Schema( node_id="RecraftStyleV3LogoRaster", display_name="Recraft Style - Logo Raster", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Select realistic_image style and optional substyle.", inputs=[ IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE, include_none=False)), @@ -308,7 +308,7 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode): return IO.Schema( node_id="RecraftStyleV3InfiniteStyleLibrary", display_name="Recraft Style - Infinite Style Library", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Choose style based on preexisting UUID from Recraft's Infinite Style Library.", inputs=[ IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."), @@ -331,7 +331,7 @@ class RecraftCreateStyleNode(IO.ComfyNode): return IO.Schema( node_id="RecraftCreateStyleNode", display_name="Recraft Create Style", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Create a custom style from reference images. " "Upload 1-5 images to use as style references. " "Total size of all images is limited to 5 MB.", @@ -400,7 +400,7 @@ class RecraftTextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftTextToImageNode", display_name="Recraft Text to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates images synchronously based on prompt and resolution.", inputs=[ IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."), @@ -512,7 +512,7 @@ class RecraftImageToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftImageToImageNode", display_name="Recraft Image to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Modify image based on prompt and strength.", inputs=[ IO.Image.Input("image"), @@ -630,7 +630,7 @@ class RecraftImageInpaintingNode(IO.ComfyNode): return IO.Schema( node_id="RecraftImageInpaintingNode", display_name="Recraft Image Inpainting", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Modify image based on prompt and mask.", inputs=[ IO.Image.Input("image"), @@ -732,7 +732,7 @@ class RecraftTextToVectorNode(IO.ComfyNode): return IO.Schema( node_id="RecraftTextToVectorNode", display_name="Recraft Text to Vector", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates SVG synchronously based on prompt and resolution.", inputs=[ IO.String.Input("prompt", default="", tooltip="Prompt for the image generation.", multiline=True), @@ -832,7 +832,7 @@ class RecraftVectorizeImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftVectorizeImageNode", display_name="Recraft Vectorize Image", - category="api node/image/Recraft", + category="image/partner/Recraft", essentials_category="Image Tools", description="Generates SVG synchronously from an input image.", inputs=[ @@ -876,7 +876,7 @@ class RecraftReplaceBackgroundNode(IO.ComfyNode): return IO.Schema( node_id="RecraftReplaceBackgroundNode", display_name="Recraft Replace Background", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Replace background on image, based on provided prompt.", inputs=[ IO.Image.Input("image"), @@ -963,7 +963,7 @@ class RecraftRemoveBackgroundNode(IO.ComfyNode): return IO.Schema( node_id="RecraftRemoveBackgroundNode", display_name="Recraft Remove Background", - category="api node/image/Recraft", + category="image/partner/Recraft", essentials_category="Image Tools", description="Remove background from image, and return processed image and mask.", inputs=[ @@ -1012,7 +1012,7 @@ class RecraftCrispUpscaleNode(IO.ComfyNode): return IO.Schema( node_id="RecraftCrispUpscaleNode", display_name="Recraft Crisp Upscale Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Upscale image synchronously.\n" "Enhances a given raster image using ‘crisp upscale’ tool, " "increasing image resolution, making the image sharper and cleaner.", @@ -1058,7 +1058,7 @@ class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode): return IO.Schema( node_id="RecraftCreativeUpscaleNode", display_name="Recraft Creative Upscale Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Upscale image synchronously.\n" "Enhances a given raster image using ‘creative upscale’ tool, " "boosting resolution with a focus on refining small details and faces.", @@ -1086,7 +1086,7 @@ class RecraftV4TextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RecraftV4TextToImageNode", display_name="Recraft V4 Text to Image", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates images using Recraft V4 or V4 Pro models.", inputs=[ IO.String.Input( @@ -1210,7 +1210,7 @@ class RecraftV4TextToVectorNode(IO.ComfyNode): return IO.Schema( node_id="RecraftV4TextToVectorNode", display_name="Recraft V4 Text to Vector", - category="api node/image/Recraft", + category="image/partner/Recraft", description="Generates SVG using Recraft V4 or V4 Pro models.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_reve.py b/comfy_api_nodes/nodes_reve.py index a87395394..2b15eadd7 100644 --- a/comfy_api_nodes/nodes_reve.py +++ b/comfy_api_nodes/nodes_reve.py @@ -109,7 +109,7 @@ class ReveImageCreateNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageCreateNode", display_name="Reve Image Create", - category="api node/image/Reve", + category="image/partner/Reve", description="Generate images from text descriptions using Reve.", inputs=[ IO.String.Input( @@ -200,7 +200,7 @@ class ReveImageEditNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageEditNode", display_name="Reve Image Edit", - category="api node/image/Reve", + category="image/partner/Reve", description="Edit images using natural language instructions with Reve.", inputs=[ IO.Image.Input("image", tooltip="The image to edit."), @@ -300,7 +300,7 @@ class ReveImageRemixNode(IO.ComfyNode): return IO.Schema( node_id="ReveImageRemixNode", display_name="Reve Image Remix", - category="api node/image/Reve", + category="image/partner/Reve", description="Combine reference images with text prompts to create new images using Reve.", inputs=[ IO.Autogrow.Input( diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py index 2df5a3e13..e14955661 100644 --- a/comfy_api_nodes/nodes_rodin.py +++ b/comfy_api_nodes/nodes_rodin.py @@ -230,7 +230,7 @@ class Rodin3D_Regular(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Regular", display_name="Rodin 3D Generate - Regular Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -289,7 +289,7 @@ class Rodin3D_Detail(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Detail", display_name="Rodin 3D Generate - Detail Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -348,7 +348,7 @@ class Rodin3D_Smooth(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Smooth", display_name="Rodin 3D Generate - Smooth Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -406,7 +406,7 @@ class Rodin3D_Sketch(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Sketch", display_name="Rodin 3D Generate - Sketch Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -468,7 +468,7 @@ class Rodin3D_Gen2(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Gen2", display_name="Rodin 3D Generate - Gen-2 Generate", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("Images"), @@ -941,7 +941,7 @@ class Rodin3D_Gen25_Image(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Gen25_Image", display_name="Rodin 3D Gen-2.5 - Image to 3D", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=( "Generate a 3D model from 1-5 reference images via Rodin Gen-2.5. " "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost." @@ -1035,7 +1035,7 @@ class Rodin3D_Gen25_Text(IO.ComfyNode): return IO.Schema( node_id="Rodin3D_Gen25_Text", display_name="Rodin 3D Gen-2.5 - Text to 3D", - category="api node/3d/Rodin", + category="3d/partner/Rodin", description=( "Generate a 3D model from a text prompt via Rodin Gen-2.5. " "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost." diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py index 573170ba2..7357c733e 100644 --- a/comfy_api_nodes/nodes_runway.py +++ b/comfy_api_nodes/nodes_runway.py @@ -140,7 +140,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode): return IO.Schema( node_id="RunwayImageToVideoNodeGen3a", display_name="Runway Image to Video (Gen3a Turbo)", - category="api node/video/Runway", + category="video/partner/Runway", description="Generate a video from a single starting frame using Gen3a Turbo model. " "Before diving in, review these best practices to ensure that " "your input selections will set your generation up for success: " @@ -234,7 +234,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode): return IO.Schema( node_id="RunwayImageToVideoNodeGen4", display_name="Runway Image to Video (Gen4 Turbo)", - category="api node/video/Runway", + category="video/partner/Runway", description="Generate a video from a single starting frame using Gen4 Turbo model. " "Before diving in, review these best practices to ensure that " "your input selections will set your generation up for success: " @@ -329,7 +329,7 @@ class RunwayFirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="RunwayFirstLastFrameNode", display_name="Runway First-Last-Frame to Video", - category="api node/video/Runway", + category="video/partner/Runway", description="Upload first and last keyframes, draft a prompt, and generate a video. " "More complex transitions, such as cases where the Last frame is completely different " "from the First frame, may benefit from the longer 10s duration. " @@ -440,7 +440,7 @@ class RunwayTextToImageNode(IO.ComfyNode): return IO.Schema( node_id="RunwayTextToImageNode", display_name="Runway Text to Image", - category="api node/image/Runway", + category="image/partner/Runway", description="Generate an image from a text prompt using Runway's Gen 4 model. " "You can also include reference image to guide the generation.", inputs=[ diff --git a/comfy_api_nodes/nodes_sonilo.py b/comfy_api_nodes/nodes_sonilo.py index 5518f5902..bc31a0074 100644 --- a/comfy_api_nodes/nodes_sonilo.py +++ b/comfy_api_nodes/nodes_sonilo.py @@ -34,7 +34,7 @@ class SoniloVideoToMusic(IO.ComfyNode): return IO.Schema( node_id="SoniloVideoToMusic", display_name="Sonilo Video to Music", - category="api node/audio/Sonilo", + category="audio/partner/Sonilo", description="Generate music from video content using Sonilo's AI model. " "Analyzes the video and creates matching music.", inputs=[ @@ -99,7 +99,7 @@ class SoniloTextToMusic(IO.ComfyNode): return IO.Schema( node_id="SoniloTextToMusic", display_name="Sonilo Text to Music", - category="api node/audio/Sonilo", + category="audio/partner/Sonilo", description="Generate music from a text prompt using Sonilo's AI model. " "Leave duration at 0 to let the model infer it from the prompt.", inputs=[ diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py index c1d485188..83cfca495 100644 --- a/comfy_api_nodes/nodes_sora.py +++ b/comfy_api_nodes/nodes_sora.py @@ -34,7 +34,7 @@ class OpenAIVideoSora2(IO.ComfyNode): return IO.Schema( node_id="OpenAIVideoSora2", display_name="OpenAI Sora - Video (DEPRECATED)", - category="api node/video/Sora", + category="video/partner/Sora", description=( "OpenAI video and audio generation.\n\n" "DEPRECATION NOTICE: OpenAI will stop serving the Sora v2 API in September 2026. " diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py index 906d8ff35..a1753d647 100644 --- a/comfy_api_nodes/nodes_stability.py +++ b/comfy_api_nodes/nodes_stability.py @@ -62,7 +62,7 @@ class StabilityStableImageUltraNode(IO.ComfyNode): return IO.Schema( node_id="StabilityStableImageUltraNode", display_name="Stability AI Stable Image Ultra", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.String.Input( @@ -197,7 +197,7 @@ class StabilityStableImageSD_3_5Node(IO.ComfyNode): return IO.Schema( node_id="StabilityStableImageSD_3_5Node", display_name="Stability AI Stable Diffusion 3.5 Image", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.String.Input( @@ -354,7 +354,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleConservativeNode", display_name="Stability AI Upscale Conservative", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -457,7 +457,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleCreativeNode", display_name="Stability AI Upscale Creative", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -578,7 +578,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode): return IO.Schema( node_id="StabilityUpscaleFastNode", display_name="Stability AI Upscale Fast", - category="api node/image/Stability AI", + category="image/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Image.Input("image"), @@ -630,7 +630,7 @@ class StabilityTextToAudio(IO.ComfyNode): return IO.Schema( node_id="StabilityTextToAudio", display_name="Stability AI Text To Audio", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", essentials_category="Audio", description=cleandoc(cls.__doc__ or ""), inputs=[ @@ -708,7 +708,7 @@ class StabilityAudioToAudio(IO.ComfyNode): return IO.Schema( node_id="StabilityAudioToAudio", display_name="Stability AI Audio To Audio", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Combo.Input( @@ -802,7 +802,7 @@ class StabilityAudioInpaint(IO.ComfyNode): return IO.Schema( node_id="StabilityAudioInpaint", display_name="Stability AI Audio Inpaint", - category="api node/audio/Stability AI", + category="audio/partner/Stability AI", description=cleandoc(cls.__doc__ or ""), inputs=[ IO.Combo.Input( diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py index e79c16d3c..d0906ee44 100644 --- a/comfy_api_nodes/nodes_topaz.py +++ b/comfy_api_nodes/nodes_topaz.py @@ -52,7 +52,7 @@ class TopazImageEnhance(IO.ComfyNode): return IO.Schema( node_id="TopazImageEnhance", display_name="Topaz Image Enhance", - category="api node/image/Topaz", + category="image/partner/Topaz", description="Industry-standard upscaling and image enhancement.", inputs=[ IO.Combo.Input("model", options=["Reimagine"]), @@ -235,7 +235,7 @@ class TopazVideoEnhance(IO.ComfyNode): return IO.Schema( node_id="TopazVideoEnhance", display_name="Topaz Video Enhance (Legacy)", - category="api node/video/Topaz", + category="video/partner/Topaz", description="Breathe new life into video with powerful upscaling and recovery technology.", inputs=[ IO.Video.Input("video"), @@ -475,7 +475,7 @@ class TopazVideoEnhanceV2(IO.ComfyNode): return IO.Schema( node_id="TopazVideoEnhanceV2", display_name="Topaz Video Enhance", - category="api node/video/Topaz", + category="video/partner/Topaz", description="Breathe new life into video with powerful upscaling and recovery technology.", inputs=[ IO.Video.Input("video"), diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py index d6501dee4..4820e26c1 100644 --- a/comfy_api_nodes/nodes_tripo.py +++ b/comfy_api_nodes/nodes_tripo.py @@ -11,6 +11,9 @@ from comfy_api_nodes.apis.tripo import ( TripoModelVersion, TripoMultiviewToModelRequest, TripoOrientation, + TripoP1ImageToModelRequest, + TripoP1MultiviewToModelRequest, + TripoP1TextToModelRequest, TripoRefineModelRequest, TripoStyle, TripoTaskResponse, @@ -80,7 +83,7 @@ class TripoTextToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoTextToModelNode", display_name="Tripo: Text to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.String.Input("prompt", multiline=True), IO.String.Input("negative_prompt", multiline=True, optional=True), @@ -93,10 +96,22 @@ class TripoTextToModelNode(IO.ComfyNode): IO.Int.Input("image_seed", default=42, optional=True, advanced=True), IO.Int.Input("model_seed", default=42, optional=True, advanced=True), IO.Int.Input("texture_seed", default=42, optional=True, advanced=True), - IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), + IO.Combo.Input( + "texture_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), IO.Int.Input("face_limit", default=-1, min=-1, max=2000000, optional=True, advanced=True), IO.Boolean.Input("quad", default=False, optional=True, advanced=True), - IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), + IO.Combo.Input( + "geometry_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), ], outputs=[ IO.String.Output(display_name="model_file"), # for backward compatibility only @@ -195,7 +210,7 @@ class TripoImageToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoImageToModelNode", display_name="Tripo: Image to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Image.Input("image"), IO.Combo.Input( @@ -209,16 +224,36 @@ class TripoImageToModelNode(IO.ComfyNode): IO.Boolean.Input("pbr", default=True, optional=True), IO.Int.Input("model_seed", default=42, optional=True, advanced=True), IO.Combo.Input( - "orientation", options=TripoOrientation, default=TripoOrientation.DEFAULT, optional=True, advanced=True + "orientation", + options=TripoOrientation, + default=TripoOrientation.DEFAULT, + optional=True, + advanced=True, ), IO.Int.Input("texture_seed", default=42, optional=True, advanced=True), - IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), IO.Combo.Input( - "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True, advanced=True + "texture_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), + IO.Combo.Input( + "texture_alignment", + default="original_image", + options=["original_image", "geometry"], + optional=True, + advanced=True, ), IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True, advanced=True), IO.Boolean.Input("quad", default=False, optional=True, advanced=True), - IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), + IO.Combo.Input( + "geometry_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), ], outputs=[ IO.String.Output(display_name="model_file"), # for backward compatibility only @@ -323,7 +358,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode): return IO.Schema( node_id="TripoMultiviewToModelNode", display_name="Tripo: Multiview to Model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Image.Input("image"), IO.Image.Input("image_left", optional=True), @@ -346,13 +381,35 @@ class TripoMultiviewToModelNode(IO.ComfyNode): IO.Boolean.Input("pbr", default=True, optional=True), IO.Int.Input("model_seed", default=42, optional=True, advanced=True), IO.Int.Input("texture_seed", default=42, optional=True, advanced=True), - IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), IO.Combo.Input( - "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True, advanced=True + "texture_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), + IO.Combo.Input( + "texture_alignment", + default="original_image", + options=["original_image", "geometry"], + optional=True, + advanced=True, ), IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True, advanced=True), - IO.Boolean.Input("quad", default=False, optional=True, advanced=True, tooltip="This parameter is deprecated and does nothing."), - IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), + IO.Boolean.Input( + "quad", + default=False, + optional=True, + advanced=True, + tooltip="This parameter is deprecated and does nothing.", + ), + IO.Combo.Input( + "geometry_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), ], outputs=[ IO.String.Output(display_name="model_file"), # for backward compatibility only @@ -461,15 +518,25 @@ class TripoTextureNode(IO.ComfyNode): return IO.Schema( node_id="TripoTextureNode", display_name="Tripo: Texture model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("MODEL_TASK_ID").Input("model_task_id"), IO.Boolean.Input("texture", default=True, optional=True), IO.Boolean.Input("pbr", default=True, optional=True), IO.Int.Input("texture_seed", default=42, optional=True, advanced=True), - IO.Combo.Input("texture_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True), IO.Combo.Input( - "texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True, advanced=True + "texture_quality", + default="standard", + options=["standard", "detailed"], + optional=True, + advanced=True, + ), + IO.Combo.Input( + "texture_alignment", + default="original_image", + options=["original_image", "geometry"], + optional=True, + advanced=True, ), ], outputs=[ @@ -528,7 +595,7 @@ class TripoRefineNode(IO.ComfyNode): return IO.Schema( node_id="TripoRefineNode", display_name="Tripo: Refine Draft model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", description="Refine a draft model created by v1.4 Tripo models only.", inputs=[ IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"), @@ -568,7 +635,7 @@ class TripoRigNode(IO.ComfyNode): return IO.Schema( node_id="TripoRigNode", display_name="Tripo: Rig model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")], outputs=[ IO.String.Output(display_name="model_file"), # for backward compatibility only @@ -605,7 +672,7 @@ class TripoRetargetNode(IO.ComfyNode): return IO.Schema( node_id="TripoRetargetNode", display_name="Tripo: Retarget rigged model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("RIG_TASK_ID").Input("original_model_task_id"), IO.Combo.Input( @@ -626,7 +693,7 @@ class TripoRetargetNode(IO.ComfyNode): "preset:hexapod:walk", "preset:octopod:walk", "preset:serpentine:march", - "preset:aquatic:march" + "preset:aquatic:march", ], ), ], @@ -670,7 +737,7 @@ class TripoConversionNode(IO.ComfyNode): return IO.Schema( node_id="TripoConversionNode", display_name="Tripo: Convert model", - category="api node/3d/Tripo", + category="3d/partner/Tripo", inputs=[ IO.Custom("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID").Input("original_model_task_id"), IO.Combo.Input("format", options=["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"]), @@ -817,7 +884,7 @@ class TripoConversionNode(IO.ComfyNode): # Parse part_names from comma-separated string to list part_names_list = None if part_names and part_names.strip(): - part_names_list = [name.strip() for name in part_names.split(',') if name.strip()] + part_names_list = [name.strip() for name in part_names.split(",") if name.strip()] response = await sync_op( cls, @@ -848,6 +915,373 @@ class TripoConversionNode(IO.ComfyNode): return await poll_until_finished(cls, response, average_duration=30) +def _p1_price_expr(*, geometry_credits: int, textured_credits: int, detailed_credits: int) -> str: + return ( + "(" + " $mode := widgets.output_mode;" + ' $detailed := $lookup(widgets, "output_mode.texture_quality") = "detailed";' + f' $credits := $mode = "geometry only" ? {geometry_credits} : ($detailed ? {detailed_credits} : {textured_credits});' + ' {"type":"usd","usd": $credits * 0.01, "format": {"approximate": true}}' + ")" + ) + + +def _p1_textured_inputs(*, include_image_alignment: bool) -> list: + """Inputs shown inside the 'Textured' branch of the P1 output_mode DynamicCombo.""" + inputs: list = [ + IO.Boolean.Input("pbr", default=True, tooltip="Include PBR maps. When on, base texture is forced on too."), + IO.Combo.Input("texture_quality", options=["standard", "detailed"], default="standard"), + ] + if include_image_alignment: + inputs.extend( + [ + IO.Combo.Input( + "texture_alignment", + options=["original_image", "geometry"], + default="original_image", + tooltip="Prioritize visual fidelity to the source image, or alignment to the mesh geometry.", + ), + IO.Combo.Input( + "orientation", + options=["default", "align_image"], + default="default", + tooltip="Rotate the output to match the source image. Only applies when textured.", + ), + ] + ) + inputs.append(IO.Int.Input("texture_seed", default=42, advanced=True)) + return inputs + + +def _build_p1_output_mode(*, include_image_alignment: bool) -> IO.DynamicCombo.Input: + return IO.DynamicCombo.Input( + "output_mode", + options=[ + IO.DynamicCombo.Option("Geometry only", []), + IO.DynamicCombo.Option("Textured", _p1_textured_inputs(include_image_alignment=include_image_alignment)), + ], + tooltip='"Geometry only" returns an untextured mesh. "Textured" adds color/PBR maps.', + ) + + +def _resolve_p1_texture_fields(output_mode: dict) -> dict: + """Translate the output_mode DynamicCombo payload into P1 request fields. + + pbr=true forces texture=true server-side, but we send both explicitly so the + intent is visible in the request body and logs. + """ + mode = output_mode["output_mode"] + if mode == "Geometry only": + return {"texture": False, "pbr": False} + out = { + "texture": True, + "pbr": bool(output_mode.get("pbr", True)), + "texture_quality": output_mode.get("texture_quality", "standard"), + "texture_seed": output_mode.get("texture_seed"), + } + if "texture_alignment" in output_mode: + out["texture_alignment"] = output_mode["texture_alignment"] + if "orientation" in output_mode: + out["orientation"] = output_mode["orientation"] + return out + + +def _p1_common_inputs() -> list: + """Inputs shared by all P1 nodes (placed after output_mode).""" + return [ + IO.Int.Input( + "face_limit", + default=-1, + min=-1, + max=20000, + optional=True, + advanced=True, + tooltip="Target face count, 48-20000. -1 lets Tripo pick adaptively.", + ), + IO.Int.Input("model_seed", default=42, optional=True, advanced=True), + IO.Boolean.Input( + "auto_size", + default=False, + optional=True, + advanced=True, + tooltip="Scale the output to approximate real-world meters.", + ), + IO.Boolean.Input( + "export_uv", + default=True, + optional=True, + advanced=True, + tooltip="UV unwrap during generation. Turn off for faster geometry-only runs.", + ), + IO.Boolean.Input( + "compress_geometry", + default=False, + optional=True, + advanced=True, + tooltip="Apply geometry-based compression. Decompress before editing.", + ), + ] + + +def _build_p1_request_kwargs( + *, + output_mode: dict, + face_limit: int, + model_seed: int, + auto_size: bool, + export_uv: bool, + compress_geometry: bool, +) -> dict: + """Common P1 request fields shared by all three node types.""" + kwargs: dict = { + "model_seed": model_seed, + "face_limit": face_limit if face_limit != -1 else None, + "auto_size": auto_size, + "export_uv": export_uv, + "compress": "geometry" if compress_geometry else None, + } + kwargs.update(_resolve_p1_texture_fields(output_mode)) + return kwargs + + +class TripoP1TextToModelNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TripoP1TextToModelNode", + display_name="Tripo P1: Text to Model", + category="3d/partner/Tripo", + description="Tripo P1 text-to-3D. Optimized for low-poly, game-ready meshes with stable topology.", + inputs=[ + IO.String.Input("prompt", multiline=True, tooltip="Up to 1024 characters."), + IO.String.Input("negative_prompt", multiline=True, optional=True, tooltip="Up to 255 characters."), + _build_p1_output_mode(include_image_alignment=False), + IO.Int.Input("image_seed", default=42, optional=True, advanced=True), + *_p1_common_inputs(), + ], + outputs=[ + IO.String.Output(display_name="model_file"), # for backward compatibility only + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + IO.File3DGLB.Output(display_name="GLB"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["output_mode", "output_mode.texture_quality"]), + expr=_p1_price_expr(geometry_credits=30, textured_credits=40, detailed_credits=50), + ), + ) + + @classmethod + async def execute( + cls, + prompt: str, + output_mode: dict, + negative_prompt: str | None = None, + image_seed: int | None = None, + face_limit: int = -1, + model_seed: int | None = None, + auto_size: bool = False, + export_uv: bool = True, + compress_geometry: bool = False, + ) -> IO.NodeOutput: + if not prompt: + raise RuntimeError("Prompt is required") + common = _build_p1_request_kwargs( + output_mode=output_mode, + face_limit=face_limit, + model_seed=model_seed, + auto_size=auto_size, + export_uv=export_uv, + compress_geometry=compress_geometry, + ) + request = TripoP1TextToModelRequest( + prompt=prompt, + negative_prompt=negative_prompt or None, + image_seed=image_seed, + **common, + ) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=request, + ) + return await poll_until_finished(cls, response, average_duration=60) + + +class TripoP1ImageToModelNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TripoP1ImageToModelNode", + display_name="Tripo P1: Image to Model", + category="3d/partner/Tripo", + description="Tripo P1 image-to-3D. Optimized for low-poly, game-ready meshes.", + inputs=[ + IO.Image.Input("image"), + _build_p1_output_mode(include_image_alignment=True), + IO.Boolean.Input( + "enable_image_autofix", + default=False, + optional=True, + advanced=True, + tooltip="Pre-process the input image for better generation quality.", + ), + *_p1_common_inputs(), + ], + outputs=[ + IO.String.Output(display_name="model_file"), # for backward compatibility only + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + IO.File3DGLB.Output(display_name="GLB"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["output_mode", "output_mode.texture_quality"]), + expr=_p1_price_expr(geometry_credits=40, textured_credits=50, detailed_credits=60), + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + output_mode: dict, + enable_image_autofix: bool = False, + face_limit: int = -1, + model_seed: int | None = None, + auto_size: bool = False, + export_uv: bool = True, + compress_geometry: bool = False, + ) -> IO.NodeOutput: + if image is None: + raise RuntimeError("Image is required") + tripo_file = TripoFileReference( + root=TripoUrlReference( + url=(await upload_images_to_comfyapi(cls, image, max_images=1))[0], + type="jpeg", + ) + ) + common = _build_p1_request_kwargs( + output_mode=output_mode, + face_limit=face_limit, + model_seed=model_seed, + auto_size=auto_size, + export_uv=export_uv, + compress_geometry=compress_geometry, + ) + request = TripoP1ImageToModelRequest( + file=tripo_file, + enable_image_autofix=enable_image_autofix, + **common, + ) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=request, + ) + return await poll_until_finished(cls, response, average_duration=60) + + +class TripoP1MultiviewToModelNode(IO.ComfyNode): + + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TripoP1MultiviewToModelNode", + display_name="Tripo P1: Multiview to Model", + category="3d/partner/Tripo", + description="Tripo P1 multiview-to-3D from 2-4 reference images in [front, left, back, right] order. " + "Front is required; any combination of the other three may be omitted.", + inputs=[ + IO.Image.Input("image", tooltip="Front view (0°). Required."), + IO.Image.Input( + "image_left", + optional=True, + tooltip="Left view (90°), i.e. the subject's left side.", + ), + IO.Image.Input("image_back", optional=True, tooltip="Back view (180°)."), + IO.Image.Input( + "image_right", + optional=True, + tooltip="Right view (270°), i.e. the subject's right side.", + ), + _build_p1_output_mode(include_image_alignment=True), + *_p1_common_inputs(), + ], + outputs=[ + IO.String.Output(display_name="model_file"), # for backward compatibility only + IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"), + IO.File3DGLB.Output(display_name="GLB"), + ], + hidden=[ + IO.Hidden.auth_token_comfy_org, + IO.Hidden.api_key_comfy_org, + IO.Hidden.unique_id, + ], + is_api_node=True, + price_badge=IO.PriceBadge( + depends_on=IO.PriceBadgeDepends(widgets=["output_mode", "output_mode.texture_quality"]), + expr=_p1_price_expr(geometry_credits=40, textured_credits=50, detailed_credits=60), + ), + ) + + @classmethod + async def execute( + cls, + image: Input.Image, + output_mode: dict, + image_left: Input.Image | None = None, + image_back: Input.Image | None = None, + image_right: Input.Image | None = None, + face_limit: int = -1, + model_seed: int | None = None, + auto_size: bool = False, + export_uv: bool = True, + compress_geometry: bool = False, + ) -> IO.NodeOutput: + views = [image, image_left, image_back, image_right] + if sum(1 for v in views if v is not None) < 2: + raise RuntimeError("Tripo P1 multiview requires at least 2 images (front plus one of left/back/right).") + + files: list[TripoFileReference] = [] + for view in views: + if view is None: + files.append(TripoFileReference(root=TripoFileEmptyReference())) + continue + url = (await upload_images_to_comfyapi(cls, view, max_images=1))[0] + files.append(TripoFileReference(root=TripoUrlReference(url=url, type="jpeg"))) + + common = _build_p1_request_kwargs( + output_mode=output_mode, + face_limit=face_limit, + model_seed=model_seed, + auto_size=auto_size, + export_uv=export_uv, + compress_geometry=compress_geometry, + ) + request = TripoP1MultiviewToModelRequest(files=files, **common) + response = await sync_op( + cls, + endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/task", method="POST"), + response_model=TripoTaskResponse, + data=request, + ) + return await poll_until_finished(cls, response, average_duration=80) + + class TripoExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: @@ -855,6 +1289,9 @@ class TripoExtension(ComfyExtension): TripoTextToModelNode, TripoImageToModelNode, TripoMultiviewToModelNode, + TripoP1TextToModelNode, + TripoP1ImageToModelNode, + TripoP1MultiviewToModelNode, TripoTextureNode, TripoRefineNode, TripoRigNode, diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py index 2ff75d9b2..068862397 100644 --- a/comfy_api_nodes/nodes_veo2.py +++ b/comfy_api_nodes/nodes_veo2.py @@ -45,7 +45,7 @@ class VeoVideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="VeoVideoGenerationNode", display_name="Google Veo 2 Video Generation", - category="api node/video/Veo", + category="video/partner/Veo", description="Generates videos from text prompts using Google's Veo 2 API", inputs=[ IO.String.Input( @@ -256,7 +256,7 @@ class Veo3VideoGenerationNode(IO.ComfyNode): return IO.Schema( node_id="Veo3VideoGenerationNode", display_name="Google Veo 3 Video Generation", - category="api node/video/Veo", + category="video/partner/Veo", description="Generates videos from text prompts using Google's Veo 3 API", inputs=[ IO.String.Input( @@ -468,7 +468,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode): return IO.Schema( node_id="Veo3FirstLastFrameNode", display_name="Google Veo 3 First-Last-Frame to Video", - category="api node/video/Veo", + category="video/partner/Veo", description="Generate video using prompt and first and last frames.", inputs=[ IO.String.Input( diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py index 8d90cefeb..16f6113de 100644 --- a/comfy_api_nodes/nodes_vidu.py +++ b/comfy_api_nodes/nodes_vidu.py @@ -71,7 +71,7 @@ class ViduTextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduTextToVideoNode", display_name="Vidu Text To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -169,7 +169,7 @@ class ViduImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduImageToVideoNode", display_name="Vidu Image To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from image and optional prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -273,7 +273,7 @@ class ViduReferenceVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduReferenceVideoNode", display_name="Vidu Reference To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from multiple images and a prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -388,7 +388,7 @@ class ViduStartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduStartEndToVideoNode", display_name="Vidu Start End To Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from start and end frames and a prompt", inputs=[ IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"), @@ -492,7 +492,7 @@ class Vidu2TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2TextToVideoNode", display_name="Vidu2 Text-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt", inputs=[ IO.Combo.Input("model", options=["viduq2"]), @@ -584,7 +584,7 @@ class Vidu2ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2ImageToVideoNode", display_name="Vidu2 Image-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from an image and an optional prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]), @@ -714,7 +714,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2ReferenceVideoNode", display_name="Vidu2 Reference-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from multiple reference images and a prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2"]), @@ -849,7 +849,7 @@ class Vidu2StartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu2StartEndToVideoNode", display_name="Vidu2 Start/End Frame-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from a start frame, an end frame, and a prompt.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]), @@ -969,7 +969,7 @@ class ViduExtendVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduExtendVideoNode", display_name="Vidu Video Extension", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Extend an existing video by generating additional frames.", inputs=[ IO.DynamicCombo.Input( @@ -1138,7 +1138,7 @@ class ViduMultiFrameVideoNode(IO.ComfyNode): return IO.Schema( node_id="ViduMultiFrameVideoNode", display_name="Vidu Multi-Frame Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video with multiple keyframe transitions.", inputs=[ IO.Combo.Input("model", options=["viduq2-pro", "viduq2-turbo"]), @@ -1284,7 +1284,7 @@ class Vidu3TextToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3TextToVideoNode", display_name="Vidu Q3 Text-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate video from a text prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1429,7 +1429,7 @@ class Vidu3ImageToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3ImageToVideoNode", display_name="Vidu Q3 Image-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from an image and an optional prompt.", inputs=[ IO.DynamicCombo.Input( @@ -1571,7 +1571,7 @@ class Vidu3StartEndToVideoNode(IO.ComfyNode): return IO.Schema( node_id="Vidu3StartEndToVideoNode", display_name="Vidu Q3 Start/End Frame-to-Video Generation", - category="api node/video/Vidu", + category="video/partner/Vidu", description="Generate a video from a start frame, an end frame, and a prompt.", inputs=[ IO.DynamicCombo.Input( diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py index 68061bb5c..a235dc387 100644 --- a/comfy_api_nodes/nodes_wan.py +++ b/comfy_api_nodes/nodes_wan.py @@ -61,7 +61,7 @@ class WanTextToImageApi(IO.ComfyNode): return IO.Schema( node_id="WanTextToImageApi", display_name="Wan Text to Image", - category="api node/image/Wan", + category="image/partner/Wan", description="Generates an image based on a text prompt.", inputs=[ IO.Combo.Input( @@ -184,7 +184,7 @@ class WanImageToImageApi(IO.ComfyNode): return IO.Schema( node_id="WanImageToImageApi", display_name="Wan Image to Image", - category="api node/image/Wan", + category="image/partner/Wan", description="Generates an image from one or two input images and a text prompt. " "The output image is currently fixed at 1.6 MP, and its aspect ratio matches the input image(s).", inputs=[ @@ -312,7 +312,7 @@ class WanTextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanTextToVideoApi", display_name="Wan Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt.", inputs=[ IO.Combo.Input( @@ -495,7 +495,7 @@ class WanImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanImageToVideoApi", display_name="Wan Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video from the first frame and a text prompt.", inputs=[ IO.Combo.Input( @@ -674,7 +674,7 @@ class WanReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="WanReferenceVideoApi", display_name="Wan Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Use the character and voice from input videos, combined with a prompt, " "to generate a new video that maintains character consistency.", inputs=[ @@ -828,7 +828,7 @@ class Wan2TextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2TextToVideoApi", display_name="Wan 2.7 Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt using the Wan 2.7 model.", inputs=[ IO.DynamicCombo.Input( @@ -981,7 +981,7 @@ class Wan2ImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2ImageToVideoApi", display_name="Wan 2.7 Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video from a first-frame image, with optional last-frame image and audio.", inputs=[ IO.DynamicCombo.Input( @@ -1152,7 +1152,7 @@ class Wan2VideoContinuationApi(IO.ComfyNode): return IO.Schema( node_id="Wan2VideoContinuationApi", display_name="Wan 2.7 Video Continuation", - category="api node/video/Wan", + category="video/partner/Wan", description="Continue a video from where it left off, with optional last-frame control.", inputs=[ IO.DynamicCombo.Input( @@ -1319,7 +1319,7 @@ class Wan2VideoEditApi(IO.ComfyNode): return IO.Schema( node_id="Wan2VideoEditApi", display_name="Wan 2.7 Video Edit", - category="api node/video/Wan", + category="video/partner/Wan", description="Edit a video using text instructions, reference images, or style transfer.", inputs=[ IO.DynamicCombo.Input( @@ -1477,7 +1477,7 @@ class Wan2ReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="Wan2ReferenceVideoApi", display_name="Wan 2.7 Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video featuring a person or object from reference materials. " "Supports single-character performances and multi-character interactions.", inputs=[ @@ -1651,7 +1651,7 @@ class HappyHorseTextToVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseTextToVideoApi", display_name="HappyHorse Text to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generates a video based on a text prompt using the HappyHorse model.", inputs=[ IO.DynamicCombo.Input( @@ -1775,7 +1775,7 @@ class HappyHorseImageToVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseImageToVideoApi", display_name="HappyHorse Image to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video from a first-frame image using the HappyHorse model.", inputs=[ IO.DynamicCombo.Input( @@ -1905,7 +1905,7 @@ class HappyHorseVideoEditApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseVideoEditApi", display_name="HappyHorse Video Edit", - category="api node/video/Wan", + category="video/partner/Wan", description="Edit a video using text instructions or reference images with the HappyHorse model. " "Output duration is 3-15s and matches the input video; inputs longer than 15s are truncated.", inputs=[ @@ -2046,7 +2046,7 @@ class HappyHorseReferenceVideoApi(IO.ComfyNode): return IO.Schema( node_id="HappyHorseReferenceVideoApi", display_name="HappyHorse Reference to Video", - category="api node/video/Wan", + category="video/partner/Wan", description="Generate a video featuring a person or object from reference materials with the HappyHorse " "model. Supports single-character performances and multi-character interactions.", inputs=[ diff --git a/comfy_api_nodes/nodes_wavespeed.py b/comfy_api_nodes/nodes_wavespeed.py index 65e45f60a..a250015c3 100644 --- a/comfy_api_nodes/nodes_wavespeed.py +++ b/comfy_api_nodes/nodes_wavespeed.py @@ -27,7 +27,7 @@ class WavespeedFlashVSRNode(IO.ComfyNode): return IO.Schema( node_id="WavespeedFlashVSRNode", display_name="FlashVSR Video Upscale", - category="api node/video/WaveSpeed", + category="video/partner/WaveSpeed", description="Fast, high-quality video upscaler that " "boosts resolution and restores clarity for low-resolution or blurry footage.", inputs=[ @@ -98,7 +98,7 @@ class WavespeedImageUpscaleNode(IO.ComfyNode): return IO.Schema( node_id="WavespeedImageUpscaleNode", display_name="WaveSpeed Image Upscale", - category="api node/image/WaveSpeed", + category="image/partner/WaveSpeed", description="Boost image resolution and quality, upscaling photos to 4K or 8K for sharp, detailed results.", inputs=[ IO.Combo.Input("model", options=["SeedVR2", "Ultimate"]), diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py index 052301c33..57c501724 100644 --- a/comfy_api_nodes/util/client.py +++ b/comfy_api_nodes/util/client.py @@ -86,7 +86,7 @@ class _PollUIState: _RETRY_STATUS = {408, 500, 502, 503, 504} # status 429 is handled separately COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"] FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"] -QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait"] +QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait", "in_queue"] async def sync_op( diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py index 247d9ae8a..044077b18 100644 --- a/comfy_extras/nodes_ace.py +++ b/comfy_extras/nodes_ace.py @@ -11,7 +11,7 @@ class TextEncodeAceStepAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="TextEncodeAceStepAudio", - category="conditioning", + category="model/conditioning", inputs=[ IO.Clip.Input("clip"), IO.String.Input("tags", multiline=True, dynamic_prompts=True), @@ -33,7 +33,7 @@ class TextEncodeAceStepAudio15(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="TextEncodeAceStepAudio1.5", - category="conditioning", + category="model/conditioning", inputs=[ IO.Clip.Input("clip"), IO.String.Input("tags", multiline=True, dynamic_prompts=True), @@ -67,7 +67,7 @@ class EmptyAceStepLatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyAceStepLatentAudio", display_name="Empty Ace Step 1.0 Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1), IO.Int.Input( @@ -90,7 +90,7 @@ class EmptyAceStep15LatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyAceStep1.5LatentAudio", display_name="Empty Ace Step 1.5 Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01), IO.Int.Input( diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py index 20717ca38..77a561e30 100644 --- a/comfy_extras/nodes_advanced_samplers.py +++ b/comfy_extras/nodes_advanced_samplers.py @@ -45,7 +45,7 @@ class SamplerLCMUpscale(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SamplerLCMUpscale", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("scale_ratio", default=1.0, min=0.1, max=20.0, step=0.01, advanced=True), io.Int.Input("scale_steps", default=-1, min=-1, max=1000, step=1, advanced=True), @@ -91,7 +91,7 @@ class SamplerLCM(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SamplerLCM", - category="sampling/samplers", + category="model/sampling/samplers", description=("LCM sampler with tunable per-step noise. s_noise is a multiplier on the model's training noise scale"), inputs=[ io.Float.Input("s_noise", default=1.0, min=0.0, max=64.0, step=0.01, diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py index 307f41337..f89a809bb 100644 --- a/comfy_extras/nodes_align_your_steps.py +++ b/comfy_extras/nodes_align_your_steps.py @@ -29,7 +29,7 @@ class AlignYourStepsScheduler(io.ComfyNode): return io.Schema( node_id="AlignYourStepsScheduler", search_aliases=["AYS scheduler"], - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]), io.Int.Input("steps", default=10, min=1, max=10000), diff --git a/comfy_extras/nodes_apg.py b/comfy_extras/nodes_apg.py index fd561d360..4a352038a 100644 --- a/comfy_extras/nodes_apg.py +++ b/comfy_extras/nodes_apg.py @@ -16,7 +16,7 @@ class APG(io.ComfyNode): return io.Schema( node_id="APG", display_name="Adaptive Projected Guidance", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Model.Input("model"), io.Float.Input( diff --git a/comfy_extras/nodes_ar_video.py b/comfy_extras/nodes_ar_video.py index 1a15facfa..c22359eb2 100644 --- a/comfy_extras/nodes_ar_video.py +++ b/comfy_extras/nodes_ar_video.py @@ -19,7 +19,7 @@ class EmptyARVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyARVideoLatent", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=832, min=16, max=8192, step=16), io.Int.Input("height", default=480, min=16, max=8192, step=16), @@ -53,7 +53,7 @@ class SamplerARVideo(io.ComfyNode): return io.Schema( node_id="SamplerARVideo", display_name="Sampler AR Video", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Int.Input( "num_frame_per_block", @@ -85,7 +85,7 @@ class ARVideoI2V(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ARVideoI2V", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Model.Input("model"), io.Vae.Input("vae"), diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index f09a8a874..ff078f74c 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -16,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode): return IO.Schema( node_id="EmptyLatentAudio", display_name="Empty Latent Audio", - category="latent/audio", + category="model/latent/audio", essentials_category="Audio", inputs=[ IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1), @@ -41,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="ConditioningStableAudio", - category="conditioning", + category="model/conditioning", inputs=[ IO.Conditioning.Input("positive"), IO.Conditioning.Input("negative"), @@ -70,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode): node_id="VAEEncodeAudio", search_aliases=["audio to latent"], display_name="VAE Encode Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Audio.Input("audio"), IO.Vae.Input("vae"), @@ -115,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode): node_id="VAEDecodeAudio", search_aliases=["latent to audio"], display_name="VAE Decode Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), @@ -137,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode): node_id="VAEDecodeAudioTiled", search_aliases=["latent to audio"], display_name="VAE Decode Audio (Tiled)", - category="latent/audio", + category="model/latent/audio", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py index 6a85da89b..2ae30d321 100644 --- a/comfy_extras/nodes_audio_encoder.py +++ b/comfy_extras/nodes_audio_encoder.py @@ -11,7 +11,7 @@ class AudioEncoderLoader(io.ComfyNode): return io.Schema( node_id="AudioEncoderLoader", display_name="Load Audio Encoder", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input( "audio_encoder_name", @@ -36,7 +36,7 @@ class AudioEncoderEncode(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="AudioEncoderEncode", - category="conditioning", + category="model/conditioning", inputs=[ io.AudioEncoder.Input("audio_encoder"), io.Audio.Input("audio"), diff --git a/comfy_extras/nodes_bg_removal.py b/comfy_extras/nodes_bg_removal.py index 793fd802b..9dc9ad854 100644 --- a/comfy_extras/nodes_bg_removal.py +++ b/comfy_extras/nodes_bg_removal.py @@ -11,7 +11,7 @@ class LoadBackgroundRemovalModel(IO.ComfyNode): return IO.Schema( node_id="LoadBackgroundRemovalModel", display_name="Load Background Removal Model", - category="loaders", + category="model/loaders", inputs=[ IO.Combo.Input("bg_removal_name", options=sorted(files), tooltip="The model used to remove backgrounds from images"), ], diff --git a/comfy_extras/nodes_camera_trajectory.py b/comfy_extras/nodes_camera_trajectory.py index 34b78e81b..13a1448f4 100644 --- a/comfy_extras/nodes_camera_trajectory.py +++ b/comfy_extras/nodes_camera_trajectory.py @@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanCameraEmbedding", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Combo.Input( "camera_pose", diff --git a/comfy_extras/nodes_chroma_radiance.py b/comfy_extras/nodes_chroma_radiance.py index 509436062..ca427e5cb 100644 --- a/comfy_extras/nodes_chroma_radiance.py +++ b/comfy_extras/nodes_chroma_radiance.py @@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="EmptyChromaRadianceLatentImage", - category="latent/chroma_radiance", + category="model/latent/chroma_radiance", inputs=[ io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="ChromaRadianceOptions", - category="model_patches/chroma_radiance", + category="model/patch/chroma_radiance", description="Allows setting advanced options for the Chroma Radiance model.", inputs=[ io.Model.Input(id="model"), diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py index 80ba121cd..01a05035e 100644 --- a/comfy_extras/nodes_color.py +++ b/comfy_extras/nodes_color.py @@ -8,7 +8,7 @@ class ColorToRGBInt(io.ComfyNode): return io.Schema( node_id="ColorToRGBInt", display_name="Color to RGB Int", - category="utils", + category="utilities", description="Convert a color to a RGB integer value.", inputs=[ io.Color.Input("color"), diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py index 2ad5bd65b..042992971 100644 --- a/comfy_extras/nodes_context_windows.py +++ b/comfy_extras/nodes_context_windows.py @@ -9,7 +9,7 @@ class ContextWindowsManualNode(io.ComfyNode): return io.Schema( node_id="ContextWindowsManual", display_name="Context Windows (Manual)", - category="model_patches", + category="model/patch", description="Manually set context windows.", inputs=[ io.Model.Input("model", tooltip="The model to apply context windows to during sampling."), diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py index 847cb0bdf..17d965405 100644 --- a/comfy_extras/nodes_controlnet.py +++ b/comfy_extras/nodes_controlnet.py @@ -9,7 +9,7 @@ class SetUnionControlNetType(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SetUnionControlNetType", - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.ControlNet.Input("control_net"), io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())), @@ -39,7 +39,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode): return io.Schema( node_id="ControlNetInpaintingAliMamaApply", search_aliases=["masked controlnet"], - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_cosmos.py b/comfy_extras/nodes_cosmos.py index 7dd129d19..d754ab442 100644 --- a/comfy_extras/nodes_cosmos.py +++ b/comfy_extras/nodes_cosmos.py @@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="EmptyCosmosLatentVideo", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CosmosImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="CosmosPredict2ImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_curve.py b/comfy_extras/nodes_curve.py index 099453131..aa2d94bb6 100644 --- a/comfy_extras/nodes_curve.py +++ b/comfy_extras/nodes_curve.py @@ -11,7 +11,7 @@ class CurveEditor(io.ComfyNode): return io.Schema( node_id="CurveEditor", display_name="Curve Editor", - category="utils", + category="utilities", inputs=[ io.Curve.Input("curve"), io.Histogram.Input("histogram", optional=True), @@ -38,7 +38,7 @@ class ImageHistogram(io.ComfyNode): return io.Schema( node_id="ImageHistogram", display_name="Image Histogram", - category="utils", + category="utilities", inputs=[ io.Image.Input("image"), ], diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py index 10b56b91c..c3346bf09 100644 --- a/comfy_extras/nodes_custom_sampler.py +++ b/comfy_extras/nodes_custom_sampler.py @@ -17,7 +17,7 @@ class BasicScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="BasicScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Combo.Input("scheduler", options=comfy.samplers.SCHEDULER_NAMES), @@ -47,7 +47,7 @@ class KarrasScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="KarrasScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -69,7 +69,7 @@ class ExponentialScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ExponentialScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -90,7 +90,7 @@ class PolyexponentialScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PolyexponentialScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -112,7 +112,7 @@ class LaplaceScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LaplaceScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), @@ -136,7 +136,7 @@ class SDTurboScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SDTurboScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Int.Input("steps", default=1, min=1, max=10), @@ -160,7 +160,7 @@ class BetaSamplingScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="BetaSamplingScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Model.Input("model"), io.Int.Input("steps", default=20, min=1, max=10000), @@ -182,7 +182,7 @@ class VPScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VPScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("beta_d", default=19.9, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), #TODO: fix default values @@ -204,7 +204,7 @@ class SplitSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SplitSigmas", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Int.Input("step", default=0, min=0, max=10000), @@ -228,7 +228,7 @@ class SplitSigmasDenoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SplitSigmasDenoise", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01), @@ -254,7 +254,7 @@ class FlipSigmas(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="FlipSigmas", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[io.Sigmas.Input("sigmas")], outputs=[io.Sigmas.Output()] ) @@ -276,7 +276,7 @@ class SetFirstSigma(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SetFirstSigma", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Float.Input("sigma", default=136.0, min=0.0, max=20000.0, step=0.001, round=False), @@ -298,7 +298,7 @@ class ExtendIntermediateSigmas(io.ComfyNode): return io.Schema( node_id="ExtendIntermediateSigmas", search_aliases=["interpolate sigmas"], - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Sigmas.Input("sigmas"), io.Int.Input("steps", default=2, min=1, max=100), @@ -351,7 +351,7 @@ class SamplingPercentToSigma(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplingPercentToSigma", - category="sampling/sigmas", + category="model/sampling/sigmas", inputs=[ io.Model.Input("model"), io.Float.Input("sampling_percent", default=0.0, min=0.0, max=1.0, step=0.0001), @@ -379,7 +379,7 @@ class KSamplerSelect(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="KSamplerSelect", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[io.Combo.Input("sampler_name", options=comfy.samplers.SAMPLER_NAMES)], outputs=[io.Sampler.Output()] ) @@ -396,7 +396,7 @@ class SamplerDPMPP_3M_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_3M_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -421,7 +421,7 @@ class SamplerDPMPP_2M_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_2M_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=['midpoint', 'heun']), io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -448,7 +448,7 @@ class SamplerDPMPP_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -474,7 +474,7 @@ class SamplerDPMPP_2S_Ancestral(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMPP_2S_Ancestral", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False), @@ -494,7 +494,7 @@ class SamplerEulerAncestral(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerEulerAncestral", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -515,7 +515,7 @@ class SamplerEulerAncestralCFGPP(io.ComfyNode): return io.Schema( node_id="SamplerEulerAncestralCFGPP", display_name="SamplerEulerAncestralCFG++", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Float.Input("eta", default=1.0, min=0.0, max=1.0, step=0.01, round=False), io.Float.Input("s_noise", default=1.0, min=0.0, max=10.0, step=0.01, round=False), @@ -537,7 +537,7 @@ class SamplerLMS(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerLMS", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[io.Int.Input("order", default=4, min=1, max=100, advanced=True)], outputs=[io.Sampler.Output()] ) @@ -554,7 +554,7 @@ class SamplerDPMAdaptative(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerDPMAdaptative", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Int.Input("order", default=3, min=2, max=3, advanced=True), io.Float.Input("rtol", default=0.05, min=0.0, max=100.0, step=0.01, round=False, advanced=True), @@ -585,7 +585,7 @@ class SamplerER_SDE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerER_SDE", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=["ER-SDE", "Reverse-time SDE", "ODE"]), io.Int.Input("max_stage", default=3, min=1, max=3, advanced=True), @@ -623,7 +623,7 @@ class SamplerSASolver(io.ComfyNode): return io.Schema( node_id="SamplerSASolver", search_aliases=["sde"], - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Model.Input("model"), io.Float.Input("eta", default=1.0, min=0.0, max=10.0, step=0.01, round=False, advanced=True), @@ -668,7 +668,7 @@ class SamplerSEEDS2(io.ComfyNode): return io.Schema( node_id="SamplerSEEDS2", search_aliases=["sde", "exp heun"], - category="sampling/samplers", + category="model/sampling/samplers", inputs=[ io.Combo.Input("solver_type", options=["phi_1", "phi_2"]), io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength", advanced=True), @@ -727,7 +727,7 @@ class SamplerCustom(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerCustom", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Model.Input("model"), io.Boolean.Input("add_noise", default=True, advanced=True), @@ -795,7 +795,7 @@ class BasicGuider(io.ComfyNode): return io.Schema( node_id="BasicGuider", display_name="Basic Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("conditioning"), @@ -817,7 +817,7 @@ class CFGGuider(io.ComfyNode): return io.Schema( node_id="CFGGuider", display_name="CFG Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("positive"), @@ -872,7 +872,7 @@ class DualCFGGuider(io.ComfyNode): node_id="DualCFGGuider", search_aliases=["dual prompt guidance"], display_name="Dual CFG Guider", - category="sampling/guiders", + category="model/sampling/guiders", inputs=[ io.Model.Input("model"), io.Conditioning.Input("cond1"), @@ -900,7 +900,7 @@ class DisableNoise(io.ComfyNode): return io.Schema( node_id="DisableNoise", search_aliases=["zero noise"], - category="sampling/noise", + category="model/sampling/noise", inputs=[], outputs=[io.Noise.Output()] ) @@ -917,7 +917,7 @@ class RandomNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="RandomNoise", - category="sampling/noise", + category="model/sampling/noise", inputs=[io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True)], outputs=[io.Noise.Output()] ) @@ -934,7 +934,7 @@ class SamplerCustomAdvanced(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SamplerCustomAdvanced", - category="sampling/custom_sampling", + category="model/sampling/custom_sampling", inputs=[ io.Noise.Input("noise"), io.Guider.Input("guider"), diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py index 22f5ff203..104d16d91 100644 --- a/comfy_extras/nodes_dataset.py +++ b/comfy_extras/nodes_dataset.py @@ -157,7 +157,7 @@ class LoadImageTextDataSetFromFolderNode(io.ComfyNode): return io.NodeOutput(output_tensor, captions) -def save_images_to_folder(image_list, output_dir, prefix="image"): +def save_images_to_folder(image_list, output_dir, prefix="image", overwrite=True): """Utility function to save a list of image tensors to disk. Args: @@ -197,7 +197,11 @@ def save_images_to_folder(image_list, output_dir, prefix="image"): raise ValueError(f"Expected torch.Tensor, got {type(img_tensor)}") # Save image - filename = f"{prefix}_{idx:05d}.png" + if overwrite: + filename = f"{prefix}_{idx:05d}.png" + else: + _, _, counter, _, resolved_prefix = folder_paths.get_save_image_path(prefix, output_dir) + filename = f"{resolved_prefix}_{counter:05}_{idx:05d}.png" filepath = os.path.join(output_dir, filename) img.save(filepath) saved_files.append(filename) @@ -230,19 +234,26 @@ class SaveImageDataSetToFolderNode(io.ComfyNode): tooltip="Prefix for saved image filenames.", advanced=True, ), + io.Combo.Input( + "mode", + default="overwrite", + options=["overwrite", "increment"], + tooltip="Whether to overwrite existing files or increment filenames to avoid overwriting." + ), ], outputs=[], is_deprecated=True, # This node is redundant and superseded by existing Save Image nodes where the target folder can be specified in the filename_prefix ) @classmethod - def execute(cls, images, folder_name, filename_prefix): + def execute(cls, images, folder_name, filename_prefix, mode): # Extract scalar values folder_name = folder_name[0] filename_prefix = filename_prefix[0] + mode = mode[0] output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) - saved_files = save_images_to_folder(images, output_dir, filename_prefix) + saved_files = save_images_to_folder(images, output_dir, filename_prefix, mode=='overwrite') logging.info(f"Saved {len(saved_files)} images to {output_dir}.") return io.NodeOutput() @@ -278,18 +289,25 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode): tooltip="Prefix for saved image filenames.", advanced=True, ), + io.Combo.Input( + "mode", + default="overwrite", + options=["overwrite", "increment"], + tooltip="Whether to overwrite existing files or increment filenames to avoid overwriting." + ), ], outputs=[], ) @classmethod - def execute(cls, images, folder_name, filename_prefix, texts=None): + def execute(cls, images, folder_name, filename_prefix, mode, texts=None): # Extract scalar values folder_name = folder_name[0] filename_prefix = filename_prefix[0] + mode = mode[0] output_dir = os.path.join(folder_paths.get_output_directory(), folder_name) - saved_files = save_images_to_folder(images, output_dir, filename_prefix) + saved_files = save_images_to_folder(images, output_dir, filename_prefix, mode=='overwrite') # Save captions if texts: @@ -574,7 +592,7 @@ class TextProcessingNode(io.ComfyNode): return io.Schema( node_id=cls.node_id, display_name=cls.display_name or cls.node_id, - category="dataset/text", + category="text", is_experimental=True, is_input_list=is_group, # True for group, False for individual inputs=inputs, @@ -1208,7 +1226,7 @@ class ResolutionBucket(io.ComfyNode): node_id="ResolutionBucket", search_aliases=["bucket by resolution", "group by resolution", "batch by resolution"], display_name="Resolution Bucket", - category="training", + category="model/training", description="Group latents and conditionings into buckets", is_experimental=True, is_input_list=True, @@ -1302,7 +1320,7 @@ class MakeTrainingDataset(io.ComfyNode): node_id="MakeTrainingDataset", search_aliases=["encode dataset"], display_name="Make Training Dataset", - category="training", + category="model/training", description="Encode images with VAE and texts with CLIP to create a training dataset of latents and conditionings.", is_experimental=True, is_input_list=True, # images and texts as lists @@ -1390,7 +1408,7 @@ class SaveTrainingDataset(io.ComfyNode): node_id="SaveTrainingDataset", search_aliases=["export dataset", "save dataset"], display_name="Save Training Dataset", - category="training", + category="model/training", description="Save encoded training dataset (latents + conditioning) to disk for efficient loading during training.", is_experimental=True, is_output_node=True, @@ -1493,7 +1511,7 @@ class LoadTrainingDataset(io.ComfyNode): node_id="LoadTrainingDataset", search_aliases=["import dataset", "training data"], display_name="Load Training Dataset", - category="training", + category="model/training", description="Load encoded training dataset (latents + conditioning) from disk for use in training.", is_experimental=True, inputs=[ diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py index 0fb3871c8..8c397f132 100644 --- a/comfy_extras/nodes_eps.py +++ b/comfy_extras/nodes_eps.py @@ -18,7 +18,7 @@ class EpsilonScaling(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Epsilon Scaling", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input( @@ -84,7 +84,7 @@ class TemporalScoreRescaling(io.ComfyNode): return io.Schema( node_id="TemporalScoreRescaling", display_name="TSR - Temporal Score Rescaling", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input( diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py index 997f21c09..afc663b22 100644 --- a/comfy_extras/nodes_flux.py +++ b/comfy_extras/nodes_flux.py @@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode): return io.Schema( node_id="EmptyFlux2LatentImage", display_name="Empty Flux 2 Latent", - category="latent", + category="model/latent", inputs=[ io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -215,7 +215,7 @@ class Flux2Scheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Flux2Scheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=4096), io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1), diff --git a/comfy_extras/nodes_frame_interpolation.py b/comfy_extras/nodes_frame_interpolation.py index 9dd34cfb8..4d5bca17e 100644 --- a/comfy_extras/nodes_frame_interpolation.py +++ b/comfy_extras/nodes_frame_interpolation.py @@ -19,7 +19,7 @@ class FrameInterpolationModelLoader(io.ComfyNode): return io.Schema( node_id="FrameInterpolationModelLoader", display_name="Load Frame Interpolation Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("frame_interpolation"), tooltip="Select a frame interpolation model to load. Models must be placed in the 'frame_interpolation' folder."), diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py index 248efdef3..ccbd1fd90 100644 --- a/comfy_extras/nodes_freelunch.py +++ b/comfy_extras/nodes_freelunch.py @@ -29,7 +29,7 @@ class FreeU(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="FreeU", - category="model_patches/unet", + category="model/patch/unet", inputs=[ IO.Model.Input("model"), IO.Float.Input("b1", default=1.1, min=0.0, max=10.0, step=0.01, advanced=True), @@ -76,7 +76,7 @@ class FreeU_V2(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="FreeU_V2", - category="model_patches/unet", + category="model/patch/unet", inputs=[ IO.Model.Input("model"), IO.Float.Input("b1", default=1.3, min=0.0, max=10.0, step=0.01, advanced=True), diff --git a/comfy_extras/nodes_gaussian_splat.py b/comfy_extras/nodes_gaussian_splat.py new file mode 100644 index 000000000..7fb878b8b --- /dev/null +++ b/comfy_extras/nodes_gaussian_splat.py @@ -0,0 +1,1663 @@ +# Generic utility nodes for the SPLAT type (3D gaussian splats) + +import gzip +import logging +import math +import struct +from io import BytesIO + +import numpy as np +import torch +from typing_extensions import override +from scipy.ndimage import map_coordinates, minimum as _ndi_minimum, maximum as _ndi_maximum +from scipy.sparse import coo_matrix +from scipy.sparse.csgraph import connected_components + +import comfy.model_management +import comfy.utils +from comfy_api.latest import ComfyExtension, IO, Types +from comfy_extras.nodes_save_3d import pack_variable_mesh_batch +from server import PromptServer + +_C0 = 0.28209479177387814 # SH band-0 constant: DC coefficient -> base RGB + + +def _srgb_to_linear(c): + return torch.where(c <= 0.04045, c / 12.92, ((c.clamp_min(0) + 0.055) / 1.055) ** 2.4) + + +def _linear_to_srgb(c): + return torch.where(c <= 0.0031308, c * 12.92, 1.055 * c.clamp_min(0) ** (1 / 2.4) - 0.055) + + +def _real_len(g: Types.SPLAT, i: int) -> int: + # Real splat count of batch item i (honors variable-length `counts`). + return int(g.counts[i].item()) if g.counts is not None else g.positions.shape[1] + + +def _hex_to_rgb(h: str) -> tuple[float, float, float]: + # "#RRGGBB" -> (r,g,b) in [0,1]; falls back to black. + h = h.lstrip("#") + if len(h) != 6: + return (0.0, 0.0, 0.0) + return tuple(int(h[i:i + 2], 16) / 255.0 for i in (0, 2, 4)) + + +def _quantile(x, q): + # torch.quantile errors above 2**24 elements; stride-subsample large inputs for the estimate. + lim = 1 << 24 + if x.numel() > lim: + x = x[:: x.numel() // lim + 1] + return torch.quantile(x, q) + + +def _gaussian_ply_bytes(positions, scales, rotations, opacities, sh) -> bytes: + """Serialize render-ready gaussian tensors as a binary 3DGS .ply. + + positions (N,3) world; scales (N,3) linear; rotations (N,4) quat wxyz; opacities (N,1) in [0,1]; + sh (N,K,3) SH coefficients. Activated values are inverted to the standard 3D gaussian splat storage convention + (log scale, logit opacity). + """ + xyz = positions.cpu().numpy().astype(np.float32) + n = xyz.shape[0] + if n == 0: + raise ValueError("SplatToFile3D: gaussian is empty") + normals = np.zeros_like(xyz) + f = sh.cpu().numpy().astype(np.float32) # (N, K, 3) + f_dc = f[:, 0, :] # (N, 3) + f_rest = f[:, 1:, :].transpose(0, 2, 1).reshape(n, -1) # (N, 3*(K-1)) channel-major + op = opacities.cpu().numpy().astype(np.float32).reshape(n, 1).clip(1e-6, 1 - 1e-6) + op = np.log(op / (1.0 - op)) # inverse sigmoid (logit) + scale = np.log(scales.cpu().numpy().astype(np.float32).clip(min=1e-8)) + rot = rotations.cpu().numpy().astype(np.float32) # (N, 4) + + attrs = (['x', 'y', 'z', 'nx', 'ny', 'nz'] + + [f'f_dc_{i}' for i in range(3)] + + [f'f_rest_{i}' for i in range(f_rest.shape[1])] + + ['opacity'] + [f'scale_{i}' for i in range(3)] + [f'rot_{i}' for i in range(4)]) + elements = np.empty(n, dtype=[(a, 'f4') for a in attrs]) + elements[:] = list(map(tuple, np.concatenate([xyz, normals, f_dc, f_rest, op, scale, rot], axis=1))) + + header = "ply\nformat binary_little_endian 1.0\n" + f"element vertex {n}\n" + header += "".join(f"property float {a}\n" for a in attrs) + "end_header\n" + return header.encode('ascii') + elements.tobytes() + + +# .ksplat (mkkellogg SplatBuffer) level 0, SH degree 0: 4096-byte header, one 1024-byte section header, +# then N 44-byte records. Bucketing/quantization only exist at levels >= 1. See SplatBuffer.js. +_KSPLAT_HEADER_BYTES = 4096 +_KSPLAT_SECTION_HEADER_BYTES = 1024 +_KSPLAT_BYTES_PER_SPLAT = 44 # center 12 + scale 12 + rotation 16 + color(RGBA u8) 4 +_KSPLAT_VERSION = (0, 1) # SplatBuffer CurrentMajor/MinorVersion + + +def _gaussian_ksplat_bytes(positions, scales, rotations, opacities, sh) -> bytes: + """Serialize gaussian tensors as a level-0, SH degree-0 .ksplat (linear scale, opacity in color alpha). + + positions (N,3) world; scales (N,3) linear; rotations (N,4) wxyz; opacities (N,1) in [0,1]; sh (N,K,3). + """ + xyz = positions.cpu().numpy().astype(np.float32) + n = xyz.shape[0] + if n == 0: + raise ValueError("SplatToFile3D: gaussian is empty") + scale = scales.cpu().numpy().astype(np.float32) + rot = rotations.cpu().numpy().astype(np.float32) # wxyz, mirrors the .ply rot order + rot = rot / np.linalg.norm(rot, axis=1, keepdims=True).clip(1e-12) + rgb = np.clip(sh[:, 0, :].cpu().numpy().astype(np.float32) * _C0 + 0.5, 0, 1) + op = opacities.cpu().numpy().astype(np.float32).reshape(n, 1).clip(0, 1) + rgba = np.round(np.concatenate([rgb, op], axis=1) * 255.0).astype(np.uint8) # (N, 4) RGBA + + # 44-byte record: float center(3) + scale(3) + rot(4), then uint8 rgba(4). + floats = np.concatenate([xyz, scale, rot], axis=1).astype(' bytes: + """Serialize gaussian tensors as a gzip-compressed .spz (Niantic v2, SH degree 0, base color only). + + positions (N,3) world; scales (N,3) linear; rotations (N,4) wxyz; opacities (N,1) in [0,1]; sh (N,K,3). + """ + xyz = positions.cpu().numpy().astype(np.float32) + n = xyz.shape[0] + if n == 0: + raise ValueError("SplatToFile3D: gaussian is empty") + + # Positions: fixed point, masked to 24 bits, little-endian 3-byte words. + fixed = 1 << _SPZ_FRACTIONAL_BITS + qi = np.clip(np.round(xyz * fixed), -(1 << 23), (1 << 23) - 1).astype(np.int32) + qu = (qi & 0xFFFFFF).astype(np.uint32) + pos = np.stack([qu & 0xFF, (qu >> 8) & 0xFF, (qu >> 16) & 0xFF], axis=-1).reshape(n, 9).astype(np.uint8) + + alpha = np.round(opacities.cpu().numpy().astype(np.float32).reshape(n) * 255.0).clip(0, 255).astype(np.uint8) + + rgb = sh[:, 0, :].cpu().numpy().astype(np.float32) * _C0 + 0.5 + col = np.round(((rgb - 0.5) / _SPZ_COLOR_SCALE + 0.5) * 255.0).clip(0, 255).astype(np.uint8) # (N,3) + + sln = np.log(scales.cpu().numpy().astype(np.float32).clip(min=1e-9)) + scb = np.round((sln + 10.0) * 16.0).clip(0, 255).astype(np.uint8) # (N,3) inverts exp(b/16-10) + + rot = rotations.cpu().numpy().astype(np.float32) # wxyz + rot = rot / np.linalg.norm(rot, axis=1, keepdims=True).clip(1e-12) + rot[rot[:, 0] < 0] *= -1.0 # canonical w >= 0 (w dropped on decode) + rotb = np.round((rot[:, 1:4] + 1.0) * 127.5).clip(0, 255).astype(np.uint8) # (N,3) x,y,z + + header = bytearray(16) + struct.pack_into(' (positions, scales linear, rotations wxyz, opacities [0,1], sh (N,K,3)) ---- +# Inverse of the writers above and of spark's loaders. ksplat/splat/spz carry base color only (SH degree 0 +# -> K=1); .ply round-trips full SH. None of the formats flip axes, so import is the identity of export. +_PLY_DTYPES = {'char': 'i1', 'uchar': 'u1', 'short': 'i2', 'ushort': 'u2', 'int': 'i4', 'uint': 'u4', + 'float': 'f4', 'double': 'f8', 'int8': 'i1', 'uint8': 'u1', 'int16': 'i2', 'uint16': 'u2', + 'int32': 'i4', 'uint32': 'u4', 'float32': 'f4', 'float64': 'f8'} +_KSPLAT_COMPRESSION = { # level -> (bytesPerCenter, scale, rotation, color, shComponent, defaultScaleRange) + 0: (12, 12, 16, 4, 4, 1), 1: (6, 6, 8, 4, 2, 32767), 2: (6, 6, 8, 4, 1, 32767)} +_KSPLAT_SH_COMPONENTS = {0: 0, 1: 9, 2: 24, 3: 45} + + +def _rgb_to_sh_dc(rgb): + return ((np.asarray(rgb, np.float32) - 0.5) / _C0)[:, None, :] # (N,3) base color -> (N,1,3) SH DC + + +def _norm_quat(q): + return q / np.linalg.norm(q, axis=1, keepdims=True).clip(1e-12) + + +def _parse_ply_gaussian(data: bytes): + end = data.find(b'end_header') + if end < 0: + raise ValueError("File3DToSplat: not a PLY (missing end_header)") + header = data[:end].decode('ascii', 'replace') + body = end + len(b'end_header') + body += 2 if data[body:body + 2] == b'\r\n' else 1 + count, props, in_vertex = 0, [], False + for line in header.splitlines(): + p = line.split() + if not p: + continue + if p[0] == 'format' and p[1] != 'binary_little_endian': + raise ValueError(f"File3DToSplat: unsupported PLY format '{p[1]}' (need binary_little_endian)") + if p[0] == 'element': + in_vertex = p[1] == 'vertex' + if in_vertex: + count = int(p[2]) + elif p[0] == 'property' and in_vertex: + if p[1] == 'list': + raise ValueError("File3DToSplat: PLY vertex has list properties (unsupported)") + props.append((p[2], '<' + _PLY_DTYPES[p[1]])) + arr = np.frombuffer(data, np.dtype(props), count=count, offset=body) + names = arr.dtype.names + c = lambda k: arr[k].astype(np.float32) + n = count + + xyz = np.stack([c('x'), c('y'), c('z')], 1) + if 'scale_0' in names: + scale = np.exp(np.stack([c('scale_0'), c('scale_1'), c('scale_2')], 1)) # 3DGS stores log scale + else: + scale = np.full((n, 3), 0.01, np.float32) + if 'rot_0' in names: + rot = _norm_quat(np.stack([c('rot_0'), c('rot_1'), c('rot_2'), c('rot_3')], 1)) # wxyz + else: + rot = np.tile(np.array([1, 0, 0, 0], np.float32), (n, 1)) + opacity = 1.0 / (1.0 + np.exp(-c('opacity'))) if 'opacity' in names else np.ones(n, np.float32) + + if 'f_dc_0' in names: + dc = np.stack([c('f_dc_0'), c('f_dc_1'), c('f_dc_2')], 1) # (N,3) + rest = sorted((k for k in names if k.startswith('f_rest_')), key=lambda s: int(s.split('_')[-1])) + if rest: + r = np.stack([c(k) for k in rest], 1) # (N, 3*(K-1)) channel-major + kk = r.shape[1] // 3 + 1 + r = r.reshape(n, 3, kk - 1).transpose(0, 2, 1) # -> (N, K-1, 3) + sh = np.concatenate([dc[:, None, :], r], 1) + else: + sh = dc[:, None, :] + elif 'red' in names: + sh = _rgb_to_sh_dc(np.stack([c('red'), c('green'), c('blue')], 1) / 255.0) + else: + sh = np.zeros((n, 1, 3), np.float32) + return xyz, scale, rot, opacity, sh + + +def _parse_splat_gaussian(data: bytes): + # antimatter15 .splat: 32-byte records (f32 xyz, f32 scale, u8 rgba, u8 quat as (b-128)/128 wxyz). + if len(data) % 32 != 0: + raise ValueError("File3DToSplat: .splat size is not a multiple of 32 bytes") + rec = np.frombuffer(data, np.dtype([('xyz', ' 0: + ct, ft = (' full_splats: + lengths = np.frombuffer(data, '> 30) & 3 + q = np.zeros((n, 4), np.float32) # x,y,z,w + remaining, sumsq = combined.copy(), np.zeros(n, np.float64) + for comp in (3, 2, 1, 0): + active = comp != largest + value = (remaining & 0x1FF).astype(np.float64) + sign = (remaining >> 9) & 1 + remaining = np.where(active, remaining >> 10, remaining) + val = (1.0 / math.sqrt(2)) * (value / 0x1FF) + val = np.where(sign == 1, -val, val) + q[active, comp] = val[active] + sumsq += np.where(active, val * val, 0.0) + q[np.arange(n), largest] = np.sqrt(np.clip(1.0 - sumsq, 0, None)) + rot = _norm_quat(np.stack([q[:, 3], q[:, 0], q[:, 1], q[:, 2]], 1)) # xyzw -> wxyz + else: + qb = np.frombuffer(raw, np.uint8, count=n * 3, offset=off).reshape(n, 3).astype(np.float32) + xq = qb / 127.5 - 1.0 + w = np.sqrt(np.clip(1.0 - (xq ** 2).sum(1), 0, None)) + rot = _norm_quat(np.concatenate([w[:, None], xq], 1)) # wxyz + return xyz, scale, rot, alpha, _rgb_to_sh_dc(rgb) + + +_GAUSSIAN_PARSERS = {"ply": _parse_ply_gaussian, "splat": _parse_splat_gaussian, + "ksplat": _parse_ksplat_gaussian, "spz": _parse_spz_gaussian} + + +def _detect_splat_format(data: bytes) -> str: + if data[:3] == b'ply': + return "ply" + if data[:2] == b'\x1f\x8b': # gzip -> spz + return "spz" + if len(data) >= 2 and data[0] == 0 and data[1] >= 1: # ksplat version 0.x header + return "ksplat" + if len(data) % 32 == 0: + return "splat" + raise ValueError("File3DToSplat: could not determine splat format from contents") + + +def _gaussian_item(g: Types.SPLAT, i: int, device): + # Slice batch item i to its real length, as float32 torch tensors on `device` (SH DC -> base RGB). + end = _real_len(g, i) + to = lambda a: a.to(device=device, dtype=torch.float32) + xyz = to(g.positions[i, :end]) + rgb = (to(g.sh[i, :end, 0, :]) * _C0 + 0.5).clamp(0, 1) + opacity = to(g.opacities[i, :end]).reshape(-1) + scale = to(g.scales[i, :end]) + rot = to(g.rotations[i, :end]) + return xyz, rgb, opacity, scale, rot + + +def _quat_to_mat(q): + # q: (N, 4) wxyz, normalized -> (N, 3, 3) + q = q / q.norm(dim=-1, keepdim=True).clamp_min(1e-12) + w, x, y, z = q.unbind(-1) + return torch.stack([ + 1 - 2 * (y * y + z * z), 2 * (x * y - w * z), 2 * (x * z + w * y), + 2 * (x * y + w * z), 1 - 2 * (x * x + z * z), 2 * (y * z - w * x), + 2 * (x * z - w * y), 2 * (y * z + w * x), 1 - 2 * (x * x + y * y), + ], dim=-1).reshape(-1, 3, 3) + + +def _quat_mul(a, b): + # Hamilton product a (x) b, wxyz. + aw, ax, ay, az = a.unbind(-1) + bw, bx, by, bz = b.unbind(-1) + return torch.stack([ + aw * bw - ax * bx - ay * by - az * bz, + aw * bx + ax * bw + ay * bz - az * by, + aw * by - ax * bz + ay * bw + az * bx, + aw * bz + ax * by - ay * bx + az * bw, + ], dim=-1) + + +def _euler_to_quat(rx, ry, rz): + # Degrees, applied as Rz @ Ry @ Rx (rotate about X, then Y, then Z in world). Returns wxyz. + c, s = np.cos(np.radians([rx, ry, rz]) / 2.0), np.sin(np.radians([rx, ry, rz]) / 2.0) + qx = torch.tensor([c[0], s[0], 0.0, 0.0], dtype=torch.float32) + qy = torch.tensor([c[1], 0.0, s[1], 0.0], dtype=torch.float32) + qz = torch.tensor([c[2], 0.0, 0.0, s[2]], dtype=torch.float32) + return _quat_mul(_quat_mul(qz, qy), qx) + + +def _mat_to_quat(m): + # Rotation matrix (..., 3, 3) -> quaternion (..., 4) wxyz. Batched; builds the four candidate quaternions + # and keeps the one with the largest component (numerically stable across all rotations). + m00, m11, m22 = m[..., 0, 0], m[..., 1, 1], m[..., 2, 2] + m21, m12 = m[..., 2, 1], m[..., 1, 2] + m02, m20 = m[..., 0, 2], m[..., 2, 0] + m10, m01 = m[..., 1, 0], m[..., 0, 1] + q2 = torch.stack([1 + m00 + m11 + m22, 1 + m00 - m11 - m22, + 1 - m00 + m11 - m22, 1 - m00 - m11 + m22], -1) # 4 * (w^2, x^2, y^2, z^2) + cand = torch.stack([ + torch.stack([q2[..., 0], m21 - m12, m02 - m20, m10 - m01], -1), + torch.stack([m21 - m12, q2[..., 1], m10 + m01, m02 + m20], -1), + torch.stack([m02 - m20, m10 + m01, q2[..., 2], m12 + m21], -1), + torch.stack([m10 - m01, m02 + m20, m12 + m21, q2[..., 3]], -1), + ], -2) # (...,4,4) candidates, rows = wxyz + sel = q2.argmax(-1) + q = torch.gather(cand, -2, sel[..., None, None].expand(sel.shape + (1, 4)))[..., 0, :] + return q / q.norm(dim=-1, keepdim=True).clamp_min(1e-12) + + +class SplatToFile3D(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SplatToFile3D", + display_name="Create 3D File (from Splat)", + search_aliases=["gaussian to ply", "splat to file", "export gaussian"], + category="3d/splat", + description="Serialize a gaussian splat to a File3D object for Save / Preview 3D nodes. " + "Supports one item per batch only.", + inputs=[ + IO.Splat.Input("splat"), + IO.Combo.Input("format", options=["ply", "ksplat", "spz"], # TODO: add "splat" when we have a writer for it + tooltip="ply: standard 3D Gaussian Splat with full spherical harmonics. " + "ksplat: mkkellogg SplatBuffer (level 0, uncompressed), base color only " + "spz: Niantic gzip-compressed (~10x smaller), base color only " + ), + ], + outputs=[IO.File3DAny.Output(display_name="model_3d")], + ) + + @classmethod + def execute(cls, splat, format="ply") -> IO.NodeOutput: + if splat.positions.shape[0] > 1: + logging.warning("SplatToFile3D supports one item per batch only. Got %d; using first.", splat.positions.shape[0]) + end = _real_len(splat, 0) + writer = {"ksplat": _gaussian_ksplat_bytes, "spz": _gaussian_spz_bytes}.get(format, _gaussian_ply_bytes) + data = writer(splat.positions[0, :end], splat.scales[0, :end], + splat.rotations[0, :end], splat.opacities[0, :end], splat.sh[0, :end]) + return IO.NodeOutput(Types.File3D(BytesIO(data), file_format=format)) + + +class File3DToSplat(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="File3DToSplat", + display_name="Get Splat", + search_aliases=["load splat", "ply to splat", "import splat", "file to splat"], + category="3d/splat", + description="Parse a splat File3D into a gaussian splat. Inverse of Create 3D File (from Splat). " + "Supported format: PLY, SPLAT, KSPLAT, SPZ. PLY carries full spherical harmonics, " + "the other formats are base color only. Format is auto-detected from the file contents.", + inputs=[ + IO.MultiType.Input( + IO.File3DAny.Input("model_3d"), + types=[IO.File3DPLY, IO.File3DSPLAT, IO.File3DKSPLAT, IO.File3DSPZ], + tooltip="A gaussian splat 3D file", + ), + ], + outputs=[IO.Splat.Output(display_name="splat")], + ) + + @classmethod + def execute(cls, model_3d: Types.File3D) -> IO.NodeOutput: + data = model_3d.get_bytes() + fmt = (model_3d.format or "").lower() + parser = _GAUSSIAN_PARSERS.get(fmt) or _GAUSSIAN_PARSERS[_detect_splat_format(data)] + xyz, scale, rot, opacity, sh = parser(data) + + t = lambda a: torch.from_numpy(np.ascontiguousarray(a)).float() + splat = Types.SPLAT( + t(xyz)[None], # (1, N, 3) + t(scale)[None], # (1, N, 3) linear + t(rot)[None], # (1, N, 4) wxyz + t(opacity).reshape(1, -1, 1), # (1, N, 1) + t(sh)[None], # (1, N, K, 3) + ) + return IO.NodeOutput(splat) + + +def _view_matrix_t(yaw_deg, pitch_deg, device): + y, p = math.radians(yaw_deg), math.radians(pitch_deg) + cy, sy, cp, sp = math.cos(y), math.sin(y), math.cos(p), math.sin(p) + Ry = torch.tensor([[cy, 0, sy], [0, 1, 0], [-sy, 0, cy]], device=device) + Rx = torch.tensor([[1, 0, 0], [0, cp, -sp], [0, sp, cp]], device=device) + return Rx @ Ry + + +def _camera_basis(camera_info, dev): + # Look-at basis in the splat frame, named by their projection rows: right = image +x, up = image +y + # (down, since yflip=1), fwd = view/depth axis (eye -> scene). Load3D is three.js (right-handed, Y-up, + # camera looks down -Z); the splat is 3DGS (Y-down, Z-forward). World -> splat is a 180 deg rotation + # about X: (x, y, z) -> (x, -y, -z) (det +1, no mirror, no axis swap). + pos, tgt = camera_info.get("position", {}), camera_info.get("target", {}) + m = lambda d: torch.tensor([float(d.get("x", 0.0)), -float(d.get("y", 0.0)), -float(d.get("z", 0.0))], device=dev) + eye, target = m(pos), m(tgt) + mv = lambda v: torch.stack([v[0], -v[1], -v[2]]) # same world->splat map, for direction vectors + n = lambda v: v / v.norm().clamp_min(1e-8) + q = camera_info.get("quaternion") + if q: # exact camera world rotation (incl. roll) + qwxyz = torch.tensor([float(q.get("w", 1.0)), float(q.get("x", 0.0)), + float(q.get("y", 0.0)), float(q.get("z", 0.0))], device=dev) + R = _quat_to_mat(qwxyz[None])[0] # columns = camera world axes; looks down local -Z + right = n(mv(R[:, 0])) # camera +X -> image right + up = n(mv(-R[:, 1])) # camera +Y is image up; image-down row is its negative + fwd = n(mv(-R[:, 2])) # camera looks down local -Z -> view direction + return eye, target, right, up, fwd + fwd = n(target - eye) # no quaternion: orbit-consistent, roll-free + yaw = math.degrees(math.atan2(-float(fwd[0]), float(fwd[2]))) + pitch = math.degrees(math.asin(max(-1.0, min(1.0, float(fwd[1]))))) + W = _view_matrix_t(yaw, pitch, dev) + return eye, target, W[0], W[1], W[2] + + +def _lookat_quat_wxyz(position, target, dev): + # three.js lookAt in world frame: camera local +Z = (eye - target), up = world +Y. Returns wxyz. + z = position - target + z = z / z.norm().clamp_min(1e-8) + up0 = torch.tensor([0.0, 1.0, 0.0], device=dev) + if z.dot(up0).abs() > 0.999: # looking straight up/down + up0 = torch.tensor([0.0, 0.0, 1.0], device=dev) + x = torch.linalg.cross(up0, z) + x = x / x.norm().clamp_min(1e-8) + y = torch.linalg.cross(z, x) + R = torch.stack([x, y, z], dim=1) # columns = camera world axes + return _mat_to_quat(R[None])[0] + + +def _lookat_camera_info(position, target, fov, dev, zoom=1.0, camera_type="perspective", roll=0.0): + # Build a camera_info from a world-space (right-handed, Y-up) eye + look-at target; up = world +Y. + pos = torch.as_tensor(position, dtype=torch.float32, device=dev) + tgt = torch.as_tensor(target, dtype=torch.float32, device=dev) + q = _lookat_quat_wxyz(pos, tgt, dev) + if roll: # roll about the view axis (camera local Z) + a = math.radians(roll) + qz = torch.tensor([math.cos(a / 2), 0.0, 0.0, math.sin(a / 2)], device=dev) + q = _quat_mul(q[None], qz[None])[0] + xyz = lambda v: {"x": float(v[0]), "y": float(v[1]), "z": float(v[2])} + return {"position": xyz(pos), "target": xyz(tgt), + "quaternion": {"x": float(q[1]), "y": float(q[2]), "z": float(q[3]), "w": float(q[0])}, + "fov": float(fov), "cameraType": str(camera_type), "zoom": float(zoom)} + + +def _quat_camera_info(position, quat_xyzw, fov, dev, zoom=1.0, camera_type="perspective"): + # camera_info from an explicit world position + camera-rotation quaternion (three.js: looks down local -Z). + pos = torch.as_tensor(position, dtype=torch.float32, device=dev) + qx, qy, qz, qw = (float(c) for c in quat_xyzw) + qwxyz = torch.tensor([qw, qx, qy, qz], dtype=torch.float32, device=dev) + qwxyz = qwxyz / qwxyz.norm().clamp_min(1e-8) + R = _quat_to_mat(qwxyz[None])[0] + tgt = pos - R[:, 2] # look one unit down local -Z + xyz = lambda v: {"x": float(v[0]), "y": float(v[1]), "z": float(v[2])} + return {"position": xyz(pos), "target": xyz(tgt), + "quaternion": {"x": float(qwxyz[1]), "y": float(qwxyz[2]), "z": float(qwxyz[3]), "w": float(qwxyz[0])}, + "fov": float(fov), "cameraType": str(camera_type), "zoom": float(zoom)} + + +def _orbit_camera_info(yaw, pitch, distance, fov, pivot_splat, dev): + # Orbit helper for RenderSplat's default camera: yaw/pitch about `pivot_splat` (splat frame) at `distance`. + # World<->splat is the (x,-y,-z) map, so _camera_basis recovers exactly _view_matrix_t(yaw, pitch). + y, p = math.radians(yaw), math.radians(pitch) + cy, sy, cp, sp = math.cos(y), math.sin(y), math.cos(p), math.sin(p) + fwd_splat = torch.tensor([-cp * sy, sp, cp * cy], device=dev) # == _view_matrix_t(yaw, pitch)[2] + m = lambda v: torch.stack([v[0], -v[1], -v[2]]) # splat<->world (its own inverse) + return _lookat_camera_info(m(pivot_splat - distance * fwd_splat), m(pivot_splat), fov, dev) + + +def _orbit_camera_info_yaw(camera_info, angle_deg, dev): + # Turntable: rigidly rotate a camera_info about world +Y around its target by angle_deg. Returns a new dict. + a = math.radians(angle_deg) + ca, sa = math.cos(a), math.sin(a) + v = lambda d: torch.tensor([float(d.get("x", 0.0)), float(d.get("y", 0.0)), float(d.get("z", 0.0))], device=dev) + pos, tgt = v(camera_info.get("position", {})), v(camera_info.get("target", {})) + Ry = torch.tensor([[ca, 0.0, sa], [0.0, 1.0, 0.0], [-sa, 0.0, ca]], device=dev) + new_pos = tgt + Ry @ (pos - tgt) + q = camera_info.get("quaternion") or {} + qcur = torch.tensor([float(q.get("w", 1.0)), float(q.get("x", 0.0)), + float(q.get("y", 0.0)), float(q.get("z", 0.0))], device=dev) + qy = torch.tensor([math.cos(a / 2), 0.0, math.sin(a / 2), 0.0], device=dev) # world +Y rotation + qn = _quat_mul(qy[None], qcur[None])[0] + xyz = lambda t: {"x": float(t[0]), "y": float(t[1]), "z": float(t[2])} + return {**camera_info, "position": xyz(new_pos), + "quaternion": {"x": float(qn[1]), "y": float(qn[2]), "z": float(qn[3]), "w": float(qn[0])}} + + +def _gauss_blur(x, sigma, dev): + # Separable Gaussian blur of (1, C, H, W). Used to denoise the screen-space normal map. + r = max(1, int(round(3 * sigma))) + k = torch.exp(-0.5 * (torch.arange(-r, r + 1, device=dev, dtype=torch.float32) / sigma) ** 2) + k = k / k.sum() + c = x.shape[1] + x = torch.nn.functional.conv2d(x, k.view(1, 1, 1, -1).expand(c, 1, 1, -1), padding=(0, r), groups=c) + x = torch.nn.functional.conv2d(x, k.view(1, 1, -1, 1).expand(c, 1, -1, 1), padding=(r, 0), groups=c) + return x + + +def _render_gaussian(xyz, rgb, opacity, scale, rot, width, height, splat_scale, bg, camera_info, + sharpen=1.0, headlight_shading=0.0, render_style="color"): + # Perspective-correct anisotropic gaussian splat rasterizer. Each splat is weighted by its 3D Gaussian's + # peak along each pixel's ray (AAA / Hahlbohm), composited front-to-back across depth slabs. `render_style` + # selects the image: color / clay / depth / normal. Returns (image HxWx3, coverage mask HxW) on CPU. + dev = comfy.model_management.get_torch_device() + t = lambda a: torch.as_tensor(a, dtype=torch.float32, device=dev) + idev, idtype = comfy.model_management.intermediate_device(), comfy.model_management.intermediate_dtype() + xyz, rgb, opacity = t(xyz), t(rgb).clamp(0, 1), t(opacity).reshape(-1) + scale, rot = t(scale) * float(splat_scale), t(rot) + do_linear = render_style == "color" # colour blends in linear light, re-encoded at the end + if do_linear: + rgb = _srgb_to_linear(rgb) + flat = width * height + bg_t = t(bg) + bg_comp = _srgb_to_linear(bg_t) if do_linear else bg_t # background blended in the same space as the splats + need_depth = render_style == "depth" + need_normal = render_style in ("normal", "clay") or headlight_shading > 0 + + def background_only(): # no splats to rasterize -> just the background + empty mask + img = bg_t.expand(height, width, 3) if render_style == "color" else torch.zeros(height, width, 3, device=dev) + return img.to(idev, idtype), torch.zeros(height, width, device=idev, dtype=idtype) + + if xyz.shape[0] == 0: # empty input (e.g. all culled by opacity_threshold) + return background_only() + + eye, target, right, up, fwd = _camera_basis(camera_info, dev) # all camera state comes from camera_info + W = torch.stack([right, up, fwd], 0) # rows = camera axes (world -> camera) + cam = (xyz - eye) @ W.T + fov = float(camera_info.get("fov", 0) or 0) or 35.0 + zoom = float(camera_info.get("zoom", 1.0) or 1.0) # three.js digital zoom: scales the focal length + is_ortho = str(camera_info.get("cameraType", "")).lower().startswith("ortho") + xc, yc, zc = cam.unbind(-1) + + keep = zc > 1e-2 + xc, yc, zc, rgb, opacity, scale, rot = (a[keep] for a in (xc, yc, zc, rgb, opacity, scale, rot)) + if xc.shape[0] == 0: # nothing in front of the camera -> background only + return background_only() + if render_style == "clay": + rgb = torch.full_like(rgb, 0.75) # neutral albedo -> shading shows pure geometry + + f = (min(width, height) / 2) / math.tan(math.radians(fov) / 2) * zoom # fov over the smaller axis, x camera zoom + cx0, cy0 = width / 2, height / 2 + + # Camera-space 3D covariance per splat: Sigma = (W Rq) diag(scale^2) (W Rq)^T, plus a tiny relative + # regularizer for a stable inverse (a pixel-size Mip low-pass would over-thicken flat surfels and blur). + Mw = W[None] @ _quat_to_mat(rot) # (N,3,3) world -> camera + cam_cov = (Mw * scale.square()[:, None, :]) @ Mw.transpose(1, 2) + cam_cov = cam_cov + (cam_cov.diagonal(dim1=-2, dim2=-1).mean(-1) * 1e-3)[:, None, None] * torch.eye(3, device=dev) + + # Perspective-correct weighting: peak of the 3D Gaussian along each pixel ray. Precompute Si, Si@mu, mu^T Si mu. + mu = torch.stack([xc, yc, zc], -1) + si = torch.linalg.inv(cam_cov) + simu = (si @ mu[:, :, None])[:, :, 0] # (N,3) + musimu = (mu * simu).sum(-1) # (N,) + s00, s01, s02 = si[:, 0, 0], si[:, 0, 1], si[:, 0, 2] + s11, s12, s22 = si[:, 1, 1], si[:, 1, 2], si[:, 2, 2] + simu0, simu1, simu2 = simu.unbind(-1) + if need_normal: # surfel normal = thinnest axis, oriented toward camera + nrm = Mw[torch.arange(Mw.shape[0], device=dev), :, scale.argmin(-1)] # (N,3) camera-space normal + nrm = nrm * torch.where(nrm[:, 2:3] > 0, -1.0, 1.0) # flip so nz <= 0 (faces camera) + + # Screen centre (exact) + footprint radius from the affine 2D projection (used only to size the kernel). + # The image is +y-down, so the projection's y row is unflipped - it matches the splat frame's +Y. + jm = torch.zeros(xc.shape[0], 2, 3, device=dev) + if is_ortho: # parallel projection: screen = s * (xc, yc) + s = f / float((target - eye).norm().clamp_min(1e-6)) # pixels per world unit at the target plane + cx, cy = cx0 + s * xc, cy0 + s * yc + jm[:, 0, 0] = s + jm[:, 1, 1] = s + else: # perspective: screen = f * (xc, yc) / zc + invz = 1.0 / zc + cx, cy = cx0 + f * xc * invz, cy0 + f * yc * invz + jm[:, 0, 0], jm[:, 0, 2] = f * invz, -f * xc * invz.square() + jm[:, 1, 1], jm[:, 1, 2] = f * invz, -f * yc * invz.square() + cov2 = jm @ cam_cov @ jm.transpose(1, 2) + a, b, c = cov2[:, 0, 0], cov2[:, 0, 1], cov2[:, 1, 1] + max_eig = (a + c) * 0.5 + (((a - c) * 0.5).square() + b * b).clamp_min(0).sqrt() + radius = 3.0 * max_eig.clamp_min(1e-8).sqrt() + K = int(min(max(24, min(width, height) // 16), max(2, math.ceil(_quantile(radius, 0.995).item())))) + + # Per-splat kernel size: bucket splats by radius into a coarse ladder of window sizes (global K stays the cap) so + # small splats (the bulk of it) use a small window. + levels = [L for L in (16, 64, 256) if L < K] + [K] + levels_t = torch.tensor(levels, device=dev, dtype=torch.float32) + grids = [] + for L in levels: + rng = torch.arange(-L, L + 1, device=dev, dtype=torch.float32) + gy, gx = torch.meshgrid(rng, rng, indexing="ij") + grids.append((gx.reshape(-1), gy.reshape(-1))) + blevel = torch.bucketize(radius * (4.0 / 3.0), levels_t).clamp_(max=len(levels) - 1) # window >= ~4 sigma + + n = zc.shape[0] + ns = int(min(256, max(1, n // 1000))) # depth slabs: 1 per ~1000 splats, capped + nl = len(levels) + order = torch.argsort(zc) # front (small zc) -> back -> defines the slabs + bounds = torch.linspace(0, n, ns + 1, device=dev).round().long() + rank = torch.empty(n, dtype=torch.long, device=dev) + rank[order] = torch.arange(n, device=dev) # depth rank of each splat + slab_id = (torch.searchsorted(bounds, rank, right=True) - 1).clamp_(0, ns - 1) + key = slab_id * nl + blevel # group by slab, then kernel level (order-free within) + order = torch.argsort(key) + key = key[order] + + cxr, cyr = cx[order].round(), cy[order].round() + s00, s01, s02 = s00[order], s01[order], s02[order] + s11, s12, s22 = s11[order], s12[order], s22[order] + s01b, s02b, s12b = s01 * 2, s02 * 2, s12 * 2 # doubled cross terms for the fused quadratic forms + simu0, simu1, simu2, musimu = simu0[order], simu1[order], simu2[order], musimu[order] + opacity, rgb = opacity[order], rgb[order] + zc_o = zc[order] if need_depth else None + nrm_o = nrm[order] if need_normal else None + mux_o, muy_o, muz_o = (xc[order], yc[order], zc[order]) if is_ortho else (None, None, None) + + # Pack the per-splat scalars into one tensor so each chunk slices once + common = [cxr, cyr, s00, s11, s22, s01b, s02b, s12b, opacity] + pstack = torch.stack(common + ([s02, s12, mux_o, muy_o, muz_o] if is_ortho else [simu0, simu1, simu2, musimu])) + + # Precompute the (slab, level) run table on-GPU and pull it to the CPU once + starts = torch.cat([torch.zeros(1, dtype=torch.long, device=dev), (key[1:] != key[:-1]).nonzero().flatten() + 1]) + ks = key[starts] + run_lo = starts.tolist() + [n] + run_lev = (ks % nl).tolist() + run_slab = torch.div(ks, nl, rounding_mode="floor").tolist() + slab_runs = [[] for _ in range(ns)] + for r in range(len(run_lev)): + slab_runs[run_slab[r]].append((run_lo[r], run_lo[r + 1], run_lev[r])) + + def splat(lo, hi, ox, oy): # -> pixel idx (m,M), alpha (m,M); weight = 3D Gaussian peak along each pixel's ray + cols = pstack[:, lo:hi, None].unbind(0) + cxr_, cyr_, a00, a11, a22, b01, b02, b12, opa = cols[:9] # a* = Si components; b* = 2 * cross terms + px = cxr_ + ox[None, :] + py = cyr_ + oy[None, :] + valid = (px >= 0) & (px < width) & (py >= 0) & (py < height) + if is_ortho: # parallel ray (0,0,1) from screen point (X, Y, 0); rz constant per splat + c02, c12, mx, my, mz = cols[9:] + rx = (px - cx0) / s - mx + ry = (py - cy0) / s - my + rz = -mz + a22rz = a22 * rz + inx = torch.addcmul(b02 * rz, a00, rx).addcmul_(b01, ry) # a00 rx + b01 ry + b02 rz + rSr = torch.addcmul(a22rz * rz, rx, inx).addcmul_(ry, torch.addcmul(b12 * rz, a11, ry)) + dsr = torch.addcmul(a22rz, c02, rx).addcmul_(c12, ry) + q = torch.addcdiv(rSr, dsr * dsr, a22.clamp_min(1e-12), value=-1).clamp_min_(0) + else: # perspective ray (dx,dy,1) through the camera origin + su0, su1, su2, mus = cols[9:] + dx, dy = (px - cx0) / f, (py - cy0) / f + dsid = torch.addcmul(a22, dx, torch.addcmul(b02, a00, dx)) # a22 + dx*(a00 dx + b02) + dsid = dsid.addcmul_(dy, torch.addcmul(b12, a11, dy)) # + dy*(a11 dy + b12) + dsid = dsid.addcmul_(b01 * dx, dy) # + (2 s01) dx dy + dsimu = torch.addcmul(su2, dx, su0).addcmul_(dy, su1) + q = torch.addcdiv(mus, dsimu * dsimu, dsid.clamp_min(1e-12), value=-1).clamp_min_(0) + alpha = (opa * torch.exp(-0.5 * q) * valid).clamp_(0, 0.999) + idx = py.long().clamp(0, height - 1) * width + px.long().clamp(0, width - 1) + return idx, alpha + + # Front-to-back compositing over the depth slabs set up above. Within a slab the accumulation is a pure + # sum (order-independent), so splats are grouped by kernel level and each level uses its own tight window. + sharp = sharpen != 1.0 # winner-take-more colour blend: dominant splat shows more + cacc = torch.zeros((flat, 3), device=dev) + trans = torch.ones((flat,), device=dev) + a_buf = torch.zeros((flat,), device=dev) # sum alpha -> colour/depth/normal weight (alpha-weighted mean) + tau_buf = torch.zeros((flat,), device=dev) # sum -ln(1-alpha) -> slab opacity = 1-prod(1-alpha) + crgb = torch.zeros((flat, 3), device=dev) # sum alpha^p * rgb -> slab colour + wbuf = torch.zeros((flat,), device=dev) if sharp else None # sum alpha^p -> colour normalizer (sharp only) + dacc = torch.zeros((flat,), device=dev) if need_depth else None # front-weighted depth + nacc = torch.zeros((flat, 3), device=dev) if need_normal else None # front-weighted camera-space normal + zslab = torch.zeros((flat,), device=dev) if need_depth else None + nslab = torch.zeros((flat, 3), device=dev) if need_normal else None + stale = 0 # consecutive fully-occluded slabs -> early-out + for si in range(ns): + runs = slab_runs[si] + if not runs: + continue + a_buf.zero_() + tau_buf.zero_() + crgb.zero_() + if sharp: + wbuf.zero_() + if need_depth: + zslab.zero_() + if need_normal: + nslab.zero_() + for r_lo, r_hi, li in runs: # contiguous same-kernel-level runs in this slab + ox, oy = grids[li] + ch = max(2048, 10_000_000 // ox.shape[0]) # splats/chunk, bounded by this level's kernel size + for lo in range(r_lo, r_hi, ch): + hi = min(lo + ch, r_hi) + idx, alpha = splat(lo, hi, ox, oy) + idx, af = idx.reshape(-1), alpha.reshape(-1) + a_buf.index_add_(0, idx, af) + tau_buf.index_add_(0, idx, (-torch.log1p(-alpha)).reshape(-1)) # -ln(1-alpha), correct opacity merge + apw = alpha.pow(sharpen) if sharp else alpha # bias colour toward the highest-alpha splat + crgb.index_add_(0, idx, (apw[:, :, None] * rgb[lo:hi, None, :]).reshape(-1, 3)) + if sharp: + wbuf.index_add_(0, idx, apw.reshape(-1)) + if need_depth: + zslab.index_add_(0, idx, (alpha * zc_o[lo:hi, None]).reshape(-1)) + if need_normal: + nslab.index_add_(0, idx, (alpha[:, :, None] * nrm_o[lo:hi, None, :]).reshape(-1, 3)) + slab_a = 1 - torch.exp(-tau_buf) # 1 - prod(1-alpha): true opacity of the slab's splats + front = trans * slab_a + denom = wbuf if sharp else a_buf + cacc.addcmul_(front[:, None], crgb / denom.clamp_min(1e-8)[:, None]) # cacc += front * (crgb/denom) + if need_depth or need_normal: + ainv = a_buf.clamp_min(1e-8) # alpha-weighted-mean normalizer (depth/normal only) + if need_depth: + dacc.addcmul_(front, zslab / ainv) + if need_normal: + nacc.addcmul_(front[:, None], nslab / ainv[:, None]) + trans.mul_(1 - slab_a) + if si % 8 == 7: # checkpoint every 8 slabs (a per-slab GPU sync would cost more) + if float(front.max()) < 1e-3: # this checkpoint slab is fully occluded by what is in front + stale += 1 + if stale >= 2: # two occluded checkpoints running -> the rest are too -> stop + break + else: + stale = 0 + + cov = 1 - trans + covg = cov.reshape(height, width) + covm = covg > 0.5 if render_style in ("depth", "normal") else None # silhouette mask (depth/normal styles only) + depth_map = (dacc / cov.clamp_min(1e-6)).reshape(height, width) if need_depth else None + nrm_map = None + if need_normal: + # Per-splat surfel normals are jittery, so do a masked blur + nb = nacc.reshape(height, width, 3).permute(2, 0, 1)[None] + cb = cov.reshape(1, 1, height, width) + nb, cb = _gauss_blur(nb, 1.2, dev), _gauss_blur(cb, 1.2, dev) + normal = (nb / cb.clamp_min(1e-6))[0].permute(1, 2, 0) + nrm_map = normal / normal.norm(dim=-1, keepdim=True).clamp_min(1e-6) + + if render_style == "depth": # near = bright, far = dark, 0 off-object + d = torch.zeros(height, width, device=dev) + if bool(covm.any()): + lo, hi = depth_map[covm].min(), depth_map[covm].max() + d = torch.where(covm, ((hi - depth_map) / (hi - lo).clamp_min(1e-6)).clamp(0, 1), d) + img = d[:, :, None].expand(height, width, 3) + elif render_style == "normal": # OpenGL normal map: +X right, +Y up, +Z to viewer + enc = (nrm_map * t([1.0, -1.0, -1.0]) * 0.5 + 0.5).clamp(0, 1) + img = enc * covm[:, :, None] + else: # color / clay + img = cacc.reshape(height, width, 3) + if render_style == "clay": # studio key light + ambient -> sculpted matte look + kl = t([-0.4, -0.7, -0.6]) # key from screen upper-left, angled toward the viewer + kl = kl / kl.norm() + hl = (0.5 * (nrm_map * kl).sum(-1) + 0.5).clamp(0, 1) # half-Lambert: soft terminator, no harsh dark side + img = img * (0.35 + 0.65 * hl * hl)[:, :, None] # ambient floor + diffuse key + elif headlight_shading > 0: # camera headlight: darken faces turned from view + k = float(headlight_shading) + ndotl = (-nrm_map[:, :, 2]).clamp(0, 1) + img = img * (1 - 0.6 * k + 0.6 * k * ndotl)[:, :, None] + img = img.addcmul_(trans.reshape(height, width, 1), bg_comp) + if do_linear: # back to display space after linear compositing + img = _linear_to_srgb(img) + return img.clamp(0, 1).to(idev, idtype), covg.clamp(0, 1).to(idev, idtype) + + +class RenderSplat(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="RenderSplat", + display_name="Render Splat", + search_aliases=["splat to image", "render splat", "gaussian turntable"], + category="3d/splat", + description="Render a gaussian splat as an image with an anisotropic EWA rasterizer (oriented " + "elliptical splats, antialiased, depth-sorted front-to-back). The camera comes from a " + "camera_info input (Load / Preview 3D, or a Create Camera Info node); leave it empty to " + "auto-frame the splat. Set frames greater than 1 for a turntable batch of images to feed a Video node.", + inputs=[ + IO.Splat.Input("splat"), + IO.Int.Input("width", default=1024, min=64, max=2048, step=8), + IO.Int.Input("height", default=1024, min=64, max=2048, step=8), + IO.Int.Input("frames", default=1, min=-240, max=240, + tooltip="-1, 0, 1 = single still image; >1 = turntable, the camera orbits over a full " + "360 turn (works with any camera_info). Negative value orbits the other way."), + IO.Float.Input("splat_scale", default=1.0, min=0.1, max=5.0, step=0.05, advanced=True, + tooltip="Multiplier on each splat's projected footprint (lower = crisper points, " + "higher = softer/fuller surface)."), + IO.Float.Input("sharpen", default=2.0, min=1.0, max=8.0, step=0.5, + tooltip="Sharpen overlapping splats: 1.0 = physically-correct blend; higher biases " + "each pixel toward its dominant (nearest) splat for crisper texture, without " + "shrinking splats or opening gaps. Non-physical above 1."), + IO.Float.Input("headlight_shading", default=0.0, min=0.0, max=3.0, step=0.05, advanced=True, + tooltip="Diffuse shading from a light at the camera (headlight), using the splat surfel " + "normals: darkens surfaces that turn away from view to reveal form/curvature. " + "0 = flat albedo, 1 = strongest shading."), + IO.Float.Input("opacity_threshold", default=0.0, min=0.0, max=1.0, step=0.01, advanced=True, + tooltip="Cull gaussians with opacity below this (removes faint floaters)."), + IO.Combo.Input("render_style", options=["color", "clay", "depth", "normal"], + tooltip="What the image output shows: color, clay (neutral-albedo shaded), " + "depth (near=bright), normal (OpenGL normal map)."), + IO.Color.Input("background", default="#000000"), + IO.Image.Input("bg_image", optional=True, + tooltip="Optional background plate composited behind the splat (overrides the solid " + "background colour). Resized to the render size; a batch is used per frame, " + "a single image for all. color/clay only."), + IO.Load3DCamera.Input("camera_info", optional=True, + tooltip="Camera to render from - a Load3D / Preview3D camera or a Create Camera " + "Info node. If empty, the splat is auto-framed from a default 3/4 view."), + ], + outputs=[IO.Image.Output(display_name="image"), IO.Mask.Output(display_name="mask")], + ) + + @classmethod + def execute(cls, splat, width, height, frames, splat_scale, sharpen, headlight_shading, + opacity_threshold, background, render_style, camera_info=None, bg_image=None) -> IO.NodeOutput: + bg = _hex_to_rgb(background) + bg_imgs = None + if bg_image is not None: # resize the plate(s) to the render size: (B,H,W,3) + bi = comfy.utils.common_upscale(bg_image.movedim(-1, 1), width, height, "bicubic", "disabled") + bg_imgs = bi.movedim(1, -1).clamp(0, 1) + n_frames = abs(int(frames)) or 1 # magnitude = frame count (0 -> single still) + orbit_dir = -1.0 if frames < 0 else 1.0 # sign = orbit direction + imgs, masks = [], [] + device = comfy.model_management.get_torch_device() + total = splat.positions.shape[0] * n_frames + pbar = comfy.utils.ProgressBar(total) if total > 1 else None + k = 0 + for i in range(splat.positions.shape[0]): + xyz, rgb, opacity, scale, rot = _gaussian_item(splat, i, device) + if opacity_threshold > 0: + keep = opacity >= opacity_threshold + xyz, rgb, opacity, scale, rot = xyz[keep], rgb[keep], opacity[keep], scale[keep], rot[keep] + base_cam = camera_info + if base_cam is None: # no camera -> default 3/4 view, auto-framed on the splat + center = xyz.mean(0) if xyz.shape[0] else torch.zeros(3, device=device) + extent = (_quantile((xyz - center).norm(dim=-1), 0.99).clamp_min(1e-4) if xyz.shape[0] + else torch.tensor(1.0, device=device)) + dist = float(extent / (math.tan(math.radians(35.0) / 2) * 0.9)) + base_cam = _orbit_camera_info(35.0, 30.0, dist, 35.0, center, device) + for fr in range(n_frames): + cam_fr = (base_cam if n_frames == 1 + else _orbit_camera_info_yaw(base_cam, orbit_dir * 360.0 * fr / n_frames, device)) + bg_k = bg_imgs[k % bg_imgs.shape[0]] if bg_imgs is not None else bg # per-frame plate, or solid colour + img, mask = _render_gaussian(xyz, rgb, opacity, scale, rot, width, height, splat_scale, bg_k, cam_fr, + sharpen=sharpen, headlight_shading=headlight_shading, + render_style=render_style) + imgs.append(img) + masks.append(mask) + k += 1 + if pbar is not None: + pbar.update(1) + return IO.NodeOutput(torch.stack(imgs), torch.stack(masks)) + + +class CreateCameraInfo(IO.ComfyNode): # TODO: move to better file + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="CreateCameraInfo", + display_name="Create Camera Info", + search_aliases=["camera position", "make camera info", "orbit camera", "look at camera"], + category="3d", + description="Build a camera_info" + "Mode 'orbit' aims with yaw/pitch/distance around the target; " + "'look_at' places the camera at world position. Coordinates are the viewer's world space (right-handed,Y-up).", + inputs=[ + IO.DynamicCombo.Input("mode", options=[ + IO.DynamicCombo.Option("orbit", [ + IO.Float.Input("yaw", default=35.0, min=-360.0, max=360.0, step=1.0), + IO.Float.Input("pitch", default=30.0, min=-89.0, max=89.0, step=1.0), + IO.Float.Input("distance", default=4.0, min=0.01, max=1000.0, step=0.01, + tooltip="Camera distance from the target."), + ]), + IO.DynamicCombo.Option("look_at", [ + IO.Float.Input("position_x", default=4.0, min=-1000.0, max=1000.0, step=0.01, + tooltip="Camera position in world space (right-handed, Y-up)."), + IO.Float.Input("position_y", default=4.0, min=-1000.0, max=1000.0, step=0.01), + IO.Float.Input("position_z", default=4.0, min=-1000.0, max=1000.0, step=0.01), + ]), + IO.DynamicCombo.Option("quaternion", [ + IO.Float.Input("position_x", default=4.0, min=-1000.0, max=1000.0, step=0.01, + tooltip="Camera position in world space (right-handed, Y-up)."), + IO.Float.Input("position_y", default=4.0, min=-1000.0, max=1000.0, step=0.01), + IO.Float.Input("position_z", default=4.0, min=-1000.0, max=1000.0, step=0.01), + IO.Float.Input("quat_x", default=0.0, min=-1.0, max=1.0, step=0.001), + IO.Float.Input("quat_y", default=0.0, min=-1.0, max=1.0, step=0.001), + IO.Float.Input("quat_z", default=0.0, min=-1.0, max=1.0, step=0.001), + IO.Float.Input("quat_w", default=1.0, min=-1.0, max=1.0, step=0.001, + tooltip="Camera world-rotation quaternion (three.js: looks down local -Z). Normalized for you."), + ]), + ], tooltip="How to define the camera: orbit angles, an explicit position, or a position + quaternion."), + IO.Float.Input("target_x", default=0.0, min=-1000.0, max=1000.0, step=0.01, advanced=True, + tooltip="Look-at point (orbit pivot / aim). In orbit mode, move it to pan/translate the " + "whole camera. Ignored in quaternion mode. Defaults to the origin."), + IO.Float.Input("target_y", default=0.0, min=-1000.0, max=1000.0, step=0.01, advanced=True), + IO.Float.Input("target_z", default=0.0, min=-1000.0, max=1000.0, step=0.01, advanced=True), + IO.Float.Input("roll", default=0.0, min=-180.0, max=180.0, step=1.0, + tooltip="Camera roll about the view axis, degrees."), + IO.Float.Input("fov", default=35.0, min=1.0, max=120.0, step=1.0, + tooltip="Vertical field of view in degrees."), + IO.Float.Input("zoom", default=1.0, min=0.01, max=100.0, step=0.01, + tooltip="Digital zoom (focal-length multiplier). >1 zooms in without moving the camera."), + IO.Combo.Input("camera_type", options=["perspective", "orthographic"], + tooltip="Projection used by Render Splat: perspective (foreshortening) or orthographic (parallel)."), + ], + outputs=[IO.Load3DCamera.Output(display_name="camera_info")], + ) + + @classmethod + def execute(cls, mode, target_x, target_y, target_z, roll, fov, zoom=1.0, camera_type="perspective") -> IO.NodeOutput: + dev = comfy.model_management.get_torch_device() + kind = mode["mode"] + if kind == "quaternion": # explicit world position + camera rotation + position = [mode["position_x"], mode["position_y"], mode["position_z"]] + quat = [mode["quat_x"], mode["quat_y"], mode["quat_z"], mode["quat_w"]] + return IO.NodeOutput(_quat_camera_info(position, quat, fov, dev, zoom=zoom, camera_type=camera_type)) + target = [target_x, target_y, target_z] # orbit pivot / aim; move it to pan the whole camera + if kind == "orbit": # yaw/pitch/distance about the target (world Y-up) + y, p = math.radians(mode["yaw"]), math.radians(mode["pitch"]) + cy, sy, cp, sp = math.cos(y), math.sin(y), math.cos(p), math.sin(p) + d = mode["distance"] + position = [target_x + d * cp * sy, target_y + d * sp, target_z + d * cp * cy] + else: # look_at: explicit world-space camera position + position = [mode["position_x"], mode["position_y"], mode["position_z"]] + return IO.NodeOutput(_lookat_camera_info(position, target, fov, dev, zoom=zoom, camera_type=camera_type, roll=roll)) + + +class TransformSplat(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="TransformSplat", + display_name="Transform Splat", + search_aliases=["move splat", "rotate splat", "scale splat", "gaussian transform"], + category="3d/splat", + description="Translate, rotate, and scale a gaussian splat. " + "Non-uniform scale also reshapes every individual splat, slower process.", + inputs=[ + IO.Splat.Input("splat"), + IO.Float.Input("translate_x", default=0.0, min=-100.0, max=100.0, step=0.01), + IO.Float.Input("translate_y", default=0.0, min=-100.0, max=100.0, step=0.01), + IO.Float.Input("translate_z", default=0.0, min=-100.0, max=100.0, step=0.01), + IO.Float.Input("rotate_x", default=0.0, min=-360.0, max=360.0, step=1.0), + IO.Float.Input("rotate_y", default=0.0, min=-360.0, max=360.0, step=1.0), + IO.Float.Input("rotate_z", default=0.0, min=-360.0, max=360.0, step=1.0), + IO.Float.Input("scale_x", default=1.0, min=0.01, max=100.0, step=0.01), + IO.Float.Input("scale_y", default=1.0, min=0.01, max=100.0, step=0.01), + IO.Float.Input("scale_z", default=1.0, min=0.01, max=100.0, step=0.01), + ], + outputs=[IO.Splat.Output(display_name="splat")], + ) + + @classmethod + def execute(cls, splat, translate_x, translate_y, translate_z, + rotate_x, rotate_y, rotate_z, scale_x, scale_y, scale_z) -> IO.NodeOutput: + pos = splat.positions + dev, dt = pos.device, pos.dtype + q_rot = _euler_to_quat(rotate_x, rotate_y, rotate_z).to(device=dev, dtype=dt) + R = _quat_to_mat(q_rot[None])[0] # (3, 3) node rotation + D = torch.tensor([scale_x, scale_y, scale_z], dtype=dt, device=dev) + A = D[:, None] * R # diag(D) @ R: per-axis scale after rotation + t = torch.tensor([translate_x, translate_y, translate_z], dtype=dt, device=dev) + + positions = pos @ A.T + t # rotate, scale per-axis, then translate + if scale_x == scale_y == scale_z: # uniform: rotation/scale factor out cleanly + scales = splat.scales * scale_x + rotations = _quat_mul(q_rot.expand_as(splat.rotations), splat.rotations) + rotations = rotations / rotations.norm(dim=-1, keepdim=True).clamp_min(1e-12) + else: # non-uniform: transform Sigma = A R s^2 R^T A^T, re-extract + rg = _quat_to_mat(splat.rotations.reshape(-1, 4)) # (M,3,3) per-splat rotation + s2 = splat.scales.reshape(-1, 3).square() + cov = (rg * s2[:, None, :]) @ rg.transpose(-1, -2) # Sigma + cov = A @ cov @ A.T # A Sigma A^T (A broadcast over splats) + lam, V = torch.linalg.eigh(cov) # symmetric -> eigenvalues (asc), orthonormal axes + V = V * torch.where(torch.linalg.det(V) < 0, -1.0, 1.0)[..., None, None] # keep a proper rotation + scales = lam.clamp_min(0).sqrt().reshape(splat.scales.shape) + rotations = _mat_to_quat(V).reshape(splat.rotations.shape) + out = Types.SPLAT(positions, scales, rotations, splat.opacities, splat.sh, + counts=getattr(splat, "counts", None)) + return IO.NodeOutput(out) + + +class GetSplatCount(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="GetSplatCount", + display_name="Get Splat Count", + search_aliases=["splat count", "gaussian count", "number of splats", "splat info"], + category="3d/splat", + description="Returns the number of splats summed across the batch.", + inputs=[IO.Splat.Input("splat")], + outputs=[IO.Splat.Output(display_name="splat"), + IO.Int.Output(display_name="count"), + ], + hidden=[IO.Hidden.unique_id], + ) + + @classmethod + def execute(cls, splat) -> IO.NodeOutput: + count = sum(_real_len(splat, i) for i in range(splat.positions.shape[0])) + if cls.hidden.unique_id: # show the count inline on the node + PromptServer.instance.send_progress_text(f"{count:,} splats", cls.hidden.unique_id) + return IO.NodeOutput(splat, count) + + +def _pad_stack(items, n): + # Stack a list of (Lᵢ, *tail) tensors into (B, n, *tail), zero-padding each row up to n. + tail = items[0].shape[1:] + out = items[0].new_zeros((len(items), n, *tail)) + for i, t in enumerate(items): + out[i, :t.shape[0]] = t + return out + + +def _merge_gaussians(gaussians: list) -> Types.SPLAT: + # Concatenate SPLAT batches along the splat dimension (per item), padding SH to the highest degree. + gs = [g for g in gaussians if g is not None] + if not gs: + raise ValueError("MergeSplat: no gaussians to merge") + b = gs[0].positions.shape[0] + for g in gs: + if g.positions.shape[0] != b: + raise ValueError(f"MergeSplat: batch size mismatch ({b} vs {g.positions.shape[0]}).") + max_k = max(g.sh.shape[2] for g in gs) + + pos_b, scl_b, rot_b, op_b, sh_b, lengths = [], [], [], [], [], [] + for i in range(b): + pos_i, scl_i, rot_i, op_i, sh_i = [], [], [], [], [] + for g in gs: + end = _real_len(g, i) + pos_i.append(g.positions[i, :end]) + scl_i.append(g.scales[i, :end]) + rot_i.append(g.rotations[i, :end]) + op_i.append(g.opacities[i, :end]) + sh = g.sh[i, :end] # (end, K, 3) + if sh.shape[1] < max_k: # zero-pad lower-degree SH + sh = torch.cat([sh, sh.new_zeros(sh.shape[0], max_k - sh.shape[1], sh.shape[2])], dim=1) + sh_i.append(sh) + pos_b.append(torch.cat(pos_i)) + scl_b.append(torch.cat(scl_i)) + rot_b.append(torch.cat(rot_i)) + op_b.append(torch.cat(op_i)) + sh_b.append(torch.cat(sh_i)) + lengths.append(pos_b[-1].shape[0]) + + n = max(lengths) + counts = None + if len(set(lengths)) > 1: + counts = torch.tensor(lengths, device=gs[0].positions.device, dtype=torch.int64) + return Types.SPLAT(_pad_stack(pos_b, n), _pad_stack(scl_b, n), _pad_stack(rot_b, n), + _pad_stack(op_b, n), _pad_stack(sh_b, n), counts=counts) + + +class MergeSplat(IO.ComfyNode): + @classmethod + def define_schema(cls): + # Autogrow: a splat0/splat1/... input list that grows a fresh slot as you connect splats. + splats = IO.Autogrow.TemplatePrefix(IO.Splat.Input("splat"), prefix="splat", min=2, max=32) + return IO.Schema( + node_id="MergeSplat", + display_name="Merge Splats", + search_aliases=["union splat", "densify gaussian", "combine splat", "merge gaussian"], + category="3d/splat", + description="Concatenate any number of gaussian splats into one. Unioning several decodes of the same " + "latent at different seeds densifies the surface, this can improve surface quality when meshing.", + inputs=[IO.Autogrow.Input("splats", template=splats)], + outputs=[IO.Splat.Output(display_name="splat")], + ) + + @classmethod + def execute(cls, splats: IO.Autogrow.Type) -> IO.NodeOutput: + gs = [v for v in splats.values() if v is not None] + if not gs: + raise ValueError("MergeSplat: connect at least one splat.") + return IO.NodeOutput(_merge_gaussians(gs)) + + +def _inverse_covariance(scale, quat): + # Per-splat Sigma^-1 = R diag(1/s^2) R^T. scale (N,3) linear std, quat (N,4) wxyz -> (N,3,3). + q = quat / quat.norm(dim=1, keepdim=True).clamp_min(1e-12) + w, x, y, z = q.unbind(-1) + R = torch.stack([ + 1 - 2 * (y * y + z * z), 2 * (x * y - w * z), 2 * (x * z + w * y), + 2 * (x * y + w * z), 1 - 2 * (x * x + z * z), 2 * (y * z - w * x), + 2 * (x * z - w * y), 2 * (y * z + w * x), 1 - 2 * (x * x + y * y), + ], dim=1).reshape(-1, 3, 3) + inv_s2 = 1.0 / scale.clamp_min(1e-8) ** 2 # (N, 3) + return torch.einsum("nij,nj,nkj->nik", R, inv_s2, R) + + +def _splat_density(xyz, opacity, scale, quat, rgb, res, kernel, device, color_sharpen=1.0, chunk=4096, progress=None, + col_dtype=torch.float16): + # Splat each gaussian as its oriented-covariance disk (3-sigma, opacity-weighted) into a density grid, + # plus a colour volume. Each gaussian uses a voxel window sized to its OWN 3-sigma (capped at `kernel`). + # Colour is weighted by w^color_sharpen: >1 biases each voxel toward its dominant gaussian (crisper + # texture). Returns (density, colour numerator, colour normaliser, origin, voxel). + pad = 4.0 * scale.median() + lo = xyz.amin(0) - pad + hi = xyz.amax(0) + pad + voxel = ((hi - lo).max() / res).clamp_min(1e-8) + dx, dy, dz = (torch.ceil((hi - lo) / voxel).long() + 1).tolist() + + sinv = _inverse_covariance(scale, quat) + kreq = torch.ceil(3.0 * scale.amax(-1) / voxel).long().clamp(1, int(kernel)) # per-gaussian half-width + sharp = color_sharpen != 1.0 + vol = torch.zeros(dx * dy * dz, device=device) # Sum(w) density (surface) + colvol = torch.zeros(dx * dy * dz, 3, device=device, dtype=col_dtype) # Sum(w^p * rgb) colour numerator + wcol = torch.zeros(dx * dy * dz, device=device, dtype=col_dtype) if sharp else None # Sum(w^p) normaliser (p>1) + n, done = xyz.shape[0], 0 + for k in range(1, int(kernel) + 1): + sel = (kreq == k).nonzero(as_tuple=True)[0] + if sel.numel() == 0: + continue + rng = torch.arange(-k, k + 1, device=device, dtype=torch.float32) + off = torch.stack(torch.meshgrid(rng, rng, rng, indexing="ij"), -1).reshape(-1, 3) # (M, 3) + for st in range(0, sel.numel(), chunk): + gi = sel[st:st + chunk] + cc = xyz[gi] + idx = ((cc - lo) / voxel).round()[:, None, :] + off[None] # (b, M, 3) voxel coords + d = (lo + idx * voxel) - cc[:, None, :] # world offset to voxel center + quad = torch.einsum("bmi,bij,bmj->bm", d, sinv[gi], d) + wgt = opacity[gi, None] * torch.exp(-0.5 * quad) + wgt = torch.where(quad < 9.0, wgt, torch.zeros_like(wgt)) # clip beyond 3 sigma + ii = idx.long() + ix = ii[..., 0].clamp(0, dx - 1) + iy = ii[..., 1].clamp(0, dy - 1) + iz = ii[..., 2].clamp(0, dz - 1) + flat = (ix * (dy * dz) + iy * dz + iz).reshape(-1) + vol.index_add_(0, flat, wgt.reshape(-1)) + wp = wgt.pow(color_sharpen) if sharp else wgt # winner-take-more colour weight + colvol.index_add_(0, flat, (wp[..., None] * rgb[gi, None, :]).reshape(-1, 3).to(col_dtype)) + if sharp: + wcol.index_add_(0, flat, wp.reshape(-1).to(col_dtype)) + done += gi.numel() + if progress is not None: + progress(min(1.0, done / max(1, n))) + colnorm = (wcol if sharp else vol).reshape(dx, dy, dz) # p==1 -> Sum(w) == density + return vol.reshape(dx, dy, dz), colvol.reshape(dx, dy, dz, 3), colnorm, lo.cpu().numpy(), float(voxel) + + +def _connected_components_gpu(faces, nv): + # FastSV connected components: grandparent hooking + shortcutting, ~O(log nv) iterations. + # Returns per-vertex component labels (min node id, not densified). + a = torch.cat([faces[:, 0], faces[:, 1]]) # 2F edge endpoints: (v0,v1),(v1,v2) + b = torch.cat([faces[:, 1], faces[:, 2]]) + f = torch.arange(nv, device=faces.device) + while True: + gp = f[f] # grandparent + ga, gb = gp[a], gp[b] + new = f.clone() + new.scatter_reduce_(0, f[a], gb, "amin", include_self=True) # stochastic hooking onto roots + new.scatter_reduce_(0, f[b], ga, "amin", include_self=True) + new.scatter_reduce_(0, a, gb, "amin", include_self=True) # aggressive hooking, both directions + new.scatter_reduce_(0, b, ga, "amin", include_self=True) + new = new[new] # shortcut (path compression) + if torch.equal(new, f): + return f + f = new + + +def _clean_components_gpu(verts, faces, min_verts, device): + # GPU port of _clean_components: FastSV components + scatter reductions. Byte-identical to the numpy path + vt = torch.as_tensor(verts, device=device) + ft = torch.as_tensor(faces, device=device) + nv = vt.shape[0] + _, label = torch.unique(_connected_components_gpu(ft, nv), return_inverse=True) # dense 0..ncomp-1 + ncomp = int(label.max()) + 1 + flabel = label[ft[:, 0]] # component id per face + keep = torch.bincount(label, minlength=ncomp) >= min_verts # per-component vertex-count gate + if int(keep.sum()) > 1: + fcount = torch.bincount(flabel, minlength=ncomp) + largest = int(torch.where(keep, fcount, fcount.new_tensor(-1)).argmax()) + v0, v1, v2 = vt[ft[:, 0]], vt[ft[:, 1]], vt[ft[:, 2]] + cvol = torch.zeros(ncomp, device=device).scatter_add_(0, flabel, (v0 * torch.linalg.cross(v1, v2)).sum(-1)) + idx3 = label[:, None].expand(-1, 3) # per-component vertex bbox + cmin = torch.full((ncomp, 3), float("inf"), device=device).scatter_reduce_(0, idx3, vt, "amin", include_self=True) + cmax = torch.full((ncomp, 3), float("-inf"), device=device).scatter_reduce_(0, idx3, vt, "amax", include_self=True) + tol = 1e-4 * (cmax[largest] - cmin[largest]).max() + enclosed = (cmin >= cmin[largest] - tol).all(1) & (cmax <= cmax[largest] + tol).all(1) + inner = enclosed & (torch.sign(cvol) != torch.sign(cvol[largest])) & (torch.arange(ncomp, device=device) != largest) + keep &= ~inner + faces_k = ft[keep[flabel]] + if faces_k.shape[0] == 0: + return verts[:0], faces[:0] + used = torch.unique(faces_k) # sorted, matches np.unique + remap = torch.full((nv,), -1, dtype=torch.int64, device=device) + remap[used] = torch.arange(used.shape[0], device=device) + return vt[used].cpu().numpy(), remap[faces_k].cpu().numpy() + + +def _clean_components(verts, faces, min_verts, device=None): + # Drop floaters (components with < min_verts vertices) and inner shells - the surfel shell density + # extracts a double wall (outer + inner cavity surface). GPU path (FastSV CC + scatter reductions, ~13x + # faster) when an accelerator has headroom; else numpy/scipy. Both produce byte-identical output. + if device is not None and not comfy.model_management.is_device_cpu(device) and \ + comfy.model_management.get_free_memory(device) > 10 * faces.size * 8: # peak ~8.4x faces bytes + return _clean_components_gpu(verts, faces, min_verts, device) + nv = len(verts) + e = np.concatenate([faces[:, [0, 1]], faces[:, [1, 2]], faces[:, [0, 2]]], 0) + ncomp, label = connected_components(coo_matrix((np.ones(len(e)), (e[:, 0], e[:, 1])), shape=(nv, nv)), directed=False) + flabel = label[faces[:, 0]] # component id per face + keep = np.bincount(label, minlength=ncomp) >= min_verts # per-component vertex-count gate + if keep.sum() > 1: + fcount = np.bincount(flabel, minlength=ncomp) + largest = np.where(keep, fcount, -1).argmax() + v0, v1, v2 = verts[faces[:, 0]], verts[faces[:, 1]], verts[faces[:, 2]] + cvol = np.bincount(flabel, weights=np.einsum("ij,ij->i", v0, np.cross(v1, v2)), minlength=ncomp) # 6*signed vol + cidx = np.arange(ncomp) # per-component vertex bbox via ndimage (~6x faster than ufunc.at) + cmin = np.stack([_ndi_minimum(verts[:, a], label, cidx) for a in range(3)], 1) + cmax = np.stack([_ndi_maximum(verts[:, a], label, cidx) for a in range(3)], 1) + tol = 1e-4 * (cmax[largest] - cmin[largest]).max() + enclosed = (cmin >= cmin[largest] - tol).all(1) & (cmax <= cmax[largest] + tol).all(1) + inner = enclosed & (np.sign(cvol) != np.sign(cvol[largest])) & (np.arange(ncomp) != largest) + keep &= ~inner + faces = faces[keep[flabel]] + if len(faces) == 0: + return verts[:0], faces + used = np.unique(faces) + remap = np.full(nv, -1, np.int64) + remap[used] = np.arange(len(used)) + return verts[used], remap[faces] + + +def _surface_nets(vol, level, voxel, origin, device): + # Vectorized Surface Nets: one dual vertex per sign-changing cell at its edge-crossing mean, quads wound CCW-outward. + # Returns verts (V,3), faces (F,3). + vol = vol.to(device=device, dtype=torch.float32) + dx, dy, dz = vol.shape + origin_t = torch.as_tensor(origin, device=device, dtype=torch.float32) + empty = (np.zeros((0, 3), np.float32), np.zeros((0, 3), np.int64)) + if dx < 2 or dy < 2 or dz < 2: + return empty + + # Active = cells whose 8 corners aren't all in/all out. + inside = vol >= level # (dx,dy,dz) bool + cs8 = [inside[ox:ox + dx - 1, oy:oy + dy - 1, oz:oz + dz - 1] + for ox, oy, oz in ((0, 0, 0), (1, 0, 0), (0, 1, 0), (1, 1, 0), + (0, 0, 1), (1, 0, 1), (0, 1, 1), (1, 1, 1))] + any_in = cs8[0] | cs8[1] | cs8[2] | cs8[3] | cs8[4] | cs8[5] | cs8[6] | cs8[7] + all_in = cs8[0] & cs8[1] & cs8[2] & cs8[3] & cs8[4] & cs8[5] & cs8[6] & cs8[7] + active = any_in & ~all_in # (cx,cy,cz) straddling cells + nv = int(active.sum()) + if nv == 0: + return empty + + # Active cells only (a thin shell): each dual vertex = mean of its 12 edges' zero-crossings. + del any_in, all_in, cs8 # corner bool grids no longer needed + ac = active.nonzero(as_tuple=False) # (nv,3) cell min-corner indices + offs = torch.tensor([[0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0], + [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1]], device=device) + offf = offs.to(torch.float32) + edges = torch.tensor([[0, 1], [0, 2], [0, 4], [1, 3], [1, 5], [2, 3], + [2, 6], [3, 7], [4, 5], [4, 6], [5, 7], [6, 7]], device=device) + e0, e1 = edges[:, 0], edges[:, 1] + oe0, oe1 = offf[e0], offf[e1] # (12,3) edge endpoints + + cstep = 1 << 18 # chunk to bound peak memory (CPU RAM too) + loc = [] + for st in range(0, nv, cstep): + ci = ac[st:st + cstep, None, :] + offs[None] # (m,8,3) + cval = vol[ci[..., 0], ci[..., 1], ci[..., 2]] # (m,8) corner values + csl = cval >= level + v0, v1 = cval[:, e0], cval[:, e1] # (m,12) + cross = (csl[:, e0] != csl[:, e1])[..., None].to(torch.float32) + denom = v1 - v0 + t = torch.where(denom.abs() > 1e-12, (level - v0) / denom, torch.full_like(denom, 0.5)).clamp(0, 1) + pts = torch.lerp(oe0, oe1, t[..., None]) # (m,12,3) local crossings (fused interp) + loc.append((pts * cross).sum(1) / cross.sum(1).clamp_min(1.0)) # (m,3) in [0,1] + local = torch.cat(loc, 0) if len(loc) > 1 else loc[0] # (nv,3) + verts = origin_t + (ac.to(torch.float32) + local) * voxel # world space + del loc, local, ac + + vid = torch.full((dx - 1, dy - 1, dz - 1), -1, dtype=torch.int32, device=device) + vid[active] = torch.arange(nv, dtype=torch.int32, device=device) + del active + + # Each straddling grid edge -> one quad from its 4 cells; `sol` (low-end sign) picks outward winding. + faces = [] + + def emit(cr, sol, a, b, d, c): + valid = cr & (a >= 0) & (b >= 0) & (c >= 0) & (d >= 0) + if not bool(valid.any()): + return + a, b, c, d, sol = a[valid], b[valid], c[valid], d[valid], sol[valid] + p2, p4 = torch.where(sol, b, c), torch.where(sol, c, b) # reverse quad winding where ~sol + faces.append(torch.stack([a, p2, d], 1)) + faces.append(torch.stack([a, d, p4], 1)) + + a = inside[0:dx - 1, 1:dy - 1, 1:dz - 1] + emit(a != inside[1:dx, 1:dy - 1, 1:dz - 1], a, + vid[:, 0:dy - 2, 0:dz - 2], vid[:, 1:dy - 1, 0:dz - 2], + vid[:, 1:dy - 1, 1:dz - 1], vid[:, 0:dy - 2, 1:dz - 1]) + a = inside[1:dx - 1, 0:dy - 1, 1:dz - 1] + emit(a != inside[1:dx - 1, 1:dy, 1:dz - 1], a, + vid[0:dx - 2, :, 0:dz - 2], vid[0:dx - 2, :, 1:dz - 1], + vid[1:dx - 1, :, 1:dz - 1], vid[1:dx - 1, :, 0:dz - 2]) + a = inside[1:dx - 1, 1:dy - 1, 0:dz - 1] + emit(a != inside[1:dx - 1, 1:dy - 1, 1:dz], a, + vid[0:dx - 2, 0:dy - 2, :], vid[1:dx - 1, 0:dy - 2, :], + vid[1:dx - 1, 1:dy - 1, :], vid[0:dx - 2, 1:dy - 1, :]) + + if not faces: + return empty + return verts.cpu().numpy().astype(np.float32), torch.cat(faces, 0).cpu().numpy().astype(np.int64) + + +def _otsu_level(values, bins=256): + # Otsu threshold: the density value that best splits inside/outside (max between-class variance). + hist, edges = np.histogram(values, bins=bins) + hist = hist.astype(np.float64) + centers = (edges[:-1] + edges[1:]) * 0.5 + w = np.cumsum(hist) # background-class weight at each split + mu = np.cumsum(hist * centers) + wf = w[-1] - w # foreground-class weight + mb = mu / np.where(w > 0, w, 1.0) + mf = (mu[-1] - mu) / np.where(wf > 0, wf, 1.0) + var_b = w * wf * (mb - mf) ** 2 # between-class variance + var_b[(w <= 0) | (wf <= 0)] = -1.0 + return float(centers[int(np.argmax(var_b))]) + + +def _taubin_smooth(verts, faces, iters, lam=0.5, mu=-0.53): + # Taubin lambda|mu smoothing: low-pass the mesh surface without the shrinkage of a Laplacian blur + # (the mu inflation pass cancels the lambda pass's volume loss). Uniform (umbrella) weights. + if iters <= 0 or len(verts) == 0 or len(faces) == 0: + return verts + nv = len(verts) + e = np.concatenate([faces[:, [0, 1]], faces[:, [1, 2]], faces[:, [0, 2]]], 0) + e = np.concatenate([e, e[:, ::-1]], 0) # symmetric adjacency + adj = coo_matrix((np.ones(len(e), np.float32), (e[:, 0], e[:, 1])), shape=(nv, nv)).tocsr() + adj.data[:] = 1.0 + deg = np.clip(np.asarray(adj.sum(1)).ravel(), 1.0, None).astype(np.float32)[:, None] + v = verts.astype(np.float32) # fp32 matvec: ~2x faster, sub-micron drift on unit-scale verts + for _ in range(int(iters)): + for fac in (lam, mu): + v = v + np.float32(fac) * ((adj @ v) / deg - v) # fac * (mean(neighbours) - v) + return np.ascontiguousarray(v) + + +def _sample_vertex_colours_gpu(colvol, colnorm, verts, origin, voxel, device): + # GPU trilinear sampling of the colour numerator (3ch) and normaliser (1ch) at vertex grid-coords + # reproduces scipy map_coordinates(order=1, mode='nearest'). Returns col (V,3) numpy. + dx, dy, dz = colnorm.shape + vt = torch.as_tensor(verts, device=device, dtype=torch.float32) + org = torch.as_tensor(origin, device=device, dtype=torch.float32) + gi = (vt - org) / voxel # (V,3) grid-index coords (x,y,z) + size = torch.tensor([dx, dy, dz], device=device, dtype=torch.float32) + g = 2.0 * gi / (size - 1).clamp_min(1.0) - 1.0 # -> [-1,1] (align_corners) + grid = torch.stack([g[:, 2], g[:, 1], g[:, 0]], -1)[None, None, None] # (1,1,1,V,3): grid_sample order (W=z,H=y,D=x) + + def samp(v): # (dx,dy,dz,C) cpu fp16 -> (C,V) fp32 on device + inp = v.to(device).permute(3, 0, 1, 2)[None].float() + o = torch.nn.functional.grid_sample(inp, grid, mode="bilinear", padding_mode="border", align_corners=True) + return o[0, :, 0, 0, :] + num = samp(colvol) # (3,V) + den = samp(colnorm[..., None]) # (1,V) + return (num / den.clamp_min(1e-8)).T.cpu().numpy() # (V,3) + + +def _gaussian_to_mesh(g: Types.SPLAT, i, res, kernel, taubin, level_bias, min_component, min_opacity, color_sharpen, device, progress=None): + # Mesh one splat: density + colour grids -> Surface Nets -> floater removal -> Taubin smoothing -> + # volume-sampled colours. Returns (verts, faces int64, colors in [0,1]), or None if no surface. + rep = progress if progress is not None else (lambda *_: None) + + end = _real_len(g, i) + xyz = g.positions[i, :end].to(device=device, dtype=torch.float32) + scale = g.scales[i, :end].to(device=device, dtype=torch.float32) + quat = g.rotations[i, :end].to(device=device, dtype=torch.float32) + opacity = g.opacities[i, :end].reshape(-1).to(device=device, dtype=torch.float32) + rgb = (g.sh[i, :end, 0, :].to(device=device, dtype=torch.float32) * _C0 + 0.5).clamp(0, 1) + + keep = opacity >= min_opacity + xyz, scale, quat, opacity, rgb = xyz[keep], scale[keep], quat[keep], opacity[keep], rgb[keep] + if xyz.shape[0] == 0: + return None + + vol, colvol, colnorm, origin, voxel = _splat_density(xyz, opacity, scale, quat, rgb, res, kernel, device, + color_sharpen=color_sharpen, + progress=lambda f: rep(0.25 * f)) # density build: 0 -> 25% + # Colour: sample on the GPU (grid_sample) when there's headroom + colour_gpu = not comfy.model_management.is_device_cpu(device) and comfy.model_management.get_free_memory(device) > 6 * vol.numel() * 4 + if colour_gpu: + colvol_cpu, colnorm_cpu = colvol.cpu(), colnorm.half().cpu() # park colours (fp16) off-GPU during meshing + colvol_np = colnorm_np = None + else: + colvol_np = colvol.cpu().numpy().astype(np.float32) # Sum(w^p * rgb) colour numerator (fp16 grid -> fp32) + colnorm_np = colnorm.cpu().numpy().astype(np.float32) # Sum(w^p) colour normaliser + del colvol, colnorm # free the colour grids before iso-surfacing + rep(0.40) + + vmin, vmax = float(vol.min()), float(vol.max()) + occ = vol[vol > vmax * 1e-3] # occupied voxels (skip the empty-space peak) + if occ.numel() == 0: + return None + # Otsu picks the inside/outside split principledly; `level_bias` nudges it (1.0 = auto). Clamp strictly + # inside the data range so a bias can't push the iso off the histogram. + level = min(max(_otsu_level(occ.cpu().numpy()) * level_bias, vmin + 1e-6 * (vmax - vmin)), + vmax - 1e-6 * (vmax - vmin)) + + # Iso-surface on the accelerator when there's headroom: ~15x faster than CPU, identical output. Chunked + # Surface Nets peaks at ~3-3.5x the density grid, so fall back to CPU for large grids / tight VRAM. + sn_dev = device + if not comfy.model_management.is_device_cpu(device) and comfy.model_management.get_free_memory(device) < 6 * vol.numel() * 4: + sn_dev = torch.device("cpu") + vol = vol.cpu() + verts, faces = _surface_nets(vol, level, voxel, origin, sn_dev) + del vol + rep(0.55) + if min_component > 0 and len(faces) > 0: + verts, faces = _clean_components(verts, faces, min_component, device) + if len(verts) == 0 or len(faces) == 0: + return None + + # Taubin smooths the blocky iso without shrinking it (unlike blurring the density, which rounds features). + verts = _taubin_smooth(verts, faces, taubin) + rep(0.7) + + # Colour each vertex from the co-splatted colour volume: trilinearly sample the numerator Sum(w^p*rgb) + # and normaliser Sum(w^p) separately, then divide. Normalising AFTER interpolation keeps zero-density + # edge voxels from pulling colours toward black, and matches the gaussians that formed the surface. + if colour_gpu: + col = _sample_vertex_colours_gpu(colvol_cpu, colnorm_cpu, verts, origin, voxel, device) + else: + coords = ((verts - origin) / voxel).T # (3, V) grid-index coords, matching volume axes + num = np.stack([map_coordinates(colvol_np[..., c], coords, order=1, mode="nearest") for c in range(3)], -1) + den = map_coordinates(colnorm_np, coords, order=1, mode="nearest") + col = num / np.clip(den, 1e-8, None)[:, None] + rep(1.0) + + # The unlit material's COLOR_0 is linear and the viewer sRGB-encodes it on output; the splat colours + # are display (sRGB) values, so convert sRGB -> linear here to land at the same brightness as the splat. + col = np.clip(col, 0, 1) + col = np.where(col <= 0.04045, col / 12.92, ((col + 0.055) / 1.055) ** 2.4).astype(np.float32) + + # Splat +Y is glTF's -Y: rotate 180 deg about X (negate Y,Z) to land upright. Proper rotation, so + # winding is kept; done after colouring (which works in the splat frame). + verts = np.ascontiguousarray(verts * np.array([1.0, -1.0, -1.0], dtype=np.float32)) + return (torch.from_numpy(verts), torch.from_numpy(faces), torch.from_numpy(col)) + + +class SplatToMesh(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="SplatToMesh", + display_name="Extract Mesh from Splat", + search_aliases=["splat to mesh", "gaussian surface nets", "splat surface", "mesh splat"], + category="3d/splat", + description="Extract a coloured mesh from a gaussian splat.", + inputs=[ + IO.Splat.Input("splat"), + IO.Int.Input("resolution", default=384, min=64, max=768, step=16, + tooltip="Density-grid resolution along the longest axis. Higher = finer surface, " + "more VRAM/time (grows with resolution^3)."), + IO.Int.Input("kernel", default=5, min=1, max=8, + tooltip="Max splat half-width in voxels. Each gaussian is rasterized over a window " + "sized to its own 3-sigma, capped here - small surfels stay cheap, large ones " + "aren't truncated. Raise if sparse splats leave gaps."), + IO.Int.Input("smooth", default=0, min=0, max=60, advanced = True, + tooltip="Taubin mesh-smoothing iterations. Smooths the surface without shrinking it " + "(volume-preserving), unlike blurring the density. 0 = raw surface."), + IO.Float.Input("level", default=0.4, min=0.0, max=2.0, step=0.01, + tooltip="Iso-surface level. Auto-picked by Otsu; this biases it (1.0 = auto, lower = " + "fatter/more-connected surface, higher = thinner/tighter)."), + IO.Int.Input("min_component", default=500, min=0, max=100000, step=50, advanced=True, + tooltip="Drop connected components smaller than this many vertices (0 = keep all). " + "Removes detached floater blobs and the inner shell of the double wall."), + IO.Float.Input("min_opacity", default=0.02, min=0.0, max=1.0, step=0.01, advanced=True, + tooltip="Ignore gaussians fainter than this before meshing."), + IO.Float.Input("color_sharpen", default=2.0, min=1.0, max=8.0, step=0.5, + tooltip="Crisp up the vertex texture: 1.0 = physically-correct blend; higher biases " + "each voxel's colour toward its dominant gaussian instead of averaging " + "neighbours (de-smears the texture). Colour only - geometry is unchanged."), + ], + outputs=[IO.Mesh.Output(display_name="mesh")], + ) + + @classmethod + def execute(cls, splat, resolution, kernel, smooth, level, min_component, min_opacity, color_sharpen) -> IO.NodeOutput: + device = comfy.model_management.get_torch_device() + b = splat.positions.shape[0] + prec = 1000 # each splat owns a 0..prec block of the bar; its callback advances within that block + pbar = comfy.utils.ProgressBar(b * prec) + + verts_l, faces_l, colors_l = [], [], [] + for i in range(b): + cb = lambda f, base=i * prec: pbar.update_absolute(base + int(min(max(f, 0.0), 1.0) * prec)) + res = _gaussian_to_mesh(splat, i, resolution, kernel, smooth, level, min_component, min_opacity, color_sharpen, device, cb) + if res is None: + logging.warning("SplatToMesh: splat %d produced no surface; emitting an empty mesh.", i) + v, f, c = torch.zeros((0, 3)), torch.zeros((0, 3), dtype=torch.int64), torch.zeros((0, 3)) + else: + v, f, c = res + verts_l.append(v) + faces_l.append(f) + colors_l.append(c) + pbar.update_absolute((i + 1) * prec) # snap to block end (covers empty / early-out splats) + # unlit: render flat (emissive-like) so SaveGLB matches the splat instead of lighting/washing it. + return IO.NodeOutput(pack_variable_mesh_batch(verts_l, faces_l, colors=colors_l, unlit=True)) + + +class GaussianExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [SplatToFile3D, File3DToSplat, RenderSplat, CreateCameraInfo, TransformSplat, + GetSplatCount, MergeSplat, SplatToMesh] + + +async def comfy_entrypoint() -> GaussianExtension: + return GaussianExtension() diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py index 0b7666524..434a24387 100644 --- a/comfy_extras/nodes_gits.py +++ b/comfy_extras/nodes_gits.py @@ -340,7 +340,7 @@ class GITSScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="GITSScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05, advanced=True), io.Int.Input("steps", default=10, min=2, max=1000), diff --git a/comfy_extras/nodes_hidream_o1.py b/comfy_extras/nodes_hidream_o1.py index f393745f6..8648d2e26 100644 --- a/comfy_extras/nodes_hidream_o1.py +++ b/comfy_extras/nodes_hidream_o1.py @@ -14,7 +14,7 @@ class EmptyHiDreamO1LatentImage(io.ComfyNode): return io.Schema( node_id="EmptyHiDreamO1LatentImage", display_name="Empty HiDream-O1 Latent Image", - category="latent/image", + category="model/latent/image", description=( "Empty pixel-space latent for HiDream-O1-Image. The model was " "trained at ~4 megapixels; lower resolutions go off-distribution " @@ -47,7 +47,7 @@ class HiDreamO1ReferenceImages(io.ComfyNode): return io.Schema( node_id="HiDreamO1ReferenceImages", display_name="HiDream-O1 Reference Images", - category="conditioning/image", + category="model/conditioning/image", description=( "Attach 1-10 reference images to conditioning, one for edit instruction" "or multiple for subject-driven personalization." diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py index 9e4873be5..16fff12af 100644 --- a/comfy_extras/nodes_hunyuan.py +++ b/comfy_extras/nodes_hunyuan.py @@ -41,7 +41,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode): return io.Schema( node_id="EmptyHunyuanLatentVideo", display_name="Empty HunyuanVideo 1.0 Latent", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -81,7 +81,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HunyuanVideo15ImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -132,7 +132,7 @@ class HunyuanVideo15SuperResolution(io.ComfyNode): return io.Schema( node_id="HunyuanVideo15SuperResolution", display_name="Hunyuan Video 1.5 Super Resolution", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -178,7 +178,7 @@ class LatentUpscaleModelLoader(io.ComfyNode): return io.Schema( node_id="LatentUpscaleModelLoader", display_name="Load Latent Upscale Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")), ], @@ -227,7 +227,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode): return io.Schema( node_id="HunyuanVideo15LatentUpscaleWithModel", display_name="Hunyuan Video 15 Latent Upscale With Model", - category="latent", + category="model/latent", inputs=[ io.LatentUpscaleModel.Input("model"), io.Latent.Input("samples"), @@ -308,7 +308,7 @@ class HunyuanImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HunyuanImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Vae.Input("vae"), @@ -359,7 +359,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyHunyuanImageLatent", - category="latent", + category="model/latent", inputs=[ io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32), @@ -384,7 +384,7 @@ class HunyuanRefinerLatent(io.ComfyNode): return io.Schema( node_id="HunyuanRefinerLatent", display_name="Hunyuan Latent Refiner", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py index bcd3f9198..60e530626 100644 --- a/comfy_extras/nodes_hunyuan3d.py +++ b/comfy_extras/nodes_hunyuan3d.py @@ -12,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="EmptyLatentHunyuan3Dv2", - category="latent/3d", + category="model/latent/3d", inputs=[ IO.Int.Input("resolution", default=3072, min=1, max=8192), IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."), @@ -35,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="Hunyuan3Dv2Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ IO.ClipVisionOutput.Input("clip_vision_output"), ], @@ -60,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="Hunyuan3Dv2ConditioningMultiView", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ IO.ClipVisionOutput.Input("front", optional=True), IO.ClipVisionOutput.Input("left", optional=True), @@ -97,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode): def define_schema(cls): return IO.Schema( node_id="VAEDecodeHunyuan3D", - category="latent/3d", + category="model/latent/3d", inputs=[ IO.Latent.Input("samples"), IO.Vae.Input("vae"), diff --git a/comfy_extras/nodes_hypernetwork.py b/comfy_extras/nodes_hypernetwork.py index 44a9c6f97..2d3f1bd05 100644 --- a/comfy_extras/nodes_hypernetwork.py +++ b/comfy_extras/nodes_hypernetwork.py @@ -103,7 +103,7 @@ class HypernetworkLoader(IO.ComfyNode): return IO.Schema( node_id="HypernetworkLoader", display_name="Load Hypernetwork", - category="loaders", + category="model/loaders", inputs=[ IO.Model.Input("model"), IO.Combo.Input("hypernetwork_name", options=folder_paths.get_filename_list("hypernetworks")), diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py index 354d96db1..2a96416be 100644 --- a/comfy_extras/nodes_hypertile.py +++ b/comfy_extras/nodes_hypertile.py @@ -27,7 +27,7 @@ class HyperTile(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="HyperTile", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Int.Input("tile_size", default=256, min=1, max=2048, advanced=True), diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index fe6008aa3..469a7be55 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -95,7 +95,7 @@ class BoundingBox(IO.ComfyNode): return IO.Schema( node_id="PrimitiveBoundingBox", display_name="Bounding Box", - category="utils/primitive", + category="utilities/primitive", inputs=[ IO.Int.Input("x", default=0, min=0, max=MAX_RESOLUTION), IO.Int.Input("y", default=0, min=0, max=MAX_RESOLUTION), diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py index 78f29915d..9c80834f0 100644 --- a/comfy_extras/nodes_ip2p.py +++ b/comfy_extras/nodes_ip2p.py @@ -9,7 +9,7 @@ class InstructPixToPixConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="InstructPixToPixConditioning", - category="conditioning/instructpix2pix", + category="model/conditioning/instructpix2pix", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py index 346c50cde..015965498 100644 --- a/comfy_extras/nodes_kandinsky5.py +++ b/comfy_extras/nodes_kandinsky5.py @@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Kandinsky5ImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="NormalizeVideoLatentStart", - category="conditioning/video_models", + category="model/conditioning/video_models", description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.", inputs=[ io.Latent.Input("latent"), diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py index 8bb368dec..32da9e8ac 100644 --- a/comfy_extras/nodes_latent.py +++ b/comfy_extras/nodes_latent.py @@ -22,7 +22,7 @@ class LatentAdd(io.ComfyNode): return io.Schema( node_id="LatentAdd", search_aliases=["combine latents", "sum latents"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -49,7 +49,7 @@ class LatentSubtract(io.ComfyNode): return io.Schema( node_id="LatentSubtract", search_aliases=["difference latent", "remove features"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -76,7 +76,7 @@ class LatentMultiply(io.ComfyNode): return io.Schema( node_id="LatentMultiply", search_aliases=["scale latent", "amplify latent", "latent gain"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Float.Input("multiplier", default=1.0, min=-10.0, max=10.0, step=0.01), @@ -100,7 +100,7 @@ class LatentInterpolate(io.ComfyNode): return io.Schema( node_id="LatentInterpolate", search_aliases=["blend latent", "mix latent", "lerp latent", "transition"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -139,7 +139,7 @@ class LatentConcat(io.ComfyNode): return io.Schema( node_id="LatentConcat", search_aliases=["join latents", "stitch latents"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples1"), io.Latent.Input("samples2"), @@ -179,7 +179,7 @@ class LatentCut(io.ComfyNode): return io.Schema( node_id="LatentCut", search_aliases=["crop latent", "slice latent", "extract region"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("dim", options=["x", "y", "t"]), @@ -220,7 +220,7 @@ class LatentCutToBatch(io.ComfyNode): return io.Schema( node_id="LatentCutToBatch", search_aliases=["slice to batch", "split latent", "tile latent"], - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("dim", options=["t", "x", "y"]), @@ -262,7 +262,7 @@ class LatentBatch(io.ComfyNode): return io.Schema( node_id="LatentBatch", search_aliases=["combine latents", "merge latents", "join latents"], - category="latent/batch", + category="model/latent/batch", is_deprecated=True, inputs=[ io.Latent.Input("samples1"), @@ -290,7 +290,7 @@ class LatentBatchSeedBehavior(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentBatchSeedBehavior", - category="latent/advanced", + category="model/latent/advanced", inputs=[ io.Latent.Input("samples"), io.Combo.Input("seed_behavior", options=["random", "fixed"], default="fixed"), @@ -319,7 +319,7 @@ class LatentApplyOperation(io.ComfyNode): return io.Schema( node_id="LatentApplyOperation", search_aliases=["transform latent"], - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Latent.Input("samples"), @@ -343,7 +343,7 @@ class LatentApplyOperationCFG(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentApplyOperationCFG", - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Model.Input("model"), @@ -375,7 +375,7 @@ class LatentOperationTonemapReinhard(io.ComfyNode): return io.Schema( node_id="LatentOperationTonemapReinhard", search_aliases=["hdr latent"], - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Float.Input("multiplier", default=1.0, min=0.0, max=100.0, step=0.01), @@ -410,7 +410,7 @@ class LatentOperationSharpen(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LatentOperationSharpen", - category="latent/advanced/operations", + category="model/latent/advanced/operations", is_experimental=True, inputs=[ io.Int.Input("sharpen_radius", default=9, min=1, max=31, step=1, advanced=True), @@ -447,7 +447,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ReplaceVideoLatentFrames", - category="latent/batch", + category="model/latent/batch", inputs=[ io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."), io.Latent.Input("source", optional=True, tooltip="The source latent providing frames to insert into the destination latent. If not provided, the destination latent is returned unchanged."), diff --git a/comfy_extras/nodes_load_3d.py b/comfy_extras/nodes_load_3d.py index 9112bdd0a..b339dc4ff 100644 --- a/comfy_extras/nodes_load_3d.py +++ b/comfy_extras/nodes_load_3d.py @@ -34,7 +34,7 @@ class Load3D(IO.ComfyNode): essentials_category="Basics", is_experimental=True, inputs=[ - IO.Combo.Input("model_file", options=sorted(files), upload=IO.UploadType.model), + IO.Combo.Input("model_file", options=["none"] + sorted(files), upload=IO.UploadType.model), IO.Load3D.Input("image"), IO.Int.Input("width", default=1024, min=1, max=4096, step=1), IO.Int.Input("height", default=1024, min=1, max=4096, step=1), @@ -47,6 +47,7 @@ class Load3D(IO.ComfyNode): IO.Load3DCamera.Output(display_name="camera_info"), IO.Video.Output(display_name="recording_video"), IO.File3DAny.Output(display_name="model_3d"), + IO.Load3DModelInfo.Output(display_name="model_3d_info"), ], ) @@ -68,8 +69,13 @@ class Load3D(IO.ComfyNode): video = InputImpl.VideoFromFile(recording_video_path) - file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file)) - return IO.NodeOutput(output_image, output_mask, model_file, normal_image, image['camera_info'], video, file_3d) + file_3d = None + mesh_path = "" + if model_file and model_file != "none": + file_3d = Types.File3D(folder_paths.get_annotated_filepath(model_file)) + mesh_path = model_file + model_3d_info = image.get('model_3d_info', []) + return IO.NodeOutput(output_image, output_mask, mesh_path, normal_image, image['camera_info'], video, file_3d, model_3d_info) process = execute # TODO: remove @@ -118,12 +124,71 @@ class Preview3D(IO.ComfyNode): process = execute # TODO: remove +class Preview3DAdvanced(IO.ComfyNode): + @classmethod + def define_schema(cls): + return IO.Schema( + node_id="Preview3DAdvanced", + display_name="Preview 3D (Advanced)", + search_aliases=["preview 3d", "3d viewer", "view mesh", "frame 3d", "3d camera output"], + category="3d", + is_experimental=True, + is_output_node=True, + inputs=[ + IO.MultiType.Input( + "model_file", + types=[ + IO.File3DGLB, + IO.File3DGLTF, + IO.File3DFBX, + IO.File3DOBJ, + IO.File3DSTL, + IO.File3DUSDZ, + IO.File3DAny, + ], + tooltip="3D model file from an upstream 3D node.", + ), + IO.Load3D.Input("image"), + IO.Load3DCamera.Input("camera_info", optional=True, advanced=True), + IO.Load3DModelInfo.Input("model_3d_info", optional=True, advanced=True), + IO.Int.Input("width", default=1024, min=1, max=4096, step=1), + IO.Int.Input("height", default=1024, min=1, max=4096, step=1), + ], + outputs=[ + IO.File3DAny.Output(display_name="model_file"), + IO.Load3DCamera.Output(display_name="camera_info"), + IO.Load3DModelInfo.Output(display_name="model_3d_info"), + IO.Int.Output(display_name="width"), + IO.Int.Output(display_name="height"), + ], + ) + + @classmethod + def execute(cls, model_file: Types.File3D, image, width: int, height: int, **kwargs) -> IO.NodeOutput: + filename = f"preview3d_advanced_{uuid.uuid4().hex}.{model_file.format}" + model_file.save_to(os.path.join(folder_paths.get_output_directory(), filename)) + + camera_info_input = kwargs.get("camera_info", None) + camera_info = camera_info_input if camera_info_input is not None else image['camera_info'] + model_3d_info_input = kwargs.get("model_3d_info", None) + model_3d_info = model_3d_info_input if model_3d_info_input is not None else image.get('model_3d_info', []) + return IO.NodeOutput( + model_file, + camera_info, + model_3d_info, + width, + height, + ui=UI.PreviewUI3DAdvanced(filename, camera_info, model_3d_info), + ) + + class Load3DExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[IO.ComfyNode]]: return [ Load3D, Preview3D, + Preview3DAdvanced, ] diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py index 92507f1fc..95f6ab848 100644 --- a/comfy_extras/nodes_logic.py +++ b/comfy_extras/nodes_logic.py @@ -13,7 +13,7 @@ class NotNode(io.ComfyNode): return io.Schema( node_id="ComfyNotNode", display_name="Not", - category="utils/logic", + category="utilities/logic", description="Logical NOT operation. Returns true if the value is falsy. Uses Python's rules for truthiness.", search_aliases=["invert", "toggle", "negate", "flip boolean"], inputs=[ @@ -40,7 +40,7 @@ class AndNode(io.ComfyNode): return io.Schema( node_id="ComfyAndNode", display_name="And", - category="utils/logic", + category="utilities/logic", description="Logical AND operation. Returns true if all of the values are truthy. Uses Python's rules for truthiness.", search_aliases=["all", "every"], inputs=[ @@ -67,7 +67,7 @@ class OrNode(io.ComfyNode): return io.Schema( node_id="ComfyOrNode", display_name="Or", - category="utils/logic", + category="utilities/logic", description="Logical OR operation. Returns true if any of the values are truthy. Uses Python's rules for truthiness.", search_aliases=["any", "some"], inputs=[ @@ -90,7 +90,7 @@ class SwitchNode(io.ComfyNode): return io.Schema( node_id="ComfySwitchNode", display_name="Switch", - category="utils/logic", + category="utilities/logic", is_experimental=True, inputs=[ io.Boolean.Input("switch"), @@ -121,7 +121,7 @@ class SoftSwitchNode(io.ComfyNode): return io.Schema( node_id="ComfySoftSwitchNode", display_name="Soft Switch", - category="utils/logic", + category="utilities/logic", is_experimental=True, inputs=[ io.Boolean.Input("switch"), @@ -176,7 +176,7 @@ class CustomComboNode(io.ComfyNode): return io.Schema( node_id="CustomCombo", display_name="Custom Combo", - category="utils", + category="utilities", is_experimental=True, inputs=[io.Combo.Input("choice", options=[])], outputs=[ @@ -211,7 +211,7 @@ class DCTestNode(io.ComfyNode): return io.Schema( node_id="DCTestNode", display_name="DCTest", - category="utils/logic", + category="utilities/logic", is_output_node=True, inputs=[io.DynamicCombo.Input("combo", options=[ io.DynamicCombo.Option("option1", [io.String.Input("string")]), @@ -249,7 +249,7 @@ class AutogrowNamesTestNode(io.ComfyNode): return io.Schema( node_id="AutogrowNamesTestNode", display_name="AutogrowNamesTest", - category="utils/logic", + category="utilities/logic", inputs=[ _io.Autogrow.Input("autogrow", template=template) ], @@ -269,7 +269,7 @@ class AutogrowPrefixTestNode(io.ComfyNode): return io.Schema( node_id="AutogrowPrefixTestNode", display_name="AutogrowPrefixTest", - category="utils/logic", + category="utilities/logic", inputs=[ _io.Autogrow.Input("autogrow", template=template) ], @@ -288,7 +288,7 @@ class ComboOutputTestNode(io.ComfyNode): return io.Schema( node_id="ComboOptionTestNode", display_name="ComboOptionTest", - category="utils/logic", + category="utilities/logic", inputs=[io.Combo.Input("combo", options=["option1", "option2", "option3"]), io.Combo.Input("combo2", options=["option4", "option5", "option6"])], outputs=[io.Combo.Output(), io.Combo.Output()], @@ -305,7 +305,7 @@ class ConvertStringToComboNode(io.ComfyNode): node_id="ConvertStringToComboNode", search_aliases=["string to dropdown", "text to combo"], display_name="Convert String to Combo", - category="utils/logic", + category="utilities/logic", inputs=[io.String.Input("string")], outputs=[io.Combo.Output()], ) @@ -321,7 +321,7 @@ class InvertBooleanNode(io.ComfyNode): node_id="InvertBooleanNode", search_aliases=["not", "toggle", "negate", "flip boolean"], display_name="Invert Boolean", - category="utils/logic", + category="utilities/logic", inputs=[io.Boolean.Input("boolean")], outputs=[io.Boolean.Output()], ) diff --git a/comfy_extras/nodes_lora_debug.py b/comfy_extras/nodes_lora_debug.py index 937a0fbfb..3f68064e5 100644 --- a/comfy_extras/nodes_lora_debug.py +++ b/comfy_extras/nodes_lora_debug.py @@ -30,7 +30,7 @@ class LoraLoaderBypass: OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") FUNCTION = "load_lora" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "Apply LoRA in bypass mode. Unlike regular LoRA, this doesn't modify model weights - instead it injects the LoRA computation during forward pass. Useful for training scenarios." EXPERIMENTAL = True diff --git a/comfy_extras/nodes_lotus.py b/comfy_extras/nodes_lotus.py index 9f62ba2bf..9fe4c5c7b 100644 --- a/comfy_extras/nodes_lotus.py +++ b/comfy_extras/nodes_lotus.py @@ -10,7 +10,7 @@ class LotusConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LotusConditioning", - category="conditioning/lotus", + category="model/conditioning/lotus", inputs=[], outputs=[io.Conditioning.Output(display_name="conditioning")], ) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index 51cf7951f..6d6078abe 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -25,7 +25,7 @@ class GetICLoRAParameters(io.ComfyNode): display_name="Get IC-LoRA Parameters", description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded " "model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).", - category="conditioning/video_models", + category="model/conditioning/video_models", search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"], inputs=[ io.Model.Input( @@ -62,7 +62,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyLTXVLatentVideo", - category="latent/video/ltxv", + category="model/latent/video/ltxv", inputs=[ io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32), io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32), @@ -86,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVImgToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -131,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVImgToVideoInplace", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Vae.Input("vae"), io.Image.Input("image"), @@ -226,10 +226,20 @@ def get_noise_mask(latent): noise_mask = noise_mask.clone() return noise_mask -def get_keyframe_idxs(cond): +def get_keyframe_idxs(cond, latent_shape=None): keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None) if keyframe_idxs is None: return None, 0 + # Get number of keyframes from latent_shape or guide_attention_entries if available + if latent_shape is not None and len(latent_shape) == 5: + tokens_per_frame = latent_shape[-2] * latent_shape[-1] + num_keyframes = keyframe_idxs.shape[2] // tokens_per_frame + return keyframe_idxs, num_keyframes + entries = conditioning_get_any_value(cond, "guide_attention_entries", None) + if entries: + num_keyframes = sum(e["latent_shape"][0] for e in entries) + return keyframe_idxs, num_keyframes + # fallback, may under-count if keyframes share t-start # keyframe_idxs contains start/end positions (last dimension), checking for unqiue values only for start num_keyframes = torch.unique(keyframe_idxs[:, 0, :, 0]).shape[0] return keyframe_idxs, num_keyframes @@ -241,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVAddGuide", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -322,9 +332,9 @@ class LTXVAddGuide(io.ComfyNode): return factor @classmethod - def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors): + def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors, latent_shape=None): time_scale_factor, _, _ = scale_factors - _, num_keyframes = get_keyframe_idxs(cond) + _, num_keyframes = get_keyframe_idxs(cond, latent_shape) latent_count = latent_length - num_keyframes frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0) if guide_length > 1 and frame_idx != 0: @@ -436,7 +446,7 @@ class LTXVAddGuide(io.ComfyNode): num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1 resolved_frame_idx = frame_idx if frame_idx < 0: - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0) causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1 @@ -454,7 +464,7 @@ class LTXVAddGuide(io.ComfyNode): if latent_downscale_factor > 1: t, guide_mask = cls.dilate_latent(t, latent_downscale_factor) - frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) + frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors, latent_shape=latent_image.shape) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." positive, negative, latent_image, noise_mask = cls.append_keyframe( @@ -488,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVCropGuides", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -506,7 +516,7 @@ class LTXVCropGuides(io.ComfyNode): latent_image = latent["samples"].clone() noise_mask = get_noise_mask(latent) - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) if num_keyframes == 0: return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},) @@ -532,7 +542,7 @@ class LTXVConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVConditioning", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -601,7 +611,7 @@ class LTXVScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Int.Input("steps", default=20, min=1, max=10000), io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01), @@ -736,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVConcatAVLatent", - category="latent/video/ltxv", + category="model/latent/video/ltxv", inputs=[ io.Latent.Input("video_latent"), io.Latent.Input("audio_latent"), @@ -771,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="LTXVSeparateAVLatent", - category="latent/video/ltxv", + category="model/latent/video/ltxv", description="LTXV Separate AV Latent", inputs=[ io.Latent.Input("av_latent"), @@ -804,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode): return io.Schema( node_id="LTXVReferenceAudio", display_name="LTXV Reference Audio (ID-LoRA)", - category="conditioning/audio", + category="model/conditioning/audio", description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).", inputs=[ io.Model.Input("model"), diff --git a/comfy_extras/nodes_lt_audio.py b/comfy_extras/nodes_lt_audio.py index 51ddf584a..052186083 100644 --- a/comfy_extras/nodes_lt_audio.py +++ b/comfy_extras/nodes_lt_audio.py @@ -12,7 +12,7 @@ class LTXVAudioVAELoader(io.ComfyNode): return io.Schema( node_id="LTXVAudioVAELoader", display_name="Load LTXV Audio VAE", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input( "ckpt_name", @@ -40,7 +40,7 @@ class LTXVAudioVAEEncode(VAEEncodeAudio): return io.Schema( node_id="LTXVAudioVAEEncode", display_name="LTXV Audio VAE Encode", - category="latent/audio", + category="model/latent/audio", inputs=[ io.Audio.Input("audio", tooltip="The audio to be encoded."), io.Vae.Input( @@ -63,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode): return io.Schema( node_id="LTXVAudioVAEDecode", display_name="LTXV Audio VAE Decode", - category="latent/audio", + category="model/latent/audio", inputs=[ io.Latent.Input("samples", tooltip="The latent to be decoded."), io.Vae.Input( @@ -96,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode): return io.Schema( node_id="LTXVEmptyLatentAudio", display_name="LTXV Empty Latent Audio", - category="latent/audio", + category="model/latent/audio", inputs=[ io.Int.Input( "frames_number", diff --git a/comfy_extras/nodes_lt_upsampler.py b/comfy_extras/nodes_lt_upsampler.py index f99ba13fb..be9a36e69 100644 --- a/comfy_extras/nodes_lt_upsampler.py +++ b/comfy_extras/nodes_lt_upsampler.py @@ -1,32 +1,32 @@ from comfy import model_management +from comfy_api.latest import ComfyExtension, IO +from typing_extensions import override import math -class LTXVLatentUpsampler: + +class LTXVLatentUpsampler(IO.ComfyNode): """ Upsamples a video latent by a factor of 2. """ @classmethod - def INPUT_TYPES(s): - return { - "required": { - "samples": ("LATENT",), - "upscale_model": ("LATENT_UPSCALE_MODEL",), - "vae": ("VAE",), - } - } + def define_schema(cls): + return IO.Schema( + node_id="LTXVLatentUpsampler", + category="model/latent/video", + is_experimental=True, + inputs=[ + IO.Latent.Input("samples"), + IO.LatentUpscaleModel.Input("upscale_model"), + IO.Vae.Input("vae"), + ], + outputs=[ + IO.Latent.Output(), + ], + ) - RETURN_TYPES = ("LATENT",) - FUNCTION = "upsample_latent" - CATEGORY = "latent/video" - EXPERIMENTAL = True - - def upsample_latent( - self, - samples: dict, - upscale_model, - vae, - ) -> tuple: + @classmethod + def execute(cls, samples, upscale_model, vae) -> IO.NodeOutput: """ Upsample the input latent using the provided model. @@ -34,7 +34,6 @@ class LTXVLatentUpsampler: samples (dict): Input latent samples upscale_model (LatentUpsampler): Loaded upscale model vae: VAE model for normalization - auto_tiling (bool): Whether to automatically tile the input for processing Returns: tuple: Tuple containing the upsampled latent @@ -67,9 +66,16 @@ class LTXVLatentUpsampler: return_dict = samples.copy() return_dict["samples"] = upsampled_latents return_dict.pop("noise_mask", None) - return (return_dict,) + return IO.NodeOutput(return_dict) + + upsample_latent = execute # TODO: remove -NODE_CLASS_MAPPINGS = { - "LTXVLatentUpsampler": LTXVLatentUpsampler, -} +class LTXVLatentUpsamplerExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[IO.ComfyNode]]: + return [LTXVLatentUpsampler] + + +async def comfy_entrypoint() -> LTXVLatentUpsamplerExtension: + return LTXVLatentUpsamplerExtension() diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py index b35ab8b7d..c060a86a0 100644 --- a/comfy_extras/nodes_lumina2.py +++ b/comfy_extras/nodes_lumina2.py @@ -81,7 +81,7 @@ class CLIPTextEncodeLumina2(io.ComfyNode): node_id="CLIPTextEncodeLumina2", search_aliases=["lumina prompt"], display_name="CLIP Text Encode for Lumina2", - category="conditioning", + category="model/conditioning", description="Encodes a system prompt and a user prompt using a CLIP model into an embedding " "that can be used to guide the diffusion model towards generating specific images.", inputs=[ diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py index d15f1f4e7..52484697a 100644 --- a/comfy_extras/nodes_mask.py +++ b/comfy_extras/nodes_mask.py @@ -53,7 +53,7 @@ class LatentCompositeMasked(IO.ComfyNode): return IO.Schema( node_id="LatentCompositeMasked", search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"], - category="latent", + category="model/latent", inputs=[ IO.Latent.Input("destination"), IO.Latent.Input("source"), diff --git a/comfy_extras/nodes_math.py b/comfy_extras/nodes_math.py index 0040d1a92..873ee7b51 100644 --- a/comfy_extras/nodes_math.py +++ b/comfy_extras/nodes_math.py @@ -69,7 +69,7 @@ class MathExpressionNode(io.ComfyNode): return io.Schema( node_id="ComfyMathExpression", display_name="Math Expression", - category="utils", + category="utilities", search_aliases=[ "expression", "formula", "calculate", "calculator", "eval", "math", diff --git a/comfy_extras/nodes_mediapipe.py b/comfy_extras/nodes_mediapipe.py index 32dc22de3..343d88dbb 100644 --- a/comfy_extras/nodes_mediapipe.py +++ b/comfy_extras/nodes_mediapipe.py @@ -205,7 +205,7 @@ class LoadMediaPipeFaceLandmarker(io.ComfyNode): node_id="LoadMediaPipeFaceLandmarker", search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection"], display_name="Load Face Detection Model (MediaPipe)", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("detection"), tooltip="Face detection model from models/detection/."), diff --git a/comfy_extras/nodes_mochi.py b/comfy_extras/nodes_mochi.py index d750194fc..3dcea6ab3 100644 --- a/comfy_extras/nodes_mochi.py +++ b/comfy_extras/nodes_mochi.py @@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptyMochiLatentVideo", - category="latent/video", + category="model/latent/video", inputs=[ io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py index 24d47a903..817542452 100644 --- a/comfy_extras/nodes_model_downscale.py +++ b/comfy_extras/nodes_model_downscale.py @@ -10,7 +10,7 @@ class PatchModelAddDownscale(io.ComfyNode): return io.Schema( node_id="PatchModelAddDownscale", display_name="PatchModelAddDownscale (Kohya Deep Shrink)", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Int.Input("block_number", default=3, min=1, max=32, step=1, advanced=True), diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py index 748559a6b..bdccbf8c4 100644 --- a/comfy_extras/nodes_model_patch.py +++ b/comfy_extras/nodes_model_patch.py @@ -548,7 +548,7 @@ class USOStyleReference: FUNCTION = "apply_patch" EXPERIMENTAL = True - CATEGORY = "advanced/model_patches/flux" + CATEGORY = "model/patch/flux" def apply_patch(self, model, model_patch, clip_vision_output): encoded_image = torch.stack((clip_vision_output.all_hidden_states[:, -20], clip_vision_output.all_hidden_states[:, -11], clip_vision_output.penultimate_hidden_states)) @@ -594,7 +594,7 @@ class SUPIRApply(io.ComfyNode): def define_schema(cls) -> io.Schema: return io.Schema( node_id="SUPIRApply", - category="model_patches/supir", + category="model/patch/supir", is_experimental=True, inputs=[ io.Model.Input("model"), diff --git a/comfy_extras/nodes_moge.py b/comfy_extras/nodes_moge.py index 79aec5d7f..422949531 100644 --- a/comfy_extras/nodes_moge.py +++ b/comfy_extras/nodes_moge.py @@ -78,7 +78,7 @@ class LoadMoGeModel(io.ComfyNode): return io.Schema( node_id="LoadMoGeModel", display_name="Load MoGe Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("geometry_estimation")), ], @@ -104,7 +104,7 @@ class MoGePanoramaInference(io.ComfyNode): node_id="MoGePanoramaInference", search_aliases=["moge", "panorama", "depth", "geometry", "depth estimation", "geometry estimation"], display_name="Run MoGe Panorama Inference", - category="image/geometry_estimation", + category="image/geometry estimation", description="Run MoGe on an equirectangular panorama by splitting it into 12 perspective views, running inference on each, and merging the results into a single depth map.", inputs=[ MoGeModelType.Input("moge_model"), @@ -226,7 +226,7 @@ class MoGeInference(io.ComfyNode): search_aliases=["moge", "depth", "geometry", "depth estimation", "geometry estimation"], display_name="Run MoGe Inference", description="Run MoGe on a single image to estimate depth and geometry.", - category="image/geometry_estimation", + category="image/geometry estimation", inputs=[ MoGeModelType.Input("moge_model"), io.Image.Input("image"), @@ -283,7 +283,7 @@ class MoGeRender(io.ComfyNode): search_aliases=["moge", "render", "geometry", "depth", "normal"], display_name="Render MoGe Geometry", description="Render a depth map or normal map from geometry data", - category="image/geometry_estimation", + category="image/geometry estimation", inputs=[ MoGeGeometry.Input("moge_geometry"), io.Combo.Input("output", options=["depth", "depth_colored", "normal_opengl", "normal_directx", "mask"], default="depth", @@ -350,7 +350,7 @@ class MoGePointMapToMesh(io.ComfyNode): search_aliases=["moge", "mesh", "geometry", "point map"], display_name="Convert MoGe Point Map to Mesh", description="Convert a MoGe point map into a 3D mesh.", - category="image/geometry_estimation", + category="image/geometry estimation", inputs=[ MoGeGeometry.Input("moge_geometry"), io.Int.Input("batch_index", default=0, min=0, max=4096, diff --git a/comfy_extras/nodes_number_convert.py b/comfy_extras/nodes_number_convert.py index 01593b6e6..d7e557e95 100644 --- a/comfy_extras/nodes_number_convert.py +++ b/comfy_extras/nodes_number_convert.py @@ -20,7 +20,7 @@ class NumberConvertNode(io.ComfyNode): return io.Schema( node_id="ComfyNumberConvert", display_name="Convert Number", - category="utils", + category="utilities", search_aliases=[ "int to float", "float to int", "number convert", "int2float", "float2int", "cast", "parse number", diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py index 5beeaa7db..19629790f 100644 --- a/comfy_extras/nodes_optimalsteps.py +++ b/comfy_extras/nodes_optimalsteps.py @@ -31,7 +31,7 @@ class OptimalStepsScheduler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="OptimalStepsScheduler", - category="sampling/schedulers", + category="model/sampling/schedulers", inputs=[ io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]), io.Int.Input("steps", default=20, min=3, max=1000), diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py index 79fea5f0c..c875e1e06 100644 --- a/comfy_extras/nodes_pag.py +++ b/comfy_extras/nodes_pag.py @@ -15,7 +15,7 @@ class PerturbedAttentionGuidance(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="PerturbedAttentionGuidance", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01), diff --git a/comfy_extras/nodes_pid.py b/comfy_extras/nodes_pid.py new file mode 100644 index 000000000..811b9ae8e --- /dev/null +++ b/comfy_extras/nodes_pid.py @@ -0,0 +1,55 @@ +"""PiD (Pixel Diffusion Decoder) node""" + +import torch +from typing_extensions import override + +import node_helpers +import comfy.latent_formats +from comfy_api.latest import ComfyExtension, io + + +class PiDConditioning(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="PiDConditioning", + display_name="PiD Conditioning", + category="advanced/conditioning", + description=( + "Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling" + ), + inputs=[ + io.Conditioning.Input("positive"), + io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."), + io.Combo.Input("latent_format", options=["flux", "sd3"], default="flux", + tooltip="Flux1 and Flux2 latents auto-detected from channel dim, sd3 has to be selected manually."), + io.Float.Input( + "degrade_sigma", default=0.0, min=0.0, max=1.0, step=0.01, + tooltip="0 = clean latent. Increase to denoise corrupted latent outputs.", + ), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, positive, latent, latent_format: str, degrade_sigma: float) -> io.NodeOutput: + samples = latent["samples"] + if latent_format == "flux": + fmt_cls = comfy.latent_formats.Flux2 if samples.shape[1] == 128 else comfy.latent_formats.Flux + else: + fmt_cls = comfy.latent_formats.SD3 + lq_latent = fmt_cls().process_in(samples) + sigma_t = torch.tensor([float(degrade_sigma)], dtype=torch.float32) + return io.NodeOutput(node_helpers.conditioning_set_values( + positive, {"lq_latent": lq_latent, "degrade_sigma": sigma_t}, + )) + + +class PiDExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [PiDConditioning] + + +async def comfy_entrypoint() -> PiDExtension: + return PiDExtension() diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py index a25db277c..3e440433e 100644 --- a/comfy_extras/nodes_post_processing.py +++ b/comfy_extras/nodes_post_processing.py @@ -616,7 +616,7 @@ class BatchLatentsNode(io.ComfyNode): node_id="BatchLatentsNode", search_aliases=["combine latents", "stack latents", "merge latents"], display_name="Batch Latents", - category="latent", + category="model/latent", inputs=[ io.Autogrow.Input("latents", template=autogrow_template) ], diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py index 17e25d514..1070a69d0 100644 --- a/comfy_extras/nodes_preview_any.py +++ b/comfy_extras/nodes_preview_any.py @@ -16,7 +16,7 @@ class PreviewAny(): FUNCTION = "main" OUTPUT_NODE = True - CATEGORY = "utils" + CATEGORY = "utilities" SEARCH_ALIASES = ["show output", "inspect", "debug", "print value", "show text"] def main(self, source=None): diff --git a/comfy_extras/nodes_primitive.py b/comfy_extras/nodes_primitive.py index 33373266b..c44b09098 100644 --- a/comfy_extras/nodes_primitive.py +++ b/comfy_extras/nodes_primitive.py @@ -11,7 +11,7 @@ class String(io.ComfyNode): node_id="PrimitiveString", search_aliases=["text", "string", "text box", "prompt"], display_name="Text String", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.String.Input("value"), ], @@ -30,7 +30,7 @@ class StringMultiline(io.ComfyNode): node_id="PrimitiveStringMultiline", search_aliases=["text", "string", "text multiline", "string multiline", "text box", "prompt"], display_name="Text String (Multiline)", - category="utils/primitive", + category="utilities/primitive", essentials_category="Basics", inputs=[ io.String.Input("value", multiline=True), @@ -49,7 +49,7 @@ class Int(io.ComfyNode): return io.Schema( node_id="PrimitiveInt", display_name="Int", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=io.ControlAfterGenerate.fixed), ], @@ -67,7 +67,7 @@ class Float(io.ComfyNode): return io.Schema( node_id="PrimitiveFloat", display_name="Float", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Float.Input("value", min=-sys.maxsize, max=sys.maxsize, step=0.1), ], @@ -85,7 +85,7 @@ class Boolean(io.ComfyNode): return io.Schema( node_id="PrimitiveBoolean", display_name="Boolean", - category="utils/primitive", + category="utilities/primitive", inputs=[ io.Boolean.Input("value"), ], diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py index fde8fac9a..5b92814a4 100644 --- a/comfy_extras/nodes_qwen.py +++ b/comfy_extras/nodes_qwen.py @@ -112,7 +112,7 @@ class EmptyQwenImageLayeredLatentImage(io.ComfyNode): return io.Schema( node_id="EmptyQwenImageLayeredLatentImage", display_name="Empty Qwen Image Layered Latent", - category="latent/qwen", + category="model/latent/qwen", inputs=[ io.Int.Input("width", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16), diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py index 5f4e82aef..2185385f0 100644 --- a/comfy_extras/nodes_rebatch.py +++ b/comfy_extras/nodes_rebatch.py @@ -10,7 +10,7 @@ class LatentRebatch(io.ComfyNode): return io.Schema( node_id="RebatchLatents", display_name="Rebatch Latents", - category="latent/batch", + category="model/latent/batch", is_input_list=True, inputs=[ io.Latent.Input("latents"), diff --git a/comfy_extras/nodes_resolution.py b/comfy_extras/nodes_resolution.py index 1628038cc..dc405291c 100644 --- a/comfy_extras/nodes_resolution.py +++ b/comfy_extras/nodes_resolution.py @@ -35,7 +35,7 @@ class ResolutionSelector(io.ComfyNode): return io.Schema( node_id="ResolutionSelector", display_name="Resolution Selector", - category="utils", + category="utilities", description="Calculate width and height from aspect ratio and megapixel target. Useful for setting up Empty Latent Image dimensions.", inputs=[ io.Combo.Input( diff --git a/comfy_extras/nodes_rope.py b/comfy_extras/nodes_rope.py index 918ddc02b..808eee29b 100644 --- a/comfy_extras/nodes_rope.py +++ b/comfy_extras/nodes_rope.py @@ -7,7 +7,7 @@ class ScaleROPE(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="ScaleROPE", - category="advanced/model_patches", + category="model/patch", description="Scale and shift the ROPE of the model.", is_experimental=True, inputs=[ diff --git a/comfy_extras/nodes_save_3d.py b/comfy_extras/nodes_save_3d.py index c03524246..a91549e7f 100644 --- a/comfy_extras/nodes_save_3d.py +++ b/comfy_extras/nodes_save_3d.py @@ -16,7 +16,7 @@ from comfy.cli_args import args from comfy_api.latest import ComfyExtension, IO, Types -def pack_variable_mesh_batch(vertices, faces, colors=None, uvs=None, texture=None): +def pack_variable_mesh_batch(vertices, faces, colors=None, uvs=None, texture=None, unlit=False): # Pack lists of (Nᵢ, *) vertex/face/color/uv tensors into padded batched tensors, # stashing per-item lengths as runtime attrs so consumers can recover the real slice. # colors and uvs are 1:1 with vertices, so they're padded to max_vertices and read with vertex_counts. @@ -54,7 +54,7 @@ def pack_variable_mesh_batch(vertices, faces, colors=None, uvs=None, texture=Non return Types.MESH(packed_vertices, packed_faces, uvs=packed_uvs, vertex_colors=packed_colors, texture=texture, - vertex_counts=vertex_counts, face_counts=face_counts) + vertex_counts=vertex_counts, face_counts=face_counts, unlit=unlit) def get_mesh_batch_item(mesh, index): @@ -77,7 +77,7 @@ def get_mesh_batch_item(mesh, index): def save_glb(vertices, faces, filepath, metadata=None, - uvs=None, vertex_colors=None, texture_image=None): + uvs=None, vertex_colors=None, texture_image=None, unlit=False): """ Save PyTorch tensor vertices and faces as a GLB file without external dependencies. @@ -234,6 +234,17 @@ def save_glb(vertices, faces, filepath, metadata=None, textures = [] samplers = [] materials = [] + extensions_used = [] + if unlit and texture_png_bytes is None: + # Flat, light-independent shading (KHR_materials_unlit): COLOR_0 is shown as-is, matching how a + # gaussian splat renders (emissive). Without this the viewer lights the mesh and washes the colours. + materials.append({ + "pbrMetallicRoughness": {"baseColorFactor": [1.0, 1.0, 1.0, 1.0], "metallicFactor": 0.0, "roughnessFactor": 1.0}, + "extensions": {"KHR_materials_unlit": {}}, + "doubleSided": True, + }) + extensions_used.append("KHR_materials_unlit") + primitive["material"] = 0 if texture_png_bytes is not None and "TEXCOORD_0" in primitive_attributes: buffer_views.append({ "buffer": 0, @@ -271,6 +282,8 @@ def save_glb(vertices, faces, filepath, metadata=None, gltf["textures"] = textures if materials: gltf["materials"] = materials + if extensions_used: + gltf["extensionsUsed"] = extensions_used if metadata: gltf["asset"]["extras"] = metadata @@ -376,7 +389,8 @@ class SaveGLB(IO.ComfyNode): save_glb(vertices_i, faces_i, os.path.join(full_output_folder, f), metadata, uvs=uvs_i, vertex_colors=v_colors, - texture_image=tex_img) + texture_image=tex_img, + unlit=getattr(mesh, "unlit", False)) results.append({ "filename": f, "subfolder": subfolder, diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py index 6655c1ba7..38cbf117b 100644 --- a/comfy_extras/nodes_sd3.py +++ b/comfy_extras/nodes_sd3.py @@ -41,7 +41,7 @@ class EmptySD3LatentImage(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="EmptySD3LatentImage", - category="latent/sd3", + category="model/latent/sd3", inputs=[ io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16), @@ -113,7 +113,7 @@ class ControlNetApplySD3(io.ComfyNode): return io.Schema( node_id="ControlNetApplySD3", display_name="Apply Controlnet with VAE", - category="conditioning/controlnet", + category="model/conditioning/controlnet", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_sdupscale.py b/comfy_extras/nodes_sdupscale.py index 5877719d3..ea283e971 100644 --- a/comfy_extras/nodes_sdupscale.py +++ b/comfy_extras/nodes_sdupscale.py @@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SD_4XUpscale_Conditioning", - category="conditioning/upscale_diffusion", + category="model/conditioning/upscale_diffusion", inputs=[ io.Image.Input("images"), io.Conditioning.Input("positive"), diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py index 829c837a1..8a6e5b726 100644 --- a/comfy_extras/nodes_stable3d.py +++ b/comfy_extras/nodes_stable3d.py @@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableZero123_Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), @@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableZero123_Conditioning_Batched", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), @@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="SV3D_Conditioning", - category="conditioning/3d_models", + category="model/conditioning/3d_models", inputs=[ io.ClipVision.Input("clip_vision"), io.Image.Input("init_image"), diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py index 0dc6c9fcd..e55f248ae 100644 --- a/comfy_extras/nodes_stable_cascade.py +++ b/comfy_extras/nodes_stable_cascade.py @@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_EmptyLatentImage", - category="latent/stable_cascade", + category="model/latent/stable_cascade", inputs=[ io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8), @@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_StageC_VAEEncode", - category="latent/stable_cascade", + category="model/latent/stable_cascade", inputs=[ io.Image.Input("image"), io.Vae.Input("vae"), @@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="StableCascade_StageB_Conditioning", - category="conditioning/stable_cascade", + category="model/conditioning/stable_cascade", inputs=[ io.Conditioning.Input("conditioning"), io.Latent.Input("stage_c"), diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py index 87bf29b8f..3667fac3a 100644 --- a/comfy_extras/nodes_tomesd.py +++ b/comfy_extras/nodes_tomesd.py @@ -151,7 +151,7 @@ class TomePatchModel(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="TomePatchModel", - category="model_patches/unet", + category="model/patch/unet", inputs=[ io.Model.Input("model"), io.Float.Input("ratio", default=0.3, min=0.0, max=1.0, step=0.01), diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py index 0548a0cf8..9f709bbe3 100644 --- a/comfy_extras/nodes_toolkit.py +++ b/comfy_extras/nodes_toolkit.py @@ -13,7 +13,7 @@ class CreateList(io.ComfyNode): return io.Schema( node_id="CreateList", display_name="Create List", - category="utils", + category="utilities", is_input_list=True, search_aliases=["Image Iterator", "Text Iterator", "Iterator"], inputs=[io.Autogrow.Input("inputs", template=template_autogrow)], diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py index e9871369b..046eeaaf5 100644 --- a/comfy_extras/nodes_train.py +++ b/comfy_extras/nodes_train.py @@ -951,7 +951,7 @@ class TrainLoraNode(io.ComfyNode): return io.Schema( node_id="TrainLoraNode", display_name="Train LoRA", - category="training", + category="model/training", is_experimental=True, is_input_list=True, # All inputs become lists inputs=[ @@ -1309,7 +1309,7 @@ class LoraModelLoader(io.ComfyNode): return io.Schema( node_id="LoraModelLoader", display_name="Load LoRA Model", - category="loaders", + category="model/loaders", is_experimental=True, inputs=[ io.Model.Input( @@ -1405,7 +1405,7 @@ class LossGraphNode(io.ComfyNode): node_id="LossGraphNode", search_aliases=["training chart", "training visualization", "plot loss"], display_name="Plot Loss Graph", - category="training", + category="model/training", is_experimental=True, is_output_node=True, inputs=[ diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py index d3ee3f1c1..1cf5a5d01 100644 --- a/comfy_extras/nodes_upscale_model.py +++ b/comfy_extras/nodes_upscale_model.py @@ -22,7 +22,7 @@ class UpscaleModelLoader(io.ComfyNode): return io.Schema( node_id="UpscaleModelLoader", display_name="Load Upscale Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input("model_name", options=folder_paths.get_filename_list("upscale_models")), ], diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py index 8f19895a1..0d6cae6a8 100644 --- a/comfy_extras/nodes_video_model.py +++ b/comfy_extras/nodes_video_model.py @@ -15,7 +15,7 @@ class ImageOnlyCheckpointLoader: RETURN_TYPES = ("MODEL", "CLIP_VISION", "VAE") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True): ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name) @@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning: FUNCTION = "encode" - CATEGORY = "conditioning/video_models" + CATEGORY = "model/conditioning/video_models" def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level): output = clip_vision.encode_image(init_image) @@ -65,7 +65,7 @@ class VideoLinearCFGGuidance: RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "sampling/guiders" + CATEGORY = "model/sampling/guiders" def patch(self, model, min_cfg): def linear_cfg(args): @@ -89,7 +89,7 @@ class VideoTriangleCFGGuidance: RETURN_TYPES = ("MODEL",) FUNCTION = "patch" - CATEGORY = "sampling/guiders" + CATEGORY = "model/sampling/guiders" def patch(self, model, min_cfg): def linear_cfg(args): @@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, temporal, x, y, z, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x), diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index be724371a..b43154b8d 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -58,7 +58,7 @@ class OpticalFlowLoader(io.ComfyNode): return io.Schema( node_id="OpticalFlowLoader", display_name="Load Optical Flow Model", - category="loaders", + category="model/loaders", inputs=[ io.Combo.Input( "model_name", @@ -175,7 +175,7 @@ class VOIDInpaintConditioning(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDInpaintConditioning", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -288,7 +288,7 @@ class VOIDWarpedNoise(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDWarpedNoise", - category="latent/video", + category="model/latent/video", inputs=[ OpticalFlow.Input( "optical_flow", @@ -393,7 +393,7 @@ class VOIDWarpedNoiseSource(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDWarpedNoiseSource", - category="sampling/noise", + category="model/sampling/noise", inputs=[ io.Latent.Input("warped_noise", tooltip="Warped noise latent from VOIDWarpedNoise"), @@ -455,7 +455,7 @@ class VOIDSampler(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="VOIDSampler", - category="sampling/samplers", + category="model/sampling/samplers", inputs=[], outputs=[io.Sampler.Output()], ) diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py index e50bfcd2c..67d3a8443 100644 --- a/comfy_extras/nodes_wan.py +++ b/comfy_extras/nodes_wan.py @@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFunControlToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Wan22FunControlToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFirstLastFrameToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanFunInpaintToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode): return io.Schema( node_id="WanVaceToVideo", search_aliases=["video conditioning", "video control"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -375,7 +375,7 @@ class TrimVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="TrimVideoLatent", - category="latent/video", + category="model/latent/video", inputs=[ io.Latent.Input("samples"), io.Int.Input("trim_amount", default=0, min=0, max=99999), @@ -398,7 +398,7 @@ class WanCameraImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanCameraImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -452,7 +452,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanPhantomSubjectToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -707,7 +707,7 @@ class WanTrackToVideo(io.ComfyNode): return io.Schema( node_id="WanTrackToVideo", search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -951,7 +951,7 @@ class WanSoundImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSoundImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -984,7 +984,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSoundImageToVideoExtend", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1046,7 +1046,7 @@ class WanHuMoImageToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanHuMoImageToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1112,7 +1112,7 @@ class WanAnimateToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanAnimateToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), @@ -1252,7 +1252,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="Wan22ImageToVideoLatent", - category="conditioning/inpaint", + category="model/conditioning/inpaint", inputs=[ io.Vae.Input("vae"), io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32), @@ -1302,7 +1302,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanInfiniteTalkToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.DynamicCombo.Input("mode", options=[ io.DynamicCombo.Option("single_speaker", []), @@ -1461,7 +1461,7 @@ class WanSCAILToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanSCAILToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py index fc005ed4c..a96885745 100644 --- a/comfy_extras/nodes_wandancer.py +++ b/comfy_extras/nodes_wandancer.py @@ -713,7 +713,7 @@ class WanDancerEncodeAudio(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanDancerEncodeAudio", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Audio.Input("audio"), io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4), @@ -787,7 +787,7 @@ class WanDancerVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanDancerVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py index 5acae03eb..2db064922 100644 --- a/comfy_extras/nodes_wanmove.py +++ b/comfy_extras/nodes_wanmove.py @@ -247,7 +247,7 @@ class WanMoveVisualizeTracks(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveVisualizeTracks", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Image.Input("images"), io.Tracks.Input("tracks", optional=True), @@ -283,7 +283,7 @@ class WanMoveTracksFromCoords(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveTracksFromCoords", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.String.Input("track_coords", force_input=True, default="[]", optional=True), io.Mask.Input("track_mask", optional=True), @@ -325,7 +325,7 @@ class GenerateTracks(io.ComfyNode): return io.Schema( node_id="GenerateTracks", search_aliases=["motion paths", "camera movement", "trajectory"], - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Int.Input("width", default=832, min=16, max=4096, step=16), io.Int.Input("height", default=480, min=16, max=4096, step=16), @@ -434,7 +434,7 @@ class WanMoveConcatTrack(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveConcatTrack", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Tracks.Input("tracks_1"), io.Tracks.Input("tracks_2", optional=True), @@ -463,7 +463,7 @@ class WanMoveTrackToVideo(io.ComfyNode): def define_schema(cls): return io.Schema( node_id="WanMoveTrackToVideo", - category="conditioning/video_models", + category="model/conditioning/video_models", inputs=[ io.Conditioning.Input("positive"), io.Conditioning.Input("negative"), diff --git a/nodes.py b/nodes.py index 13d3864cd..5678bc22d 100644 --- a/nodes.py +++ b/nodes.py @@ -68,7 +68,7 @@ class CLIPTextEncode(ComfyNodeABC): OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",) FUNCTION = "encode" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images." SEARCH_ALIASES = ["text", "prompt", "text prompt", "positive prompt", "negative prompt", "encode text", "text encoder", "encode prompt"] @@ -87,7 +87,7 @@ class ConditioningCombine: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "combine" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"] def combine(self, conditioning_1, conditioning_2): @@ -104,7 +104,7 @@ class ConditioningAverage : RETURN_TYPES = ("CONDITIONING",) FUNCTION = "addWeighted" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def addWeighted(self, conditioning_to, conditioning_from, conditioning_to_strength): out = [] @@ -143,7 +143,7 @@ class ConditioningConcat: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "concat" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def concat(self, conditioning_to, conditioning_from): out = [] @@ -176,7 +176,7 @@ class ConditioningSetArea: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, x, y, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": (height // 8, width // 8, y // 8, x // 8), @@ -197,7 +197,7 @@ class ConditioningSetAreaPercentage: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, width, height, x, y, strength): c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", height, width, y, x), @@ -214,7 +214,7 @@ class ConditioningSetAreaStrength: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, strength): c = node_helpers.conditioning_set_values(conditioning, {"strength": strength}) @@ -234,7 +234,7 @@ class ConditioningSetMask: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def append(self, conditioning, mask, set_cond_area, strength): set_area_to_bounds = False @@ -303,7 +303,7 @@ class VAEDecode: OUTPUT_TOOLTIPS = ("The decoded image.",) FUNCTION = "decode" - CATEGORY = "latent" + CATEGORY = "model/latent" DESCRIPTION = "Decodes latent images back into pixel space images." SEARCH_ALIASES = ["decode", "decode latent", "latent to image", "render latent"] @@ -357,7 +357,7 @@ class VAEEncode: RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "latent" + CATEGORY = "model/latent" SEARCH_ALIASES = ["encode", "encode image", "image to latent"] def encode(self, vae, pixels): @@ -389,7 +389,7 @@ class VAEEncodeForInpaint: RETURN_TYPES = ("LATENT",) FUNCTION = "encode" - CATEGORY = "latent/inpaint" + CATEGORY = "model/latent/inpaint" def encode(self, vae, pixels, mask, grow_mask_by=6): downscale_ratio = vae.spacial_compression_encode() @@ -438,7 +438,7 @@ class InpaintModelConditioning: RETURN_NAMES = ("positive", "negative", "latent") FUNCTION = "encode" - CATEGORY = "conditioning/inpaint" + CATEGORY = "model/conditioning/inpaint" def encode(self, positive, negative, pixels, vae, mask, noise_mask=True): x = (pixels.shape[1] // 8) * 8 @@ -598,7 +598,7 @@ class CheckpointLoaderSimple: "The VAE model used for encoding and decoding images to and from latent space.") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "Loads a diffusion model checkpoint, diffusion models are used to denoise latents." SEARCH_ALIASES = ["load model", "checkpoint", "model loader", "load checkpoint", "ckpt", "model"] @@ -644,7 +644,7 @@ class unCLIPCheckpointLoader: RETURN_TYPES = ("MODEL", "CLIP", "VAE", "CLIP_VISION") FUNCTION = "load_checkpoint" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True): ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name) @@ -660,7 +660,7 @@ class CLIPSetLastLayer: RETURN_TYPES = ("CLIP",) FUNCTION = "set_last_layer" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def set_last_layer(self, clip, stop_at_clip_layer): clip = clip.clone() @@ -689,7 +689,7 @@ class LoraLoader: OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.") FUNCTION = "load_lora" - CATEGORY = "loaders" + CATEGORY = "model/loaders" DESCRIPTION = "This LoRA loader is used to modify both diffusion and CLIP models, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together." SEARCH_ALIASES = ["lora", "load lora", "apply lora", "lora loader", "lora model"] @@ -789,7 +789,7 @@ class VAELoader: RETURN_TYPES = ("VAE",) FUNCTION = "load_vae" - CATEGORY = "loaders" + CATEGORY = "model/loaders" #TODO: scale factor? def load_vae(self, vae_name): @@ -831,7 +831,7 @@ class ControlNetLoader: RETURN_TYPES = ("CONTROL_NET",) FUNCTION = "load_controlnet" - CATEGORY = "loaders" + CATEGORY = "model/loaders" SEARCH_ALIASES = ["controlnet", "control net", "cn", "load controlnet", "controlnet loader"] def load_controlnet(self, control_net_name): @@ -850,7 +850,7 @@ class DiffControlNetLoader: RETURN_TYPES = ("CONTROL_NET",) FUNCTION = "load_controlnet" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_controlnet(self, model, control_net_name): controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name) @@ -870,7 +870,7 @@ class ControlNetApply: FUNCTION = "apply_controlnet" DEPRECATED = True - CATEGORY = "conditioning/controlnet" + CATEGORY = "model/conditioning/controlnet" def apply_controlnet(self, conditioning, control_net, image, strength): if strength == 0: @@ -908,7 +908,7 @@ class ControlNetApplyAdvanced: RETURN_NAMES = ("positive", "negative") FUNCTION = "apply_controlnet" - CATEGORY = "conditioning/controlnet" + CATEGORY = "model/conditioning/controlnet" SEARCH_ALIASES = ["controlnet", "apply controlnet", "use controlnet", "control net"] def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None, extra_concat=[]): @@ -969,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -979,7 +979,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) @@ -1030,7 +1030,7 @@ class CLIPVisionLoader: RETURN_TYPES = ("CLIP_VISION",) FUNCTION = "load_clip" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_clip(self, clip_name): clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name) @@ -1049,7 +1049,7 @@ class CLIPVisionEncode: RETURN_TYPES = ("CLIP_VISION_OUTPUT",) FUNCTION = "encode" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def encode(self, clip_vision, image, crop): crop_image = True @@ -1066,7 +1066,7 @@ class StyleModelLoader: RETURN_TYPES = ("STYLE_MODEL",) FUNCTION = "load_style_model" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_style_model(self, style_model_name): style_model_path = folder_paths.get_full_path_or_raise("style_models", style_model_name) @@ -1088,7 +1088,7 @@ class StyleModelApply: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "apply_stylemodel" - CATEGORY = "conditioning/style_model" + CATEGORY = "model/conditioning/style_model" def apply_stylemodel(self, conditioning, style_model, clip_vision_output, strength, strength_type): cond = style_model.get_cond(clip_vision_output).flatten(start_dim=0, end_dim=1).unsqueeze(dim=0) @@ -1148,7 +1148,7 @@ class unCLIPConditioning: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "apply_adm" - CATEGORY = "conditioning" + CATEGORY = "model/conditioning" def apply_adm(self, conditioning, clip_vision_output, strength, noise_augmentation): if strength == 0: @@ -1165,7 +1165,7 @@ class GLIGENLoader: RETURN_TYPES = ("GLIGEN",) FUNCTION = "load_gligen" - CATEGORY = "loaders" + CATEGORY = "model/loaders" def load_gligen(self, gligen_name): gligen_path = folder_paths.get_full_path_or_raise("gligen", gligen_name) @@ -1187,7 +1187,7 @@ class GLIGENTextBoxApply: RETURN_TYPES = ("CONDITIONING",) FUNCTION = "append" - CATEGORY = "conditioning/gligen" + CATEGORY = "model/conditioning/gligen" def append(self, conditioning_to, clip, gligen_textbox_model, text, width, height, x, y): c = [] @@ -1217,7 +1217,7 @@ class EmptyLatentImage: OUTPUT_TOOLTIPS = ("The empty latent image batch.",) FUNCTION = "generate" - CATEGORY = "latent" + CATEGORY = "model/latent" DESCRIPTION = "Create a new batch of empty latent images to be denoised via sampling." SEARCH_ALIASES = ["empty", "empty latent", "new latent", "create latent", "blank latent", "blank"] @@ -1238,7 +1238,7 @@ class LatentFromBatch: RETURN_TYPES = ("LATENT",) FUNCTION = "frombatch" - CATEGORY = "latent/batch" + CATEGORY = "model/latent/batch" def frombatch(self, samples, batch_index, length): s = samples.copy() @@ -1273,7 +1273,7 @@ class RepeatLatentBatch: RETURN_TYPES = ("LATENT",) FUNCTION = "repeat" - CATEGORY = "latent/batch" + CATEGORY = "model/latent/batch" def repeat(self, samples, amount): s = samples.copy() @@ -1305,7 +1305,7 @@ class LatentUpscale: RETURN_TYPES = ("LATENT",) FUNCTION = "upscale" - CATEGORY = "latent" + CATEGORY = "model/latent" def upscale(self, samples, upscale_method, width, height, crop): if width == 0 and height == 0: @@ -1338,7 +1338,7 @@ class LatentUpscaleBy: RETURN_TYPES = ("LATENT",) FUNCTION = "upscale" - CATEGORY = "latent" + CATEGORY = "model/latent" def upscale(self, samples, upscale_method, scale_by): s = samples.copy() @@ -1356,7 +1356,7 @@ class LatentRotate: RETURN_TYPES = ("LATENT",) FUNCTION = "rotate" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def rotate(self, samples, rotation): s = samples.copy() @@ -1382,7 +1382,7 @@ class LatentFlip: RETURN_TYPES = ("LATENT",) FUNCTION = "flip" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def flip(self, samples, flip_method): s = samples.copy() @@ -1407,7 +1407,7 @@ class LatentComposite: RETURN_TYPES = ("LATENT",) FUNCTION = "composite" - CATEGORY = "latent" + CATEGORY = "model/latent" def composite(self, samples_to, samples_from, x, y, composite_method="normal", feather=0): x = x // 8 @@ -1494,7 +1494,7 @@ class LatentCrop: RETURN_TYPES = ("LATENT",) FUNCTION = "crop" - CATEGORY = "latent/transform" + CATEGORY = "model/latent/transform" def crop(self, samples, width, height, x, y): s = samples.copy() @@ -1524,7 +1524,7 @@ class SetLatentNoiseMask: RETURN_TYPES = ("LATENT",) FUNCTION = "set_mask" - CATEGORY = "latent/inpaint" + CATEGORY = "model/latent/inpaint" def set_mask(self, samples, mask): s = samples.copy() @@ -1578,7 +1578,7 @@ class KSampler: OUTPUT_TOOLTIPS = ("The denoised latent.",) FUNCTION = "sample" - CATEGORY = "sampling" + CATEGORY = "model/sampling" DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image." SEARCH_ALIASES = ["sampler", "sample", "generate", "denoise", "diffuse", "txt2img", "img2img"] @@ -1608,7 +1608,7 @@ class KSamplerAdvanced: RETURN_TYPES = ("LATENT",) FUNCTION = "sample" - CATEGORY = "sampling" + CATEGORY = "model/sampling" def sample(self, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, start_at_step, end_at_step, return_with_leftover_noise, denoise=1.0): force_full_denoise = True @@ -2420,6 +2420,7 @@ async def init_builtin_extra_nodes(): "nodes_context_windows.py", "nodes_qwen.py", "nodes_chroma_radiance.py", + "nodes_pid.py", "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", @@ -2454,6 +2455,7 @@ async def init_builtin_extra_nodes(): "nodes_save_3d.py", "nodes_moge.py", "nodes_mediapipe.py", + "nodes_gaussian_splat.py", ] import_failed = [] diff --git a/openapi.yaml b/openapi.yaml index 502e518c7..f801a39d9 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -275,7 +275,10 @@ paths: responses: "200": description: Queue updated - + content: + application/json: + schema: + $ref: "#/components/schemas/QueueManageResponse" '400': description: Invalid request parameters content: @@ -3092,18 +3095,34 @@ paths: application/json: schema: type: object - required: - - asset_ids properties: + job_ids: + type: array + items: + type: string + description: Job IDs whose associated assets should all be included in the ZIP bundle. asset_ids: type: array items: type: string format: uuid - description: IDs of assets to export + description: Asset IDs to include in the ZIP bundle. Additive to assets associated with provided job IDs. export_name: type: string description: Name for the export archive + naming_strategy: + type: string + enum: [group_by_job_id, preserve, asset_id, group_by_job_time] + default: group_by_job_time + description: "Strategy for naming files in the ZIP: group by job ID, preserve original names, use the asset ID, or group by job creation time." + job_asset_name_filters: + type: object + additionalProperties: + type: array + minItems: 1 + items: + type: string + description: Optional per-job asset name filters. When provided for a job ID, only assets whose name matches one of the listed names are included. responses: "202": description: Export task accepted @@ -3575,10 +3594,7 @@ paths: content: application/json: schema: - type: array - items: - $ref: "#/components/schemas/HubLabel" - + $ref: "#/components/schemas/HubLabelListResponse" '400': description: Bad request (e.g. invalid type parameter) content: @@ -7466,6 +7482,25 @@ components: type: string description: Array of prompt IDs to delete from queue + QueueManageResponse: + type: object + x-runtime: [cloud] + description: >- + [cloud-only] Result of a queue mutation. The Cloud runtime returns which + items were deleted and whether the queue was cleared; local ComfyUI + returns an empty 200 body. + properties: + deleted: + type: array + nullable: true + items: + type: string + description: Prompt IDs that were deleted from the queue. + cleared: + type: boolean + nullable: true + description: Whether the queue was cleared. + # ------------------------------------------------------------------- # History # ------------------------------------------------------------------- @@ -7546,6 +7581,16 @@ components: outputs_count: type: integer description: Total number of output files + workflow_id: + type: string + nullable: true + x-runtime: [cloud] + description: "[cloud-only] UUID of the Cloud workflow entity this job is associated with. Local ComfyUI returns null." + execution_error: + x-runtime: [cloud] + description: "[cloud-only] Detailed execution error from ComfyUI for failed jobs. Absent on local ComfyUI." + allOf: + - $ref: "#/components/schemas/ExecutionError" JobDetailResponse: type: object @@ -10433,6 +10478,19 @@ components: - custom_node description: Label category. + HubLabelListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response wrapper for the available Hub label catalog.' + required: + - labels + properties: + labels: + type: array + items: + $ref: '#/components/schemas/HubLabelInfo' + description: Available labels, optionally filtered by type. + HubProfileSummary: type: object x-runtime: [cloud] diff --git a/requirements.txt b/requirements.txt index 2ca6d8929..14bba1437 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ comfyui-frontend-package==1.44.19 -comfyui-workflow-templates==0.9.82 -comfyui-embedded-docs==0.5.1 +comfyui-workflow-templates==0.9.91 +comfyui-embedded-docs==0.5.2 torch torchsde torchvision @@ -21,9 +21,9 @@ psutil alembic SQLAlchemy>=2.0.0 filelock -av>=14.2.0 -comfy-kitchen>=0.2.8 -comfy-aimdo==0.4.5 +av>=16.0.0 +comfy-kitchen==0.2.10 +comfy-aimdo==0.4.7 requests simpleeval>=1.0.0 blake3