Merge branch 'master' into add-openapi-spec
Some checks are pending
Python Linting / Run Ruff (push) Waiting to run
Python Linting / Run Pylint (push) Waiting to run
Build package / Build Test (3.10) (push) Waiting to run
Build package / Build Test (3.11) (push) Waiting to run
Build package / Build Test (3.12) (push) Waiting to run
Build package / Build Test (3.13) (push) Waiting to run
Build package / Build Test (3.14) (push) Waiting to run

This commit is contained in:
Matt Miller 2026-04-22 14:18:24 -07:00 committed by GitHub
commit 8cdb41b6b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
38 changed files with 1819 additions and 316 deletions

View File

@ -195,7 +195,9 @@ The portable above currently comes with python 3.13 and pytorch cuda 13.0. Updat
#### Alternative Downloads:
[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
[Experimental portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).

View File

@ -67,7 +67,7 @@ class InternalRoutes:
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
key=lambda entry: -entry.stat().st_mtime
)
return web.json_response([entry.name for entry in sorted_files], status=200)
return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)
def get_app(self):

View File

@ -15,7 +15,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
omega = 1.0 / (theta**scale)
out = torch.einsum("...n,d->...nd", pos, omega)
out = torch.einsum("...n,d->...nd", pos.to(device), omega)
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
return out.to(dtype=torch.float32, device=pos.device)
@ -118,8 +118,6 @@ class ErnieImageAttention(nn.Module):
query = apply_rotary_emb(query, image_rotary_emb)
key = apply_rotary_emb(key, image_rotary_emb)
query, key = query.to(x.dtype), key.to(x.dtype)
q_flat = query.reshape(B, S, -1)
k_flat = key.reshape(B, S, -1)
@ -161,16 +159,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module):
residual = x
x_norm = self.adaLN_sa_ln(x)
x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
x_norm = x_norm * (1 + scale_msa) + shift_msa
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
x = residual + gate_msa * attn_out
residual = x
x_norm = self.adaLN_mlp_ln(x)
x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
x_norm = x_norm * (1 + scale_mlp) + shift_mlp
return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
return residual + gate_mlp * self.mlp(x_norm)
class ErnieImageAdaLNContinuous(nn.Module):
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
@ -183,7 +181,7 @@ class ErnieImageAdaLNContinuous(nn.Module):
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
x = self.norm(x)
x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
return x
class ErnieImageModel(nn.Module):

View File

@ -4,9 +4,6 @@ import math
import torch
import torchaudio
import comfy.model_management
import comfy.model_patcher
import comfy.utils as utils
from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
@ -43,30 +40,6 @@ class AudioVAEComponentConfig:
return cls(autoencoder=audio_config, vocoder=vocoder_config)
class ModelDeviceManager:
"""Manages device placement and GPU residency for the composed model."""
def __init__(self, module: torch.nn.Module):
load_device = comfy.model_management.get_torch_device()
offload_device = comfy.model_management.vae_offload_device()
self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
def ensure_model_loaded(self) -> None:
comfy.model_management.free_memory(
self.patcher.model_size(),
self.patcher.load_device,
)
comfy.model_management.load_model_gpu(self.patcher)
def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
return tensor.to(self.patcher.load_device)
@property
def load_device(self):
return self.patcher.load_device
class AudioLatentNormalizer:
"""Applies per-channel statistics in patch space and restores original layout."""
@ -132,23 +105,17 @@ class AudioPreprocessor:
class AudioVAE(torch.nn.Module):
"""High-level Audio VAE wrapper exposing encode and decode entry points."""
def __init__(self, state_dict: dict, metadata: dict):
def __init__(self, metadata: dict):
super().__init__()
component_config = AudioVAEComponentConfig.from_metadata(metadata)
vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
if "bwe" in component_config.vocoder:
self.vocoder = VocoderWithBWE(config=component_config.vocoder)
else:
self.vocoder = Vocoder(config=component_config.vocoder)
self.autoencoder.load_state_dict(vae_sd, strict=False)
self.vocoder.load_state_dict(vocoder_sd, strict=False)
autoencoder_config = self.autoencoder.get_config()
self.normalizer = AudioLatentNormalizer(
AudioPatchifier(
@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module):
n_fft=autoencoder_config["n_fft"],
)
self.device_manager = ModelDeviceManager(self)
def encode(self, audio: dict) -> torch.Tensor:
def encode(self, audio, sample_rate=44100) -> torch.Tensor:
"""Encode a waveform dictionary into normalized latent tensors."""
waveform = audio["waveform"]
waveform_sample_rate = audio["sample_rate"]
waveform = audio
waveform_sample_rate = sample_rate
input_device = waveform.device
# Ensure that Audio VAE is loaded on the correct device.
self.device_manager.ensure_model_loaded()
waveform = self.device_manager.move_to_load_device(waveform)
expected_channels = self.autoencoder.encoder.in_channels
if waveform.shape[1] != expected_channels:
if waveform.shape[1] == 1:
@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module):
)
mel_spec = self.preprocessor.waveform_to_mel(
waveform, waveform_sample_rate, device=self.device_manager.load_device
waveform, waveform_sample_rate, device=waveform.device
)
latents = self.autoencoder.encode(mel_spec)
@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module):
"""Decode normalized latent tensors into an audio waveform."""
original_shape = latents.shape
# Ensure that Audio VAE is loaded on the correct device.
self.device_manager.ensure_model_loaded()
latents = self.device_manager.move_to_load_device(latents)
latents = self.normalizer.denormalize(latents)
target_shape = self.target_shape_from_latents(original_shape)
mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
waveform = self.run_vocoder(mel_spec)
return self.device_manager.move_to_load_device(waveform)
return waveform
def target_shape_from_latents(self, latents_shape):
batch, _, time, _ = latents_shape

View File

@ -34,6 +34,16 @@ class TimestepBlock(nn.Module):
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
for layer in ts:
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
found_patched = False
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
if isinstance(layer, class_type):
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
found_patched = True
break
if found_patched:
continue
if isinstance(layer, VideoResBlock):
x = layer(x, emb, num_video_frames, image_only_indicator)
elif isinstance(layer, TimestepBlock):
@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
elif isinstance(layer, Upsample):
x = layer(x, output_shape=output_shape)
else:
if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
found_patched = False
for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
if isinstance(layer, class_type):
x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
found_patched = True
break
if found_patched:
continue
x = layer(x)
return x
@ -894,6 +895,12 @@ class UNetModel(nn.Module):
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
h = apply_control(h, control, 'middle')
if "middle_block_after_patch" in transformer_patches:
patch = transformer_patches["middle_block_after_patch"]
for p in patch:
out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
"timesteps": timesteps, "transformer_options": transformer_options})
h = out["h"]
for id, module in enumerate(self.output_blocks):
transformer_options["block"] = ("output", id)
@ -905,8 +912,9 @@ class UNetModel(nn.Module):
for p in patch:
h, hsp = p(h, hsp, transformer_options)
h = th.cat([h, hsp], dim=1)
del hsp
if hsp is not None:
h = th.cat([h, hsp], dim=1)
del hsp
if len(hs) > 0:
output_shape = hs[-1].shape
else:

View File

View File

@ -0,0 +1,226 @@
import torch
import torch.nn as nn
from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
from comfy.ldm.modules.attention import optimized_attention
class ZeroSFT(nn.Module):
def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
super().__init__()
ks = 3
pw = ks // 2
self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
nhidden = 128
self.mlp_shared = nn.Sequential(
operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
nn.SiLU()
)
self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
self.pre_concat = bool(concat_channels != 0)
def forward(self, c, h, h_ori=None, control_scale=1):
if h_ori is not None and self.pre_concat:
h_raw = torch.cat([h_ori, h], dim=1)
else:
h_raw = h
h = h + self.zero_conv(c)
if h_ori is not None and self.pre_concat:
h = torch.cat([h_ori, h], dim=1)
actv = self.mlp_shared(c)
gamma = self.zero_mul(actv)
beta = self.zero_add(actv)
h = self.param_free_norm(h)
h = torch.addcmul(h + beta, h, gamma)
if h_ori is not None and not self.pre_concat:
h = torch.cat([h_ori, h], dim=1)
return torch.lerp(h_raw, h, control_scale)
class _CrossAttnInner(nn.Module):
"""Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
self.to_out = nn.Sequential(
operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
)
def forward(self, x, context):
q = self.to_q(x)
k = self.to_k(context)
v = self.to_v(context)
return self.to_out(optimized_attention(q, k, v, self.heads))
class ZeroCrossAttn(nn.Module):
def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
super().__init__()
heads = query_dim // 64
dim_head = 64
self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
def forward(self, context, x, control_scale=1):
b, c, h, w = x.shape
x_in = x
x = self.attn(
self.norm1(x).flatten(2).transpose(1, 2),
self.norm2(context).flatten(2).transpose(1, 2),
).transpose(1, 2).unflatten(2, (h, w))
return x_in + x * control_scale
class GLVControl(nn.Module):
"""SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
def __init__(
self,
in_channels=4,
model_channels=320,
num_res_blocks=2,
attention_resolutions=(4, 2),
channel_mult=(1, 2, 4),
num_head_channels=64,
transformer_depth=(1, 2, 10),
context_dim=2048,
adm_in_channels=2816,
use_linear_in_transformer=True,
use_checkpoint=False,
dtype=None,
device=None,
operations=None,
**kwargs,
):
super().__init__()
self.model_channels = model_channels
time_embed_dim = model_channels * 4
self.time_embed = nn.Sequential(
operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
nn.SiLU(),
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
)
self.label_emb = nn.Sequential(
nn.Sequential(
operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
nn.SiLU(),
operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
)
)
self.input_blocks = nn.ModuleList([
TimestepEmbedSequential(
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
)
])
ch = model_channels
ds = 1
for level, mult in enumerate(channel_mult):
for nr in range(num_res_blocks):
layers = [
ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
dtype=dtype, device=device, operations=operations)
]
ch = mult * model_channels
if ds in attention_resolutions:
num_heads = ch // num_head_channels
layers.append(
SpatialTransformer(ch, num_heads, num_head_channels,
depth=transformer_depth[level], context_dim=context_dim,
use_linear=use_linear_in_transformer,
use_checkpoint=use_checkpoint,
dtype=dtype, device=device, operations=operations)
)
self.input_blocks.append(TimestepEmbedSequential(*layers))
if level != len(channel_mult) - 1:
self.input_blocks.append(
TimestepEmbedSequential(
Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
)
)
ds *= 2
num_heads = ch // num_head_channels
self.middle_block = TimestepEmbedSequential(
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
SpatialTransformer(ch, num_heads, num_head_channels,
depth=transformer_depth[-1], context_dim=context_dim,
use_linear=use_linear_in_transformer,
use_checkpoint=use_checkpoint,
dtype=dtype, device=device, operations=operations),
ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
)
self.input_hint_block = TimestepEmbedSequential(
operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
)
def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
emb = self.time_embed(t_emb) + self.label_emb(y)
guided_hint = self.input_hint_block(x, emb, context)
hs = []
h = xt
for module in self.input_blocks:
if guided_hint is not None:
h = module(h, emb, context)
h += guided_hint
guided_hint = None
else:
h = module(h, emb, context)
hs.append(h)
h = self.middle_block(h, emb, context)
hs.append(h)
return hs
class SUPIR(nn.Module):
"""
SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
State dict keys match the original SUPIR checkpoint layout:
control_model.* -> GLVControl
project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
"""
def __init__(self, device=None, dtype=None, operations=None):
super().__init__()
self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
project_channel_scale = 2
cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
cross_attn_insert_idx = [6, 3]
self.project_modules = nn.ModuleList()
for i in range(len(cond_output_channels)):
self.project_modules.append(ZeroSFT(
project_channels[i], cond_output_channels[i],
concat_channels=concat_channels[i],
dtype=dtype, device=device, operations=operations,
))
for i in cross_attn_insert_idx:
self.project_modules.insert(i, ZeroCrossAttn(
cond_output_channels[i], concat_channels[i],
dtype=dtype, device=device, operations=operations,
))

View File

@ -0,0 +1,103 @@
import torch
from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
class SUPIRPatch:
"""
Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
Runs GLVControl lazily on first patch invocation per step, applies adapters through
middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
"""
SIGMA_MAX = 14.6146
def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl
self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn
self.hint_latent = hint_latent # encoded LQ image latent
self.strength_start = strength_start
self.strength_end = strength_end
self.cached_features = None
self.adapter_idx = 0
self.control_idx = 0
self.current_control_idx = 0
self.active = True
def _ensure_features(self, kwargs):
"""Run GLVControl on first call per step, cache results."""
if self.cached_features is not None:
return
x = kwargs["x"]
b = x.shape[0]
hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
if hint.shape[0] != b:
hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
self.cached_features = self.model_patch.model.control_model(
hint, kwargs["timesteps"], x,
kwargs["context"], kwargs["y"]
)
self.adapter_idx = len(self.project_modules) - 1
self.control_idx = len(self.cached_features) - 1
def _get_control_scale(self, kwargs):
if self.strength_start == self.strength_end:
return self.strength_end
sigma = kwargs["transformer_options"].get("sigmas")
if sigma is None:
return self.strength_end
s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
t = min(s / self.SIGMA_MAX, 1.0)
return t * (self.strength_start - self.strength_end) + self.strength_end
def middle_after(self, kwargs):
"""middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
self.cached_features = None # reset from previous step
self.current_scale = self._get_control_scale(kwargs)
self.active = self.current_scale > 0
if not self.active:
return {"h": kwargs["h"]}
self._ensure_features(kwargs)
h = kwargs["h"]
h = self.project_modules[self.adapter_idx](
self.cached_features[self.control_idx], h, control_scale=self.current_scale
)
self.adapter_idx -= 1
self.control_idx -= 1
return {"h": h}
def output_block(self, h, hsp, transformer_options):
"""output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
if not self.active:
return h, hsp
self.current_control_idx = self.control_idx
h = self.project_modules[self.adapter_idx](
self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
)
self.adapter_idx -= 1
self.control_idx -= 1
return h, None
def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
"""forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
block_type, _ = transformer_options["block"]
if block_type == "output" and self.active and self.cached_features is not None:
x = self.project_modules[self.adapter_idx](
self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
)
self.adapter_idx -= 1
return layer(x, output_shape=output_shape)
def to(self, device_or_dtype):
if isinstance(device_or_dtype, torch.device):
self.cached_features = None
if self.hint_latent is not None:
self.hint_latent = self.hint_latent.to(device_or_dtype)
return self
def models(self):
return [self.model_patch]
def register(self, model_patcher):
"""Register all patches on a cloned model patcher."""
model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
model_patcher.set_model_output_block_patch(self.output_block)
model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")

View File

@ -506,6 +506,10 @@ class ModelPatcher:
def set_model_noise_refiner_patch(self, patch):
self.set_model_patch(patch, "noise_refiner")
def set_model_middle_block_after_patch(self, patch):
self.set_model_patch(patch, "middle_block_after_patch")
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
rope_options = self.model_options["transformer_options"].get("rope_options", {})
rope_options["scale_x"] = scale_x

View File

@ -1151,7 +1151,7 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
if param is None:
continue
p = fn(param)
if p.is_inference():
if (not torch.is_inference_mode_enabled()) and p.is_inference():
p = p.clone()
self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
for key, buf in self._buffers.items():

View File

@ -12,6 +12,7 @@ from .ldm.cascade.stage_c_coder import StageC_coder
from .ldm.audio.autoencoder import AudioOobleckVAE
import comfy.ldm.genmo.vae.model
import comfy.ldm.lightricks.vae.causal_video_autoencoder
import comfy.ldm.lightricks.vae.audio_vae
import comfy.ldm.cosmos.vae
import comfy.ldm.wan.vae
import comfy.ldm.wan.vae2_2
@ -805,6 +806,24 @@ class VAE:
self.downscale_index_formula = (4, 8, 8)
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
self.latent_channels = self.first_stage_model.latent_channels
self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes
self.output_channels = 2
self.pad_channel_value = "replicate"
self.upscale_ratio = 4096
self.downscale_ratio = 4096
self.latent_dim = 2
self.process_output = lambda audio: audio
self.process_input = lambda audio: audio
self.working_dtypes = [torch.float32]
self.disable_offload = True
self.extra_1d_channel = 16
else:
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
self.first_stage_model = None

View File

@ -35,4 +35,4 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
return ErnieTEModel
return ErnieTEModel_

View File

@ -158,10 +158,17 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
("Custom", None, None),
]
# Seedance 2.0 reference video pixel count limits per model.
# Seedance 2.0 reference video pixel count limits per model and output resolution.
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
"dreamina-seedance-2-0-260128": {"min": 409_600, "max": 927_408},
"dreamina-seedance-2-0-fast-260128": {"min": 409_600, "max": 927_408},
"dreamina-seedance-2-0-260128": {
"480p": {"min": 409_600, "max": 927_408},
"720p": {"min": 409_600, "max": 927_408},
"1080p": {"min": 409_600, "max": 2_073_600},
},
"dreamina-seedance-2-0-fast-260128": {
"480p": {"min": 409_600, "max": 927_408},
"720p": {"min": 409_600, "max": 927_408},
},
}
# The time in this dictionary are given for 10 seconds duration.

View File

@ -35,6 +35,7 @@ from comfy_api_nodes.util import (
get_number_of_images,
image_tensor_pair_to_batch,
poll_op,
resize_video_to_pixel_budget,
sync_op,
upload_audio_to_comfyapi,
upload_image_to_comfyapi,
@ -69,9 +70,12 @@ DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-2504
logger = logging.getLogger(__name__)
def _validate_ref_video_pixels(video: Input.Video, model_id: str, index: int) -> None:
"""Validate reference video pixel count against Seedance 2.0 model limits."""
limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
"""Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
if not model_limits:
return
limits = model_limits.get(resolution)
if not limits:
return
try:
@ -1066,7 +1070,7 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
)
def _seedance2_text_inputs():
def _seedance2_text_inputs(resolutions: list[str]):
return [
IO.String.Input(
"prompt",
@ -1076,7 +1080,7 @@ def _seedance2_text_inputs():
),
IO.Combo.Input(
"resolution",
options=["480p", "720p"],
options=resolutions,
tooltip="Resolution of the output video.",
),
IO.Combo.Input(
@ -1114,8 +1118,8 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
),
@ -1152,11 +1156,14 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
(
$rate480 := 10044;
$rate720 := 21600;
$rate1080 := 48800;
$m := widgets.model;
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
$res := $lookup(widgets, "model.resolution");
$dur := $lookup(widgets, "model.duration");
$rate := $res = "720p" ? $rate720 : $rate480;
$rate := $res = "1080p" ? $rate1080 :
$res = "720p" ? $rate720 :
$rate480;
$cost := $dur * $rate * $pricePer1K / 1000;
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
)
@ -1195,6 +1202,7 @@ class ByteDance2TextToVideoNode(IO.ComfyNode):
status_extractor=lambda r: r.status,
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
poll_interval=9,
max_poll_attempts=180,
)
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
@ -1212,8 +1220,8 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs()),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs()),
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
),
@ -1259,11 +1267,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
(
$rate480 := 10044;
$rate720 := 21600;
$rate1080 := 48800;
$m := widgets.model;
$pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
$res := $lookup(widgets, "model.resolution");
$dur := $lookup(widgets, "model.duration");
$rate := $res = "720p" ? $rate720 : $rate480;
$rate := $res = "1080p" ? $rate1080 :
$res = "720p" ? $rate720 :
$rate480;
$cost := $dur * $rate * $pricePer1K / 1000;
{"type": "usd", "usd": $cost, "format": {"approximate": true}}
)
@ -1324,13 +1335,14 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
status_extractor=lambda r: r.status,
price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
poll_interval=9,
max_poll_attempts=180,
)
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
def _seedance2_reference_inputs():
def _seedance2_reference_inputs(resolutions: list[str]):
return [
*_seedance2_text_inputs(),
*_seedance2_text_inputs(resolutions),
IO.Autogrow.Input(
"reference_images",
template=IO.Autogrow.TemplateNames(
@ -1365,6 +1377,14 @@ def _seedance2_reference_inputs():
min=0,
),
),
IO.Boolean.Input(
"auto_downscale",
default=False,
advanced=True,
optional=True,
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
),
]
@ -1382,8 +1402,8 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs()),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs()),
IO.DynamicCombo.Option("Seedance 2.0", _seedance2_reference_inputs(["480p", "720p", "1080p"])),
IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_reference_inputs(["480p", "720p"])),
],
tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
),
@ -1423,13 +1443,16 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
(
$rate480 := 10044;
$rate720 := 21600;
$rate1080 := 48800;
$m := widgets.model;
$hasVideo := $lookup(inputGroups, "model.reference_videos") > 0;
$noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
$videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149;
$res := $lookup(widgets, "model.resolution");
$dur := $lookup(widgets, "model.duration");
$rate := $res = "720p" ? $rate720 : $rate480;
$rate := $res = "1080p" ? $rate1080 :
$res = "720p" ? $rate720 :
$rate480;
$noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000;
$minVideoFactor := $ceil($dur * 5 / 3);
$minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000;
@ -1469,10 +1492,23 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
model_id = SEEDANCE_MODELS[model["model"]]
has_video_input = len(reference_videos) > 0
if model.get("auto_downscale") and reference_videos:
max_px = (
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
.get(model["resolution"], {})
.get("max")
)
if max_px:
for key in reference_videos:
reference_videos[key] = resize_video_to_pixel_budget(
reference_videos[key], max_px
)
total_video_duration = 0.0
for i, key in enumerate(reference_videos, 1):
video = reference_videos[key]
_validate_ref_video_pixels(video, model_id, i)
_validate_ref_video_pixels(video, model_id, model["resolution"], i)
try:
dur = video.get_duration()
if dur < 1.8:
@ -1559,6 +1595,7 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
status_extractor=lambda r: r.status,
price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
poll_interval=9,
max_poll_attempts=180,
)
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))

View File

@ -221,14 +221,17 @@ class TencentTextToModelNode(IO.ComfyNode):
response_model=To3DProTaskResultResponse,
status_extractor=lambda r: r.Status,
)
obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
obj_result = None
if obj_file_response:
obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
return IO.NodeOutput(
f"{task_id}.glb",
await download_url_to_file_3d(
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
),
obj_result.obj,
obj_result.texture,
obj_result.obj if obj_result else None,
obj_result.texture if obj_result else None,
)
@ -378,17 +381,30 @@ class TencentImageToModelNode(IO.ComfyNode):
response_model=To3DProTaskResultResponse,
status_extractor=lambda r: r.Status,
)
obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
if obj_file_response:
obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
return IO.NodeOutput(
f"{task_id}.glb",
await download_url_to_file_3d(
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
),
obj_result.obj,
obj_result.texture,
obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3),
obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3),
obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3),
)
return IO.NodeOutput(
f"{task_id}.glb",
await download_url_to_file_3d(
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
),
obj_result.obj,
obj_result.texture,
obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3),
obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3),
obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3),
None,
None,
None,
None,
None,
)

View File

@ -363,7 +363,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="OpenAIGPTImage1",
display_name="OpenAI GPT Image 1.5",
display_name="OpenAI GPT Image 2",
category="api node/image/OpenAI",
description="Generates images synchronously via OpenAI's GPT Image endpoint.",
inputs=[
@ -427,8 +427,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
),
IO.Combo.Input(
"model",
options=["gpt-image-1", "gpt-image-1.5"],
default="gpt-image-1.5",
options=["gpt-image-1", "gpt-image-1.5", 'gpt-image-2'],
default="gpt-image-2",
optional=True,
),
],
@ -487,6 +487,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
price_extractor = calculate_tokens_price_image_1
elif model == "gpt-image-1.5":
price_extractor = calculate_tokens_price_image_1_5
elif model == "gpt-image-2":
price_extractor = calculate_tokens_price_image_1_5
else:
raise ValueError(f"Unknown model: {model}")

View File

@ -17,6 +17,44 @@ from comfy_api_nodes.util import (
)
from comfy_extras.nodes_images import SVG
_ARROW_MODELS = ["arrow-1.1", "arrow-1.1-max", "arrow-preview"]
def _arrow_sampling_inputs():
"""Shared sampling inputs for all Arrow model variants."""
return [
IO.Float.Input(
"temperature",
default=1.0,
min=0.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Randomness control. Higher values increase randomness.",
advanced=True,
),
IO.Float.Input(
"top_p",
default=1.0,
min=0.05,
max=1.0,
step=0.05,
display_mode=IO.NumberDisplay.slider,
tooltip="Nucleus sampling parameter.",
advanced=True,
),
IO.Float.Input(
"presence_penalty",
default=0.0,
min=-2.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Token presence penalty.",
advanced=True,
),
]
class QuiverTextToSVGNode(IO.ComfyNode):
@classmethod
@ -39,6 +77,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
default="",
tooltip="Additional style or formatting guidance.",
optional=True,
advanced=True,
),
IO.Autogrow.Input(
"reference_images",
@ -53,43 +92,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
),
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option(
"arrow-preview",
[
IO.Float.Input(
"temperature",
default=1.0,
min=0.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Randomness control. Higher values increase randomness.",
advanced=True,
),
IO.Float.Input(
"top_p",
default=1.0,
min=0.05,
max=1.0,
step=0.05,
display_mode=IO.NumberDisplay.slider,
tooltip="Nucleus sampling parameter.",
advanced=True,
),
IO.Float.Input(
"presence_penalty",
default=0.0,
min=-2.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Token presence penalty.",
advanced=True,
),
],
),
],
options=[IO.DynamicCombo.Option(m, _arrow_sampling_inputs()) for m in _ARROW_MODELS],
tooltip="Model to use for SVG generation.",
),
IO.Int.Input(
@ -112,7 +115,16 @@ class QuiverTextToSVGNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.429}""",
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
expr="""
(
$contains(widgets.model, "max")
? {"type":"usd","usd":0.3575}
: $contains(widgets.model, "preview")
? {"type":"usd","usd":0.429}
: {"type":"usd","usd":0.286}
)
""",
),
)
@ -176,12 +188,13 @@ class QuiverImageToSVGNode(IO.ComfyNode):
"auto_crop",
default=False,
tooltip="Automatically crop to the dominant subject.",
advanced=True,
),
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option(
"arrow-preview",
m,
[
IO.Int.Input(
"target_size",
@ -189,39 +202,12 @@ class QuiverImageToSVGNode(IO.ComfyNode):
min=128,
max=4096,
tooltip="Square resize target in pixels.",
),
IO.Float.Input(
"temperature",
default=1.0,
min=0.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Randomness control. Higher values increase randomness.",
advanced=True,
),
IO.Float.Input(
"top_p",
default=1.0,
min=0.05,
max=1.0,
step=0.05,
display_mode=IO.NumberDisplay.slider,
tooltip="Nucleus sampling parameter.",
advanced=True,
),
IO.Float.Input(
"presence_penalty",
default=0.0,
min=-2.0,
max=2.0,
step=0.1,
display_mode=IO.NumberDisplay.slider,
tooltip="Token presence penalty.",
advanced=True,
),
*_arrow_sampling_inputs(),
],
),
)
for m in _ARROW_MODELS
],
tooltip="Model to use for SVG vectorization.",
),
@ -245,7 +231,16 @@ class QuiverImageToSVGNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.429}""",
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
expr="""
(
$contains(widgets.model, "max")
? {"type":"usd","usd":0.3575}
: $contains(widgets.model, "preview")
? {"type":"usd","usd":0.429}
: {"type":"usd","usd":0.286}
)
""",
),
)

View File

@ -401,7 +401,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.25}""",
expr="""{"type":"usd","usd":0.4}""",
),
)
@ -510,7 +510,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.25}""",
expr="""{"type":"usd","usd":0.6}""",
),
)
@ -593,7 +593,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.01}""",
expr="""{"type":"usd","usd":0.02}""",
),
)

View File

@ -24,8 +24,9 @@ from comfy_api_nodes.util import (
AVERAGE_DURATION_VIDEO_GEN = 32
MODELS_MAP = {
"veo-2.0-generate-001": "veo-2.0-generate-001",
"veo-3.1-generate": "veo-3.1-generate-preview",
"veo-3.1-fast-generate": "veo-3.1-fast-generate-preview",
"veo-3.1-generate": "veo-3.1-generate-001",
"veo-3.1-fast-generate": "veo-3.1-fast-generate-001",
"veo-3.1-lite": "veo-3.1-lite-generate-001",
"veo-3.0-generate-001": "veo-3.0-generate-001",
"veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001",
}
@ -247,17 +248,8 @@ class VeoVideoGenerationNode(IO.ComfyNode):
raise Exception("Video generation completed but no video was returned")
class Veo3VideoGenerationNode(VeoVideoGenerationNode):
"""
Generates videos from text prompts using Google's Veo 3 API.
Supported models:
- veo-3.0-generate-001
- veo-3.0-fast-generate-001
This node extends the base Veo node with Veo 3 specific features including
audio generation and fixed 8-second duration.
"""
class Veo3VideoGenerationNode(IO.ComfyNode):
"""Generates videos from text prompts using Google's Veo 3 API."""
@classmethod
def define_schema(cls):
@ -279,6 +271,13 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
default="16:9",
tooltip="Aspect ratio of the output video",
),
IO.Combo.Input(
"resolution",
options=["720p", "1080p", "4k"],
default="720p",
tooltip="Output video resolution. 4K is not available for veo-3.1-lite and veo-3.0 models.",
optional=True,
),
IO.String.Input(
"negative_prompt",
multiline=True,
@ -289,11 +288,11 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
IO.Int.Input(
"duration_seconds",
default=8,
min=8,
min=4,
max=8,
step=1,
step=2,
display_mode=IO.NumberDisplay.number,
tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
tooltip="Duration of the output video in seconds",
optional=True,
),
IO.Boolean.Input(
@ -332,10 +331,10 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
options=[
"veo-3.1-generate",
"veo-3.1-fast-generate",
"veo-3.1-lite",
"veo-3.0-generate-001",
"veo-3.0-fast-generate-001",
],
default="veo-3.0-generate-001",
tooltip="Veo 3 model to use for video generation",
optional=True,
),
@ -356,21 +355,111 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio"]),
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "resolution", "duration_seconds"]),
expr="""
(
$m := widgets.model;
$r := widgets.resolution;
$a := widgets.generate_audio;
($contains($m,"veo-3.0-fast-generate-001") or $contains($m,"veo-3.1-fast-generate"))
? {"type":"usd","usd": ($a ? 1.2 : 0.8)}
: ($contains($m,"veo-3.0-generate-001") or $contains($m,"veo-3.1-generate"))
? {"type":"usd","usd": ($a ? 3.2 : 1.6)}
: {"type":"range_usd","min_usd":0.8,"max_usd":3.2}
$seconds := widgets.duration_seconds;
$pps :=
$contains($m, "lite")
? ($r = "1080p" ? ($a ? 0.08 : 0.05) : ($a ? 0.05 : 0.03))
: $contains($m, "3.1-fast")
? ($r = "4k" ? ($a ? 0.30 : 0.25) : $r = "1080p" ? ($a ? 0.12 : 0.10) : ($a ? 0.10 : 0.08))
: $contains($m, "3.1-generate")
? ($r = "4k" ? ($a ? 0.60 : 0.40) : ($a ? 0.40 : 0.20))
: $contains($m, "3.0-fast")
? ($a ? 0.15 : 0.10)
: ($a ? 0.40 : 0.20);
{"type":"usd","usd": $pps * $seconds}
)
""",
),
)
@classmethod
async def execute(
cls,
prompt,
aspect_ratio="16:9",
resolution="720p",
negative_prompt="",
duration_seconds=8,
enhance_prompt=True,
person_generation="ALLOW",
seed=0,
image=None,
model="veo-3.0-generate-001",
generate_audio=False,
):
if resolution == "4k" and ("lite" in model or "3.0" in model):
raise Exception("4K resolution is not supported by the veo-3.1-lite or veo-3.0 models.")
model = MODELS_MAP[model]
instances = [{"prompt": prompt}]
if image is not None:
image_base64 = tensor_to_base64_string(image)
if image_base64:
instances[0]["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"}
parameters = {
"aspectRatio": aspect_ratio,
"personGeneration": person_generation,
"durationSeconds": duration_seconds,
"enhancePrompt": True,
"generateAudio": generate_audio,
}
if negative_prompt:
parameters["negativePrompt"] = negative_prompt
if seed > 0:
parameters["seed"] = seed
if "veo-3.1" in model:
parameters["resolution"] = resolution
initial_response = await sync_op(
cls,
ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
response_model=VeoGenVidResponse,
data=VeoGenVidRequest(
instances=instances,
parameters=parameters,
),
)
poll_response = await poll_op(
cls,
ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
response_model=VeoGenVidPollResponse,
status_extractor=lambda r: "completed" if r.done else "pending",
data=VeoGenVidPollRequest(operationName=initial_response.name),
poll_interval=9.0,
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
)
if poll_response.error:
raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
response = poll_response.response
filtered_count = response.raiMediaFilteredCount
if filtered_count:
reasons = response.raiMediaFilteredReasons or []
reason_part = f": {reasons[0]}" if reasons else ""
raise Exception(
f"Content blocked by Google's Responsible AI filters{reason_part} "
f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
)
if response.videos:
video = response.videos[0]
if video.bytesBase64Encoded:
return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
if video.gcsUri:
return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
raise Exception("Video returned but no data or URL was provided")
raise Exception("Video generation completed but no video was returned")
class Veo3FirstLastFrameNode(IO.ComfyNode):
@ -394,7 +483,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
default="",
tooltip="Negative text prompt to guide what to avoid in the video",
),
IO.Combo.Input("resolution", options=["720p", "1080p"]),
IO.Combo.Input("resolution", options=["720p", "1080p", "4k"]),
IO.Combo.Input(
"aspect_ratio",
options=["16:9", "9:16"],
@ -424,8 +513,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
IO.Image.Input("last_frame", tooltip="End frame"),
IO.Combo.Input(
"model",
options=["veo-3.1-generate", "veo-3.1-fast-generate"],
default="veo-3.1-fast-generate",
options=["veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.1-lite"],
),
IO.Boolean.Input(
"generate_audio",
@ -443,26 +531,20 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration"]),
depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration", "resolution"]),
expr="""
(
$prices := {
"veo-3.1-fast-generate": { "audio": 0.15, "no_audio": 0.10 },
"veo-3.1-generate": { "audio": 0.40, "no_audio": 0.20 }
};
$m := widgets.model;
$ga := (widgets.generate_audio = "true");
$r := widgets.resolution;
$ga := widgets.generate_audio;
$seconds := widgets.duration;
$modelKey :=
$contains($m, "veo-3.1-fast-generate") ? "veo-3.1-fast-generate" :
$contains($m, "veo-3.1-generate") ? "veo-3.1-generate" :
"";
$audioKey := $ga ? "audio" : "no_audio";
$modelPrices := $lookup($prices, $modelKey);
$pps := $lookup($modelPrices, $audioKey);
($pps != null)
? {"type":"usd","usd": $pps * $seconds}
: {"type":"range_usd","min_usd": 0.4, "max_usd": 3.2}
$pps :=
$contains($m, "lite")
? ($r = "1080p" ? ($ga ? 0.08 : 0.05) : ($ga ? 0.05 : 0.03))
: $contains($m, "fast")
? ($r = "4k" ? ($ga ? 0.30 : 0.25) : $r = "1080p" ? ($ga ? 0.12 : 0.10) : ($ga ? 0.10 : 0.08))
: ($r = "4k" ? ($ga ? 0.60 : 0.40) : ($ga ? 0.40 : 0.20));
{"type":"usd","usd": $pps * $seconds}
)
""",
),
@ -482,6 +564,9 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
model: str,
generate_audio: bool,
):
if "lite" in model and resolution == "4k":
raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
model = MODELS_MAP[model]
initial_response = await sync_op(
cls,
@ -519,7 +604,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
data=VeoGenVidPollRequest(
operationName=initial_response.name,
),
poll_interval=5.0,
poll_interval=9.0,
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
)

View File

@ -19,6 +19,7 @@ from .conversions import (
image_tensor_pair_to_batch,
pil_to_bytesio,
resize_mask_to_image,
resize_video_to_pixel_budget,
tensor_to_base64_string,
tensor_to_bytesio,
tensor_to_pil,
@ -90,6 +91,7 @@ __all__ = [
"image_tensor_pair_to_batch",
"pil_to_bytesio",
"resize_mask_to_image",
"resize_video_to_pixel_budget",
"tensor_to_base64_string",
"tensor_to_bytesio",
"tensor_to_pil",

View File

@ -129,22 +129,38 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
return img_byte_arr
def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
"""Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.
Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
are rounded down to even values (many codecs require divisible-by-2).
"""
pixels = src_w * src_h
if pixels <= total_pixels:
return None
scale = math.sqrt(total_pixels / pixels)
new_w = max(2, int(src_w * scale))
new_h = max(2, int(src_h * scale))
new_w -= new_w % 2
new_h -= new_h % 2
return new_w, new_h
def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
"""Downscale input image tensor to roughly the specified total pixels."""
"""Downscale input image tensor to roughly the specified total pixels.
Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
and is compatible with codecs that require even dimensions (e.g. yuv420p).
"""
samples = image.movedim(-1, 1)
total = int(total_pixels)
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
if scale_by >= 1:
dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
if dims is None:
return image
width = round(samples.shape[3] * scale_by)
height = round(samples.shape[2] * scale_by)
s = common_upscale(samples, width, height, "lanczos", "disabled")
s = s.movedim(1, -1)
return s
new_w, new_h = dims
return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)
def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
"""Downscale input image tensor so the largest dimension is at most max_side pixels."""
samples = image.movedim(-1, 1)
height, width = samples.shape[2], samples.shape[3]
@ -399,6 +415,72 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
raise RuntimeError(f"Failed to trim video: {str(e)}") from e
def resize_video_to_pixel_budget(video: Input.Video, total_pixels: int) -> Input.Video:
"""Downscale a video to fit within ``total_pixels`` (w * h), preserving aspect ratio.
Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
"""
src_w, src_h = video.get_dimensions()
scale_dims = _compute_downscale_dims(src_w, src_h, total_pixels)
if scale_dims is None:
return video
return _apply_video_scale(video, scale_dims)
def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
"""Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
out_w, out_h = scale_dims
output_buffer = BytesIO()
input_container = None
output_container = None
try:
input_source = video.get_stream_source()
input_container = av.open(input_source, mode="r")
output_container = av.open(output_buffer, mode="w", format="mp4")
video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
video_stream.width = out_w
video_stream.height = out_h
video_stream.pix_fmt = "yuv420p"
audio_stream = None
for stream in input_container.streams:
if isinstance(stream, av.AudioStream):
audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
audio_stream.sample_rate = stream.sample_rate
audio_stream.layout = stream.layout
break
for frame in input_container.decode(video=0):
frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
for packet in video_stream.encode(frame):
output_container.mux(packet)
for packet in video_stream.encode():
output_container.mux(packet)
if audio_stream is not None:
input_container.seek(0)
for audio_frame in input_container.decode(audio=0):
for packet in audio_stream.encode(audio_frame):
output_container.mux(packet)
for packet in audio_stream.encode():
output_container.mux(packet)
output_container.close()
input_container.close()
output_buffer.seek(0)
return InputImpl.VideoFromFile(output_buffer)
except Exception as e:
if input_container is not None:
input_container.close()
if output_container is not None:
output_container.close()
raise RuntimeError(f"Failed to resize video: {str(e)}") from e
def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
"""Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
if wav.dtype.is_floating_point:

View File

@ -0,0 +1,258 @@
"""FILM: Frame Interpolation for Large Motion (ECCV 2022)."""
import torch
import torch.nn as nn
import torch.nn.functional as F
import comfy.ops
ops = comfy.ops.disable_weight_init
class FilmConv2d(nn.Module):
"""Conv2d with optional LeakyReLU and FILM-style padding."""
def __init__(self, in_channels, out_channels, size, activation=True, device=None, dtype=None, operations=ops):
super().__init__()
self.even_pad = not size % 2
self.conv = operations.Conv2d(in_channels, out_channels, kernel_size=size, padding=size // 2 if size % 2 else 0, device=device, dtype=dtype)
self.activation = nn.LeakyReLU(0.2) if activation else None
def forward(self, x):
if self.even_pad:
x = F.pad(x, (0, 1, 0, 1))
x = self.conv(x)
if self.activation is not None:
x = self.activation(x)
return x
def _warp_core(image, flow, grid_x, grid_y):
dtype = image.dtype
H, W = flow.shape[2], flow.shape[3]
dx = flow[:, 0].float() / (W * 0.5)
dy = flow[:, 1].float() / (H * 0.5)
grid = torch.stack([grid_x[None, None, :] + dx, grid_y[None, :, None] + dy], dim=3)
return F.grid_sample(image.float(), grid, mode="bilinear", padding_mode="border", align_corners=False).to(dtype)
def build_image_pyramid(image, pyramid_levels):
pyramid = [image]
for _ in range(1, pyramid_levels):
image = F.avg_pool2d(image, 2, 2)
pyramid.append(image)
return pyramid
def flow_pyramid_synthesis(residual_pyramid):
flow = residual_pyramid[-1]
flow_pyramid = [flow]
for residual_flow in residual_pyramid[:-1][::-1]:
flow = F.interpolate(flow, size=residual_flow.shape[2:4], mode="bilinear", scale_factor=None).mul_(2).add_(residual_flow)
flow_pyramid.append(flow)
flow_pyramid.reverse()
return flow_pyramid
def multiply_pyramid(pyramid, scalar):
return [image * scalar[:, None, None, None] for image in pyramid]
def pyramid_warp(feature_pyramid, flow_pyramid, warp_fn):
return [warp_fn(features, flow) for features, flow in zip(feature_pyramid, flow_pyramid)]
def concatenate_pyramids(pyramid1, pyramid2):
return [torch.cat([f1, f2], dim=1) for f1, f2 in zip(pyramid1, pyramid2)]
class SubTreeExtractor(nn.Module):
def __init__(self, in_channels=3, channels=64, n_layers=4, device=None, dtype=None, operations=ops):
super().__init__()
convs = []
for i in range(n_layers):
out_ch = channels << i
convs.append(nn.Sequential(
FilmConv2d(in_channels, out_ch, 3, device=device, dtype=dtype, operations=operations),
FilmConv2d(out_ch, out_ch, 3, device=device, dtype=dtype, operations=operations)))
in_channels = out_ch
self.convs = nn.ModuleList(convs)
def forward(self, image, n):
head = image
pyramid = []
for i, layer in enumerate(self.convs):
head = layer(head)
pyramid.append(head)
if i < n - 1:
head = F.avg_pool2d(head, 2, 2)
return pyramid
class FeatureExtractor(nn.Module):
def __init__(self, in_channels=3, channels=64, sub_levels=4, device=None, dtype=None, operations=ops):
super().__init__()
self.extract_sublevels = SubTreeExtractor(in_channels, channels, sub_levels, device=device, dtype=dtype, operations=operations)
self.sub_levels = sub_levels
def forward(self, image_pyramid):
sub_pyramids = [self.extract_sublevels(image_pyramid[i], min(len(image_pyramid) - i, self.sub_levels))
for i in range(len(image_pyramid))]
feature_pyramid = []
for i in range(len(image_pyramid)):
features = sub_pyramids[i][0]
for j in range(1, self.sub_levels):
if j <= i:
features = torch.cat([features, sub_pyramids[i - j][j]], dim=1)
feature_pyramid.append(features)
# Free sub-pyramids no longer needed by future levels
if i >= self.sub_levels - 1:
sub_pyramids[i - self.sub_levels + 1] = None
return feature_pyramid
class FlowEstimator(nn.Module):
def __init__(self, in_channels, num_convs, num_filters, device=None, dtype=None, operations=ops):
super().__init__()
self._convs = nn.ModuleList()
for _ in range(num_convs):
self._convs.append(FilmConv2d(in_channels, num_filters, 3, device=device, dtype=dtype, operations=operations))
in_channels = num_filters
self._convs.append(FilmConv2d(in_channels, num_filters // 2, 1, device=device, dtype=dtype, operations=operations))
self._convs.append(FilmConv2d(num_filters // 2, 2, 1, activation=False, device=device, dtype=dtype, operations=operations))
def forward(self, features_a, features_b):
net = torch.cat([features_a, features_b], dim=1)
for conv in self._convs:
net = conv(net)
return net
class PyramidFlowEstimator(nn.Module):
def __init__(self, filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops):
super().__init__()
in_channels = filters << 1
predictors = []
for i in range(len(flow_convs)):
predictors.append(FlowEstimator(in_channels, flow_convs[i], flow_filters[i], device=device, dtype=dtype, operations=operations))
in_channels += filters << (i + 2)
self._predictor = predictors[-1]
self._predictors = nn.ModuleList(predictors[:-1][::-1])
def forward(self, feature_pyramid_a, feature_pyramid_b, warp_fn):
levels = len(feature_pyramid_a)
v = self._predictor(feature_pyramid_a[-1], feature_pyramid_b[-1])
residuals = [v]
# Coarse-to-fine: shared predictor for deep levels, then specialized predictors for fine levels
steps = [(i, self._predictor) for i in range(levels - 2, len(self._predictors) - 1, -1)]
steps += [(len(self._predictors) - 1 - k, p) for k, p in enumerate(self._predictors)]
for i, predictor in steps:
v = F.interpolate(v, size=feature_pyramid_a[i].shape[2:4], mode="bilinear").mul_(2)
v_residual = predictor(feature_pyramid_a[i], warp_fn(feature_pyramid_b[i], v))
residuals.append(v_residual)
v = v.add_(v_residual)
residuals.reverse()
return residuals
def _get_fusion_channels(level, filters):
# Per direction: multi-scale features + RGB image (3ch) + flow (2ch), doubled for both directions
return (sum(filters << i for i in range(level)) + 3 + 2) * 2
class Fusion(nn.Module):
def __init__(self, n_layers=4, specialized_layers=3, filters=64, device=None, dtype=None, operations=ops):
super().__init__()
self.output_conv = operations.Conv2d(filters, 3, kernel_size=1, device=device, dtype=dtype)
self.convs = nn.ModuleList()
in_channels = _get_fusion_channels(n_layers, filters)
increase = 0
for i in range(n_layers)[::-1]:
num_filters = (filters << i) if i < specialized_layers else (filters << specialized_layers)
self.convs.append(nn.ModuleList([
FilmConv2d(in_channels, num_filters, 2, activation=False, device=device, dtype=dtype, operations=operations),
FilmConv2d(in_channels + (increase or num_filters), num_filters, 3, device=device, dtype=dtype, operations=operations),
FilmConv2d(num_filters, num_filters, 3, device=device, dtype=dtype, operations=operations)]))
in_channels = num_filters
increase = _get_fusion_channels(i, filters) - num_filters // 2
def forward(self, pyramid):
net = pyramid[-1]
for k, layers in enumerate(self.convs):
i = len(self.convs) - 1 - k
net = layers[0](F.interpolate(net, size=pyramid[i].shape[2:4], mode="nearest"))
net = layers[2](layers[1](torch.cat([pyramid[i], net], dim=1)))
return self.output_conv(net)
class FILMNet(nn.Module):
def __init__(self, pyramid_levels=7, fusion_pyramid_levels=5, specialized_levels=3, sub_levels=4,
filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops):
super().__init__()
self.pyramid_levels = pyramid_levels
self.fusion_pyramid_levels = fusion_pyramid_levels
self.extract = FeatureExtractor(3, filters, sub_levels, device=device, dtype=dtype, operations=operations)
self.predict_flow = PyramidFlowEstimator(filters, flow_convs, flow_filters, device=device, dtype=dtype, operations=operations)
self.fuse = Fusion(sub_levels, specialized_levels, filters, device=device, dtype=dtype, operations=operations)
self._warp_grids = {}
def get_dtype(self):
return self.extract.extract_sublevels.convs[0][0].conv.weight.dtype
def _build_warp_grids(self, H, W, device):
"""Pre-compute warp grids for all pyramid levels."""
if (H, W) in self._warp_grids:
return
self._warp_grids = {} # clear old resolution grids to prevent memory leaks
for _ in range(self.pyramid_levels):
self._warp_grids[(H, W)] = (
torch.linspace(-(1 - 1 / W), 1 - 1 / W, W, dtype=torch.float32, device=device),
torch.linspace(-(1 - 1 / H), 1 - 1 / H, H, dtype=torch.float32, device=device),
)
H, W = H // 2, W // 2
def warp(self, image, flow):
grid_x, grid_y = self._warp_grids[(flow.shape[2], flow.shape[3])]
return _warp_core(image, flow, grid_x, grid_y)
def extract_features(self, img):
"""Extract image and feature pyramids for a single frame. Can be cached across pairs."""
image_pyramid = build_image_pyramid(img, self.pyramid_levels)
feature_pyramid = self.extract(image_pyramid)
return image_pyramid, feature_pyramid
def forward(self, img0, img1, timestep=0.5, cache=None):
# FILM uses a scalar timestep per batch element (spatially-varying timesteps not supported)
t = timestep.mean(dim=(1, 2, 3)).item() if isinstance(timestep, torch.Tensor) else timestep
return self.forward_multi_timestep(img0, img1, [t], cache=cache)
def forward_multi_timestep(self, img0, img1, timesteps, cache=None):
"""Compute flow once, synthesize at multiple timesteps. Expects batch=1 inputs."""
self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device)
image_pyr0, feat_pyr0 = cache["img0"] if cache and "img0" in cache else self.extract_features(img0)
image_pyr1, feat_pyr1 = cache["img1"] if cache and "img1" in cache else self.extract_features(img1)
fwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr0, feat_pyr1, self.warp))[:self.fusion_pyramid_levels]
bwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr1, feat_pyr0, self.warp))[:self.fusion_pyramid_levels]
# Build warp targets and free full pyramids (only first fpl levels needed from here)
fpl = self.fusion_pyramid_levels
p2w = [concatenate_pyramids(image_pyr0[:fpl], feat_pyr0[:fpl]),
concatenate_pyramids(image_pyr1[:fpl], feat_pyr1[:fpl])]
del image_pyr0, image_pyr1, feat_pyr0, feat_pyr1
results = []
dt_tensors = torch.tensor(timesteps, device=img0.device, dtype=img0.dtype)
for idx in range(len(timesteps)):
batch_dt = dt_tensors[idx:idx + 1]
bwd_scaled = multiply_pyramid(bwd_flow, batch_dt)
fwd_scaled = multiply_pyramid(fwd_flow, 1 - batch_dt)
fwd_warped = pyramid_warp(p2w[0], bwd_scaled, self.warp)
bwd_warped = pyramid_warp(p2w[1], fwd_scaled, self.warp)
aligned = [torch.cat([fw, bw, bf, ff], dim=1)
for fw, bw, bf, ff in zip(fwd_warped, bwd_warped, bwd_scaled, fwd_scaled)]
del fwd_warped, bwd_warped, bwd_scaled, fwd_scaled
results.append(self.fuse(aligned))
del aligned
return torch.cat(results, dim=0)

View File

@ -0,0 +1,128 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import comfy.ops
ops = comfy.ops.disable_weight_init
def _warp(img, flow, warp_grids):
B, _, H, W = img.shape
base_grid, flow_div = warp_grids[(H, W)]
flow_norm = torch.cat([flow[:, 0:1] / flow_div[0], flow[:, 1:2] / flow_div[1]], 1).float()
grid = (base_grid.expand(B, -1, -1, -1) + flow_norm).permute(0, 2, 3, 1)
return F.grid_sample(img.float(), grid, mode="bilinear", padding_mode="border", align_corners=True).to(img.dtype)
class Head(nn.Module):
def __init__(self, out_ch=4, device=None, dtype=None, operations=ops):
super().__init__()
self.cnn0 = operations.Conv2d(3, 16, 3, 2, 1, device=device, dtype=dtype)
self.cnn1 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype)
self.cnn2 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype)
self.cnn3 = operations.ConvTranspose2d(16, out_ch, 4, 2, 1, device=device, dtype=dtype)
self.relu = nn.LeakyReLU(0.2, True)
def forward(self, x):
x = self.relu(self.cnn0(x))
x = self.relu(self.cnn1(x))
x = self.relu(self.cnn2(x))
return self.cnn3(x)
class ResConv(nn.Module):
def __init__(self, c, device=None, dtype=None, operations=ops):
super().__init__()
self.conv = operations.Conv2d(c, c, 3, 1, 1, device=device, dtype=dtype)
self.beta = nn.Parameter(torch.ones((1, c, 1, 1), device=device, dtype=dtype))
self.relu = nn.LeakyReLU(0.2, True)
def forward(self, x):
return self.relu(torch.addcmul(x, self.conv(x), self.beta))
class IFBlock(nn.Module):
def __init__(self, in_planes, c=64, device=None, dtype=None, operations=ops):
super().__init__()
self.conv0 = nn.Sequential(
nn.Sequential(operations.Conv2d(in_planes, c // 2, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True)),
nn.Sequential(operations.Conv2d(c // 2, c, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True)))
self.convblock = nn.Sequential(*(ResConv(c, device=device, dtype=dtype, operations=operations) for _ in range(8)))
self.lastconv = nn.Sequential(operations.ConvTranspose2d(c, 4 * 13, 4, 2, 1, device=device, dtype=dtype), nn.PixelShuffle(2))
def forward(self, x, flow=None, scale=1):
x = F.interpolate(x, scale_factor=1.0 / scale, mode="bilinear")
if flow is not None:
flow = F.interpolate(flow, scale_factor=1.0 / scale, mode="bilinear").div_(scale)
x = torch.cat((x, flow), 1)
feat = self.convblock(self.conv0(x))
tmp = F.interpolate(self.lastconv(feat), scale_factor=scale, mode="bilinear")
return tmp[:, :4] * scale, tmp[:, 4:5], tmp[:, 5:]
class IFNet(nn.Module):
def __init__(self, head_ch=4, channels=(192, 128, 96, 64, 32), device=None, dtype=None, operations=ops):
super().__init__()
self.encode = Head(out_ch=head_ch, device=device, dtype=dtype, operations=operations)
block_in = [7 + 2 * head_ch] + [8 + 4 + 8 + 2 * head_ch] * 4
self.blocks = nn.ModuleList([IFBlock(block_in[i], channels[i], device=device, dtype=dtype, operations=operations) for i in range(5)])
self.scale_list = [16, 8, 4, 2, 1]
self.pad_align = 64
self._warp_grids = {}
def get_dtype(self):
return self.encode.cnn0.weight.dtype
def _build_warp_grids(self, H, W, device):
if (H, W) in self._warp_grids:
return
self._warp_grids = {} # clear old resolution grids to prevent memory leaks
grid_y, grid_x = torch.meshgrid(
torch.linspace(-1.0, 1.0, H, device=device, dtype=torch.float32),
torch.linspace(-1.0, 1.0, W, device=device, dtype=torch.float32), indexing="ij")
self._warp_grids[(H, W)] = (
torch.stack((grid_x, grid_y), dim=0).unsqueeze(0),
torch.tensor([(W - 1.0) / 2.0, (H - 1.0) / 2.0], dtype=torch.float32, device=device))
def warp(self, img, flow):
return _warp(img, flow, self._warp_grids)
def extract_features(self, img):
"""Extract head features for a single frame. Can be cached across pairs."""
return self.encode(img)
def forward(self, img0, img1, timestep=0.5, cache=None):
if not isinstance(timestep, torch.Tensor):
timestep = torch.full((img0.shape[0], 1, img0.shape[2], img0.shape[3]), timestep, device=img0.device, dtype=img0.dtype)
self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device)
B = img0.shape[0]
f0 = cache["img0"].expand(B, -1, -1, -1) if cache and "img0" in cache else self.encode(img0)
f1 = cache["img1"].expand(B, -1, -1, -1) if cache and "img1" in cache else self.encode(img1)
flow = mask = feat = None
warped_img0, warped_img1 = img0, img1
for i, block in enumerate(self.blocks):
if flow is None:
flow, mask, feat = block(torch.cat((img0, img1, f0, f1, timestep), 1), None, scale=self.scale_list[i])
else:
fd, mask, feat = block(
torch.cat((warped_img0, warped_img1, self.warp(f0, flow[:, :2]), self.warp(f1, flow[:, 2:4]), timestep, mask, feat), 1),
flow, scale=self.scale_list[i])
flow = flow.add_(fd)
warped_img0 = self.warp(img0, flow[:, :2])
warped_img1 = self.warp(img1, flow[:, 2:4])
return torch.lerp(warped_img1, warped_img0, torch.sigmoid(mask))
def detect_rife_config(state_dict):
head_ch = state_dict["encode.cnn3.weight"].shape[1] # ConvTranspose2d: (in_ch, out_ch, kH, kW)
channels = []
for i in range(5):
key = f"blocks.{i}.conv0.1.0.weight"
if key in state_dict:
channels.append(state_dict[key].shape[0])
if len(channels) != 5:
raise ValueError(f"Unsupported RIFE model: expected 5 blocks, found {len(channels)}")
return head_ch, channels

View File

@ -3,136 +3,136 @@ from typing_extensions import override
import comfy.model_management
import node_helpers
from comfy_api.latest import ComfyExtension, io
from comfy_api.latest import ComfyExtension, IO
class TextEncodeAceStepAudio(io.ComfyNode):
class TextEncodeAceStepAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
return IO.Schema(
node_id="TextEncodeAceStepAudio",
category="conditioning",
inputs=[
io.Clip.Input("clip"),
io.String.Input("tags", multiline=True, dynamic_prompts=True),
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
IO.Clip.Input("clip"),
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
],
outputs=[io.Conditioning.Output()],
outputs=[IO.Conditioning.Output()],
)
@classmethod
def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput:
def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput:
tokens = clip.tokenize(tags, lyrics=lyrics)
conditioning = clip.encode_from_tokens_scheduled(tokens)
conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
return io.NodeOutput(conditioning)
return IO.NodeOutput(conditioning)
class TextEncodeAceStepAudio15(io.ComfyNode):
class TextEncodeAceStepAudio15(IO.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
return IO.Schema(
node_id="TextEncodeAceStepAudio1.5",
category="conditioning",
inputs=[
io.Clip.Input("clip"),
io.String.Input("tags", multiline=True, dynamic_prompts=True),
io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
io.Int.Input("bpm", default=120, min=10, max=300),
io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
IO.Clip.Input("clip"),
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
IO.Int.Input("bpm", default=120, min=10, max=300),
IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
IO.Combo.Input("timesignature", options=['2', '3', '4', '6']),
IO.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
],
outputs=[io.Conditioning.Output()],
outputs=[IO.Conditioning.Output()],
)
@classmethod
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput:
tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
conditioning = clip.encode_from_tokens_scheduled(tokens)
return io.NodeOutput(conditioning)
return IO.NodeOutput(conditioning)
class EmptyAceStepLatentAudio(io.ComfyNode):
class EmptyAceStepLatentAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
return IO.Schema(
node_id="EmptyAceStepLatentAudio",
display_name="Empty Ace Step 1.0 Latent Audio",
category="latent/audio",
inputs=[
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
io.Int.Input(
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
IO.Int.Input(
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
),
],
outputs=[io.Latent.Output()],
outputs=[IO.Latent.Output()],
)
@classmethod
def execute(cls, seconds, batch_size) -> io.NodeOutput:
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
length = int(seconds * 44100 / 512 / 8)
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
return io.NodeOutput({"samples": latent, "type": "audio"})
return IO.NodeOutput({"samples": latent, "type": "audio"})
class EmptyAceStep15LatentAudio(io.ComfyNode):
class EmptyAceStep15LatentAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
return IO.Schema(
node_id="EmptyAceStep1.5LatentAudio",
display_name="Empty Ace Step 1.5 Latent Audio",
category="latent/audio",
inputs=[
io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
io.Int.Input(
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
IO.Int.Input(
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
),
],
outputs=[io.Latent.Output()],
outputs=[IO.Latent.Output()],
)
@classmethod
def execute(cls, seconds, batch_size) -> io.NodeOutput:
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
length = round((seconds * 48000 / 1920))
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
return io.NodeOutput({"samples": latent, "type": "audio"})
return IO.NodeOutput({"samples": latent, "type": "audio"})
class ReferenceAudio(io.ComfyNode):
class ReferenceAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
return IO.Schema(
node_id="ReferenceTimbreAudio",
display_name="Reference Audio",
category="advanced/conditioning/audio",
is_experimental=True,
description="This node sets the reference audio for ace step 1.5",
inputs=[
io.Conditioning.Input("conditioning"),
io.Latent.Input("latent", optional=True),
IO.Conditioning.Input("conditioning"),
IO.Latent.Input("latent", optional=True),
],
outputs=[
io.Conditioning.Output(),
IO.Conditioning.Output(),
]
)
@classmethod
def execute(cls, conditioning, latent=None) -> io.NodeOutput:
def execute(cls, conditioning, latent=None) -> IO.NodeOutput:
if latent is not None:
conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
return io.NodeOutput(conditioning)
return IO.NodeOutput(conditioning)
class AceExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
return [
TextEncodeAceStepAudio,
EmptyAceStepLatentAudio,

View File

@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None):
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
std[std < 1.0] = 1.0
audio /= std
vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100))
return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}

View File

@ -0,0 +1,211 @@
import torch
from tqdm import tqdm
from typing_extensions import override
import comfy.model_patcher
import comfy.utils
import folder_paths
from comfy import model_management
from comfy_extras.frame_interpolation_models.ifnet import IFNet, detect_rife_config
from comfy_extras.frame_interpolation_models.film_net import FILMNet
from comfy_api.latest import ComfyExtension, io
FrameInterpolationModel = io.Custom("INTERP_MODEL")
class FrameInterpolationModelLoader(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="FrameInterpolationModelLoader",
display_name="Load Frame Interpolation Model",
category="loaders",
inputs=[
io.Combo.Input("model_name", options=folder_paths.get_filename_list("frame_interpolation"),
tooltip="Select a frame interpolation model to load. Models must be placed in the 'frame_interpolation' folder."),
],
outputs=[
FrameInterpolationModel.Output(),
],
)
@classmethod
def execute(cls, model_name) -> io.NodeOutput:
model_path = folder_paths.get_full_path_or_raise("frame_interpolation", model_name)
sd = comfy.utils.load_torch_file(model_path, safe_load=True)
model = cls._detect_and_load(sd)
dtype = torch.float16 if model_management.should_use_fp16(model_management.get_torch_device()) else torch.float32
model.eval().to(dtype)
patcher = comfy.model_patcher.ModelPatcher(
model,
load_device=model_management.get_torch_device(),
offload_device=model_management.unet_offload_device(),
)
return io.NodeOutput(patcher)
@classmethod
def _detect_and_load(cls, sd):
# Try FILM
if "extract.extract_sublevels.convs.0.0.conv.weight" in sd:
model = FILMNet()
model.load_state_dict(sd)
return model
# Try RIFE (needs key remapping for raw checkpoints)
sd = comfy.utils.state_dict_prefix_replace(sd, {"module.": "", "flownet.": ""})
key_map = {}
for k in sd:
for i in range(5):
if k.startswith(f"block{i}."):
key_map[k] = f"blocks.{i}.{k[len(f'block{i}.'):]}"
if key_map:
sd = {key_map.get(k, k): v for k, v in sd.items()}
sd = {k: v for k, v in sd.items() if not k.startswith(("teacher.", "caltime."))}
try:
head_ch, channels = detect_rife_config(sd)
except (KeyError, ValueError):
raise ValueError("Unrecognized frame interpolation model format")
model = IFNet(head_ch=head_ch, channels=channels)
model.load_state_dict(sd)
return model
class FrameInterpolate(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="FrameInterpolate",
display_name="Frame Interpolate",
category="image/video",
search_aliases=["rife", "film", "frame interpolation", "slow motion", "interpolate frames", "vfi"],
inputs=[
FrameInterpolationModel.Input("interp_model"),
io.Image.Input("images"),
io.Int.Input("multiplier", default=2, min=2, max=16),
],
outputs=[
io.Image.Output(),
],
)
@classmethod
def execute(cls, interp_model, images, multiplier) -> io.NodeOutput:
offload_device = model_management.intermediate_device()
num_frames = images.shape[0]
if num_frames < 2 or multiplier < 2:
return io.NodeOutput(images)
model_management.load_model_gpu(interp_model)
device = interp_model.load_device
dtype = interp_model.model_dtype()
inference_model = interp_model.model
# Free VRAM for inference activations (model weights + ~20x a single frame's worth)
H, W = images.shape[1], images.shape[2]
activation_mem = H * W * 3 * images.element_size() * 20
model_management.free_memory(activation_mem, device)
align = getattr(inference_model, "pad_align", 1)
# Prepare a single padded frame on device for determining output dimensions
def prepare_frame(idx):
frame = images[idx:idx + 1].movedim(-1, 1).to(dtype=dtype, device=device)
if align > 1:
from comfy.ldm.common_dit import pad_to_patch_size
frame = pad_to_patch_size(frame, (align, align), padding_mode="reflect")
return frame
# Count total interpolation passes for progress bar
total_pairs = num_frames - 1
num_interp = multiplier - 1
total_steps = total_pairs * num_interp
pbar = comfy.utils.ProgressBar(total_steps)
tqdm_bar = tqdm(total=total_steps, desc="Frame interpolation")
batch = num_interp # reduced on OOM and persists across pairs (same resolution = same limit)
t_values = [t / multiplier for t in range(1, multiplier)]
out_dtype = model_management.intermediate_dtype()
total_out_frames = total_pairs * multiplier + 1
result = torch.empty((total_out_frames, 3, H, W), dtype=out_dtype, device=offload_device)
result[0] = images[0].movedim(-1, 0).to(out_dtype)
out_idx = 1
# Pre-compute timestep tensor on device (padded dimensions needed)
sample = prepare_frame(0)
pH, pW = sample.shape[2], sample.shape[3]
ts_full = torch.tensor(t_values, device=device, dtype=dtype).reshape(num_interp, 1, 1, 1)
ts_full = ts_full.expand(-1, 1, pH, pW)
del sample
multi_fn = getattr(inference_model, "forward_multi_timestep", None)
feat_cache = {}
prev_frame = None
try:
for i in range(total_pairs):
img0_single = prev_frame if prev_frame is not None else prepare_frame(i)
img1_single = prepare_frame(i + 1)
prev_frame = img1_single
# Cache features: img1 of pair N becomes img0 of pair N+1
feat_cache["img0"] = feat_cache.pop("next") if "next" in feat_cache else inference_model.extract_features(img0_single)
feat_cache["img1"] = inference_model.extract_features(img1_single)
feat_cache["next"] = feat_cache["img1"]
used_multi = False
if multi_fn is not None:
# Models with timestep-independent flow can compute it once for all timesteps
try:
mids = multi_fn(img0_single, img1_single, t_values, cache=feat_cache)
result[out_idx:out_idx + num_interp] = mids[:, :, :H, :W].to(out_dtype)
out_idx += num_interp
pbar.update(num_interp)
tqdm_bar.update(num_interp)
used_multi = True
except model_management.OOM_EXCEPTION:
model_management.soft_empty_cache()
multi_fn = None # fall through to single-timestep path
if not used_multi:
j = 0
while j < num_interp:
b = min(batch, num_interp - j)
try:
img0 = img0_single.expand(b, -1, -1, -1)
img1 = img1_single.expand(b, -1, -1, -1)
mids = inference_model(img0, img1, timestep=ts_full[j:j + b], cache=feat_cache)
result[out_idx:out_idx + b] = mids[:, :, :H, :W].to(out_dtype)
out_idx += b
pbar.update(b)
tqdm_bar.update(b)
j += b
except model_management.OOM_EXCEPTION:
if batch <= 1:
raise
batch = max(1, batch // 2)
model_management.soft_empty_cache()
result[out_idx] = images[i + 1].movedim(-1, 0).to(out_dtype)
out_idx += 1
finally:
tqdm_bar.close()
# BCHW -> BHWC
result = result.movedim(1, -1).clamp_(0.0, 1.0)
return io.NodeOutput(result)
class FrameInterpolationExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
FrameInterpolationModelLoader,
FrameInterpolate,
]
async def comfy_entrypoint() -> FrameInterpolationExtension:
return FrameInterpolationExtension()

View File

@ -3,9 +3,8 @@ import comfy.utils
import comfy.model_management
import torch
from comfy.ldm.lightricks.vae.audio_vae import AudioVAE
from comfy_api.latest import ComfyExtension, io
from comfy_extras.nodes_audio import VAEEncodeAudio
class LTXVAudioVAELoader(io.ComfyNode):
@classmethod
@ -28,10 +27,14 @@ class LTXVAudioVAELoader(io.ComfyNode):
def execute(cls, ckpt_name: str) -> io.NodeOutput:
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
return io.NodeOutput(AudioVAE(sd, metadata))
sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True)
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
vae.throw_exception_if_invalid()
return io.NodeOutput(vae)
class LTXVAudioVAEEncode(io.ComfyNode):
class LTXVAudioVAEEncode(VAEEncodeAudio):
@classmethod
def define_schema(cls) -> io.Schema:
return io.Schema(
@ -50,15 +53,8 @@ class LTXVAudioVAEEncode(io.ComfyNode):
)
@classmethod
def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput:
audio_latents = audio_vae.encode(audio)
return io.NodeOutput(
{
"samples": audio_latents,
"sample_rate": int(audio_vae.sample_rate),
"type": "audio",
}
)
def execute(cls, audio, audio_vae) -> io.NodeOutput:
return super().execute(audio_vae, audio)
class LTXVAudioVAEDecode(io.ComfyNode):
@ -80,12 +76,12 @@ class LTXVAudioVAEDecode(io.ComfyNode):
)
@classmethod
def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput:
def execute(cls, samples, audio_vae) -> io.NodeOutput:
audio_latent = samples["samples"]
if audio_latent.is_nested:
audio_latent = audio_latent.unbind()[-1]
audio = audio_vae.decode(audio_latent).to(audio_latent.device)
output_audio_sample_rate = audio_vae.output_sample_rate
audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device)
output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate
return io.NodeOutput(
{
"waveform": audio,
@ -143,17 +139,17 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
frames_number: int,
frame_rate: int,
batch_size: int,
audio_vae: AudioVAE,
audio_vae,
) -> io.NodeOutput:
"""Generate empty audio latents matching the reference pipeline structure."""
assert audio_vae is not None, "Audio VAE model is required"
z_channels = audio_vae.latent_channels
audio_freq = audio_vae.latent_frequency_bins
sampling_rate = int(audio_vae.sample_rate)
audio_freq = audio_vae.first_stage_model.latent_frequency_bins
sampling_rate = int(audio_vae.first_stage_model.sample_rate)
num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate)
num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate)
audio_latents = torch.zeros(
(batch_size, z_channels, num_audio_latents, audio_freq),

View File

@ -7,7 +7,10 @@ import comfy.model_management
import comfy.ldm.common_dit
import comfy.latent_formats
import comfy.ldm.lumina.controlnet
import comfy.ldm.supir.supir_modules
from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel
from comfy_api.latest import io
from comfy.ldm.supir.supir_patch import SUPIRPatch
class BlockWiseControlBlock(torch.nn.Module):
@ -266,6 +269,27 @@ class ModelPatchLoader:
out_dim=sd["audio_proj.norm.weight"].shape[0],
device=comfy.model_management.unet_offload_device(),
operations=comfy.ops.manual_cast)
elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd:
prefix_replace = {}
if 'model.control_model.input_hint_block.0.weight' in sd:
prefix_replace["model.control_model."] = "control_model."
prefix_replace["model.diffusion_model.project_modules."] = "project_modules."
else:
prefix_replace["control_model."] = "control_model."
prefix_replace["project_modules."] = "project_modules."
# Extract denoise_encoder weights before filter_keys discards them
de_prefix = "first_stage_model.denoise_encoder."
denoise_encoder_sd = {}
for k in list(sd.keys()):
if k.startswith(de_prefix):
denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k)
sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True)
sd.pop("control_model.mask_LQ", None)
model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
if denoise_encoder_sd:
model.denoise_encoder_sd = denoise_encoder_sd
model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
model.load_state_dict(sd, assign=model_patcher.is_dynamic())
@ -565,9 +589,89 @@ class MultiTalkModelPatch(torch.nn.Module):
)
class SUPIRApply(io.ComfyNode):
@classmethod
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="SUPIRApply",
category="model_patches/supir",
is_experimental=True,
inputs=[
io.Model.Input("model"),
io.ModelPatch.Input("model_patch"),
io.Vae.Input("vae"),
io.Image.Input("image"),
io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01,
tooltip="Control strength at the start of sampling (high sigma)."),
io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01,
tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."),
io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True,
tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."),
io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True,
tooltip="Sigma threshold below which restore_cfg is disabled."),
],
outputs=[io.Model.Output()],
)
@classmethod
def _encode_with_denoise_encoder(cls, vae, model_patch, image):
"""Encode using denoise_encoder weights from SUPIR checkpoint if available."""
denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None)
if not denoise_sd:
return vae.encode(image)
# Clone VAE patcher, apply denoise_encoder weights to clone, encode
orig_patcher = vae.patcher
vae.patcher = orig_patcher.clone()
patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()}
vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0)
try:
return vae.encode(image)
finally:
vae.patcher = orig_patcher
@classmethod
def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type,
strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput:
model_patched = model.clone()
hint_latent = model.get_model_object("latent_format").process_in(
cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3]))
patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end)
patch.register(model_patched)
if restore_cfg > 0.0:
# Round-trip to match original pipeline: decode hint, re-encode with regular VAE
latent_format = model.get_model_object("latent_format")
decoded = vae.decode(latent_format.process_out(hint_latent))
x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3]))
sigma_max = 14.6146
def restore_cfg_function(args):
denoised = args["denoised"]
sigma = args["sigma"]
if sigma.dim() > 0:
s = sigma[0].item()
else:
s = sigma.item()
if s > restore_cfg_s_tmin:
ref = x_center.to(device=denoised.device, dtype=denoised.dtype)
b = denoised.shape[0]
if ref.shape[0] != b:
ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b]
sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma
d_center = denoised - ref
denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg)
return denoised
model_patched.set_model_sampler_post_cfg_function(restore_cfg_function)
return io.NodeOutput(model_patched)
NODE_CLASS_MAPPINGS = {
"ModelPatchLoader": ModelPatchLoader,
"QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
"ZImageFunControlnet": ZImageFunControlnet,
"USOStyleReference": USOStyleReference,
"SUPIRApply": SUPIRApply,
}

View File

@ -6,6 +6,7 @@ from PIL import Image
import math
from enum import Enum
from typing import TypedDict, Literal
import kornia
import comfy.utils
import comfy.model_management
@ -660,6 +661,228 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
return io.NodeOutput(batched)
class ColorTransfer(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="ColorTransfer",
category="image/postprocessing",
description="Match the colors of one image to another using various algorithms.",
search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"],
inputs=[
io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."),
io.Image.Input("image_ref", optional=True, tooltip="Reference image(s) to match colors to. If not provided, processing is skipped"),
io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],),
io.DynamicCombo.Input("source_stats",
tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)",
options=[
io.DynamicCombo.Option("per_frame", []),
io.DynamicCombo.Option("uniform", []),
io.DynamicCombo.Option("target_frame", [
io.Int.Input("target_index", default=0, min=0, max=10000,
tooltip="Frame index used as the source baseline for computing the transform to image_ref"),
]),
]),
io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
],
outputs=[
io.Image.Output(display_name="image"),
],
)
@staticmethod
def _to_lab(images, i, device):
return kornia.color.rgb_to_lab(
images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2))
@staticmethod
def _pool_stats(images, device, is_reinhard, eps):
"""Two-pass pooled mean + std/cov across all frames."""
N, C = images.shape[0], images.shape[3]
HW = images.shape[1] * images.shape[2]
mean = torch.zeros(C, 1, device=device, dtype=torch.float32)
for i in range(N):
mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True)
mean /= N
acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32)
for i in range(N):
centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean
if is_reinhard:
acc += (centered * centered).mean(dim=-1, keepdim=True)
else:
acc += centered @ centered.T / HW
if is_reinhard:
return mean, torch.sqrt(acc / N).clamp_min_(eps)
return mean, acc / N
@staticmethod
def _frame_stats(lab_flat, hw, is_reinhard, eps):
"""Per-frame mean + std/cov."""
mean = lab_flat.mean(dim=-1, keepdim=True)
if is_reinhard:
return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps)
centered = lab_flat - mean
return mean, centered @ centered.T / hw
@staticmethod
def _mkl_matrix(cov_s, cov_r, eps):
"""Compute MKL 3x3 transform matrix from source and ref covariances."""
eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s)
sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps)
scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0)
mid = scaled_V.T @ cov_r @ scaled_V
eig_val_m, eig_vec_m = torch.linalg.eigh(mid)
sqrt_m = torch.sqrt(eig_val_m.clamp_min(0))
inv_sqrt_s = 1.0 / sqrt_val_s
inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0)
M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T
return inv_scaled_V @ M_half @ inv_scaled_V.T
@staticmethod
def _histogram_lut(src, ref, bins=256):
"""Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1]."""
s_bins = (src * (bins - 1)).long().clamp(0, bins - 1)
r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1)
s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
ones_s = torch.ones_like(src)
ones_r = torch.ones_like(ref)
s_hist.scatter_add_(1, s_bins, ones_s)
r_hist.scatter_add_(1, r_bins, ones_r)
s_cdf = s_hist.cumsum(1)
s_cdf = s_cdf / s_cdf[:, -1:]
r_cdf = r_hist.cumsum(1)
r_cdf = r_cdf / r_cdf[:, -1:]
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1)
@classmethod
def _pooled_cdf(cls, images, device, num_bins=256):
"""Build pooled CDF across all frames, one frame at a time."""
C = images.shape[3]
hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32)
for i in range(images.shape[0]):
frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1)
hist.scatter_add_(1, bins, torch.ones_like(frame))
cdf = hist.cumsum(1)
return cdf / cdf[:, -1:]
@classmethod
def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B):
"""Build per-frame or uniform LUT transform for histogram mode."""
if stats_mode == 'per_frame':
return None # LUT computed per-frame in the apply loop
r_cdf = cls._pooled_cdf(image_ref, device)
if stats_mode == 'target_frame':
ti = min(target_index, B - 1)
s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device)
else:
s_cdf = cls._pooled_cdf(image_target, device)
return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0
@classmethod
def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard):
"""Build transform parameters for Lab-based methods. Returns a transform function."""
eps = 1e-6
B, H, W, C = image_target.shape
B_ref = image_ref.shape[0]
single_ref = B_ref == 1
HW = H * W
HW_ref = image_ref.shape[1] * image_ref.shape[2]
# Precompute ref stats
if single_ref or stats_mode in ('uniform', 'target_frame'):
ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps)
# Uniform/target_frame: precompute single affine transform
if stats_mode in ('uniform', 'target_frame'):
if stats_mode == 'target_frame':
ti = min(target_index, B - 1)
s_lab = cls._to_lab(image_target, ti, device).view(C, -1)
s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps)
else:
s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps)
if is_reinhard:
scale = ref_sc / s_sc
offset = ref_mean - scale * s_mean
return lambda src_flat, **_: src_flat * scale + offset
T = cls._mkl_matrix(s_sc, ref_sc, eps)
offset = ref_mean - T @ s_mean
return lambda src_flat, **_: T @ src_flat + offset
# per_frame
def per_frame_transform(src_flat, frame_idx):
s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps)
if single_ref:
r_mean, r_sc = ref_mean, ref_sc
else:
ri = min(frame_idx, B_ref - 1)
r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps)
centered = src_flat - s_mean
if is_reinhard:
return centered * (r_sc / s_sc) + r_mean
T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps)
return T @ centered + r_mean
return per_frame_transform
@classmethod
def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput:
stats_mode = source_stats["source_stats"]
target_index = source_stats.get("target_index", 0)
if strength == 0 or image_ref is None:
return io.NodeOutput(image_target)
device = comfy.model_management.get_torch_device()
intermediate_device = comfy.model_management.intermediate_device()
intermediate_dtype = comfy.model_management.intermediate_dtype()
B, H, W, C = image_target.shape
B_ref = image_ref.shape[0]
pbar = comfy.utils.ProgressBar(B)
out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype)
if method == 'histogram':
uniform_lut = cls._build_histogram_transform(
image_target, image_ref, device, stats_mode, target_index, B)
for i in range(B):
src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1)
src_flat = src.reshape(C, -1)
if uniform_lut is not None:
lut = uniform_lut
else:
ri = min(i, B_ref - 1)
ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
lut = cls._histogram_lut(src_flat, ref)
bin_idx = (src_flat * 255).long().clamp(0, 255)
matched = lut.gather(1, bin_idx).view(C, H, W)
result = matched if strength == 1.0 else torch.lerp(src, matched, strength)
out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
pbar.update(1)
else:
transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab")
for i in range(B):
src_frame = cls._to_lab(image_target, i, device)
corrected = transform(src_frame.view(C, -1), frame_idx=i)
if strength == 1.0:
result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W))
else:
result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength))
out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
pbar.update(1)
return io.NodeOutput(out)
class PostProcessingExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -673,6 +896,7 @@ class PostProcessingExtension(ComfyExtension):
BatchImagesNode,
BatchMasksNode,
BatchLatentsNode,
ColorTransfer,
# BatchImagesMasksLatentsNode,
]

View File

@ -1,4 +1,5 @@
import re
import json
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
@ -375,6 +376,39 @@ class RegexReplace(io.ComfyNode):
return io.NodeOutput(result)
class JsonExtractString(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="JsonExtractString",
display_name="Extract String from JSON",
category="utils/string",
search_aliases=["json", "extract json", "parse json", "json value", "read json"],
inputs=[
io.String.Input("json_string", multiline=True),
io.String.Input("key", multiline=False),
],
outputs=[
io.String.Output(),
]
)
@classmethod
def execute(cls, json_string, key):
try:
data = json.loads(json_string)
if isinstance(data, dict) and key in data:
value = data[key]
if value is None:
return io.NodeOutput("")
return io.NodeOutput(str(value))
return io.NodeOutput("")
except (json.JSONDecodeError, TypeError):
return io.NodeOutput("")
class StringExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
@ -390,6 +424,7 @@ class StringExtension(ComfyExtension):
RegexMatch,
RegexExtract,
RegexReplace,
JsonExtractString,
]
async def comfy_entrypoint() -> StringExtension:

View File

@ -35,6 +35,7 @@ class TextGenerate(io.ComfyNode):
io.Int.Input("max_length", default=256, min=1, max=2048),
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
],
outputs=[
io.String.Output(display_name="generated_text"),
@ -42,9 +43,9 @@ class TextGenerate(io.ComfyNode):
)
@classmethod
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput:
tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking)
tokens = clip.tokenize(prompt, image=image, skip_template=not use_default_template, min_length=1, thinking=thinking)
# Get sampling parameters from dynamic combo
do_sample = sampling_mode.get("sampling_mode") == "on"
@ -160,12 +161,12 @@ class TextGenerateLTX2Prompt(TextGenerate):
)
@classmethod
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True) -> io.NodeOutput:
if image is None:
formatted_prompt = f"<start_of_turn>system\n{LTX2_T2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
else:
formatted_prompt = f"<start_of_turn>system\n{LTX2_I2V_SYSTEM_PROMPT.strip()}<end_of_turn>\n<start_of_turn>user\n\n<image_soft_token>\n\nUser Raw Input Prompt: {prompt}.<end_of_turn>\n<start_of_turn>model\n"
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking)
return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking, use_default_template)
class TextgenExtension(ComfyExtension):

View File

@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
__version__ = "0.19.0"
__version__ = "0.19.3"

View File

@ -52,6 +52,8 @@ folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patc
folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_encoders")], supported_pt_extensions)
folder_names_and_paths["frame_interpolation"] = ([os.path.join(models_dir, "frame_interpolation")], supported_pt_extensions)
output_directory = os.path.join(base_path, "output")
temp_directory = os.path.join(base_path, "temp")
input_directory = os.path.join(base_path, "input")

View File

@ -9,6 +9,8 @@ import folder_paths
import time
from comfy.cli_args import args, enables_dynamic_vram
from app.logger import setup_logger
setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
from app.assets.seeder import asset_seeder
from app.assets.services import register_output_files
import itertools
@ -27,8 +29,6 @@ if __name__ == "__main__":
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['DO_NOT_TRACK'] = '1'
setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
faulthandler.enable(file=sys.stderr, all_threads=False)
import comfy_aimdo.control

View File

@ -2457,7 +2457,8 @@ async def init_builtin_extra_nodes():
"nodes_number_convert.py",
"nodes_painter.py",
"nodes_curve.py",
"nodes_rtdetr.py"
"nodes_rtdetr.py",
"nodes_frame_interpolation.py",
]
import_failed = []

View File

@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
version = "0.19.0"
version = "0.19.3"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"

View File

@ -1,5 +1,5 @@
comfyui-frontend-package==1.42.11
comfyui-workflow-templates==0.9.50
comfyui-frontend-package==1.42.14
comfyui-workflow-templates==0.9.59
comfyui-embedded-docs==0.4.3
torch
torchsde
@ -19,7 +19,7 @@ scipy
tqdm
psutil
alembic
SQLAlchemy
SQLAlchemy>=2.0
filelock
av>=14.2.0
comfy-kitchen>=0.2.8