mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-30 02:47:24 +08:00
Compare commits
66 Commits
72ab6eb8cf
...
f9d53e5d75
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f9d53e5d75 | ||
|
|
96f1cee9f5 | ||
|
|
97f58baaaf | ||
|
|
e8e8fee224 | ||
|
|
e9c311b245 | ||
|
|
e6e0936128 | ||
|
|
b633244635 | ||
|
|
38ecad8f8a | ||
|
|
a7d82baa06 | ||
|
|
d10fc2d652 | ||
|
|
a164c82913 | ||
|
|
5eeae3f1d8 | ||
|
|
0e25a6936e | ||
|
|
6d4f9e86ab | ||
|
|
1548aee40e | ||
|
|
9c210473fc | ||
|
|
c1e9164c63 | ||
|
|
5e9a90186f | ||
|
|
ece906328a | ||
|
|
500ca8e02a | ||
|
|
3143b7981f | ||
|
|
c9b3f81e83 | ||
|
|
5e74e9b3ed | ||
|
|
c702cddf75 | ||
|
|
e13da8104c | ||
|
|
fdcc38b9ea | ||
|
|
c1ce00287c | ||
|
|
6e3bd33665 | ||
|
|
ce05e377a8 | ||
|
|
1a00f7743f | ||
|
|
a6472b1514 | ||
|
|
6158cd5820 | ||
|
|
bff714dda0 | ||
|
|
fce22da313 | ||
|
|
9f9d37bd9a | ||
|
|
088778c35d | ||
|
|
4c5f82971e | ||
|
|
f1d91a4c8c | ||
|
|
dbed5a1b52 | ||
|
|
24fdbb9aca | ||
|
|
a6624a9afd | ||
|
|
0b512198e8 | ||
|
|
9feb26928c | ||
|
|
fadd79ad48 | ||
|
|
77bc7bdd6b | ||
|
|
117afbc1d7 | ||
|
|
064eec2278 | ||
|
|
aceaa5e579 | ||
|
|
763089f681 | ||
|
|
1693dabc8f | ||
|
|
08063d2638 | ||
|
|
e069617e54 | ||
|
|
2bea0ee5d7 | ||
|
|
17863f603a | ||
|
|
31ba844624 | ||
|
|
1451001f64 | ||
|
|
1af99b2e81 | ||
|
|
3568b82b76 | ||
|
|
6728d4d439 | ||
|
|
4b431ffc27 | ||
|
|
880b51ac4f | ||
|
|
4d9516b909 | ||
|
|
39086890e2 | ||
|
|
2adde5a0e1 | ||
|
|
0c1bfad0df | ||
|
|
7d76a4447e |
@ -1,2 +1,2 @@
|
||||
# Admins
|
||||
* @comfyanonymous @kosinkadink @guill
|
||||
* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
|
||||
|
||||
@ -224,6 +224,7 @@ class Flux2(LatentFormat):
|
||||
|
||||
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
|
||||
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
|
||||
self.taesd_decoder_name = "taef2_decoder"
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent
|
||||
@ -783,3 +784,10 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
No VAE encoding/decoding — the model operates directly on RGB pixels.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.15258426
|
||||
|
||||
0
comfy/ldm/cogvideo/__init__.py
Normal file
0
comfy/ldm/cogvideo/__init__.py
Normal file
573
comfy/ldm/cogvideo/model.py
Normal file
573
comfy/ldm/cogvideo/model.py
Normal file
@ -0,0 +1,573 @@
|
||||
# CogVideoX 3D Transformer - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers CogVideoXTransformer3DModel
|
||||
# Style reference: comfy/ldm/wan/model.py
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.patcher_extension
|
||||
import comfy.ldm.common_dit
|
||||
|
||||
|
||||
def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
|
||||
"""Returns (cos, sin) each with shape [seq_len, dim].
|
||||
|
||||
Frequencies are computed at dim//2 resolution then repeat_interleaved
|
||||
to full dim, matching CogVideoX's interleaved (real, imag) pair format.
|
||||
"""
|
||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
|
||||
angles = torch.outer(pos.float(), freqs.float())
|
||||
cos = angles.cos().repeat_interleave(2, dim=-1).float()
|
||||
sin = angles.sin().repeat_interleave(2, dim=-1).float()
|
||||
return (cos, sin)
|
||||
|
||||
|
||||
def apply_rotary_emb(x, freqs_cos_sin):
|
||||
"""Apply CogVideoX rotary embedding to query or key tensor.
|
||||
|
||||
x: [B, heads, seq_len, head_dim]
|
||||
freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
|
||||
|
||||
Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
|
||||
head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
|
||||
"""
|
||||
cos, sin = freqs_cos_sin
|
||||
cos = cos[None, None, :, :].to(x.device)
|
||||
sin = sin[None, None, :, :].to(x.device)
|
||||
|
||||
# Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
|
||||
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
||||
|
||||
return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
|
||||
args = timesteps[:, None].float() * freqs[None] * scale
|
||||
embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
|
||||
if flip_sin_to_cos:
|
||||
embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
|
||||
if isinstance(spatial_size, int):
|
||||
spatial_size = (spatial_size, spatial_size)
|
||||
|
||||
grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
|
||||
|
||||
grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
|
||||
|
||||
embed_dim_spatial = 2 * (embed_dim // 3)
|
||||
embed_dim_temporal = embed_dim // 3
|
||||
|
||||
pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
|
||||
pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
|
||||
|
||||
T, H, W = grid_t.shape
|
||||
pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
|
||||
pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
|
||||
def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
|
||||
T, H, W = grid_h.shape
|
||||
half_dim = embed_dim // 2
|
||||
pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
return torch.cat([pos_h, pos_w], dim=-1)
|
||||
|
||||
|
||||
def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
|
||||
half = embed_dim // 2
|
||||
freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
|
||||
args = pos.float().reshape(-1)[:, None] * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if embed_dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
|
||||
class CogVideoXPatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
|
||||
text_dim=4096, bias=True, sample_width=90, sample_height=60,
|
||||
sample_frames=49, temporal_compression_ratio=4,
|
||||
max_text_seq_length=226, spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0, use_positional_embeddings=True,
|
||||
use_learned_positional_embeddings=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.dim = dim
|
||||
self.sample_height = sample_height
|
||||
self.sample_width = sample_width
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
self.use_learned_positional_embeddings = use_learned_positional_embeddings
|
||||
|
||||
if patch_size_t is None:
|
||||
self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
|
||||
else:
|
||||
self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
|
||||
|
||||
self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
if use_positional_embeddings or use_learned_positional_embeddings:
|
||||
persistent = use_learned_positional_embeddings
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
|
||||
|
||||
def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
if self.patch_size_t is not None:
|
||||
post_time_compression_frames = post_time_compression_frames // self.patch_size_t
|
||||
num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=device,
|
||||
)
|
||||
pos_embedding = pos_embedding.reshape(-1, self.dim)
|
||||
joint_pos_embedding = pos_embedding.new_zeros(
|
||||
1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
|
||||
)
|
||||
joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
|
||||
return joint_pos_embedding
|
||||
|
||||
def forward(self, text_embeds, image_embeds):
|
||||
input_dtype = text_embeds.dtype
|
||||
text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
|
||||
batch_size, num_frames, channels, height, width = image_embeds.shape
|
||||
|
||||
proj_dtype = self.proj.weight.dtype
|
||||
if self.patch_size_t is None:
|
||||
image_embeds = image_embeds.reshape(-1, channels, height, width)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3)
|
||||
image_embeds = image_embeds.flatten(1, 2)
|
||||
else:
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
|
||||
image_embeds = image_embeds.reshape(
|
||||
batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
|
||||
)
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
|
||||
embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
|
||||
|
||||
if self.use_positional_embeddings or self.use_learned_positional_embeddings:
|
||||
text_seq_length = text_embeds.shape[1]
|
||||
num_image_patches = image_embeds.shape[1]
|
||||
|
||||
if self.use_learned_positional_embeddings:
|
||||
image_pos = self.pos_embedding[
|
||||
:, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
|
||||
].to(device=embeds.device, dtype=embeds.dtype)
|
||||
else:
|
||||
image_pos = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(width // self.patch_size, height // self.patch_size),
|
||||
num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=embeds.device,
|
||||
).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
|
||||
|
||||
# Build joint: zeros for text + sincos for image
|
||||
joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
|
||||
joint_pos[:, text_seq_length:] = image_pos
|
||||
embeds = embeds + joint_pos
|
||||
|
||||
return embeds
|
||||
|
||||
|
||||
class CogVideoXLayerNormZero(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb):
|
||||
shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
|
||||
hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
|
||||
return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
|
||||
|
||||
|
||||
class CogVideoXAdaLayerNorm(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, temb):
|
||||
temb = self.linear(self.silu(temb))
|
||||
shift, scale = temb.chunk(2, dim=1)
|
||||
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
return x
|
||||
|
||||
|
||||
class CogVideoXBlock(nn.Module):
|
||||
def __init__(self, dim, num_heads, head_dim, time_dim,
|
||||
eps=1e-5, ff_inner_dim=None, ff_bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = head_dim
|
||||
|
||||
self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Self-attention (joint text + latent)
|
||||
self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Feed-forward (GELU approximate)
|
||||
inner_dim = ff_inner_dim or dim * 4
|
||||
self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
text_seq_length = encoder_hidden_states.size(1)
|
||||
|
||||
# Norm & modulate
|
||||
norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Joint self-attention
|
||||
qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
b, s, _ = qkv_input.shape
|
||||
n, d = self.num_heads, self.head_dim
|
||||
|
||||
q = self.q(qkv_input).view(b, s, n, d)
|
||||
k = self.k(qkv_input).view(b, s, n, d)
|
||||
v = self.v(qkv_input)
|
||||
|
||||
q = self.norm_q(q).view(b, s, n, d)
|
||||
k = self.norm_k(k).view(b, s, n, d)
|
||||
|
||||
# Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
|
||||
if image_rotary_emb is not None:
|
||||
q_img = q[:, text_seq_length:].transpose(1, 2) # [B, heads, img_seq, head_dim]
|
||||
k_img = k[:, text_seq_length:].transpose(1, 2)
|
||||
q_img = apply_rotary_emb(q_img, image_rotary_emb)
|
||||
k_img = apply_rotary_emb(k_img, image_rotary_emb)
|
||||
q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
|
||||
k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
|
||||
|
||||
attn_out = optimized_attention(
|
||||
q.reshape(b, s, n * d),
|
||||
k.reshape(b, s, n * d),
|
||||
v,
|
||||
heads=self.num_heads,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
attn_out = self.attn_out(attn_out)
|
||||
|
||||
attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
|
||||
|
||||
hidden_states = hidden_states + gate_msa * attn_hidden
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
|
||||
|
||||
# Norm & modulate for FF
|
||||
norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Feed-forward (GELU on concatenated text + latent)
|
||||
ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
|
||||
|
||||
hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
|
||||
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class CogVideoXTransformer3DModel(nn.Module):
|
||||
def __init__(self,
|
||||
num_attention_heads=30,
|
||||
attention_head_dim=64,
|
||||
in_channels=16,
|
||||
out_channels=16,
|
||||
flip_sin_to_cos=True,
|
||||
freq_shift=0,
|
||||
time_embed_dim=512,
|
||||
ofs_embed_dim=None,
|
||||
text_embed_dim=4096,
|
||||
num_layers=30,
|
||||
dropout=0.0,
|
||||
attention_bias=True,
|
||||
sample_width=90,
|
||||
sample_height=60,
|
||||
sample_frames=49,
|
||||
patch_size=2,
|
||||
patch_size_t=None,
|
||||
temporal_compression_ratio=4,
|
||||
max_text_seq_length=226,
|
||||
spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0,
|
||||
use_rotary_positional_embeddings=False,
|
||||
use_learned_positional_embeddings=False,
|
||||
patch_bias=True,
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
dim = num_attention_heads * attention_head_dim
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
|
||||
|
||||
# 1. Patch embedding
|
||||
self.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_size,
|
||||
patch_size_t=patch_size_t,
|
||||
in_channels=in_channels,
|
||||
dim=dim,
|
||||
text_dim=text_embed_dim,
|
||||
bias=patch_bias,
|
||||
sample_width=sample_width,
|
||||
sample_height=sample_height,
|
||||
sample_frames=sample_frames,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
max_text_seq_length=max_text_seq_length,
|
||||
spatial_interpolation_scale=spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=temporal_interpolation_scale,
|
||||
use_positional_embeddings=not use_rotary_positional_embeddings,
|
||||
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
||||
device=device, dtype=torch.float32, operations=operations,
|
||||
)
|
||||
|
||||
# 2. Time embedding
|
||||
self.time_proj_dim = dim
|
||||
self.time_proj_flip = flip_sin_to_cos
|
||||
self.time_proj_shift = freq_shift
|
||||
self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
|
||||
self.time_embedding_act = nn.SiLU()
|
||||
self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
|
||||
|
||||
# Optional OFS embedding (CogVideoX 1.5 I2V)
|
||||
self.ofs_proj_dim = ofs_embed_dim
|
||||
if ofs_embed_dim:
|
||||
self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
self.ofs_embedding_act = nn.SiLU()
|
||||
self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
else:
|
||||
self.ofs_embedding_linear_1 = None
|
||||
|
||||
# 3. Transformer blocks
|
||||
self.blocks = nn.ModuleList([
|
||||
CogVideoXBlock(
|
||||
dim=dim,
|
||||
num_heads=num_attention_heads,
|
||||
head_dim=attention_head_dim,
|
||||
time_dim=time_embed_dim,
|
||||
eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
# 4. Output
|
||||
self.norm_out = CogVideoXAdaLayerNorm(
|
||||
time_dim=time_embed_dim, dim=dim, eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
if patch_size_t is None:
|
||||
output_dim = patch_size * patch_size * out_channels
|
||||
else:
|
||||
output_dim = patch_size * patch_size * patch_size_t * out_channels
|
||||
|
||||
self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
|
||||
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, ofs, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
# ComfyUI passes [B, C, T, H, W]
|
||||
batch_size, channels, t, h, w = x.shape
|
||||
|
||||
# Pad to patch size (temporal + spatial), same pattern as WAN
|
||||
p_t = self.patch_size_t if self.patch_size_t is not None else 1
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
|
||||
|
||||
# CogVideoX expects [B, T, C, H, W]
|
||||
x = x.permute(0, 2, 1, 3, 4)
|
||||
batch_size, num_frames, channels, height, width = x.shape
|
||||
|
||||
# Time embedding
|
||||
t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
t_emb = t_emb.to(dtype=x.dtype)
|
||||
emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
|
||||
|
||||
if self.ofs_embedding_linear_1 is not None and ofs is not None:
|
||||
ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
ofs_emb = ofs_emb.to(dtype=x.dtype)
|
||||
ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
|
||||
emb = emb + ofs_emb
|
||||
|
||||
# Patch embedding
|
||||
hidden_states = self.patch_embed(context, x)
|
||||
|
||||
text_seq_length = context.shape[1]
|
||||
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# Rotary embeddings (if used)
|
||||
image_rotary_emb = None
|
||||
if self.use_rotary_positional_embeddings:
|
||||
post_patch_height = height // self.patch_size
|
||||
post_patch_width = width // self.patch_size
|
||||
if self.patch_size_t is None:
|
||||
post_time = num_frames
|
||||
else:
|
||||
post_time = num_frames // self.patch_size_t
|
||||
image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
|
||||
|
||||
# Transformer blocks
|
||||
for i, block in enumerate(self.blocks):
|
||||
hidden_states, encoder_hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=emb,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = self.norm_final(hidden_states)
|
||||
|
||||
# Output projection
|
||||
hidden_states = self.norm_out(hidden_states, temb=emb)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
|
||||
# Unpatchify
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
|
||||
if p_t is None:
|
||||
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
|
||||
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
||||
else:
|
||||
output = hidden_states.reshape(
|
||||
batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
|
||||
)
|
||||
output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
|
||||
|
||||
# Back to ComfyUI format [B, C, T, H, W] and crop padding
|
||||
output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
|
||||
return output
|
||||
|
||||
def _get_rotary_emb(self, h, w, t, device):
|
||||
"""Compute CogVideoX 3D rotary positional embeddings.
|
||||
|
||||
For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
|
||||
are integer arange computed at max_size, then sliced to actual size.
|
||||
For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
|
||||
scaled by spatial_interpolation_scale.
|
||||
"""
|
||||
d = self.attention_head_dim
|
||||
dim_t = d // 4
|
||||
dim_h = d // 8 * 3
|
||||
dim_w = d // 8 * 3
|
||||
|
||||
if self.patch_size_t is not None:
|
||||
# CogVideoX 1.5: "slice" mode — positions are simple integer indices
|
||||
# Compute at max(sample_size, actual_size) then slice to actual
|
||||
base_h = self.patch_embed.sample_height // self.patch_size
|
||||
base_w = self.patch_embed.sample_width // self.patch_size
|
||||
max_h = max(base_h, h)
|
||||
max_w = max(base_w, w)
|
||||
|
||||
grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
|
||||
grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
else:
|
||||
# CogVideoX 1.0: "linspace" mode with interpolation scale
|
||||
grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
|
||||
freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
|
||||
freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
|
||||
freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
|
||||
|
||||
t_cos, t_sin = freqs_t
|
||||
h_cos, h_sin = freqs_h
|
||||
w_cos, w_sin = freqs_w
|
||||
|
||||
# Slice to actual size (for "slice" mode where grids may be larger)
|
||||
t_cos, t_sin = t_cos[:t], t_sin[:t]
|
||||
h_cos, h_sin = h_cos[:h], h_sin[:h]
|
||||
w_cos, w_sin = w_cos[:w], w_sin[:w]
|
||||
|
||||
# Broadcast and concatenate into [T*H*W, head_dim]
|
||||
t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
|
||||
t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
|
||||
h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
|
||||
h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
|
||||
w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
|
||||
w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
|
||||
|
||||
cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
|
||||
sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
|
||||
return (cos, sin)
|
||||
566
comfy/ldm/cogvideo/vae.py
Normal file
566
comfy/ldm/cogvideo/vae.py
Normal file
@ -0,0 +1,566 @@
|
||||
# CogVideoX VAE - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers AutoencoderKLCogVideoX
|
||||
# Style reference: comfy/ldm/wan/vae.py
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ops
|
||||
ops = comfy.ops.disable_weight_init
|
||||
|
||||
|
||||
class CausalConv3d(nn.Module):
|
||||
"""Causal 3D convolution with temporal padding.
|
||||
|
||||
Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
|
||||
a single temporal frame and no cache, the 3D conv weight is sliced to act
|
||||
as a 2D conv, avoiding computation on zero-padded temporal dimensions.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
|
||||
super().__init__()
|
||||
if isinstance(kernel_size, int):
|
||||
kernel_size = (kernel_size,) * 3
|
||||
|
||||
time_kernel, height_kernel, width_kernel = kernel_size
|
||||
self.time_kernel_size = time_kernel
|
||||
self.pad_mode = pad_mode
|
||||
|
||||
height_pad = (height_kernel - 1) // 2
|
||||
width_pad = (width_kernel - 1) // 2
|
||||
self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
|
||||
|
||||
stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
|
||||
dilation = (dilation, 1, 1)
|
||||
self.conv = ops.Conv3d(
|
||||
in_channels, out_channels, kernel_size,
|
||||
stride=stride, dilation=dilation,
|
||||
padding=(0, height_pad, width_pad),
|
||||
)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
if self.pad_mode == "replicate":
|
||||
x = F.pad(x, self.time_causal_padding, mode="replicate")
|
||||
conv_cache = None
|
||||
else:
|
||||
kernel_t = self.time_kernel_size
|
||||
if kernel_t > 1:
|
||||
if conv_cache is None and x.shape[2] == 1:
|
||||
# Fast path: single frame, no cache. All temporal padding
|
||||
# frames are copies of the input (replicate-style), so the
|
||||
# 3D conv reduces to a 2D conv with summed temporal kernel.
|
||||
w = comfy.ops.cast_to_input(self.conv.weight, x)
|
||||
b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
|
||||
w2d = w.sum(dim=2, keepdim=True)
|
||||
out = F.conv3d(x, w2d, b,
|
||||
self.conv.stride, self.conv.padding,
|
||||
self.conv.dilation, self.conv.groups)
|
||||
return out, None
|
||||
cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
|
||||
x = torch.cat(cached + [x], dim=2)
|
||||
conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
|
||||
|
||||
out = self.conv(x)
|
||||
return out, conv_cache
|
||||
|
||||
|
||||
def _interpolate_zq(zq, target_size):
|
||||
"""Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
|
||||
t = target_size[0]
|
||||
if t > 1 and t % 2 == 1:
|
||||
z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
|
||||
z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
|
||||
return torch.cat([z_first, z_rest], dim=2)
|
||||
return F.interpolate(zq, size=target_size)
|
||||
|
||||
|
||||
class SpatialNorm3D(nn.Module):
|
||||
"""Spatially conditioned normalization."""
|
||||
def __init__(self, f_channels, zq_channels, groups=32):
|
||||
super().__init__()
|
||||
self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
|
||||
self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
|
||||
def forward(self, f, zq, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
if zq.shape[-3:] != f.shape[-3:]:
|
||||
zq = _interpolate_zq(zq, f.shape[-3:])
|
||||
|
||||
conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
|
||||
conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
|
||||
|
||||
return self.norm_layer(f) * conv_y + conv_b, new_cache
|
||||
|
||||
|
||||
class ResnetBlock3D(nn.Module):
|
||||
"""3D ResNet block with optional spatial norm."""
|
||||
def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
|
||||
eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
out_channels = out_channels or in_channels
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.spatial_norm_dim = spatial_norm_dim
|
||||
|
||||
if act_fn == "silu":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
elif act_fn == "swish":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
else:
|
||||
self.nonlinearity = nn.SiLU()
|
||||
|
||||
if spatial_norm_dim is None:
|
||||
self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
|
||||
self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
|
||||
else:
|
||||
self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
|
||||
self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
|
||||
|
||||
self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if temb_channels > 0:
|
||||
self.temb_proj = ops.Linear(temb_channels, out_channels)
|
||||
|
||||
self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if in_channels != out_channels:
|
||||
self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
else:
|
||||
self.conv_shortcut = None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
residual = x
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
|
||||
else:
|
||||
x = self.norm1(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
|
||||
|
||||
if temb is not None and hasattr(self, "temb_proj"):
|
||||
x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
|
||||
else:
|
||||
x = self.norm2(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
|
||||
|
||||
if self.conv_shortcut is not None:
|
||||
residual = self.conv_shortcut(residual)
|
||||
|
||||
return x + residual, new_cache
|
||||
|
||||
|
||||
class Downsample3D(nn.Module):
|
||||
"""3D downsampling with optional temporal compression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
|
||||
if t % 2 == 1:
|
||||
x_first, x_rest = x[..., 0], x[..., 1:]
|
||||
if x_rest.shape[-1] > 0:
|
||||
x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
|
||||
x = torch.cat([x_first[..., None], x_rest], dim=-1)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
else:
|
||||
x = F.avg_pool1d(x, kernel_size=2, stride=2)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
|
||||
pad = (0, 1, 0, 1)
|
||||
x = F.pad(x, pad, mode="constant", value=0)
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class Upsample3D(nn.Module):
|
||||
"""3D upsampling with optional temporal decompression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
if x.shape[2] > 1 and x.shape[2] % 2 == 1:
|
||||
x_first, x_rest = x[:, :, 0], x[:, :, 1:]
|
||||
x_first = F.interpolate(x_first, scale_factor=2.0)
|
||||
x_rest = F.interpolate(x_rest, scale_factor=2.0)
|
||||
x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
|
||||
elif x.shape[2] > 1:
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
else:
|
||||
x = x.squeeze(2)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x[:, :, None, :, :]
|
||||
else:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
|
||||
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class DownBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
|
||||
compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels,
|
||||
groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.downsamplers is not None:
|
||||
for ds in self.downsamplers:
|
||||
x = ds(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class MidBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels, out_channels=in_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class UpBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
|
||||
add_upsample=True, compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.upsamplers is not None:
|
||||
for us in self.upsamplers:
|
||||
x = us(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Encoder3D(nn.Module):
|
||||
def __init__(self, in_channels=3, out_channels=16,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.down_blocks = nn.ModuleList()
|
||||
output_channel = block_out_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
input_channel = output_channel
|
||||
output_channel = block_out_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.down_blocks.append(DownBlock3D(
|
||||
in_channels=input_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
add_downsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=block_out_channels[-1], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
for i, block in enumerate(self.down_blocks):
|
||||
key = f"down_block_{i}"
|
||||
x, new_cache[key] = block(x, None, None, conv_cache.get(key))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
x = self.norm_out(x)
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Decoder3D(nn.Module):
|
||||
def __init__(self, in_channels=16, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
reversed_channels = list(reversed(block_out_channels))
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=reversed_channels[0], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.up_blocks = nn.ModuleList()
|
||||
output_channel = reversed_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
prev_channel = output_channel
|
||||
output_channel = reversed_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.up_blocks.append(UpBlock3D(
|
||||
in_channels=prev_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block + 1,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels,
|
||||
add_upsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, sample, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
for i, block in enumerate(self.up_blocks):
|
||||
key = f"up_block_{i}"
|
||||
x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
|
||||
|
||||
x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
|
||||
class AutoencoderKLCogVideoX(nn.Module):
|
||||
"""CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
|
||||
|
||||
Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
|
||||
on the full (low-res) tensor, then the expensive spatial-only up_blocks +
|
||||
norm_out + conv_out are processed in small temporal chunks with conv_cache
|
||||
carrying causal state between chunks. This keeps peak VRAM proportional to
|
||||
chunk_size rather than total frame count.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
latent_channels=16, layers_per_block=3,
|
||||
act_fn="silu", eps=1e-6, groups=32,
|
||||
temporal_compression_ratio=4,
|
||||
):
|
||||
super().__init__()
|
||||
self.latent_channels = latent_channels
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
self.encoder = Encoder3D(
|
||||
in_channels=in_channels, out_channels=latent_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
self.decoder = Decoder3D(
|
||||
in_channels=latent_channels, out_channels=out_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
|
||||
self.num_latent_frames_batch_size = 2
|
||||
self.num_sample_frames_batch_size = 8
|
||||
|
||||
def encode(self, x):
|
||||
t = x.shape[2]
|
||||
frame_batch = self.num_sample_frames_batch_size
|
||||
remainder = t % frame_batch
|
||||
conv_cache = None
|
||||
enc = []
|
||||
|
||||
# Process remainder frames first so only the first chunk can have an
|
||||
# odd temporal dimension — where Downsample3D's first-frame-special
|
||||
# handling in temporal compression is actually correct.
|
||||
if remainder > 0:
|
||||
chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
for start in range(remainder, t, frame_batch):
|
||||
chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
enc = torch.cat(enc, dim=2)
|
||||
mean, _ = enc.chunk(2, dim=1)
|
||||
return mean
|
||||
|
||||
def decode(self, z):
|
||||
return self._decode_rolling(z)
|
||||
|
||||
def _decode_batched(self, z):
|
||||
"""Original batched decode - processes 2 latent frames through full decoder."""
|
||||
t = z.shape[2]
|
||||
frame_batch = self.num_latent_frames_batch_size
|
||||
num_batches = max(t // frame_batch, 1)
|
||||
conv_cache = None
|
||||
dec = []
|
||||
for i in range(num_batches):
|
||||
remaining = t % frame_batch
|
||||
start = frame_batch * i + (0 if i == 0 else remaining)
|
||||
end = frame_batch * (i + 1) + remaining
|
||||
chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
|
||||
dec.append(chunk.cpu())
|
||||
return torch.cat(dec, dim=2).to(z.device)
|
||||
|
||||
def _decode_rolling(self, z):
|
||||
"""Rolling decode - processes low-res layers on full tensor, then rolls
|
||||
through expensive high-res layers in temporal chunks."""
|
||||
decoder = self.decoder
|
||||
device = z.device
|
||||
|
||||
# Determine which up_blocks have temporal upsample vs spatial-only.
|
||||
# Temporal up_blocks are cheap (low res), spatial-only are expensive.
|
||||
temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
|
||||
split_at = temporal_compress_level # first N up_blocks do temporal upsample
|
||||
|
||||
# Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
|
||||
x, _ = decoder.conv_in(z)
|
||||
x, _ = decoder.mid_block(x, None, z)
|
||||
|
||||
for i in range(split_at):
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
|
||||
# Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
|
||||
remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
|
||||
chunk_size = 4 # pixel frames per chunk through high-res layers
|
||||
t_expanded = x.shape[2]
|
||||
|
||||
if t_expanded <= chunk_size or len(remaining_blocks) == 0:
|
||||
# Small enough to process in one go
|
||||
for i in remaining_blocks:
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
x, _ = decoder.norm_out(x, z)
|
||||
x = decoder.conv_act(x)
|
||||
x, _ = decoder.conv_out(x)
|
||||
return x
|
||||
|
||||
# Expand z temporally once to match Phase 2's time dimension.
|
||||
# z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
|
||||
# for the old approach of pre-interpolating to every pixel resolution).
|
||||
z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
|
||||
|
||||
# Process in temporal chunks, interpolating spatially per-chunk to avoid
|
||||
# allocating full [B, C, t_expanded, H, W] tensors at each resolution.
|
||||
dec_out = []
|
||||
conv_caches = {}
|
||||
|
||||
for chunk_start in range(0, t_expanded, chunk_size):
|
||||
chunk_end = min(chunk_start + chunk_size, t_expanded)
|
||||
x_chunk = x[:, :, chunk_start:chunk_end]
|
||||
z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
|
||||
z_spatial_cache = {}
|
||||
|
||||
for i in remaining_blocks:
|
||||
block = decoder.up_blocks[i]
|
||||
cache_key = f"up_block_{i}"
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
|
||||
z_spatial_cache[hw_key] = z_t_chunk
|
||||
else:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
|
||||
conv_caches[cache_key] = new_cache
|
||||
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
|
||||
conv_caches["norm_out"] = new_cache
|
||||
x_chunk = decoder.conv_act(x_chunk)
|
||||
x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
|
||||
conv_caches["conv_out"] = new_cache
|
||||
|
||||
dec_out.append(x_chunk.cpu())
|
||||
del z_spatial_cache
|
||||
|
||||
del x, z_time_expanded
|
||||
return torch.cat(dec_out, dim=2).to(device)
|
||||
@ -342,6 +342,12 @@ def model_lora_keys_unet(model, key_map={}):
|
||||
key_map["base_model.model.{}".format(key_lora)] = k # Official base model loras
|
||||
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k # LyCORIS/LoKR format
|
||||
|
||||
if isinstance(model, comfy.model_base.ErnieImage):
|
||||
for k in sdk:
|
||||
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||
key_map["transformer.{}".format(key_lora)] = k
|
||||
|
||||
return key_map
|
||||
|
||||
|
||||
|
||||
@ -52,6 +52,7 @@ import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
import comfy.ldm.anima.model
|
||||
import comfy.ldm.ace.ace_step15
|
||||
import comfy.ldm.cogvideo.model
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
import comfy.ldm.sam3.detector
|
||||
@ -81,6 +82,7 @@ class ModelType(Enum):
|
||||
IMG_TO_IMG = 9
|
||||
FLOW_COSMOS = 10
|
||||
IMG_TO_IMG_FLOW = 11
|
||||
V_PREDICTION_DDPM = 12
|
||||
|
||||
|
||||
def model_sampling(model_config, model_type):
|
||||
@ -115,6 +117,8 @@ def model_sampling(model_config, model_type):
|
||||
s = comfy.model_sampling.ModelSamplingCosmosRFlow
|
||||
elif model_type == ModelType.IMG_TO_IMG_FLOW:
|
||||
c = comfy.model_sampling.IMG_TO_IMG_FLOW
|
||||
elif model_type == ModelType.V_PREDICTION_DDPM:
|
||||
c = comfy.model_sampling.V_PREDICTION_DDPM
|
||||
|
||||
class ModelSampling(s, c):
|
||||
pass
|
||||
@ -1979,3 +1983,59 @@ class ErnieImage(BaseModel):
|
||||
class SAM3(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.sam3.detector.SAM3Model)
|
||||
|
||||
class CogVideoX(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
# Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
|
||||
extra_channels = self.diffusion_model.in_channels - noise.shape[1]
|
||||
if extra_channels == 0:
|
||||
return None
|
||||
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
device = kwargs["device"]
|
||||
|
||||
if image is None:
|
||||
shape = list(noise.shape)
|
||||
shape[1] = extra_channels
|
||||
return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||
|
||||
latent_dim = self.latent_format.latent_channels
|
||||
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
|
||||
if noise.ndim == 5 and image.ndim == 5:
|
||||
if image.shape[-3] < noise.shape[-3]:
|
||||
image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
|
||||
elif image.shape[-3] > noise.shape[-3]:
|
||||
image = image[:, :, :noise.shape[-3]]
|
||||
|
||||
for i in range(0, image.shape[1], latent_dim):
|
||||
image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
|
||||
if image.shape[1] > extra_channels:
|
||||
image = image[:, :extra_channels]
|
||||
elif image.shape[1] < extra_channels:
|
||||
repeats = extra_channels // image.shape[1]
|
||||
remainder = extra_channels % image.shape[1]
|
||||
parts = [image] * repeats
|
||||
if remainder > 0:
|
||||
parts.append(image[:, :remainder])
|
||||
image = torch.cat(parts, dim=1)
|
||||
|
||||
return image
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
# OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
|
||||
if self.diffusion_model.ofs_proj_dim is not None:
|
||||
ofs = kwargs.get("ofs", None)
|
||||
if ofs is None:
|
||||
noise = kwargs.get("noise", None)
|
||||
ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
|
||||
out['ofs'] = comfy.conds.CONDRegular(ofs)
|
||||
return out
|
||||
|
||||
@ -490,6 +490,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys: # CogVideoX
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "cogvideox"
|
||||
|
||||
# Extract config from weight shapes
|
||||
norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
|
||||
time_embed_dim = norm1_weight.shape[1]
|
||||
dim = norm1_weight.shape[0] // 6
|
||||
|
||||
dit_config["num_attention_heads"] = dim // 64
|
||||
dit_config["attention_head_dim"] = 64
|
||||
dit_config["time_embed_dim"] = time_embed_dim
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
|
||||
|
||||
# Detect in_channels from patch_embed
|
||||
patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
|
||||
if patch_proj_key in state_dict_keys:
|
||||
w = state_dict[patch_proj_key]
|
||||
if w.ndim == 4:
|
||||
# Conv2d: [out, in, kh, kw] — CogVideoX 1.0
|
||||
dit_config["in_channels"] = w.shape[1]
|
||||
dit_config["patch_size"] = w.shape[2]
|
||||
elif w.ndim == 2:
|
||||
# Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
|
||||
dit_config["patch_size"] = 2
|
||||
dit_config["patch_size_t"] = 2
|
||||
dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2) # 256 // 8 = 32
|
||||
|
||||
text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
|
||||
if text_proj_key in state_dict_keys:
|
||||
dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
|
||||
|
||||
# Detect OFS embedding
|
||||
ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
|
||||
if ofs_key in state_dict_keys:
|
||||
dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
|
||||
|
||||
# Detect positional embedding type
|
||||
pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
|
||||
if pos_key in state_dict_keys:
|
||||
dit_config["use_learned_positional_embeddings"] = True
|
||||
dit_config["use_rotary_positional_embeddings"] = False
|
||||
else:
|
||||
dit_config["use_learned_positional_embeddings"] = False
|
||||
dit_config["use_rotary_positional_embeddings"] = True
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "wan2.1"
|
||||
|
||||
@ -54,6 +54,30 @@ class V_PREDICTION(EPS):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
|
||||
|
||||
class V_PREDICTION_DDPM:
|
||||
"""CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
|
||||
x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
|
||||
= x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
|
||||
"""
|
||||
def calculate_input(self, sigma, noise):
|
||||
return noise
|
||||
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
|
||||
|
||||
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
|
||||
sigma = reshape_sigma(sigma, noise.ndim)
|
||||
if max_denoise:
|
||||
noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
|
||||
else:
|
||||
noise = noise * sigma
|
||||
noise += latent_image
|
||||
return noise
|
||||
|
||||
def inverse_noise_scaling(self, sigma, latent):
|
||||
return latent
|
||||
|
||||
class EDM(V_PREDICTION):
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
|
||||
29
comfy/sd.py
29
comfy/sd.py
@ -18,6 +18,7 @@ import comfy.ldm.wan.vae
|
||||
import comfy.ldm.wan.vae2_2
|
||||
import comfy.ldm.hunyuan3d.vae
|
||||
import comfy.ldm.ace.vae.music_dcae_pipeline
|
||||
import comfy.ldm.cogvideo.vae
|
||||
import comfy.ldm.hunyuan_video.vae
|
||||
import comfy.ldm.mmaudio.vae.autoencoder
|
||||
import comfy.pixel_space_convert
|
||||
@ -478,7 +479,10 @@ class VAE:
|
||||
encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': encoder_config},
|
||||
decoder_config={'target': "comfy.ldm.modules.temporal_ae.VideoDecoder", 'params': decoder_config})
|
||||
elif "taesd_decoder.1.weight" in sd:
|
||||
self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
|
||||
if isinstance(metadata, dict) and "tae_latent_channels" in metadata:
|
||||
self.latent_channels = metadata["tae_latent_channels"]
|
||||
else:
|
||||
self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
|
||||
self.first_stage_model = comfy.taesd.taesd.TAESD(latent_channels=self.latent_channels)
|
||||
elif "vquantizer.codebook.weight" in sd: #VQGan: stage a of stable cascade
|
||||
self.first_stage_model = StageA()
|
||||
@ -652,6 +656,17 @@ class VAE:
|
||||
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||
elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd: # CogVideoX VAE
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||
self.upscale_index_formula = (4, 8, 8)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.latent_dim = 3
|
||||
self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
|
||||
self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
|
||||
self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
elif "decoder.conv_in.conv.weight" in sd:
|
||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||
ddconfig["conv3d"] = True
|
||||
@ -1105,7 +1120,17 @@ class VAE:
|
||||
else:
|
||||
pixel_samples = pixel_samples.unsqueeze(2)
|
||||
|
||||
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) # TODO: calculate mem required for tile
|
||||
if dims == 2:
|
||||
default_tile_x = 512 if tile_x is None else tile_x
|
||||
default_tile_y = 512 if tile_y is None else tile_y
|
||||
tile_shapes = [
|
||||
(1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y)), min(pixel_samples.shape[3], max(1, default_tile_x))),
|
||||
(1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y // 2)), min(pixel_samples.shape[3], max(1, default_tile_x * 2))),
|
||||
(1, pixel_samples.shape[1], min(pixel_samples.shape[2], max(1, default_tile_y * 2)), min(pixel_samples.shape[3], max(1, default_tile_x // 2))),
|
||||
]
|
||||
memory_used = max(self.memory_used_encode(shape, self.vae_dtype) for shape in tile_shapes)
|
||||
else:
|
||||
memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
|
||||
model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
|
||||
|
||||
args = {}
|
||||
|
||||
@ -27,6 +27,7 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@ -1832,6 +1833,52 @@ class SAM31(SAM3):
|
||||
unet_config = {"image_model": "SAM31"}
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, SAM3, SAM31]
|
||||
class CogVideoX_T2V(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"linear_start": 0.00085,
|
||||
"linear_end": 0.012,
|
||||
"beta_schedule": "linear",
|
||||
"zsnr": True,
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.CogVideoX
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
|
||||
|
||||
class CogVideoX_I2V(CogVideoX_T2V):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
"in_channels": 32,
|
||||
}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage, SAM3, SAM31, CogVideoX_I2V, CogVideoX_T2V]
|
||||
|
||||
models += [SVD_img2vid]
|
||||
|
||||
@ -7,6 +7,7 @@ from tqdm.auto import tqdm
|
||||
from collections import namedtuple, deque
|
||||
|
||||
import comfy.ops
|
||||
import comfy.model_management
|
||||
operations=comfy.ops.disable_weight_init
|
||||
|
||||
DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
|
||||
@ -47,11 +48,14 @@ class TGrow(nn.Module):
|
||||
x = self.conv(x)
|
||||
return x.reshape(-1, C, H, W)
|
||||
|
||||
def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||
def apply_model_with_memblocks(model, x, parallel, show_progress_bar, output_device=None,
|
||||
patch_size=1, decode=False):
|
||||
|
||||
B, T, C, H, W = x.shape
|
||||
if parallel:
|
||||
x = x.reshape(B*T, C, H, W)
|
||||
if not decode and patch_size > 1:
|
||||
x = F.pixel_unshuffle(x, patch_size)
|
||||
# parallel over input timesteps, iterate over blocks
|
||||
for b in tqdm(model, disable=not show_progress_bar):
|
||||
if isinstance(b, MemBlock):
|
||||
@ -62,20 +66,27 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||
x = b(x, mem)
|
||||
else:
|
||||
x = b(x)
|
||||
BT, C, H, W = x.shape
|
||||
T = BT // B
|
||||
x = x.view(B, T, C, H, W)
|
||||
if decode and patch_size > 1:
|
||||
x = F.pixel_shuffle(x, patch_size)
|
||||
x = x.view(B, x.shape[0] // B, *x.shape[1:])
|
||||
x = x.to(output_device)
|
||||
else:
|
||||
out = []
|
||||
work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
|
||||
# Chunk along the time dim directly (chunks are [B,1,C,H,W] views, squeeze to [B,C,H,W] views).
|
||||
# Avoids forcing a contiguous copy when x is non-contiguous (e.g. after movedim in encode/decode).
|
||||
work_queue = deque([TWorkItem(xt.squeeze(1), 0) for xt in x.chunk(T, dim=1)])
|
||||
progress_bar = tqdm(range(T), disable=not show_progress_bar)
|
||||
mem = [None] * len(model)
|
||||
while work_queue:
|
||||
xt, i = work_queue.popleft()
|
||||
if i == 0:
|
||||
progress_bar.update(1)
|
||||
if not decode and patch_size > 1:
|
||||
xt = F.pixel_unshuffle(xt, patch_size)
|
||||
if i == len(model):
|
||||
out.append(xt)
|
||||
if decode and patch_size > 1:
|
||||
xt = F.pixel_shuffle(xt, patch_size)
|
||||
out.append(xt.to(output_device))
|
||||
del xt
|
||||
else:
|
||||
b = model[i]
|
||||
@ -165,24 +176,20 @@ class TAEHV(nn.Module):
|
||||
|
||||
def encode(self, x, **kwargs):
|
||||
x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
if self.patch_size > 1:
|
||||
B, T, C, H, W = x.shape
|
||||
x = x.reshape(B * T, C, H, W)
|
||||
x = F.pixel_unshuffle(x, self.patch_size)
|
||||
x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size)
|
||||
if x.shape[1] % self.t_downscale != 0:
|
||||
# pad at end to multiple of t_downscale
|
||||
n_pad = self.t_downscale - x.shape[1] % self.t_downscale
|
||||
padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
|
||||
x = torch.cat([x, padding], 1)
|
||||
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
|
||||
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar,
|
||||
patch_size=self.patch_size).movedim(2, 1)
|
||||
return self.process_out(x)
|
||||
|
||||
def decode(self, x, **kwargs):
|
||||
x = x.unsqueeze(0) if x.ndim == 4 else x # [T, C, H, W] -> [1, T, C, H, W]
|
||||
x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x # [B, T, C, H, W] or [B, C, T, H, W]
|
||||
x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
|
||||
if self.patch_size > 1:
|
||||
x = F.pixel_shuffle(x, self.patch_size)
|
||||
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar,
|
||||
output_device=comfy.model_management.intermediate_device(),
|
||||
patch_size=self.patch_size, decode=True)
|
||||
return x[:, self.frames_to_trim:].movedim(2, 1)
|
||||
|
||||
@ -17,32 +17,79 @@ class Clamp(nn.Module):
|
||||
return torch.tanh(x / 3) * 3
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, n_in, n_out):
|
||||
def __init__(self, n_in: int, n_out: int, use_midblock_gn: bool = False):
|
||||
super().__init__()
|
||||
self.conv = nn.Sequential(conv(n_in, n_out), nn.ReLU(), conv(n_out, n_out), nn.ReLU(), conv(n_out, n_out))
|
||||
self.skip = comfy.ops.disable_weight_init.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
|
||||
self.fuse = nn.ReLU()
|
||||
def forward(self, x):
|
||||
if not use_midblock_gn:
|
||||
self.pool = None
|
||||
return
|
||||
n_gn = n_in * 4
|
||||
self.pool = nn.Sequential(
|
||||
comfy.ops.disable_weight_init.Conv2d(n_in, n_gn, 1, bias=False),
|
||||
comfy.ops.disable_weight_init.GroupNorm(4, n_gn),
|
||||
nn.ReLU(inplace=True),
|
||||
comfy.ops.disable_weight_init.Conv2d(n_gn, n_in, 1, bias=False),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
if self.pool is not None:
|
||||
x = x + self.pool(x)
|
||||
return self.fuse(self.conv(x) + self.skip(x))
|
||||
|
||||
def Encoder(latent_channels=4):
|
||||
return nn.Sequential(
|
||||
conv(3, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, latent_channels),
|
||||
)
|
||||
class Encoder(nn.Sequential):
|
||||
def __init__(self, latent_channels: int = 4, use_gn: bool = False):
|
||||
super().__init__(
|
||||
conv(3, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn),
|
||||
conv(64, latent_channels),
|
||||
)
|
||||
|
||||
class Decoder(nn.Sequential):
|
||||
def __init__(self, latent_channels: int = 4, use_gn: bool = False):
|
||||
super().__init__(
|
||||
Clamp(), conv(latent_channels, 64), nn.ReLU(),
|
||||
Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), conv(64, 3),
|
||||
)
|
||||
|
||||
class DecoderFlux2(Decoder):
|
||||
def __init__(self, latent_channels: int = 128, use_gn: bool = True):
|
||||
if latent_channels != 128 or not use_gn:
|
||||
raise ValueError("Unexpected parameters for Flux2 TAE module")
|
||||
super().__init__(latent_channels=32, use_gn=True)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
B, C, H, W = x.shape
|
||||
x = (
|
||||
x
|
||||
.reshape(B, 32, 2, 2, H, W)
|
||||
.permute(0, 1, 4, 2, 5, 3)
|
||||
.reshape(B, 32, H * 2, W * 2)
|
||||
)
|
||||
return super().forward(x)
|
||||
|
||||
class EncoderFlux2(Encoder):
|
||||
def __init__(self, latent_channels: int = 128, use_gn: bool = True):
|
||||
if latent_channels != 128 or not use_gn:
|
||||
raise ValueError("Unexpected parameters for Flux2 TAE module")
|
||||
super().__init__(latent_channels=32, use_gn=True)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
result = super().forward(x)
|
||||
B, C, H, W = result.shape
|
||||
return (
|
||||
result
|
||||
.reshape(B, C, H // 2, 2, W // 2, 2)
|
||||
.permute(0, 1, 3, 5, 2, 4)
|
||||
.reshape(B, 128, H // 2, W // 2)
|
||||
)
|
||||
|
||||
def Decoder(latent_channels=4):
|
||||
return nn.Sequential(
|
||||
Clamp(), conv(latent_channels, 64), nn.ReLU(),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), conv(64, 3),
|
||||
)
|
||||
|
||||
class TAESD(nn.Module):
|
||||
latent_magnitude = 3
|
||||
@ -51,8 +98,15 @@ class TAESD(nn.Module):
|
||||
def __init__(self, encoder_path=None, decoder_path=None, latent_channels=4):
|
||||
"""Initialize pretrained TAESD on the given device from the given checkpoints."""
|
||||
super().__init__()
|
||||
self.taesd_encoder = Encoder(latent_channels=latent_channels)
|
||||
self.taesd_decoder = Decoder(latent_channels=latent_channels)
|
||||
if latent_channels == 128:
|
||||
encoder_class = EncoderFlux2
|
||||
decoder_class = DecoderFlux2
|
||||
else:
|
||||
encoder_class = Encoder
|
||||
decoder_class = Decoder
|
||||
self.taesd_encoder = encoder_class(latent_channels=latent_channels)
|
||||
self.taesd_decoder = decoder_class(latent_channels=latent_channels)
|
||||
|
||||
self.vae_scale = torch.nn.Parameter(torch.tensor(1.0))
|
||||
self.vae_shift = torch.nn.Parameter(torch.tensor(0.0))
|
||||
if encoder_path is not None:
|
||||
@ -61,19 +115,19 @@ class TAESD(nn.Module):
|
||||
self.taesd_decoder.load_state_dict(comfy.utils.load_torch_file(decoder_path, safe_load=True))
|
||||
|
||||
@staticmethod
|
||||
def scale_latents(x):
|
||||
def scale_latents(x: torch.Tensor) -> torch.Tensor:
|
||||
"""raw latents -> [0, 1]"""
|
||||
return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1)
|
||||
|
||||
@staticmethod
|
||||
def unscale_latents(x):
|
||||
def unscale_latents(x: torch.Tensor) -> torch.Tensor:
|
||||
"""[0, 1] -> raw latents"""
|
||||
return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude)
|
||||
|
||||
def decode(self, x):
|
||||
def decode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x_sample = self.taesd_decoder((x - self.vae_shift) * self.vae_scale)
|
||||
x_sample = x_sample.sub(0.5).mul(2)
|
||||
return x_sample
|
||||
|
||||
def encode(self, x):
|
||||
def encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return (self.taesd_encoder(x * 0.5 + 0.5) / self.vae_scale) + self.vae_shift
|
||||
|
||||
6
comfy/text_encoders/cogvideo.py
Normal file
6
comfy/text_encoders/cogvideo.py
Normal file
@ -0,0 +1,6 @@
|
||||
import comfy.text_encoders.sd3_clip
|
||||
|
||||
|
||||
class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
|
||||
@ -251,6 +251,7 @@ class VideoFromFile(VideoInput):
|
||||
container.seek(start_pts, stream=video_stream)
|
||||
|
||||
image_format = 'gbrpf32le'
|
||||
process_image_format = lambda a: a
|
||||
audio = None
|
||||
|
||||
streams = [video_stream]
|
||||
@ -283,11 +284,25 @@ class VideoFromFile(VideoInput):
|
||||
break
|
||||
|
||||
if not checked_alpha:
|
||||
alpha_channel = False
|
||||
for comp in frame.format.components:
|
||||
if comp.is_alpha or frame.format.name == "pal8":
|
||||
alphas = []
|
||||
image_format = 'gbrapf32le'
|
||||
alpha_channel = True
|
||||
break
|
||||
if frame.format.name in ("yuvj420p", "yuvj422p", "yuvj444p", "rgb24", "rgba", "pal8"):
|
||||
process_image_format = lambda a: a.float() / 255.0
|
||||
if alpha_channel:
|
||||
image_format = 'rgba'
|
||||
else:
|
||||
image_format = 'rgb24'
|
||||
else:
|
||||
process_image_format = lambda a: a
|
||||
if alpha_channel:
|
||||
image_format = 'gbrapf32le'
|
||||
else:
|
||||
image_format = 'gbrpf32le'
|
||||
|
||||
checked_alpha = True
|
||||
|
||||
img = frame.to_ndarray(format=image_format) # shape: (H, W, 4)
|
||||
@ -323,9 +338,9 @@ class VideoFromFile(VideoInput):
|
||||
else:
|
||||
audio_frames.append(frame.to_ndarray())
|
||||
|
||||
images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
|
||||
images = process_image_format(torch.stack(frames)) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
|
||||
if alphas is not None:
|
||||
alphas = torch.stack(alphas) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)
|
||||
alphas = process_image_format(torch.stack(alphas)) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)
|
||||
|
||||
# Get frame rate
|
||||
frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
|
||||
|
||||
@ -157,6 +157,11 @@ class SeedanceCreateAssetResponse(BaseModel):
|
||||
asset_id: str = Field(...)
|
||||
|
||||
|
||||
class SeedanceVirtualLibraryCreateAssetRequest(BaseModel):
|
||||
url: str = Field(..., description="Publicly accessible URL of the image asset to upload.")
|
||||
hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.")
|
||||
|
||||
|
||||
# Dollars per 1K tokens, keyed by (model_id, has_video_input).
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS = {
|
||||
("dreamina-seedance-2-0-260128", False): 0.007,
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
@ -20,6 +21,7 @@ from comfy_api_nodes.apis.bytedance import (
|
||||
SeedanceCreateAssetResponse,
|
||||
SeedanceCreateVisualValidateSessionResponse,
|
||||
SeedanceGetVisualValidateSessionResponse,
|
||||
SeedanceVirtualLibraryCreateAssetRequest,
|
||||
Seedream4Options,
|
||||
Seedream4TaskCreationRequest,
|
||||
TaskAudioContent,
|
||||
@ -271,6 +273,30 @@ async def _wait_for_asset_active(cls: type[IO.ComfyNode], asset_id: str, group_i
|
||||
)
|
||||
|
||||
|
||||
async def _seedance_virtual_library_upload_image_asset(
|
||||
cls: type[IO.ComfyNode],
|
||||
image: torch.Tensor,
|
||||
*,
|
||||
wait_label: str = "Uploading image",
|
||||
) -> str:
|
||||
"""Upload an image into the caller's per-customer Seedance virtual library."""
|
||||
public_url = await upload_image_to_comfyapi(cls, image, wait_label=wait_label)
|
||||
normalized = image.detach().cpu().contiguous().to(torch.float32)
|
||||
digest = hashlib.sha256()
|
||||
digest.update(str(tuple(normalized.shape)).encode("utf-8"))
|
||||
digest.update(b"\0")
|
||||
digest.update(normalized.numpy().tobytes())
|
||||
image_hash = digest.hexdigest()
|
||||
create_resp = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"),
|
||||
response_model=SeedanceCreateAssetResponse,
|
||||
data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=image_hash),
|
||||
)
|
||||
await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library")
|
||||
return f"asset://{create_resp.asset_id}"
|
||||
|
||||
|
||||
def _seedance2_price_extractor(model_id: str, has_video_input: bool):
|
||||
"""Returns a price_extractor closure for Seedance 2.0 poll_op."""
|
||||
rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
|
||||
@ -1507,7 +1533,9 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
if first_frame_asset_id:
|
||||
first_frame_url = image_assets[first_frame_asset_id]
|
||||
else:
|
||||
first_frame_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
|
||||
first_frame_url = await _seedance_virtual_library_upload_image_asset(
|
||||
cls, first_frame, wait_label="Uploading first frame."
|
||||
)
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
@ -1527,7 +1555,9 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, last_frame, wait_label="Uploading last frame.")
|
||||
url=await _seedance_virtual_library_upload_image_asset(
|
||||
cls, last_frame, wait_label="Uploading last frame."
|
||||
)
|
||||
),
|
||||
role="last_frame",
|
||||
),
|
||||
@ -1805,9 +1835,9 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(
|
||||
url=await _seedance_virtual_library_upload_image_asset(
|
||||
cls,
|
||||
image=reference_images[key],
|
||||
reference_images[key],
|
||||
wait_label=f"Uploading image {i}",
|
||||
),
|
||||
),
|
||||
|
||||
@ -415,8 +415,9 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
"1152x2048",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
"Custom",
|
||||
],
|
||||
tooltip="Image size",
|
||||
tooltip="Image size. Select 'Custom' to use the custom width and height (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
@ -445,6 +446,24 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
default="gpt-image-2",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_width",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_height",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
@ -471,9 +490,9 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
"high": [0.133, 0.22]
|
||||
},
|
||||
"gpt-image-2": {
|
||||
"low": [0.0048, 0.012],
|
||||
"medium": [0.041, 0.112],
|
||||
"high": [0.165, 0.43]
|
||||
"low": [0.0048, 0.019],
|
||||
"medium": [0.041, 0.168],
|
||||
"high": [0.165, 0.67]
|
||||
}
|
||||
};
|
||||
$range := $lookup($lookup($ranges, widgets.model), widgets.quality);
|
||||
@ -503,6 +522,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
mask: Input.Image | None = None,
|
||||
n: int = 1,
|
||||
size: str = "1024x1024",
|
||||
custom_width: int = 1024,
|
||||
custom_height: int = 1024,
|
||||
model: str = "gpt-image-1",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False)
|
||||
@ -510,7 +531,25 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
if mask is not None and image is None:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
if model in ("gpt-image-1", "gpt-image-1.5"):
|
||||
if size == "Custom":
|
||||
if model != "gpt-image-2":
|
||||
raise ValueError("Custom resolution is only supported by GPT Image 2 model")
|
||||
if custom_width % 16 != 0 or custom_height % 16 != 0:
|
||||
raise ValueError(f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}")
|
||||
if max(custom_width, custom_height) > 3840:
|
||||
raise ValueError(f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}")
|
||||
ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
|
||||
if ratio > 3:
|
||||
raise ValueError(
|
||||
f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
|
||||
)
|
||||
total_pixels = custom_width * custom_height
|
||||
if not 655_360 <= total_pixels <= 8_294_400:
|
||||
raise ValueError(
|
||||
f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
|
||||
)
|
||||
size = f"{custom_width}x{custom_height}"
|
||||
elif model in ("gpt-image-1", "gpt-image-1.5"):
|
||||
if size not in ("auto", "1024x1024", "1024x1536", "1536x1024"):
|
||||
raise ValueError(f"Resolution {size} is only supported by GPT Image 2 model")
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import asyncio
|
||||
import bisect
|
||||
import gc
|
||||
import itertools
|
||||
import psutil
|
||||
import time
|
||||
@ -17,6 +18,7 @@ NODE_CLASS_CONTAINS_UNIQUE_ID: Dict[str, bool] = {}
|
||||
|
||||
|
||||
def include_unique_id_in_input(class_type: str) -> bool:
|
||||
"""Return whether a node class includes UNIQUE_ID among its hidden inputs."""
|
||||
if class_type in NODE_CLASS_CONTAINS_UNIQUE_ID:
|
||||
return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type]
|
||||
class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
|
||||
@ -24,52 +26,412 @@ def include_unique_id_in_input(class_type: str) -> bool:
|
||||
return NODE_CLASS_CONTAINS_UNIQUE_ID[class_type]
|
||||
|
||||
class CacheKeySet(ABC):
|
||||
"""Base helper for building and storing cache keys for prompt nodes."""
|
||||
def __init__(self, dynprompt, node_ids, is_changed_cache):
|
||||
"""Initialize cache-key storage for a dynamic prompt execution pass."""
|
||||
self.keys = {}
|
||||
self.subcache_keys = {}
|
||||
|
||||
@abstractmethod
|
||||
async def add_keys(self, node_ids):
|
||||
"""Populate cache keys for the provided node ids."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def all_node_ids(self):
|
||||
"""Return the set of node ids currently tracked by this key set."""
|
||||
return set(self.keys.keys())
|
||||
|
||||
def get_used_keys(self):
|
||||
"""Return the computed cache keys currently in use."""
|
||||
return self.keys.values()
|
||||
|
||||
def get_used_subcache_keys(self):
|
||||
"""Return the computed subcache keys currently in use."""
|
||||
return self.subcache_keys.values()
|
||||
|
||||
def get_data_key(self, node_id):
|
||||
"""Return the cache key for a node, if present."""
|
||||
return self.keys.get(node_id, None)
|
||||
|
||||
def get_subcache_key(self, node_id):
|
||||
"""Return the subcache key for a node, if present."""
|
||||
return self.subcache_keys.get(node_id, None)
|
||||
|
||||
class Unhashable:
|
||||
def __init__(self):
|
||||
self.value = float("NaN")
|
||||
"""Hashable identity sentinel for values that cannot be represented safely in cache keys."""
|
||||
pass
|
||||
|
||||
def to_hashable(obj):
|
||||
# So that we don't infinitely recurse since frozenset and tuples
|
||||
# are Sequences.
|
||||
if isinstance(obj, (int, float, str, bool, bytes, type(None))):
|
||||
return obj
|
||||
elif isinstance(obj, Mapping):
|
||||
return frozenset([(to_hashable(k), to_hashable(v)) for k, v in sorted(obj.items())])
|
||||
elif isinstance(obj, Sequence):
|
||||
return frozenset(zip(itertools.count(), [to_hashable(i) for i in obj]))
|
||||
else:
|
||||
# TODO - Support other objects like tensors?
|
||||
|
||||
_PRIMITIVE_SIGNATURE_TYPES = (int, float, str, bool, bytes, type(None))
|
||||
_CONTAINER_SIGNATURE_TYPES = (dict, list, tuple, set, frozenset)
|
||||
_MAX_SIGNATURE_DEPTH = 32
|
||||
_MAX_SIGNATURE_CONTAINER_VISITS = 10_000
|
||||
_FAILED_SIGNATURE = object()
|
||||
|
||||
|
||||
def _shallow_is_changed_signature(value):
|
||||
"""Reduce execution-time `is_changed` values through a fail-closed builtin canonicalizer."""
|
||||
value_type = type(value)
|
||||
if value_type in _PRIMITIVE_SIGNATURE_TYPES:
|
||||
return value
|
||||
|
||||
if value_type not in _CONTAINER_SIGNATURE_TYPES:
|
||||
return Unhashable()
|
||||
|
||||
canonical = _signature_to_hashable(value, max_nodes=64)
|
||||
if type(canonical) is Unhashable:
|
||||
return canonical
|
||||
if value_type is list or value_type is tuple:
|
||||
container_tag = "is_changed_list" if value_type is list else "is_changed_tuple"
|
||||
return (container_tag, canonical[1])
|
||||
|
||||
return canonical
|
||||
|
||||
|
||||
def _primitive_signature_sort_key(obj):
|
||||
"""Return a deterministic ordering key for primitive signature values."""
|
||||
obj_type = type(obj)
|
||||
return ("primitive", obj_type.__module__, obj_type.__qualname__, repr(obj))
|
||||
|
||||
|
||||
def _sanitized_sort_key(obj, depth=0, max_depth=_MAX_SIGNATURE_DEPTH, active=None, memo=None):
|
||||
"""Return a deterministic ordering key for sanitized built-in container content."""
|
||||
if depth >= max_depth:
|
||||
return ("MAX_DEPTH",)
|
||||
|
||||
if active is None:
|
||||
active = set()
|
||||
if memo is None:
|
||||
memo = {}
|
||||
|
||||
obj_type = type(obj)
|
||||
if obj_type is Unhashable:
|
||||
return ("UNHASHABLE",)
|
||||
elif obj_type in _PRIMITIVE_SIGNATURE_TYPES:
|
||||
return (obj_type.__module__, obj_type.__qualname__, repr(obj))
|
||||
elif obj_type not in _CONTAINER_SIGNATURE_TYPES:
|
||||
return (obj_type.__module__, obj_type.__qualname__, "OPAQUE")
|
||||
|
||||
obj_id = id(obj)
|
||||
if obj_id in memo:
|
||||
return memo[obj_id]
|
||||
if obj_id in active:
|
||||
return ("CYCLE",)
|
||||
|
||||
active.add(obj_id)
|
||||
try:
|
||||
if obj_type is dict:
|
||||
items = [
|
||||
(
|
||||
_sanitized_sort_key(k, depth + 1, max_depth, active, memo),
|
||||
_sanitized_sort_key(v, depth + 1, max_depth, active, memo),
|
||||
)
|
||||
for k, v in obj.items()
|
||||
]
|
||||
items.sort()
|
||||
result = ("dict", tuple(items))
|
||||
elif obj_type is list:
|
||||
result = ("list", tuple(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj))
|
||||
elif obj_type is tuple:
|
||||
result = ("tuple", tuple(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj))
|
||||
elif obj_type is set:
|
||||
result = ("set", tuple(sorted(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj)))
|
||||
else:
|
||||
result = ("frozenset", tuple(sorted(_sanitized_sort_key(i, depth + 1, max_depth, active, memo) for i in obj)))
|
||||
finally:
|
||||
active.discard(obj_id)
|
||||
|
||||
memo[obj_id] = result
|
||||
return result
|
||||
|
||||
|
||||
def _signature_to_hashable_impl(obj, depth=0, max_depth=_MAX_SIGNATURE_DEPTH, active=None, memo=None, budget=None):
|
||||
"""Canonicalize signature inputs directly into their final hashable form."""
|
||||
if depth >= max_depth:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
if active is None:
|
||||
active = set()
|
||||
if memo is None:
|
||||
memo = {}
|
||||
if budget is None:
|
||||
budget = {"remaining": _MAX_SIGNATURE_CONTAINER_VISITS}
|
||||
|
||||
obj_type = type(obj)
|
||||
if obj_type in _PRIMITIVE_SIGNATURE_TYPES:
|
||||
return obj, _primitive_signature_sort_key(obj)
|
||||
if obj_type is Unhashable or obj_type not in _CONTAINER_SIGNATURE_TYPES:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
obj_id = id(obj)
|
||||
if obj_id in memo:
|
||||
return memo[obj_id]
|
||||
if obj_id in active:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
budget["remaining"] -= 1
|
||||
if budget["remaining"] < 0:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
active.add(obj_id)
|
||||
try:
|
||||
if obj_type is dict:
|
||||
try:
|
||||
items = list(obj.items())
|
||||
except RuntimeError:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
ordered_items = []
|
||||
for key, value in items:
|
||||
if type(key) not in _PRIMITIVE_SIGNATURE_TYPES:
|
||||
return _FAILED_SIGNATURE
|
||||
key_result = (key, _primitive_signature_sort_key(key))
|
||||
value_result = _signature_to_hashable_impl(value, depth + 1, max_depth, active, memo, budget)
|
||||
if value_result is _FAILED_SIGNATURE:
|
||||
return _FAILED_SIGNATURE
|
||||
key_value, key_sort = key_result
|
||||
value_value, value_sort = value_result
|
||||
ordered_items.append((key_sort, value_sort, key_value, value_value))
|
||||
|
||||
ordered_items.sort(key=lambda item: (item[0], item[1]))
|
||||
for index in range(1, len(ordered_items)):
|
||||
previous_key_sort = ordered_items[index - 1][0]
|
||||
current_key_sort = ordered_items[index][0]
|
||||
if previous_key_sort == current_key_sort:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
value = ("dict", tuple((key_value, value_value) for _, _, key_value, value_value in ordered_items))
|
||||
sort_key = ("dict", tuple((key_sort, value_sort) for key_sort, value_sort, _, _ in ordered_items))
|
||||
elif obj_type is list or obj_type is tuple:
|
||||
try:
|
||||
items = list(obj)
|
||||
except RuntimeError:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
child_results = []
|
||||
for item in items:
|
||||
child_result = _signature_to_hashable_impl(item, depth + 1, max_depth, active, memo, budget)
|
||||
if child_result is _FAILED_SIGNATURE:
|
||||
return _FAILED_SIGNATURE
|
||||
child_results.append(child_result)
|
||||
|
||||
container_tag = "list" if obj_type is list else "tuple"
|
||||
value = (container_tag, tuple(child for child, _ in child_results))
|
||||
sort_key = (container_tag, tuple(child_sort for _, child_sort in child_results))
|
||||
else:
|
||||
try:
|
||||
items = list(obj)
|
||||
except RuntimeError:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
ordered_items = []
|
||||
for item in items:
|
||||
child_result = _signature_to_hashable_impl(item, depth + 1, max_depth, active, memo, budget)
|
||||
if child_result is _FAILED_SIGNATURE:
|
||||
return _FAILED_SIGNATURE
|
||||
child_value, child_sort = child_result
|
||||
ordered_items.append((child_sort, child_value))
|
||||
|
||||
ordered_items.sort(key=lambda item: item[0])
|
||||
for index in range(1, len(ordered_items)):
|
||||
previous_sort_key, previous_value = ordered_items[index - 1]
|
||||
current_sort_key, current_value = ordered_items[index]
|
||||
if previous_sort_key == current_sort_key and previous_value != current_value:
|
||||
return _FAILED_SIGNATURE
|
||||
|
||||
container_tag = "set" if obj_type is set else "frozenset"
|
||||
value = (container_tag, tuple(child_value for _, child_value in ordered_items))
|
||||
sort_key = (container_tag, tuple(child_sort for child_sort, _ in ordered_items))
|
||||
finally:
|
||||
active.discard(obj_id)
|
||||
|
||||
memo[obj_id] = (value, sort_key)
|
||||
return memo[obj_id]
|
||||
|
||||
|
||||
def _signature_to_hashable(obj, max_nodes=_MAX_SIGNATURE_CONTAINER_VISITS):
|
||||
"""Build the final cache-signature representation in one fail-closed pass."""
|
||||
try:
|
||||
result = _signature_to_hashable_impl(obj, budget={"remaining": max_nodes})
|
||||
except RuntimeError:
|
||||
return Unhashable()
|
||||
if result is _FAILED_SIGNATURE:
|
||||
return Unhashable()
|
||||
return result[0]
|
||||
|
||||
|
||||
def to_hashable(obj, max_nodes=_MAX_SIGNATURE_CONTAINER_VISITS):
|
||||
"""Convert sanitized prompt inputs into a stable hashable representation.
|
||||
|
||||
The input is expected to already be sanitized to plain built-in containers,
|
||||
but this function still fails safe for anything unexpected. Traversal is
|
||||
iterative and memoized so shared built-in substructures do not trigger
|
||||
exponential re-walks during cache-key construction.
|
||||
"""
|
||||
obj_type = type(obj)
|
||||
if obj_type in _PRIMITIVE_SIGNATURE_TYPES or obj_type is Unhashable:
|
||||
return obj
|
||||
if obj_type not in _CONTAINER_SIGNATURE_TYPES:
|
||||
return Unhashable()
|
||||
|
||||
memo = {}
|
||||
active = set()
|
||||
snapshots = {}
|
||||
sort_memo = {}
|
||||
processed = 0
|
||||
# Keep traversal state separate from container snapshots/results.
|
||||
work_stack = [(obj, False)]
|
||||
|
||||
def resolve_value(value):
|
||||
"""Resolve a child value from the completed memo table when available."""
|
||||
value_type = type(value)
|
||||
if value_type in _PRIMITIVE_SIGNATURE_TYPES or value_type is Unhashable:
|
||||
return value
|
||||
return memo.get(id(value), Unhashable())
|
||||
|
||||
def is_failed(value):
|
||||
"""Return whether a resolved child value represents failed canonicalization."""
|
||||
return type(value) is Unhashable
|
||||
|
||||
def resolve_unordered_values(current_items, container_tag):
|
||||
"""Resolve a set-like container or fail closed if ordering is ambiguous."""
|
||||
try:
|
||||
ordered_items = [
|
||||
(_sanitized_sort_key(item, memo=sort_memo), resolve_value(item))
|
||||
for item in current_items
|
||||
]
|
||||
if any(is_failed(value) for _, value in ordered_items):
|
||||
return Unhashable()
|
||||
ordered_items.sort(key=lambda item: item[0])
|
||||
except RuntimeError:
|
||||
return Unhashable()
|
||||
|
||||
for index in range(1, len(ordered_items)):
|
||||
previous_key, previous_value = ordered_items[index - 1]
|
||||
current_key, current_value = ordered_items[index]
|
||||
if previous_key == current_key and previous_value != current_value:
|
||||
return Unhashable()
|
||||
|
||||
return (container_tag, tuple(value for _, value in ordered_items))
|
||||
|
||||
while work_stack:
|
||||
entry = work_stack.pop()
|
||||
if len(entry) == 3:
|
||||
_, current_id, current_type = entry
|
||||
current = None
|
||||
expanded = True
|
||||
else:
|
||||
current, expanded = entry
|
||||
current_type = type(current)
|
||||
current_id = id(current)
|
||||
|
||||
if not expanded and (current_type in _PRIMITIVE_SIGNATURE_TYPES or current_type is Unhashable):
|
||||
continue
|
||||
if not expanded and current_type not in _CONTAINER_SIGNATURE_TYPES:
|
||||
memo[current_id] = Unhashable()
|
||||
continue
|
||||
|
||||
if current_id in memo:
|
||||
continue
|
||||
|
||||
if expanded:
|
||||
active.discard(current_id)
|
||||
try:
|
||||
items = snapshots.pop(current_id, None)
|
||||
if items is None:
|
||||
memo[current_id] = Unhashable()
|
||||
continue
|
||||
|
||||
if current_type is dict:
|
||||
ordered_items = [
|
||||
(_sanitized_sort_key(k, memo=sort_memo), k, resolve_value(v))
|
||||
for k, v in items
|
||||
]
|
||||
if any(type(key) not in _PRIMITIVE_SIGNATURE_TYPES or is_failed(value) for _, key, value in ordered_items):
|
||||
memo[current_id] = Unhashable()
|
||||
continue
|
||||
ordered_items.sort(key=lambda item: item[0])
|
||||
for index in range(1, len(ordered_items)):
|
||||
if ordered_items[index - 1][0] == ordered_items[index][0]:
|
||||
memo[current_id] = Unhashable()
|
||||
break
|
||||
else:
|
||||
memo[current_id] = (
|
||||
"dict",
|
||||
tuple((key, value) for _, key, value in ordered_items),
|
||||
)
|
||||
elif current_type is list:
|
||||
resolved_items = tuple(resolve_value(item) for item in items)
|
||||
if any(is_failed(item) for item in resolved_items):
|
||||
memo[current_id] = Unhashable()
|
||||
else:
|
||||
memo[current_id] = ("list", resolved_items)
|
||||
elif current_type is tuple:
|
||||
resolved_items = tuple(resolve_value(item) for item in items)
|
||||
if any(is_failed(item) for item in resolved_items):
|
||||
memo[current_id] = Unhashable()
|
||||
else:
|
||||
memo[current_id] = ("tuple", resolved_items)
|
||||
elif current_type is set:
|
||||
memo[current_id] = resolve_unordered_values(items, "set")
|
||||
else:
|
||||
memo[current_id] = resolve_unordered_values(items, "frozenset")
|
||||
except RuntimeError:
|
||||
memo[current_id] = Unhashable()
|
||||
continue
|
||||
|
||||
if current_id in active:
|
||||
memo[current_id] = Unhashable()
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
if processed > max_nodes:
|
||||
return Unhashable()
|
||||
|
||||
active.add(current_id)
|
||||
if current_type is dict:
|
||||
try:
|
||||
items = list(current.items())
|
||||
snapshots[current_id] = items
|
||||
except RuntimeError:
|
||||
memo[current_id] = Unhashable()
|
||||
active.discard(current_id)
|
||||
continue
|
||||
for key, value in items:
|
||||
if type(key) not in _PRIMITIVE_SIGNATURE_TYPES:
|
||||
snapshots.pop(current_id, None)
|
||||
memo[current_id] = Unhashable()
|
||||
active.discard(current_id)
|
||||
break
|
||||
else:
|
||||
work_stack.append(("EXPANDED", current_id, current_type))
|
||||
for _, value in reversed(items):
|
||||
work_stack.append((value, False))
|
||||
continue
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
items = list(current)
|
||||
snapshots[current_id] = items
|
||||
except RuntimeError:
|
||||
memo[current_id] = Unhashable()
|
||||
active.discard(current_id)
|
||||
continue
|
||||
work_stack.append(("EXPANDED", current_id, current_type))
|
||||
for item in reversed(items):
|
||||
work_stack.append((item, False))
|
||||
|
||||
return memo.get(id(obj), Unhashable())
|
||||
|
||||
class CacheKeySetID(CacheKeySet):
|
||||
"""Cache-key strategy that keys nodes by node id and class type."""
|
||||
def __init__(self, dynprompt, node_ids, is_changed_cache):
|
||||
"""Initialize identity-based cache keys for the supplied dynamic prompt."""
|
||||
super().__init__(dynprompt, node_ids, is_changed_cache)
|
||||
self.dynprompt = dynprompt
|
||||
|
||||
async def add_keys(self, node_ids):
|
||||
"""Populate identity-based keys for nodes that exist in the dynamic prompt."""
|
||||
for node_id in node_ids:
|
||||
if node_id in self.keys:
|
||||
continue
|
||||
@ -80,15 +442,19 @@ class CacheKeySetID(CacheKeySet):
|
||||
self.subcache_keys[node_id] = (node_id, node["class_type"])
|
||||
|
||||
class CacheKeySetInputSignature(CacheKeySet):
|
||||
"""Cache-key strategy that hashes a node's immediate inputs plus ancestor references."""
|
||||
def __init__(self, dynprompt, node_ids, is_changed_cache):
|
||||
"""Initialize input-signature-based cache keys for the supplied dynamic prompt."""
|
||||
super().__init__(dynprompt, node_ids, is_changed_cache)
|
||||
self.dynprompt = dynprompt
|
||||
self.is_changed_cache = is_changed_cache
|
||||
|
||||
def include_node_id_in_input(self) -> bool:
|
||||
"""Return whether node ids should be included in computed input signatures."""
|
||||
return False
|
||||
|
||||
async def add_keys(self, node_ids):
|
||||
"""Populate input-signature-based keys for nodes in the dynamic prompt."""
|
||||
for node_id in node_ids:
|
||||
if node_id in self.keys:
|
||||
continue
|
||||
@ -99,21 +465,37 @@ class CacheKeySetInputSignature(CacheKeySet):
|
||||
self.subcache_keys[node_id] = (node_id, node["class_type"])
|
||||
|
||||
async def get_node_signature(self, dynprompt, node_id):
|
||||
"""Build the full cache signature for a node and its ordered ancestors."""
|
||||
signature = []
|
||||
ancestors, order_mapping = self.get_ordered_ancestry(dynprompt, node_id)
|
||||
signature.append(await self.get_immediate_node_signature(dynprompt, node_id, order_mapping))
|
||||
immediate = await self.get_immediate_node_signature(dynprompt, node_id, order_mapping)
|
||||
if type(immediate) is Unhashable:
|
||||
return immediate
|
||||
signature.append(immediate)
|
||||
for ancestor_id in ancestors:
|
||||
signature.append(await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping))
|
||||
return to_hashable(signature)
|
||||
immediate = await self.get_immediate_node_signature(dynprompt, ancestor_id, order_mapping)
|
||||
if type(immediate) is Unhashable:
|
||||
return immediate
|
||||
signature.append(immediate)
|
||||
return tuple(signature)
|
||||
|
||||
async def get_immediate_node_signature(self, dynprompt, node_id, ancestor_order_mapping):
|
||||
"""Build the immediate cache-signature fragment for a node.
|
||||
|
||||
Link inputs are reduced to ancestor references here. Non-link values
|
||||
are canonicalized or failed closed before being appended so the final
|
||||
node signature is assembled from already-hashable fragments.
|
||||
"""
|
||||
if not dynprompt.has_node(node_id):
|
||||
# This node doesn't exist -- we can't cache it.
|
||||
return [float("NaN")]
|
||||
return Unhashable()
|
||||
node = dynprompt.get_node(node_id)
|
||||
class_type = node["class_type"]
|
||||
class_def = nodes.NODE_CLASS_MAPPINGS[class_type]
|
||||
signature = [class_type, await self.is_changed_cache.get(node_id)]
|
||||
is_changed_signature = _shallow_is_changed_signature(await self.is_changed_cache.get(node_id))
|
||||
if type(is_changed_signature) is Unhashable:
|
||||
return is_changed_signature
|
||||
signature = [class_type, is_changed_signature]
|
||||
if self.include_node_id_in_input() or (hasattr(class_def, "NOT_IDEMPOTENT") and class_def.NOT_IDEMPOTENT) or include_unique_id_in_input(class_type):
|
||||
signature.append(node_id)
|
||||
inputs = node["inputs"]
|
||||
@ -123,18 +505,23 @@ class CacheKeySetInputSignature(CacheKeySet):
|
||||
ancestor_index = ancestor_order_mapping[ancestor_id]
|
||||
signature.append((key,("ANCESTOR", ancestor_index, ancestor_socket)))
|
||||
else:
|
||||
signature.append((key, inputs[key]))
|
||||
return signature
|
||||
value_signature = to_hashable(inputs[key])
|
||||
if type(value_signature) is Unhashable:
|
||||
return value_signature
|
||||
signature.append((key, value_signature))
|
||||
return tuple(signature)
|
||||
|
||||
# This function returns a list of all ancestors of the given node. The order of the list is
|
||||
# deterministic based on which specific inputs the ancestor is connected by.
|
||||
def get_ordered_ancestry(self, dynprompt, node_id):
|
||||
"""Return ancestors in deterministic traversal order and their index mapping."""
|
||||
ancestors = []
|
||||
order_mapping = {}
|
||||
self.get_ordered_ancestry_internal(dynprompt, node_id, ancestors, order_mapping)
|
||||
return ancestors, order_mapping
|
||||
|
||||
def get_ordered_ancestry_internal(self, dynprompt, node_id, ancestors, order_mapping):
|
||||
"""Recursively collect ancestors in input order without revisiting prior nodes."""
|
||||
if not dynprompt.has_node(node_id):
|
||||
return
|
||||
inputs = dynprompt.get_node(node_id)["inputs"]
|
||||
|
||||
@ -1,11 +1,17 @@
|
||||
def is_link(obj):
|
||||
if not isinstance(obj, list):
|
||||
"""Return whether obj is a plain prompt link of the form [node_id, output_index]."""
|
||||
# Prompt links produced by the frontend / GraphBuilder are plain Python
|
||||
# lists in the form [node_id, output_index]. Some custom-node paths can
|
||||
# inject foreign runtime objects into prompt inputs during on-prompt graph
|
||||
# rewriting or subgraph construction. Be strict here so cache signature
|
||||
# building never tries to treat list-like proxy objects as links.
|
||||
if type(obj) is not list:
|
||||
return False
|
||||
if len(obj) != 2:
|
||||
return False
|
||||
if not isinstance(obj[0], str):
|
||||
if type(obj[0]) is not str:
|
||||
return False
|
||||
if not isinstance(obj[1], int) and not isinstance(obj[1], float):
|
||||
if type(obj[1]) is not int:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
55
nodes.py
55
nodes.py
@ -728,50 +728,26 @@ class LoraLoaderModelOnly(LoraLoader):
|
||||
|
||||
class VAELoader:
|
||||
video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"]
|
||||
image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
|
||||
image_taes = ["taesd", "taesdxl", "taesd3", "taef1", "taef2"]
|
||||
|
||||
@staticmethod
|
||||
def vae_list(s):
|
||||
vaes = folder_paths.get_filename_list("vae")
|
||||
approx_vaes = folder_paths.get_filename_list("vae_approx")
|
||||
sdxl_taesd_enc = False
|
||||
sdxl_taesd_dec = False
|
||||
sd1_taesd_enc = False
|
||||
sd1_taesd_dec = False
|
||||
sd3_taesd_enc = False
|
||||
sd3_taesd_dec = False
|
||||
f1_taesd_enc = False
|
||||
f1_taesd_dec = False
|
||||
|
||||
have_img_encoder, have_img_decoder = set(), set()
|
||||
for v in approx_vaes:
|
||||
if v.startswith("taesd_decoder."):
|
||||
sd1_taesd_dec = True
|
||||
elif v.startswith("taesd_encoder."):
|
||||
sd1_taesd_enc = True
|
||||
elif v.startswith("taesdxl_decoder."):
|
||||
sdxl_taesd_dec = True
|
||||
elif v.startswith("taesdxl_encoder."):
|
||||
sdxl_taesd_enc = True
|
||||
elif v.startswith("taesd3_decoder."):
|
||||
sd3_taesd_dec = True
|
||||
elif v.startswith("taesd3_encoder."):
|
||||
sd3_taesd_enc = True
|
||||
elif v.startswith("taef1_encoder."):
|
||||
f1_taesd_dec = True
|
||||
elif v.startswith("taef1_decoder."):
|
||||
f1_taesd_enc = True
|
||||
else:
|
||||
parts = v.split("_", 1)
|
||||
if len(parts) != 2 or parts[0] not in s.image_taes:
|
||||
for tae in s.video_taes:
|
||||
if v.startswith(tae):
|
||||
vaes.append(v)
|
||||
|
||||
if sd1_taesd_dec and sd1_taesd_enc:
|
||||
vaes.append("taesd")
|
||||
if sdxl_taesd_dec and sdxl_taesd_enc:
|
||||
vaes.append("taesdxl")
|
||||
if sd3_taesd_dec and sd3_taesd_enc:
|
||||
vaes.append("taesd3")
|
||||
if f1_taesd_dec and f1_taesd_enc:
|
||||
vaes.append("taef1")
|
||||
break
|
||||
continue
|
||||
if parts[1].startswith("encoder."):
|
||||
have_img_encoder.add(parts[0])
|
||||
elif parts[1].startswith("decoder."):
|
||||
have_img_decoder.add(parts[0])
|
||||
vaes += [k for k in have_img_decoder if k in have_img_encoder]
|
||||
vaes.append("pixel_space")
|
||||
return vaes
|
||||
|
||||
@ -827,6 +803,11 @@ class VAELoader:
|
||||
else:
|
||||
vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
|
||||
sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True)
|
||||
if vae_name == "taef2":
|
||||
if metadata is None:
|
||||
metadata = {"tae_latent_channels": 128}
|
||||
else:
|
||||
metadata["tae_latent_channels"] = 128
|
||||
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
|
||||
vae.throw_exception_if_invalid()
|
||||
return (vae,)
|
||||
@ -2463,7 +2444,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_curve.py",
|
||||
"nodes_rtdetr.py",
|
||||
"nodes_frame_interpolation.py",
|
||||
"nodes_sam3.py"
|
||||
"nodes_sam3.py",
|
||||
]
|
||||
|
||||
import_failed = []
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.42.15
|
||||
comfyui-workflow-templates==0.9.63
|
||||
comfyui-workflow-templates==0.9.65
|
||||
comfyui-embedded-docs==0.4.4
|
||||
torch
|
||||
torchsde
|
||||
@ -19,7 +19,7 @@ scipy
|
||||
tqdm
|
||||
psutil
|
||||
alembic
|
||||
SQLAlchemy>=2.0
|
||||
SQLAlchemy>=2.0.0
|
||||
filelock
|
||||
av>=14.2.0
|
||||
comfy-kitchen>=0.2.8
|
||||
|
||||
473
tests-unit/execution_test/caching_test.py
Normal file
473
tests-unit/execution_test/caching_test.py
Normal file
@ -0,0 +1,473 @@
|
||||
"""Unit tests for cache-signature canonicalization hardening."""
|
||||
|
||||
import asyncio
|
||||
import importlib
|
||||
import sys
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class _DummyNode:
|
||||
"""Minimal node stub used to satisfy cache-signature class lookups."""
|
||||
|
||||
@staticmethod
|
||||
def INPUT_TYPES():
|
||||
"""Return a minimal empty input schema for unit tests."""
|
||||
return {"required": {}}
|
||||
|
||||
|
||||
class _FakeDynPrompt:
|
||||
"""Small DynamicPrompt stand-in with only the methods these tests need."""
|
||||
|
||||
def __init__(self, nodes_by_id):
|
||||
"""Store test nodes by id."""
|
||||
self._nodes_by_id = nodes_by_id
|
||||
|
||||
def has_node(self, node_id):
|
||||
"""Return whether the fake prompt contains the requested node."""
|
||||
return node_id in self._nodes_by_id
|
||||
|
||||
def get_node(self, node_id):
|
||||
"""Return the stored node payload for the requested id."""
|
||||
return self._nodes_by_id[node_id]
|
||||
|
||||
|
||||
class _FakeIsChangedCache:
|
||||
"""Async stub for `is_changed` lookups used by cache-key generation."""
|
||||
|
||||
def __init__(self, values):
|
||||
"""Store canned `is_changed` responses keyed by node id."""
|
||||
self._values = values
|
||||
|
||||
async def get(self, node_id):
|
||||
"""Return the canned `is_changed` value for a node."""
|
||||
return self._values[node_id]
|
||||
|
||||
|
||||
class _OpaqueValue:
|
||||
"""Hashable opaque object used to exercise fail-closed unordered hashing paths."""
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def caching_module(monkeypatch):
|
||||
"""Import `comfy_execution.caching` with lightweight stub dependencies."""
|
||||
torch_module = types.ModuleType("torch")
|
||||
psutil_module = types.ModuleType("psutil")
|
||||
nodes_module = types.ModuleType("nodes")
|
||||
nodes_module.NODE_CLASS_MAPPINGS = {}
|
||||
graph_module = types.ModuleType("comfy_execution.graph")
|
||||
|
||||
class DynamicPrompt:
|
||||
"""Placeholder graph type so the caching module can import cleanly."""
|
||||
|
||||
pass
|
||||
|
||||
graph_module.DynamicPrompt = DynamicPrompt
|
||||
|
||||
monkeypatch.setitem(sys.modules, "torch", torch_module)
|
||||
monkeypatch.setitem(sys.modules, "psutil", psutil_module)
|
||||
monkeypatch.setitem(sys.modules, "nodes", nodes_module)
|
||||
monkeypatch.setitem(sys.modules, "comfy_execution.graph", graph_module)
|
||||
monkeypatch.delitem(sys.modules, "comfy_execution.caching", raising=False)
|
||||
|
||||
module = importlib.import_module("comfy_execution.caching")
|
||||
module = importlib.reload(module)
|
||||
return module, nodes_module
|
||||
|
||||
|
||||
def test_signature_to_hashable_handles_shared_builtin_substructures(caching_module):
|
||||
"""Shared built-in substructures should canonicalize without collapsing to Unhashable."""
|
||||
caching, _ = caching_module
|
||||
shared = [{"value": 1}, {"value": 2}]
|
||||
|
||||
signature = caching._signature_to_hashable([shared, shared])
|
||||
|
||||
assert signature[0] == "list"
|
||||
assert signature[1][0] == signature[1][1]
|
||||
assert signature[1][0][0] == "list"
|
||||
assert signature[1][0][1][0] == ("dict", (("value", 1),))
|
||||
assert signature[1][0][1][1] == ("dict", (("value", 2),))
|
||||
|
||||
|
||||
def test_signature_to_hashable_fails_closed_on_opaque_values(caching_module):
|
||||
"""Opaque values should collapse the full signature to Unhashable immediately."""
|
||||
caching, _ = caching_module
|
||||
|
||||
signature = caching._signature_to_hashable(["safe", object()])
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_signature_to_hashable_stops_descending_after_failure(caching_module, monkeypatch):
|
||||
"""Once canonicalization fails, later recursive descent should stop immediately."""
|
||||
caching, _ = caching_module
|
||||
original = caching._signature_to_hashable_impl
|
||||
marker = object()
|
||||
marker_seen = False
|
||||
|
||||
def tracking_canonicalize(obj, *args, **kwargs):
|
||||
"""Track whether recursion reaches the nested marker after failure."""
|
||||
nonlocal marker_seen
|
||||
if obj is marker:
|
||||
marker_seen = True
|
||||
return original(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_signature_to_hashable_impl", tracking_canonicalize)
|
||||
|
||||
signature = caching._signature_to_hashable([object(), [marker]])
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
assert marker_seen is False
|
||||
|
||||
|
||||
def test_signature_to_hashable_snapshots_list_before_recursing(caching_module, monkeypatch):
|
||||
"""List canonicalization should read a point-in-time snapshot before recursive descent."""
|
||||
caching, _ = caching_module
|
||||
original = caching._signature_to_hashable_impl
|
||||
marker = ("marker",)
|
||||
values = [marker, 2]
|
||||
|
||||
def mutating_canonicalize(obj, *args, **kwargs):
|
||||
"""Mutate the live list during recursion to verify snapshot-based traversal."""
|
||||
if obj is marker:
|
||||
values[1] = 3
|
||||
return original(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_signature_to_hashable_impl", mutating_canonicalize)
|
||||
|
||||
signature = caching._signature_to_hashable(values)
|
||||
|
||||
assert signature == ("list", (("tuple", ("marker",)), 2))
|
||||
assert values[1] == 3
|
||||
|
||||
|
||||
def test_signature_to_hashable_snapshots_dict_before_recursing(caching_module, monkeypatch):
|
||||
"""Dict canonicalization should read a point-in-time snapshot before recursive descent."""
|
||||
caching, _ = caching_module
|
||||
original = caching._signature_to_hashable_impl
|
||||
marker = ("marker",)
|
||||
values = {"first": marker, "second": 2}
|
||||
|
||||
def mutating_canonicalize(obj, *args, **kwargs):
|
||||
"""Mutate the live dict during recursion to verify snapshot-based traversal."""
|
||||
if obj is marker:
|
||||
values["second"] = 3
|
||||
return original(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_signature_to_hashable_impl", mutating_canonicalize)
|
||||
|
||||
signature = caching._signature_to_hashable(values)
|
||||
|
||||
assert signature == ("dict", (("first", ("tuple", ("marker",))), ("second", 2)))
|
||||
assert values["second"] == 3
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_factory",
|
||||
[
|
||||
lambda marker: [marker],
|
||||
lambda marker: (marker,),
|
||||
lambda marker: {marker},
|
||||
lambda marker: frozenset({marker}),
|
||||
lambda marker: {"key": marker},
|
||||
],
|
||||
)
|
||||
def test_signature_to_hashable_fails_closed_on_runtimeerror(caching_module, monkeypatch, container_factory):
|
||||
"""Traversal RuntimeError should degrade canonicalization to Unhashable."""
|
||||
caching, _ = caching_module
|
||||
original = caching._signature_to_hashable_impl
|
||||
marker = object()
|
||||
|
||||
def raising_canonicalize(obj, *args, **kwargs):
|
||||
"""Raise a traversal RuntimeError for the marker value and delegate otherwise."""
|
||||
if obj is marker:
|
||||
raise RuntimeError("container changed during iteration")
|
||||
return original(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_signature_to_hashable_impl", raising_canonicalize)
|
||||
|
||||
signature = caching._signature_to_hashable(container_factory(marker))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_to_hashable_handles_shared_builtin_substructures(caching_module):
|
||||
"""The legacy helper should still hash sanitized built-ins stably when used directly."""
|
||||
caching, _ = caching_module
|
||||
shared = [{"value": 1}, {"value": 2}]
|
||||
|
||||
sanitized = [shared, shared]
|
||||
hashable = caching.to_hashable(sanitized)
|
||||
|
||||
assert hashable[0] == "list"
|
||||
assert hashable[1][0] == hashable[1][1]
|
||||
assert hashable[1][0][0] == "list"
|
||||
|
||||
|
||||
def test_to_hashable_uses_parent_snapshot_during_expanded_phase(caching_module, monkeypatch):
|
||||
"""Expanded-phase assembly should not reread a live parent container after snapshotting."""
|
||||
caching, _ = caching_module
|
||||
original_sort_key = caching._sanitized_sort_key
|
||||
outer = [{"marker"}, 2]
|
||||
|
||||
def mutating_sort_key(obj, *args, **kwargs):
|
||||
"""Mutate the live parent while a child container is being canonicalized."""
|
||||
if obj == "marker":
|
||||
outer[1] = 3
|
||||
return original_sort_key(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_sanitized_sort_key", mutating_sort_key)
|
||||
|
||||
hashable = caching.to_hashable(outer)
|
||||
|
||||
assert hashable == ("list", (("set", ("marker",)), 2))
|
||||
assert outer[1] == 3
|
||||
|
||||
|
||||
def test_to_hashable_fails_closed_for_ordered_container_with_opaque_child(caching_module):
|
||||
"""Ordered containers should fail closed when a child cannot be canonicalized."""
|
||||
caching, _ = caching_module
|
||||
|
||||
result = caching.to_hashable([object()])
|
||||
|
||||
assert isinstance(result, caching.Unhashable)
|
||||
|
||||
|
||||
def test_to_hashable_canonicalizes_dict_insertion_order(caching_module):
|
||||
"""Dicts with the same content should hash identically regardless of insertion order."""
|
||||
caching, _ = caching_module
|
||||
|
||||
first = {"b": 2, "a": 1}
|
||||
second = {"a": 1, "b": 2}
|
||||
|
||||
assert caching.to_hashable(first) == ("dict", (("a", 1), ("b", 2)))
|
||||
assert caching.to_hashable(first) == caching.to_hashable(second)
|
||||
|
||||
|
||||
def test_to_hashable_fails_closed_for_opaque_dict_key(caching_module):
|
||||
"""Opaque dict keys should fail closed instead of being traversed during hashing."""
|
||||
caching, _ = caching_module
|
||||
|
||||
hashable = caching.to_hashable({_OpaqueValue(): 1})
|
||||
|
||||
assert isinstance(hashable, caching.Unhashable)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_factory",
|
||||
[
|
||||
set,
|
||||
frozenset,
|
||||
],
|
||||
)
|
||||
def test_to_hashable_fails_closed_on_runtimeerror(caching_module, monkeypatch, container_factory):
|
||||
"""Traversal RuntimeError should degrade unordered hash conversion to Unhashable."""
|
||||
caching, _ = caching_module
|
||||
|
||||
def raising_sort_key(obj, *args, **kwargs):
|
||||
"""Raise a traversal RuntimeError while unordered values are canonicalized."""
|
||||
raise RuntimeError("container changed during iteration")
|
||||
|
||||
monkeypatch.setattr(caching, "_sanitized_sort_key", raising_sort_key)
|
||||
|
||||
hashable = caching.to_hashable(container_factory({"value"}))
|
||||
|
||||
assert isinstance(hashable, caching.Unhashable)
|
||||
|
||||
|
||||
def test_to_hashable_fails_closed_for_ambiguous_dict_ordering(caching_module, monkeypatch):
|
||||
"""Ambiguous dict key ordering should fail closed instead of using insertion order."""
|
||||
caching, _ = caching_module
|
||||
original_sort_key = caching._sanitized_sort_key
|
||||
ambiguous = {"a": 1, "b": 1}
|
||||
|
||||
def colliding_sort_key(obj, *args, **kwargs):
|
||||
"""Force two distinct primitive keys to share the same ordering key."""
|
||||
if obj == "a" or obj == "b":
|
||||
return ("COLLIDE",)
|
||||
return original_sort_key(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_sanitized_sort_key", colliding_sort_key)
|
||||
|
||||
hashable = caching.to_hashable(ambiguous)
|
||||
|
||||
assert isinstance(hashable, caching.Unhashable)
|
||||
|
||||
|
||||
def test_signature_to_hashable_fails_closed_for_ambiguous_dict_ordering(caching_module, monkeypatch):
|
||||
"""Ambiguous dict sort ties should fail closed instead of depending on input order."""
|
||||
caching, _ = caching_module
|
||||
original_sort_key = caching._primitive_signature_sort_key
|
||||
ambiguous = {"a": 1, "b": 1}
|
||||
|
||||
def colliding_sort_key(obj):
|
||||
"""Force two distinct primitive keys to share the same ordering key."""
|
||||
if obj == "a" or obj == "b":
|
||||
return ("COLLIDE",)
|
||||
return original_sort_key(obj)
|
||||
|
||||
monkeypatch.setattr(caching, "_primitive_signature_sort_key", colliding_sort_key)
|
||||
|
||||
sanitized = caching._signature_to_hashable(ambiguous)
|
||||
|
||||
assert isinstance(sanitized, caching.Unhashable)
|
||||
|
||||
|
||||
def test_signature_to_hashable_fails_closed_for_opaque_dict_key(caching_module):
|
||||
"""Opaque dict keys should fail closed instead of being recursively canonicalized."""
|
||||
caching, _ = caching_module
|
||||
|
||||
sanitized = caching._signature_to_hashable({_OpaqueValue(): 1})
|
||||
|
||||
assert isinstance(sanitized, caching.Unhashable)
|
||||
|
||||
|
||||
def test_signature_to_hashable_fails_closed_on_dict_key_sort_collisions_even_with_distinct_values(caching_module, monkeypatch):
|
||||
"""Different values must not mask dict key-sort collisions during canonicalization."""
|
||||
caching, _ = caching_module
|
||||
original_sort_key = caching._primitive_signature_sort_key
|
||||
|
||||
def colliding_sort_key(obj):
|
||||
"""Force two distinct primitive keys to share the same ordering key."""
|
||||
if obj == "a" or obj == "b":
|
||||
return ("COLLIDE",)
|
||||
return original_sort_key(obj)
|
||||
|
||||
monkeypatch.setattr(caching, "_primitive_signature_sort_key", colliding_sort_key)
|
||||
|
||||
sanitized = caching._signature_to_hashable({"a": 1, "b": 2})
|
||||
|
||||
assert isinstance(sanitized, caching.Unhashable)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"container_factory",
|
||||
[
|
||||
set,
|
||||
frozenset,
|
||||
],
|
||||
)
|
||||
def test_to_hashable_fails_closed_for_ambiguous_unordered_values(caching_module, monkeypatch, container_factory):
|
||||
"""Ambiguous unordered values should fail closed instead of depending on iteration order."""
|
||||
caching, _ = caching_module
|
||||
original_sort_key = caching._sanitized_sort_key
|
||||
container = container_factory({"a", "b"})
|
||||
|
||||
def colliding_sort_key(obj, *args, **kwargs):
|
||||
"""Force two distinct primitive values to share the same ordering key."""
|
||||
if obj == "a" or obj == "b":
|
||||
return ("COLLIDE",)
|
||||
return original_sort_key(obj, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(caching, "_sanitized_sort_key", colliding_sort_key)
|
||||
|
||||
hashable = caching.to_hashable(container)
|
||||
|
||||
assert isinstance(hashable, caching.Unhashable)
|
||||
|
||||
|
||||
def test_get_node_signature_returns_top_level_unhashable_for_tainted_signature(caching_module, monkeypatch):
|
||||
"""Tainted full signatures should fail closed before `to_hashable()` runs."""
|
||||
caching, nodes_module = caching_module
|
||||
monkeypatch.setitem(nodes_module.NODE_CLASS_MAPPINGS, "UnitTestNode", _DummyNode)
|
||||
monkeypatch.setattr(
|
||||
caching,
|
||||
"to_hashable",
|
||||
lambda *_args, **_kwargs: pytest.fail("to_hashable should not run for tainted signatures"),
|
||||
)
|
||||
|
||||
is_changed_value = []
|
||||
is_changed_value.append(is_changed_value)
|
||||
|
||||
dynprompt = _FakeDynPrompt(
|
||||
{
|
||||
"node": {
|
||||
"class_type": "UnitTestNode",
|
||||
"inputs": {"value": 5},
|
||||
}
|
||||
}
|
||||
)
|
||||
key_set = caching.CacheKeySetInputSignature(
|
||||
dynprompt,
|
||||
["node"],
|
||||
_FakeIsChangedCache({"node": is_changed_value}),
|
||||
)
|
||||
|
||||
signature = asyncio.run(key_set.get_node_signature(dynprompt, "node"))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_accepts_primitive_lists(caching_module):
|
||||
"""Primitive-only `is_changed` lists should stay hashable without deep descent."""
|
||||
caching, _ = caching_module
|
||||
|
||||
sanitized = caching._shallow_is_changed_signature([1, "two", None, True])
|
||||
|
||||
assert sanitized == ("is_changed_list", (1, "two", None, True))
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_accepts_structured_builtin_fingerprint_lists(caching_module):
|
||||
"""Structured built-in `is_changed` fingerprints should remain representable."""
|
||||
caching, _ = caching_module
|
||||
|
||||
sanitized = caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}])
|
||||
|
||||
assert sanitized == (
|
||||
"is_changed_list",
|
||||
(
|
||||
("tuple", ("seed", 42)),
|
||||
("dict", (("cfg", 8),)),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_fails_closed_for_opaque_payload(caching_module):
|
||||
"""Opaque `is_changed` payloads should still fail closed."""
|
||||
caching, _ = caching_module
|
||||
|
||||
sanitized = caching._shallow_is_changed_signature([_OpaqueValue()])
|
||||
|
||||
assert isinstance(sanitized, caching.Unhashable)
|
||||
|
||||
|
||||
def test_get_immediate_node_signature_fails_closed_for_unhashable_is_changed(caching_module, monkeypatch):
|
||||
"""Recursive `is_changed` payloads should fail the full fragment closed."""
|
||||
caching, nodes_module = caching_module
|
||||
monkeypatch.setitem(nodes_module.NODE_CLASS_MAPPINGS, "UnitTestNode", _DummyNode)
|
||||
|
||||
is_changed_value = []
|
||||
is_changed_value.append(is_changed_value)
|
||||
dynprompt = _FakeDynPrompt(
|
||||
{
|
||||
"node": {
|
||||
"class_type": "UnitTestNode",
|
||||
"inputs": {"value": 5},
|
||||
}
|
||||
}
|
||||
)
|
||||
key_set = caching.CacheKeySetInputSignature(
|
||||
dynprompt,
|
||||
["node"],
|
||||
_FakeIsChangedCache({"node": is_changed_value}),
|
||||
)
|
||||
|
||||
signature = asyncio.run(key_set.get_immediate_node_signature(dynprompt, "node", {}))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_get_immediate_node_signature_fails_closed_for_missing_node(caching_module):
|
||||
"""Missing nodes should return the fail-closed sentinel instead of a NaN tuple."""
|
||||
caching, _ = caching_module
|
||||
dynprompt = _FakeDynPrompt({})
|
||||
key_set = caching.CacheKeySetInputSignature(
|
||||
dynprompt,
|
||||
[],
|
||||
_FakeIsChangedCache({}),
|
||||
)
|
||||
|
||||
signature = asyncio.run(key_set.get_immediate_node_signature(dynprompt, "missing", {}))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
242
tests/execution/test_caching.py
Normal file
242
tests/execution/test_caching.py
Normal file
@ -0,0 +1,242 @@
|
||||
import asyncio
|
||||
|
||||
from comfy_execution import caching
|
||||
|
||||
|
||||
class _StubDynPrompt:
|
||||
def __init__(self, nodes):
|
||||
self._nodes = nodes
|
||||
|
||||
def has_node(self, node_id):
|
||||
return node_id in self._nodes
|
||||
|
||||
def get_node(self, node_id):
|
||||
return self._nodes[node_id]
|
||||
|
||||
|
||||
class _StubIsChangedCache:
|
||||
async def get(self, node_id):
|
||||
return None
|
||||
|
||||
|
||||
class _StubNode:
|
||||
@classmethod
|
||||
def INPUT_TYPES(cls):
|
||||
return {"required": {}}
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_keeps_primitive_only_list_shallow():
|
||||
assert caching._shallow_is_changed_signature([1, "two", None, True]) == (
|
||||
"is_changed_list",
|
||||
(1, "two", None, True),
|
||||
)
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_keeps_primitive_only_tuple_shallow():
|
||||
assert caching._shallow_is_changed_signature((1, "two", None, True)) == (
|
||||
"is_changed_tuple",
|
||||
(1, "two", None, True),
|
||||
)
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_keeps_structured_builtin_fingerprint_list():
|
||||
assert caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}]) == (
|
||||
"is_changed_list",
|
||||
(
|
||||
("tuple", ("seed", 42)),
|
||||
("dict", (("cfg", 8),)),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_shallow_is_changed_signature_does_not_use_to_hashable(monkeypatch):
|
||||
monkeypatch.setattr(
|
||||
caching,
|
||||
"to_hashable",
|
||||
lambda *_args, **_kwargs: (_ for _ in ()).throw(
|
||||
AssertionError("is_changed signature must not deep-canonicalize")
|
||||
),
|
||||
)
|
||||
|
||||
signature = caching._shallow_is_changed_signature([("seed", 42), {"cfg": 8}])
|
||||
|
||||
assert signature == (
|
||||
"is_changed_list",
|
||||
(
|
||||
("tuple", ("seed", 42)),
|
||||
("dict", (("cfg", 8),)),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_get_immediate_node_signature_canonicalizes_non_link_inputs(monkeypatch):
|
||||
live_value = [1, {"nested": [2, 3]}]
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": live_value},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_immediate_node_signature(dynprompt, "1", {}))
|
||||
|
||||
assert signature == (
|
||||
"TestCacheNode",
|
||||
None,
|
||||
("value", ("list", (1, ("dict", (("nested", ("list", (2, 3))),))))),
|
||||
)
|
||||
|
||||
|
||||
def test_to_hashable_walks_dicts_without_rebinding_traversal_stack():
|
||||
live_value = {
|
||||
"outer": {"nested": [2, 3]},
|
||||
"items": [{"leaf": 4}],
|
||||
}
|
||||
|
||||
assert caching.to_hashable(live_value) == (
|
||||
"dict",
|
||||
(
|
||||
("items", ("list", (("dict", (("leaf", 4),)),))),
|
||||
("outer", ("dict", (("nested", ("list", (2, 3))),))),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_get_immediate_node_signature_fails_closed_for_opaque_non_link_input(monkeypatch):
|
||||
class OpaqueRuntimeValue:
|
||||
pass
|
||||
|
||||
live_value = OpaqueRuntimeValue()
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": live_value},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_immediate_node_signature(dynprompt, "1", {}))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_get_node_signature_propagates_unhashable_immediate_fragment(monkeypatch):
|
||||
class OpaqueRuntimeValue:
|
||||
pass
|
||||
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": OpaqueRuntimeValue()},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
|
||||
|
||||
assert isinstance(signature, caching.Unhashable)
|
||||
|
||||
|
||||
def test_get_node_signature_never_visits_raw_non_link_input(monkeypatch):
|
||||
live_value = [1, 2, 3]
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": live_value},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
monkeypatch.setattr(
|
||||
caching,
|
||||
"_signature_to_hashable",
|
||||
lambda *_args, **_kwargs: (_ for _ in ()).throw(
|
||||
AssertionError("outer signature canonicalizer should not run")
|
||||
),
|
||||
)
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
|
||||
|
||||
assert isinstance(signature, tuple)
|
||||
|
||||
|
||||
def test_get_node_signature_keeps_deep_canonicalized_input_fragment(monkeypatch):
|
||||
live_value = 1
|
||||
for _ in range(8):
|
||||
live_value = [live_value]
|
||||
expected = caching.to_hashable(live_value)
|
||||
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": live_value},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
|
||||
|
||||
assert isinstance(signature, tuple)
|
||||
assert signature[0][2][0] == "value"
|
||||
assert signature[0][2][1] == expected
|
||||
|
||||
|
||||
def test_get_node_signature_keeps_large_precanonicalized_fragment(monkeypatch):
|
||||
live_value = object()
|
||||
canonical_fragment = ("tuple", tuple(("list", (index, index + 1)) for index in range(256)))
|
||||
dynprompt = _StubDynPrompt(
|
||||
{
|
||||
"1": {
|
||||
"class_type": "TestCacheNode",
|
||||
"inputs": {"value": live_value},
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
monkeypatch.setitem(caching.nodes.NODE_CLASS_MAPPINGS, "TestCacheNode", _StubNode)
|
||||
monkeypatch.setattr(caching, "NODE_CLASS_CONTAINS_UNIQUE_ID", {})
|
||||
monkeypatch.setattr(
|
||||
caching,
|
||||
"to_hashable",
|
||||
lambda value, max_nodes=caching._MAX_SIGNATURE_CONTAINER_VISITS: (
|
||||
canonical_fragment if value is live_value else caching.Unhashable()
|
||||
),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
caching,
|
||||
"_signature_to_hashable",
|
||||
lambda *_args, **_kwargs: (_ for _ in ()).throw(
|
||||
AssertionError("outer signature canonicalizer should not run")
|
||||
),
|
||||
)
|
||||
|
||||
keyset = caching.CacheKeySetInputSignature(dynprompt, [], _StubIsChangedCache())
|
||||
signature = asyncio.run(keyset.get_node_signature(dynprompt, "1"))
|
||||
|
||||
assert isinstance(signature, tuple)
|
||||
assert signature[0][2] == ("value", canonical_fragment)
|
||||
Loading…
Reference in New Issue
Block a user