# CogVideoX VAE - ported to ComfyUI native ops # Architecture reference: diffusers AutoencoderKLCogVideoX # Style reference: comfy/ldm/wan/vae.py import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import comfy.ops ops = comfy.ops.disable_weight_init class SafeConv3d(nn.Conv3d): """3D convolution that splits large inputs along temporal dim to avoid OOM.""" def forward(self, x): mem = x.shape[0] * x.shape[1] * x.shape[2] * x.shape[3] * x.shape[4] * 2 / 1024**3 if mem > 2 and x.shape[2] >= self.kernel_size[0]: kernel_t = self.kernel_size[0] parts = int(mem / 2) + 1 # Ensure each chunk has at least kernel_t frames max_parts = max(1, x.shape[2] // kernel_t) parts = min(parts, max_parts) if parts <= 1: return super().forward(x) chunks = torch.chunk(x, parts, dim=2) if kernel_t > 1: chunks = [chunks[0]] + [ torch.cat((chunks[i - 1][:, :, -kernel_t + 1:], chunks[i]), dim=2) for i in range(1, len(chunks)) ] out = [] for chunk in chunks: out.append(super().forward(chunk)) return torch.cat(out, dim=2) return super().forward(x) class CausalConv3d(nn.Module): """Causal 3D convolution with temporal padding.""" def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"): super().__init__() if isinstance(kernel_size, int): kernel_size = (kernel_size,) * 3 time_kernel, height_kernel, width_kernel = kernel_size time_pad = time_kernel - 1 height_pad = (height_kernel - 1) // 2 width_pad = (width_kernel - 1) // 2 self.pad_mode = pad_mode self.time_pad = time_pad self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0) self.const_padding = (0, width_pad, height_pad) self.time_kernel_size = time_kernel stride = stride if isinstance(stride, tuple) else (stride, 1, 1) dilation = (dilation, 1, 1) self.conv = SafeConv3d( in_channels, out_channels, kernel_size, stride=stride, dilation=dilation, padding=0 if pad_mode == "replicate" else self.const_padding, ) def forward(self, x, conv_cache=None): if self.pad_mode == "replicate": x = F.pad(x, self.time_causal_padding, mode="replicate") conv_cache = None else: kernel_t = self.time_kernel_size if kernel_t > 1: cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1) x = torch.cat(cached + [x], dim=2) conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None out = self.conv(x) return out, conv_cache class SpatialNorm3D(nn.Module): """Spatially conditioned normalization.""" def __init__(self, f_channels, zq_channels, groups=32): super().__init__() self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True) self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1) self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1) def forward(self, f, zq, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} if f.shape[2] > 1 and f.shape[2] % 2 == 1: f_first, f_rest = f[:, :, :1], f[:, :, 1:] z_first, z_rest = zq[:, :, :1], zq[:, :, 1:] z_first = F.interpolate(z_first, size=f_first.shape[-3:]) z_rest = F.interpolate(z_rest, size=f_rest.shape[-3:]) zq = torch.cat([z_first, z_rest], dim=2) else: zq = F.interpolate(zq, size=f.shape[-3:]) conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y")) conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b")) return self.norm_layer(f) * conv_y + conv_b, new_cache class ResnetBlock3D(nn.Module): """3D ResNet block with optional spatial norm.""" def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32, eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"): super().__init__() out_channels = out_channels or in_channels self.in_channels = in_channels self.out_channels = out_channels self.spatial_norm_dim = spatial_norm_dim if act_fn == "silu": self.nonlinearity = nn.SiLU() elif act_fn == "swish": self.nonlinearity = nn.SiLU() else: self.nonlinearity = nn.SiLU() if spatial_norm_dim is None: self.norm1 = nn.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps) self.norm2 = nn.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps) else: self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups) self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups) self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode) if temb_channels > 0: self.temb_proj = nn.Linear(temb_channels, out_channels) self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode) if in_channels != out_channels: self.conv_shortcut = SafeConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0) else: self.conv_shortcut = None def forward(self, x, temb=None, zq=None, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} residual = x if zq is not None: x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1")) else: x = self.norm1(x) x = self.nonlinearity(x) x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1")) if temb is not None and hasattr(self, "temb_proj"): x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None] if zq is not None: x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2")) else: x = self.norm2(x) x = self.nonlinearity(x) x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2")) if self.conv_shortcut is not None: residual = self.conv_shortcut(residual) return x + residual, new_cache class Downsample3D(nn.Module): """3D downsampling with optional temporal compression.""" def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False): super().__init__() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) self.compress_time = compress_time def forward(self, x): if self.compress_time: b, c, t, h, w = x.shape x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t) if t % 2 == 1: x_first, x_rest = x[..., 0], x[..., 1:] if x_rest.shape[-1] > 0: x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2) x = torch.cat([x_first[..., None], x_rest], dim=-1) x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2) else: x = F.avg_pool1d(x, kernel_size=2, stride=2) x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2) pad = (0, 1, 0, 1) x = F.pad(x, pad, mode="constant", value=0) b, c, t, h, w = x.shape x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) x = self.conv(x) x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4) return x class Upsample3D(nn.Module): """3D upsampling with optional temporal decompression.""" def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False): super().__init__() self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) self.compress_time = compress_time def forward(self, x): if self.compress_time: if x.shape[2] > 1 and x.shape[2] % 2 == 1: x_first, x_rest = x[:, :, 0], x[:, :, 1:] x_first = F.interpolate(x_first, scale_factor=2.0) x_rest = F.interpolate(x_rest, scale_factor=2.0) x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2) elif x.shape[2] > 1: x = F.interpolate(x, scale_factor=2.0) else: x = x.squeeze(2) x = F.interpolate(x, scale_factor=2.0) x = x[:, :, None, :, :] else: b, c, t, h, w = x.shape x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) x = F.interpolate(x, scale_factor=2.0) x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4) b, c, t, h, w = x.shape x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w) x = self.conv(x) x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4) return x class DownBlock3D(nn.Module): def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1, eps=1e-6, act_fn="silu", groups=32, add_downsample=True, compress_time=False, pad_mode="first"): super().__init__() self.resnets = nn.ModuleList([ ResnetBlock3D( in_channels=in_channels if i == 0 else out_channels, out_channels=out_channels, temb_channels=temb_channels, groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode, ) for i in range(num_layers) ]) self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None def forward(self, x, temb=None, zq=None, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} for i, resnet in enumerate(self.resnets): x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}")) if self.downsamplers is not None: for ds in self.downsamplers: x = ds(x) return x, new_cache class MidBlock3D(nn.Module): def __init__(self, in_channels, temb_channels=0, num_layers=1, eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"): super().__init__() self.resnets = nn.ModuleList([ ResnetBlock3D( in_channels=in_channels, out_channels=in_channels, temb_channels=temb_channels, groups=groups, eps=eps, act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode, ) for _ in range(num_layers) ]) def forward(self, x, temb=None, zq=None, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} for i, resnet in enumerate(self.resnets): x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}")) return x, new_cache class UpBlock3D(nn.Module): def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1, eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16, add_upsample=True, compress_time=False, pad_mode="first"): super().__init__() self.resnets = nn.ModuleList([ ResnetBlock3D( in_channels=in_channels if i == 0 else out_channels, out_channels=out_channels, temb_channels=temb_channels, groups=groups, eps=eps, act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode, ) for i in range(num_layers) ]) self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None def forward(self, x, temb=None, zq=None, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} for i, resnet in enumerate(self.resnets): x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}")) if self.upsamplers is not None: for us in self.upsamplers: x = us(x) return x, new_cache class Encoder3D(nn.Module): def __init__(self, in_channels=3, out_channels=16, block_out_channels=(128, 256, 256, 512), layers_per_block=3, act_fn="silu", eps=1e-6, groups=32, pad_mode="first", temporal_compression_ratio=4): super().__init__() temporal_compress_level = int(np.log2(temporal_compression_ratio)) self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode) self.down_blocks = nn.ModuleList() output_channel = block_out_channels[0] for i in range(len(block_out_channels)): input_channel = output_channel output_channel = block_out_channels[i] is_final = i == len(block_out_channels) - 1 compress_time = i < temporal_compress_level self.down_blocks.append(DownBlock3D( in_channels=input_channel, out_channels=output_channel, temb_channels=0, num_layers=layers_per_block, eps=eps, act_fn=act_fn, groups=groups, add_downsample=not is_final, compress_time=compress_time, )) self.mid_block = MidBlock3D( in_channels=block_out_channels[-1], temb_channels=0, num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode, ) self.norm_out = nn.GroupNorm(groups, block_out_channels[-1], eps=1e-6) self.conv_act = nn.SiLU() self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode) def forward(self, x, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in")) for i, block in enumerate(self.down_blocks): key = f"down_block_{i}" x, new_cache[key] = block(x, None, None, conv_cache.get(key)) x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block")) x = self.norm_out(x) x = self.conv_act(x) x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out")) return x, new_cache class Decoder3D(nn.Module): def __init__(self, in_channels=16, out_channels=3, block_out_channels=(128, 256, 256, 512), layers_per_block=3, act_fn="silu", eps=1e-6, groups=32, pad_mode="first", temporal_compression_ratio=4): super().__init__() reversed_channels = list(reversed(block_out_channels)) temporal_compress_level = int(np.log2(temporal_compression_ratio)) self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode) self.mid_block = MidBlock3D( in_channels=reversed_channels[0], temb_channels=0, num_layers=2, eps=eps, act_fn=act_fn, groups=groups, spatial_norm_dim=in_channels, pad_mode=pad_mode, ) self.up_blocks = nn.ModuleList() output_channel = reversed_channels[0] for i in range(len(block_out_channels)): prev_channel = output_channel output_channel = reversed_channels[i] is_final = i == len(block_out_channels) - 1 compress_time = i < temporal_compress_level self.up_blocks.append(UpBlock3D( in_channels=prev_channel, out_channels=output_channel, temb_channels=0, num_layers=layers_per_block + 1, eps=eps, act_fn=act_fn, groups=groups, spatial_norm_dim=in_channels, add_upsample=not is_final, compress_time=compress_time, )) self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups) self.conv_act = nn.SiLU() self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode) def forward(self, sample, conv_cache=None): new_cache = {} conv_cache = conv_cache or {} x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in")) x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block")) for i, block in enumerate(self.up_blocks): key = f"up_block_{i}" x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key)) x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out")) x = self.conv_act(x) x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out")) return x, new_cache class AutoencoderKLCogVideoX(nn.Module): """CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper. Temporal frame batching with conv_cache is kept here since the causal 3D convolutions need state passed between temporal chunks. """ def __init__(self, in_channels=3, out_channels=3, block_out_channels=(128, 256, 256, 512), latent_channels=16, layers_per_block=3, act_fn="silu", eps=1e-6, groups=32, temporal_compression_ratio=4, ): super().__init__() self.latent_channels = latent_channels self.encoder = Encoder3D( in_channels=in_channels, out_channels=latent_channels, block_out_channels=block_out_channels, layers_per_block=layers_per_block, act_fn=act_fn, eps=eps, groups=groups, temporal_compression_ratio=temporal_compression_ratio, ) self.decoder = Decoder3D( in_channels=latent_channels, out_channels=out_channels, block_out_channels=block_out_channels, layers_per_block=layers_per_block, act_fn=act_fn, eps=eps, groups=groups, temporal_compression_ratio=temporal_compression_ratio, ) self.num_latent_frames_batch_size = 2 self.num_sample_frames_batch_size = 8 def encode(self, x): t = x.shape[2] frame_batch = self.num_sample_frames_batch_size num_batches = max(t // frame_batch, 1) conv_cache = None enc = [] for i in range(num_batches): remaining = t % frame_batch start = frame_batch * i + (0 if i == 0 else remaining) end = frame_batch * (i + 1) + remaining chunk, conv_cache = self.encoder(x[:, :, start:end], conv_cache=conv_cache) enc.append(chunk.to(x.device)) enc = torch.cat(enc, dim=2) mean, _ = enc.chunk(2, dim=1) return mean def decode(self, z): t = z.shape[2] frame_batch = self.num_latent_frames_batch_size num_batches = max(t // frame_batch, 1) conv_cache = None dec = [] for i in range(num_batches): remaining = t % frame_batch start = frame_batch * i + (0 if i == 0 else remaining) end = frame_batch * (i + 1) + remaining chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache) dec.append(chunk.cpu()) return torch.cat(dec, dim=2).to(z.device)