Fix the muted video output.

This commit is contained in:
Talmaj Marinc 2026-04-16 18:36:22 +02:00
parent c176f951e8
commit f4240442b6
3 changed files with 34 additions and 8 deletions

View File

@ -785,8 +785,26 @@ class ZImagePixelSpace(ChromaRadiance):
pass
class CogVideoX(LatentFormat):
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
use a different value; see CogVideoX1_5 below.
"""
latent_channels = 16
latent_dimensions = 3
def __init__(self):
self.scale_factor = 1.15258426
class CogVideoX1_5(CogVideoX):
"""Latent format for 5b-class CogVideoX checkpoints.
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
V1.5-5b family (including VOID inpainting). All of these have
scaling_factor=0.7 in their vae/config.json. Auto-selected in
supported_models.CogVideoX_T2V based on transformer hidden dim.
"""
def __init__(self):
self.scale_factor = 0.7

View File

@ -1802,6 +1802,14 @@ class CogVideoX_T2V(supported_models_base.BASE):
vae_key_prefix = ["vae."]
text_encoder_key_prefix = ["text_encoders."]
def __init__(self, unet_config):
# 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
# 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
# Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
if unet_config.get("num_attention_heads", 0) >= 48:
self.latent_format = latent_formats.CogVideoX1_5
super().__init__(unet_config)
def get_model(self, state_dict, prefix="", device=None):
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
if self.unet_config.get("patch_size_t") is not None:

View File

@ -2,9 +2,10 @@ import nodes
import node_helpers
import torch
import comfy
import comfy.latent_formats
import comfy.model_management
import comfy.samplers
import comfy.utils
from comfy.utils import model_trange as trange
from comfy_api.latest import io, ComfyExtension
from typing_extensions import override
@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode):
inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
# CogVideoX-Fun was trained with Diffusers convention where VAE latents
# are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond()
# applies process_latent_in (×sf=1.153) to the stored conditioning.
# Pre-multiply by 0.7 so the model sees the correct magnitude:
# stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7
DIFFUSERS_SCALING_FACTOR = 0.7
inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR
# No explicit scaling needed here: the model's CogVideoX.concat_cond()
# applies process_latent_in (×latent_format.scale_factor) to each 16-ch
# block of the stored conditioning. For 5b-class checkpoints (incl. the
# VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
# selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
# diffusers vae/config.json scaling_factor VOID was trained with.
positive = node_helpers.conditioning_set_values(
positive, {"concat_latent_image": inpaint_latents}