Fix the muted video output.

2026-04-21 16:02:54 +08:00 · 2026-04-16 18:36:22 +02:00 · 2026-04-16 18:36:22 +02:00 · f4240442b6
commit f4240442b6
parent c176f951e8
3 changed files with 34 additions and 8 deletions
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -785,8 +785,26 @@ class ZImagePixelSpace(ChromaRadiance):
    pass

 class CogVideoX(LatentFormat):
+    """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
+
+    scale_factor matches the vae/config.json scaling_factor for the 2b variant.
+    The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
+    use a different value; see CogVideoX1_5 below.
+    """
    latent_channels = 16
    latent_dimensions = 3

    def __init__(self):
        self.scale_factor = 1.15258426
+
+
+class CogVideoX1_5(CogVideoX):
+    """Latent format for 5b-class CogVideoX checkpoints.
+
+    Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
+    V1.5-5b family (including VOID inpainting). All of these have
+    scaling_factor=0.7 in their vae/config.json. Auto-selected in
+    supported_models.CogVideoX_T2V based on transformer hidden dim.
+    """
+    def __init__(self):
+        self.scale_factor = 0.7
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@ -1802,6 +1802,14 @@ class CogVideoX_T2V(supported_models_base.BASE):
    vae_key_prefix = ["vae."]
    text_encoder_key_prefix = ["text_encoders."]

+    def __init__(self, unet_config):
+        # 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
+        # 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
+        # Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
+        if unet_config.get("num_attention_heads", 0) >= 48:
+            self.latent_format = latent_formats.CogVideoX1_5
+        super().__init__(unet_config)
+
    def get_model(self, state_dict, prefix="", device=None):
        # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
        if self.unet_config.get("patch_size_t") is not None:
--- a/comfy_extras/nodes_void.py
+++ b/comfy_extras/nodes_void.py
@ -2,9 +2,10 @@ import nodes
 import node_helpers
 import torch
 import comfy
-import comfy.latent_formats
 import comfy.model_management
+import comfy.samplers
 import comfy.utils
+from comfy.utils import model_trange as trange
 from comfy_api.latest import io, ComfyExtension
 from typing_extensions import override

@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode):

        inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)

-        # CogVideoX-Fun was trained with Diffusers convention where VAE latents
-        # are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond()
-        # applies process_latent_in (×sf=1.153) to the stored conditioning.
-        # Pre-multiply by 0.7 so the model sees the correct magnitude:
-        #   stored = vae_output × 0.7  →  after process_in: (vae_output×0.7)×sf = raw×0.7
-        DIFFUSERS_SCALING_FACTOR = 0.7
-        inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR
+        # No explicit scaling needed here: the model's CogVideoX.concat_cond()
+        # applies process_latent_in (×latent_format.scale_factor) to each 16-ch
+        # block of the stored conditioning. For 5b-class checkpoints (incl. the
+        # VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
+        # selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
+        # diffusers vae/config.json scaling_factor VOID was trained with.

        positive = node_helpers.conditioning_set_values(
            positive, {"concat_latent_image": inpaint_latents}