diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 0f4059ebe..b8ae91e7a 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -785,8 +785,26 @@ class ZImagePixelSpace(ChromaRadiance): pass class CogVideoX(LatentFormat): + """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b). + + scale_factor matches the vae/config.json scaling_factor for the 2b variant. + The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*) + use a different value; see CogVideoX1_5 below. + """ latent_channels = 16 latent_dimensions = 3 def __init__(self): self.scale_factor = 1.15258426 + + +class CogVideoX1_5(CogVideoX): + """Latent format for 5b-class CogVideoX checkpoints. + + Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun + V1.5-5b family (including VOID inpainting). All of these have + scaling_factor=0.7 in their vae/config.json. Auto-selected in + supported_models.CogVideoX_T2V based on transformer hidden dim. + """ + def __init__(self): + self.scale_factor = 0.7 diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 9d45ee3ac..a190a3202 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1802,6 +1802,14 @@ class CogVideoX_T2V(supported_models_base.BASE): vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] + def __init__(self, unet_config): + # 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426. + # 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and + # Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json. + if unet_config.get("num_attention_heads", 0) >= 48: + self.latent_format = latent_formats.CogVideoX1_5 + super().__init__(unet_config) + def get_model(self, state_dict, prefix="", device=None): # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE if self.unet_config.get("patch_size_t") is not None: diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index b1e095571..4e9f77930 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -2,9 +2,10 @@ import nodes import node_helpers import torch import comfy -import comfy.latent_formats import comfy.model_management +import comfy.samplers import comfy.utils +from comfy.utils import model_trange as trange from comfy_api.latest import io, ComfyExtension from typing_extensions import override @@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode): inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1) - # CogVideoX-Fun was trained with Diffusers convention where VAE latents - # are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond() - # applies process_latent_in (×sf=1.153) to the stored conditioning. - # Pre-multiply by 0.7 so the model sees the correct magnitude: - # stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7 - DIFFUSERS_SCALING_FACTOR = 0.7 - inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR + # No explicit scaling needed here: the model's CogVideoX.concat_cond() + # applies process_latent_in (×latent_format.scale_factor) to each 16-ch + # block of the stored conditioning. For 5b-class checkpoints (incl. the + # VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto- + # selected as 0.7 in supported_models.CogVideoX_T2V, which matches the + # diffusers vae/config.json scaling_factor VOID was trained with. positive = node_helpers.conditioning_set_values( positive, {"concat_latent_image": inpaint_latents}