mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-21 16:02:54 +08:00
Fix the muted video output.
This commit is contained in:
parent
c176f951e8
commit
f4240442b6
@ -785,8 +785,26 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
|
||||
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
|
||||
use a different value; see CogVideoX1_5 below.
|
||||
"""
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.15258426
|
||||
|
||||
|
||||
class CogVideoX1_5(CogVideoX):
|
||||
"""Latent format for 5b-class CogVideoX checkpoints.
|
||||
|
||||
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
|
||||
V1.5-5b family (including VOID inpainting). All of these have
|
||||
scaling_factor=0.7 in their vae/config.json. Auto-selected in
|
||||
supported_models.CogVideoX_T2V based on transformer hidden dim.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.scale_factor = 0.7
|
||||
|
||||
@ -1802,6 +1802,14 @@ class CogVideoX_T2V(supported_models_base.BASE):
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def __init__(self, unet_config):
|
||||
# 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
|
||||
# 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
|
||||
# Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
|
||||
if unet_config.get("num_attention_heads", 0) >= 48:
|
||||
self.latent_format = latent_formats.CogVideoX1_5
|
||||
super().__init__(unet_config)
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
|
||||
@ -2,9 +2,10 @@ import nodes
|
||||
import node_helpers
|
||||
import torch
|
||||
import comfy
|
||||
import comfy.latent_formats
|
||||
import comfy.model_management
|
||||
import comfy.samplers
|
||||
import comfy.utils
|
||||
from comfy.utils import model_trange as trange
|
||||
from comfy_api.latest import io, ComfyExtension
|
||||
from typing_extensions import override
|
||||
|
||||
@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode):
|
||||
|
||||
inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
|
||||
|
||||
# CogVideoX-Fun was trained with Diffusers convention where VAE latents
|
||||
# are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond()
|
||||
# applies process_latent_in (×sf=1.153) to the stored conditioning.
|
||||
# Pre-multiply by 0.7 so the model sees the correct magnitude:
|
||||
# stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7
|
||||
DIFFUSERS_SCALING_FACTOR = 0.7
|
||||
inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR
|
||||
# No explicit scaling needed here: the model's CogVideoX.concat_cond()
|
||||
# applies process_latent_in (×latent_format.scale_factor) to each 16-ch
|
||||
# block of the stored conditioning. For 5b-class checkpoints (incl. the
|
||||
# VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
|
||||
# selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
|
||||
# diffusers vae/config.json scaling_factor VOID was trained with.
|
||||
|
||||
positive = node_helpers.conditioning_set_values(
|
||||
positive, {"concat_latent_image": inpaint_latents}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user