mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-04-23 00:42:30 +08:00
Fix the muted video output.
This commit is contained in:
parent
c176f951e8
commit
f4240442b6
@ -785,8 +785,26 @@ class ZImagePixelSpace(ChromaRadiance):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
class CogVideoX(LatentFormat):
|
class CogVideoX(LatentFormat):
|
||||||
|
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||||
|
|
||||||
|
scale_factor matches the vae/config.json scaling_factor for the 2b variant.
|
||||||
|
The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
|
||||||
|
use a different value; see CogVideoX1_5 below.
|
||||||
|
"""
|
||||||
latent_channels = 16
|
latent_channels = 16
|
||||||
latent_dimensions = 3
|
latent_dimensions = 3
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.scale_factor = 1.15258426
|
self.scale_factor = 1.15258426
|
||||||
|
|
||||||
|
|
||||||
|
class CogVideoX1_5(CogVideoX):
|
||||||
|
"""Latent format for 5b-class CogVideoX checkpoints.
|
||||||
|
|
||||||
|
Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
|
||||||
|
V1.5-5b family (including VOID inpainting). All of these have
|
||||||
|
scaling_factor=0.7 in their vae/config.json. Auto-selected in
|
||||||
|
supported_models.CogVideoX_T2V based on transformer hidden dim.
|
||||||
|
"""
|
||||||
|
def __init__(self):
|
||||||
|
self.scale_factor = 0.7
|
||||||
|
|||||||
@ -1802,6 +1802,14 @@ class CogVideoX_T2V(supported_models_base.BASE):
|
|||||||
vae_key_prefix = ["vae."]
|
vae_key_prefix = ["vae."]
|
||||||
text_encoder_key_prefix = ["text_encoders."]
|
text_encoder_key_prefix = ["text_encoders."]
|
||||||
|
|
||||||
|
def __init__(self, unet_config):
|
||||||
|
# 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
|
||||||
|
# 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
|
||||||
|
# Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
|
||||||
|
if unet_config.get("num_attention_heads", 0) >= 48:
|
||||||
|
self.latent_format = latent_formats.CogVideoX1_5
|
||||||
|
super().__init__(unet_config)
|
||||||
|
|
||||||
def get_model(self, state_dict, prefix="", device=None):
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
||||||
if self.unet_config.get("patch_size_t") is not None:
|
if self.unet_config.get("patch_size_t") is not None:
|
||||||
|
|||||||
@ -2,9 +2,10 @@ import nodes
|
|||||||
import node_helpers
|
import node_helpers
|
||||||
import torch
|
import torch
|
||||||
import comfy
|
import comfy
|
||||||
import comfy.latent_formats
|
|
||||||
import comfy.model_management
|
import comfy.model_management
|
||||||
|
import comfy.samplers
|
||||||
import comfy.utils
|
import comfy.utils
|
||||||
|
from comfy.utils import model_trange as trange
|
||||||
from comfy_api.latest import io, ComfyExtension
|
from comfy_api.latest import io, ComfyExtension
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode):
|
|||||||
|
|
||||||
inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
|
inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
|
||||||
|
|
||||||
# CogVideoX-Fun was trained with Diffusers convention where VAE latents
|
# No explicit scaling needed here: the model's CogVideoX.concat_cond()
|
||||||
# are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond()
|
# applies process_latent_in (×latent_format.scale_factor) to each 16-ch
|
||||||
# applies process_latent_in (×sf=1.153) to the stored conditioning.
|
# block of the stored conditioning. For 5b-class checkpoints (incl. the
|
||||||
# Pre-multiply by 0.7 so the model sees the correct magnitude:
|
# VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
|
||||||
# stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7
|
# selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
|
||||||
DIFFUSERS_SCALING_FACTOR = 0.7
|
# diffusers vae/config.json scaling_factor VOID was trained with.
|
||||||
inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR
|
|
||||||
|
|
||||||
positive = node_helpers.conditioning_set_values(
|
positive = node_helpers.conditioning_set_values(
|
||||||
positive, {"concat_latent_image": inpaint_latents}
|
positive, {"concat_latent_image": inpaint_latents}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user