From 9df5fb2ae957efef500c6c63584845fce19f2727 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 9 Apr 2026 09:08:50 +0200 Subject: [PATCH 01/13] Initial commit for void model CORE-38. --- comfy/supported_models.py | 15 ++++ comfy_extras/nodes_void.py | 172 +++++++++++++++++++++++++++++++++++++ nodes.py | 1 + 3 files changed, 188 insertions(+) create mode 100644 comfy_extras/nodes_void.py diff --git a/comfy/supported_models.py b/comfy/supported_models.py index e6c17fb98..d0e69687e 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1879,6 +1879,20 @@ class CogVideoX_I2V(CogVideoX_T2V): out = model_base.CogVideoX(self, image_to_video=True, device=device) return out +class CogVideoX_Inpaint(CogVideoX_T2V): + unet_config = { + "image_model": "cogvideox", + "in_channels": 48, + } + + def get_model(self, state_dict, prefix="", device=None): + if self.unet_config.get("patch_size_t") is not None: + self.unet_config.setdefault("sample_height", 96) + self.unet_config.setdefault("sample_width", 170) + self.unet_config.setdefault("sample_frames", 81) + out = model_base.CogVideoX(self, image_to_video=True, device=device) + return out + models = [ LotusD, @@ -1958,6 +1972,7 @@ models = [ ErnieImage, SAM3, SAM31, + CogVideoX_Inpaint, CogVideoX_I2V, CogVideoX_T2V, SVD_img2vid, diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py new file mode 100644 index 000000000..79122d013 --- /dev/null +++ b/comfy_extras/nodes_void.py @@ -0,0 +1,172 @@ +import nodes +import node_helpers +import torch +import comfy.model_management +import comfy.utils +from comfy_api.latest import io, ComfyExtension +from typing_extensions import override + + +class VOIDQuadmaskPreprocess(io.ComfyNode): + """Preprocess a quadmask video for VOID inpainting. + + Quantizes mask values to four semantic levels, inverts, and normalizes: + 0 -> primary object to remove + 63 -> overlap of primary + affected + 127 -> affected region (interactions) + 255 -> background (keep) + + After inversion and normalization, the output mask has values in [0, 1] + with four discrete levels: 1.0 (remove), ~0.75, ~0.50, 0.0 (keep). + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VOIDQuadmaskPreprocess", + category="mask/video", + inputs=[ + io.Mask.Input("mask"), + io.Int.Input("dilate_width", default=0, min=0, max=50, step=1, + tooltip="Dilation radius for the primary mask region (0 = no dilation)"), + ], + outputs=[ + io.Mask.Output(display_name="quadmask"), + ], + ) + + @classmethod + def execute(cls, mask, dilate_width=0) -> io.NodeOutput: + m = mask.clone() + + if m.max() <= 1.0: + m = m * 255.0 + + if dilate_width > 0 and m.ndim >= 3: + binary = (m < 128).float() + kernel_size = dilate_width * 2 + 1 + if binary.ndim == 3: + binary = binary.unsqueeze(1) + dilated = torch.nn.functional.max_pool2d( + binary, kernel_size=kernel_size, stride=1, padding=dilate_width + ) + if dilated.ndim == 4: + dilated = dilated.squeeze(1) + m = torch.where(dilated > 0.5, torch.zeros_like(m), m) + + m = torch.where(m <= 31, torch.zeros_like(m), m) + m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m) + m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m) + m = torch.where(m > 191, torch.full_like(m, 255), m) + + m = (255.0 - m) / 255.0 + + return io.NodeOutput(m) + + +class VOIDInpaintConditioning(io.ComfyNode): + """Build VOID inpainting conditioning for CogVideoX. + + Encodes the processed quadmask and masked source video through the VAE, + producing a 32-channel concat conditioning (16ch mask + 16ch masked video) + that gets concatenated with the 16ch noise latent by the model. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VOIDInpaintConditioning", + category="conditioning/video_models", + inputs=[ + io.Conditioning.Input("positive"), + io.Conditioning.Input("negative"), + io.Vae.Input("vae"), + io.Image.Input("video", tooltip="Source video frames [T, H, W, 3]"), + io.Mask.Input("quadmask", tooltip="Preprocessed quadmask from VOIDQuadmaskPreprocess [T, H, W]"), + io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1, + tooltip="Number of pixel frames to process"), + io.Int.Input("batch_size", default=1, min=1, max=64), + ], + outputs=[ + io.Conditioning.Output(display_name="positive"), + io.Conditioning.Output(display_name="negative"), + io.Latent.Output(display_name="latent"), + ], + ) + + @classmethod + def execute(cls, positive, negative, vae, video, quadmask, + width, height, length, batch_size) -> io.NodeOutput: + + temporal_compression = 4 + latent_t = ((length - 1) // temporal_compression) + 1 + latent_h = height // 8 + latent_w = width // 8 + + vid = video[:length] + vid = comfy.utils.common_upscale( + vid.movedim(-1, 1), width, height, "bilinear", "center" + ).movedim(1, -1) + + qm = quadmask[:length] + if qm.ndim == 3: + qm = qm.unsqueeze(-1) + qm = comfy.utils.common_upscale( + qm.movedim(-1, 1), width, height, "bilinear", "center" + ).movedim(1, -1) + if qm.ndim == 4 and qm.shape[-1] == 1: + qm = qm.squeeze(-1) + + mask_condition = qm + if mask_condition.ndim == 3: + mask_condition_3ch = mask_condition.unsqueeze(-1).expand(-1, -1, -1, 3) + else: + mask_condition_3ch = mask_condition + + inverted_mask_3ch = 1.0 - mask_condition_3ch + masked_video = vid[:, :, :, :3] * (1.0 - mask_condition_3ch) + + mask_latents = vae.encode(inverted_mask_3ch) + masked_video_latents = vae.encode(masked_video) + + def _match_temporal(lat, target_t): + if lat.shape[2] > target_t: + return lat[:, :, :target_t] + elif lat.shape[2] < target_t: + pad = target_t - lat.shape[2] + return torch.cat([lat, lat[:, :, -1:].repeat(1, 1, pad, 1, 1)], dim=2) + return lat + + mask_latents = _match_temporal(mask_latents, latent_t) + masked_video_latents = _match_temporal(masked_video_latents, latent_t) + + inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1) + + positive = node_helpers.conditioning_set_values( + positive, {"concat_latent_image": inpaint_latents} + ) + negative = node_helpers.conditioning_set_values( + negative, {"concat_latent_image": inpaint_latents} + ) + + noise_latent = torch.zeros( + [batch_size, 16, latent_t, latent_h, latent_w], + device=comfy.model_management.intermediate_device() + ) + + return io.NodeOutput(positive, negative, {"samples": noise_latent}) + + +class VOIDExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + VOIDQuadmaskPreprocess, + VOIDInpaintConditioning, + ] + + +async def comfy_entrypoint() -> VOIDExtension: + return VOIDExtension() diff --git a/nodes.py b/nodes.py index 8f8f90cf6..c92429766 100644 --- a/nodes.py +++ b/nodes.py @@ -2428,6 +2428,7 @@ async def init_builtin_extra_nodes(): "nodes_rtdetr.py", "nodes_frame_interpolation.py", "nodes_sam3.py", + "nodes_void.py", ] import_failed = [] From 7a053b5ba719e362e444d3e68f35c8f108db7c9b Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Fri, 10 Apr 2026 11:11:36 +0200 Subject: [PATCH 02/13] Add latents fix. --- comfy_extras/nodes_void.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index 79122d013..30c595f7e 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -1,6 +1,8 @@ import nodes import node_helpers import torch +import comfy +import comfy.latent_formats import comfy.model_management import comfy.utils from comfy_api.latest import io, ComfyExtension @@ -144,6 +146,12 @@ class VOIDInpaintConditioning(io.ComfyNode): inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1) + # CogVideoX.concat_cond() applies process_latent_in (x scale_factor) to + # concat_latent_image before feeding it to the transformer. Pre-divide here + # so the net scaling is identity — the VOID model expects raw VAE latents. + scale_factor = comfy.latent_formats.CogVideoX().scale_factor + inpaint_latents = inpaint_latents / scale_factor + positive = node_helpers.conditioning_set_values( positive, {"concat_latent_image": inpaint_latents} ) From 72618f1657d416b9ca14368c1cc9bbee92b37274 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Wed, 15 Apr 2026 17:36:54 +0200 Subject: [PATCH 03/13] Initial void pass 2 commit. --- comfy_extras/nodes_void.py | 151 +++++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 5 deletions(-) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index 30c595f7e..bdf21ebf6 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -146,11 +146,13 @@ class VOIDInpaintConditioning(io.ComfyNode): inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1) - # CogVideoX.concat_cond() applies process_latent_in (x scale_factor) to - # concat_latent_image before feeding it to the transformer. Pre-divide here - # so the net scaling is identity — the VOID model expects raw VAE latents. - scale_factor = comfy.latent_formats.CogVideoX().scale_factor - inpaint_latents = inpaint_latents / scale_factor + # CogVideoX-Fun was trained with Diffusers convention where VAE latents + # are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond() + # applies process_latent_in (×sf=1.153) to the stored conditioning. + # Pre-multiply by 0.7 so the model sees the correct magnitude: + # stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7 + DIFFUSERS_SCALING_FACTOR = 0.7 + inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR positive = node_helpers.conditioning_set_values( positive, {"concat_latent_image": inpaint_latents} @@ -167,12 +169,151 @@ class VOIDInpaintConditioning(io.ComfyNode): return io.NodeOutput(positive, negative, {"samples": noise_latent}) +class VOIDWarpedNoise(io.ComfyNode): + """Generate optical-flow warped noise for VOID Pass 2 refinement. + + Takes the Pass 1 output video and produces temporally-correlated noise + by warping Gaussian noise along optical flow vectors. This noise is used + as the initial latent for Pass 2, resulting in better temporal consistency. + + Requires: pip install rp (auto-installs Go-with-the-Flow dependencies) + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VOIDWarpedNoise", + category="latent/video", + inputs=[ + io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"), + io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8), + io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1, + tooltip="Number of pixel frames"), + io.Int.Input("batch_size", default=1, min=1, max=64), + ], + outputs=[ + io.Latent.Output(display_name="warped_noise"), + ], + ) + + @classmethod + def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput: + import numpy as np + + try: + import rp + rp.r._pip_import_autoyes = True + rp.git_import('CommonSource') + import rp.git.CommonSource.noise_warp as nw + except ImportError: + raise RuntimeError( + "VOIDWarpedNoise requires the 'rp' package. Install with: pip install rp" + ) + + temporal_compression = 4 + latent_t = ((length - 1) // temporal_compression) + 1 + latent_h = height // 8 + latent_w = width // 8 + + vid = video[:length].cpu().numpy() + vid_uint8 = (vid * 255).clip(0, 255).astype(np.uint8) + + frames = [vid_uint8[i] for i in range(vid_uint8.shape[0])] + frames = rp.resize_images_to_hold(frames, height=height, width=width) + frames = rp.crop_images(frames, height=height, width=width, origin='center') + frames = rp.as_numpy_array(frames) + + FRAME = 2**-1 + FLOW = 2**3 + LATENT_SCALE = 8 + + warp_output = nw.get_noise_from_video( + frames, + remove_background=False, + visualize=False, + save_files=False, + noise_channels=16, + output_folder=None, + resize_frames=FRAME, + resize_flow=FLOW, + downscale_factor=round(FRAME * FLOW) * LATENT_SCALE, + ) + + warped_np = warp_output.numpy_noises # (T, H, W, C) + if warped_np.dtype == np.float16: + warped_np = warped_np.astype(np.float32) + + import cv2 + + if warped_np.shape[0] != latent_t: + indices = np.linspace(0, warped_np.shape[0] - 1, latent_t).astype(int) + warped_np = warped_np[indices] + + if warped_np.shape[1] != latent_h or warped_np.shape[2] != latent_w: + resized = [] + for t_idx in range(latent_t): + frame = warped_np[t_idx] + ch_resized = [ + cv2.resize(frame[:, :, c], (latent_w, latent_h), + interpolation=cv2.INTER_LINEAR) + for c in range(frame.shape[2]) + ] + resized.append(np.stack(ch_resized, axis=2)) + warped_np = np.stack(resized, axis=0) + + # (T, H, W, C) -> (B, C, T, H, W) + warped_tensor = torch.from_numpy( + warped_np.transpose(3, 0, 1, 2) + ).float().unsqueeze(0) + + if batch_size > 1: + warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1) + + warped_tensor = warped_tensor.to(comfy.model_management.intermediate_device()) + + return io.NodeOutput({"samples": warped_tensor}) + + +class Noise_FromLatent: + """Wraps a pre-computed LATENT tensor as a NOISE source.""" + def __init__(self, latent_dict): + self.seed = 0 + self._samples = latent_dict["samples"] + + def generate_noise(self, input_latent): + return self._samples.clone().cpu() + + +class VOIDWarpedNoiseSource(io.ComfyNode): + """Convert a LATENT (e.g. from VOIDWarpedNoise) into a NOISE source + for use with SamplerCustomAdvanced.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VOIDWarpedNoiseSource", + category="sampling/custom_sampling/noise", + inputs=[ + io.Latent.Input("warped_noise", + tooltip="Warped noise latent from VOIDWarpedNoise"), + ], + outputs=[io.Noise.Output()], + ) + + @classmethod + def execute(cls, warped_noise) -> io.NodeOutput: + return io.NodeOutput(Noise_FromLatent(warped_noise)) + + class VOIDExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ VOIDQuadmaskPreprocess, VOIDInpaintConditioning, + VOIDWarpedNoise, + VOIDWarpedNoiseSource, ] From b4a7ba83e11354af5889dc8126aad4875f1a0a57 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 18:23:19 +0200 Subject: [PATCH 04/13] Add VOIDSampler. --- comfy_extras/nodes_void.py | 62 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index bdf21ebf6..b1e095571 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -306,6 +306,67 @@ class VOIDWarpedNoiseSource(io.ComfyNode): return io.NodeOutput(Noise_FromLatent(warped_noise)) +class VOID_DDIM(comfy.samplers.Sampler): + """DDIM sampler for VOID inpainting models. + + VOID was trained with the diffusers CogVideoXDDIMScheduler which operates in + alpha-space (input std ≈ 1). The standard KSampler applies noise_scaling that + multiplies by sqrt(1+sigma^2) ≈ 4500x, which is incompatible with VOID's + training. This sampler skips noise_scaling and implements the DDIM update rule + directly using sigma-to-alpha conversion. + """ + + def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False): + x = noise.to(torch.float32) + model_options = extra_args.get("model_options", {}) + seed = extra_args.get("seed", None) + s_in = x.new_ones([x.shape[0]]) + + for i in trange(len(sigmas) - 1, disable=disable_pbar): + sigma = sigmas[i] + sigma_next = sigmas[i + 1] + + denoised = model_wrap(x, sigma * s_in, model_options=model_options, seed=seed) + + if callback is not None: + callback(i, denoised, x, len(sigmas) - 1) + + if sigma_next == 0: + x = denoised + else: + alpha_t = 1.0 / (1.0 + sigma ** 2) + alpha_prev = 1.0 / (1.0 + sigma_next ** 2) + + pred_eps = (x - (alpha_t ** 0.5) * denoised) / (1.0 - alpha_t) ** 0.5 + x = (alpha_prev ** 0.5) * denoised + (1.0 - alpha_prev) ** 0.5 * pred_eps + + return x + + +class VOIDSampler(io.ComfyNode): + """VOID DDIM sampler for use with SamplerCustom / SamplerCustomAdvanced. + + Required for VOID inpainting models. Implements the same DDIM loop that VOID + was trained with (diffusers CogVideoXDDIMScheduler), without the noise_scaling + that the standard KSampler applies. Use with RandomNoise or VOIDWarpedNoiseSource. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="VOIDSampler", + category="sampling/custom_sampling/samplers", + inputs=[], + outputs=[io.Sampler.Output()], + ) + + @classmethod + def execute(cls) -> io.NodeOutput: + return io.NodeOutput(VOID_DDIM()) + + get_sampler = execute + + class VOIDExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: @@ -314,6 +375,7 @@ class VOIDExtension(ComfyExtension): VOIDInpaintConditioning, VOIDWarpedNoise, VOIDWarpedNoiseSource, + VOIDSampler, ] From cf5c2b9119cc06b85d7eda12cd8b5c7065556b38 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 18:36:22 +0200 Subject: [PATCH 05/13] Fix the muted video output. --- comfy/latent_formats.py | 18 ++++++++++++++++++ comfy/supported_models.py | 8 ++++++++ comfy_extras/nodes_void.py | 16 ++++++++-------- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 3dac5be18..c278a301e 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -786,8 +786,26 @@ class ZImagePixelSpace(ChromaRadiance): pass class CogVideoX(LatentFormat): + """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b). + + scale_factor matches the vae/config.json scaling_factor for the 2b variant. + The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*) + use a different value; see CogVideoX1_5 below. + """ latent_channels = 16 latent_dimensions = 3 def __init__(self): self.scale_factor = 1.15258426 + + +class CogVideoX1_5(CogVideoX): + """Latent format for 5b-class CogVideoX checkpoints. + + Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun + V1.5-5b family (including VOID inpainting). All of these have + scaling_factor=0.7 in their vae/config.json. Auto-selected in + supported_models.CogVideoX_T2V based on transformer hidden dim. + """ + def __init__(self): + self.scale_factor = 0.7 diff --git a/comfy/supported_models.py b/comfy/supported_models.py index d0e69687e..3c7591c2c 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1853,6 +1853,14 @@ class CogVideoX_T2V(supported_models_base.BASE): vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] + def __init__(self, unet_config): + # 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426. + # 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and + # Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json. + if unet_config.get("num_attention_heads", 0) >= 48: + self.latent_format = latent_formats.CogVideoX1_5 + super().__init__(unet_config) + def get_model(self, state_dict, prefix="", device=None): # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE if self.unet_config.get("patch_size_t") is not None: diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index b1e095571..4e9f77930 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -2,9 +2,10 @@ import nodes import node_helpers import torch import comfy -import comfy.latent_formats import comfy.model_management +import comfy.samplers import comfy.utils +from comfy.utils import model_trange as trange from comfy_api.latest import io, ComfyExtension from typing_extensions import override @@ -146,13 +147,12 @@ class VOIDInpaintConditioning(io.ComfyNode): inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1) - # CogVideoX-Fun was trained with Diffusers convention where VAE latents - # are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond() - # applies process_latent_in (×sf=1.153) to the stored conditioning. - # Pre-multiply by 0.7 so the model sees the correct magnitude: - # stored = vae_output × 0.7 → after process_in: (vae_output×0.7)×sf = raw×0.7 - DIFFUSERS_SCALING_FACTOR = 0.7 - inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR + # No explicit scaling needed here: the model's CogVideoX.concat_cond() + # applies process_latent_in (×latent_format.scale_factor) to each 16-ch + # block of the stored conditioning. For 5b-class checkpoints (incl. the + # VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto- + # selected as 0.7 in supported_models.CogVideoX_T2V, which matches the + # diffusers vae/config.json scaling_factor VOID was trained with. positive = node_helpers.conditioning_set_values( positive, {"concat_latent_image": inpaint_latents} From 256fb7ed8f9bdc539a31d0df5d05ca4fff507976 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 18:51:23 +0200 Subject: [PATCH 06/13] Add custom clip type cogvideox --- comfy/sd.py | 5 ++++ comfy/text_encoders/cogvideo.py | 42 +++++++++++++++++++++++++++++++++ nodes.py | 4 ++-- 3 files changed, 49 insertions(+), 2 deletions(-) diff --git a/comfy/sd.py b/comfy/sd.py index 9fce0e7d0..749bdd710 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -66,6 +66,7 @@ import comfy.text_encoders.longcat_image import comfy.text_encoders.qwen35 import comfy.text_encoders.ernie import comfy.text_encoders.gemma4 +import comfy.text_encoders.cogvideo import comfy.model_patcher import comfy.lora @@ -1224,6 +1225,7 @@ class CLIPType(Enum): NEWBIE = 24 FLUX2 = 25 LONGCAT_IMAGE = 26 + COGVIDEOX = 27 @@ -1428,6 +1430,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data), clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None) clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer + elif clip_type == CLIPType.COGVIDEOX: + clip_target.clip = comfy.text_encoders.cogvideo.cogvideo_te(**t5xxl_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.cogvideo.CogVideoXTokenizer else: #CLIPType.MOCHI clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data)) clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer diff --git a/comfy/text_encoders/cogvideo.py b/comfy/text_encoders/cogvideo.py index f1e8e3f5d..b97310709 100644 --- a/comfy/text_encoders/cogvideo.py +++ b/comfy/text_encoders/cogvideo.py @@ -1,6 +1,48 @@ import comfy.text_encoders.sd3_clip +from comfy import sd1_clip class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer): + """Inner T5 tokenizer for CogVideoX. + + CogVideoX was trained with T5 embeddings padded to 226 tokens (not 77 like SD3). + Used both directly by supported_models.CogVideoX_T2V.clip_target (paired with + the raw T5XXLModel) and by the CogVideoXTokenizer outer wrapper below. + """ def __init__(self, embedding_directory=None, tokenizer_data={}): super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226) + + +class CogVideoXTokenizer(sd1_clip.SD1Tokenizer): + """Outer tokenizer wrapper for CLIPLoader (type="cogvideox").""" + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, + clip_name="t5xxl", tokenizer=CogVideoXT5Tokenizer) + + +class CogVideoXT5XXL(sd1_clip.SD1ClipModel): + """Outer T5XXL model wrapper for CLIPLoader (type="cogvideox"). + + Wraps the raw T5XXL model in the SD1ClipModel interface so that CLIP.__init__ + (which reads self.dtypes) works correctly. The inner model is the standard + sd3_clip.T5XXLModel (no attention_mask change needed for CogVideoX). + """ + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="t5xxl", + clip_model=comfy.text_encoders.sd3_clip.T5XXLModel, + model_options=model_options) + + +def cogvideo_te(dtype_t5=None, t5_quantization_metadata=None): + """Factory that returns a CogVideoXT5XXL class configured with the detected + T5 dtype and optional quantization metadata, for use in load_text_encoder_state_dicts. + """ + class CogVideoXTEModel_(CogVideoXT5XXL): + def __init__(self, device="cpu", dtype=None, model_options={}): + if t5_quantization_metadata is not None: + model_options = model_options.copy() + model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata + if dtype_t5 is not None: + dtype = dtype_t5 + super().__init__(device=device, dtype=dtype, model_options=model_options) + return CogVideoXTEModel_ diff --git a/nodes.py b/nodes.py index c92429766..6377bfbcc 100644 --- a/nodes.py +++ b/nodes.py @@ -958,7 +958,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -968,7 +968,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) From 5e7a007157d252cf3bb3acbb95fd5dccb048440b Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 19:30:20 +0200 Subject: [PATCH 07/13] Fix VOID last-frame glitch by enforcing even latent_t. --- comfy_extras/nodes_void.py | 61 +++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index 4e9f77930..dd03eda6a 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -1,3 +1,5 @@ +import logging + import nodes import node_helpers import torch @@ -9,6 +11,29 @@ from comfy.utils import model_trange as trange from comfy_api.latest import io, ComfyExtension from typing_extensions import override +TEMPORAL_COMPRESSION = 4 +PATCH_SIZE_T = 2 + + +def _valid_void_length(length: int) -> int: + """Round ``length`` down to a value that produces an even latent_t. + + VOID / CogVideoX-Fun-V1.5 uses patch_size_t=2, so the VAE-encoded latent + must have an even temporal dimension. If latent_t is odd, the transformer + pad_to_patch_size circular-wraps an extra latent frame onto the end; after + the post-transformer crop the last real latent frame has been influenced + by the wrapped phantom frame, producing visible jitter and "disappearing" + subjects near the end of the decoded video. Rounding down fixes this. + """ + latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1 + if latent_t % PATCH_SIZE_T == 0: + return length + # Round latent_t down to the nearest multiple of PATCH_SIZE_T, then invert + # the ((length - 1) // TEMPORAL_COMPRESSION) + 1 formula. Floor at 1 frame + # so we never return a non-positive length. + target_latent_t = max(PATCH_SIZE_T, (latent_t // PATCH_SIZE_T) * PATCH_SIZE_T) + return (target_latent_t - 1) * TEMPORAL_COMPRESSION + 1 + class VOIDQuadmaskPreprocess(io.ComfyNode): """Preprocess a quadmask video for VOID inpainting. @@ -88,8 +113,10 @@ class VOIDInpaintConditioning(io.ComfyNode): io.Mask.Input("quadmask", tooltip="Preprocessed quadmask from VOIDQuadmaskPreprocess [T, H, W]"), io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8), io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8), - io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1, - tooltip="Number of pixel frames to process"), + io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1, + tooltip="Number of pixel frames to process. For CogVideoX-Fun-V1.5 " + "(patch_size_t=2), latent_t must be even — lengths that " + "produce odd latent_t are rounded down (e.g. 49 → 45)."), io.Int.Input("batch_size", default=1, min=1, max=64), ], outputs=[ @@ -103,8 +130,17 @@ class VOIDInpaintConditioning(io.ComfyNode): def execute(cls, positive, negative, vae, video, quadmask, width, height, length, batch_size) -> io.NodeOutput: - temporal_compression = 4 - latent_t = ((length - 1) // temporal_compression) + 1 + adjusted_length = _valid_void_length(length) + if adjusted_length != length: + logging.warning( + "VOIDInpaintConditioning: rounding length %d down to %d so that " + "latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2). " + "Using odd latent_t causes the last frame to be corrupted by " + "circular padding.", length, adjusted_length, + ) + length = adjusted_length + + latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1 latent_h = height // 8 latent_w = width // 8 @@ -188,8 +224,9 @@ class VOIDWarpedNoise(io.ComfyNode): io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"), io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8), io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8), - io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1, - tooltip="Number of pixel frames"), + io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1, + tooltip="Number of pixel frames. Rounded down to make latent_t " + "even (patch_size_t=2 requirement), e.g. 49 → 45."), io.Int.Input("batch_size", default=1, min=1, max=64), ], outputs=[ @@ -211,8 +248,16 @@ class VOIDWarpedNoise(io.ComfyNode): "VOIDWarpedNoise requires the 'rp' package. Install with: pip install rp" ) - temporal_compression = 4 - latent_t = ((length - 1) // temporal_compression) + 1 + adjusted_length = _valid_void_length(length) + if adjusted_length != length: + logging.warning( + "VOIDWarpedNoise: rounding length %d down to %d so that " + "latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2).", + length, adjusted_length, + ) + length = adjusted_length + + latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1 latent_h = height // 8 latent_w = width // 8 From 5138ed5326cdbb70ab4c25326dca7251f1709d40 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 21:33:08 +0200 Subject: [PATCH 08/13] Move imports to the top in nodes_void.py --- comfy_extras/nodes_void.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index dd03eda6a..4b923f4a2 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -1,14 +1,16 @@ import logging -import nodes -import node_helpers +import numpy as np import torch + import comfy import comfy.model_management import comfy.samplers import comfy.utils +import node_helpers +import nodes from comfy.utils import model_trange as trange -from comfy_api.latest import io, ComfyExtension +from comfy_api.latest import ComfyExtension, io from typing_extensions import override TEMPORAL_COMPRESSION = 4 @@ -236,8 +238,6 @@ class VOIDWarpedNoise(io.ComfyNode): @classmethod def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput: - import numpy as np - try: import rp rp.r._pip_import_autoyes = True From ba2d1f0c11a9d19a1c21a41da8e8f0ca68d21c4b Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 16 Apr 2026 21:45:35 +0200 Subject: [PATCH 09/13] Drop cv2 & numpy dependency, run VOIDWarpedNoise with torch. --- comfy_extras/nodes_void.py | 50 ++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index 4b923f4a2..301163269 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -1,6 +1,5 @@ import logging -import numpy as np import torch import comfy @@ -261,8 +260,9 @@ class VOIDWarpedNoise(io.ComfyNode): latent_h = height // 8 latent_w = width // 8 - vid = video[:length].cpu().numpy() - vid_uint8 = (vid * 255).clip(0, 255).astype(np.uint8) + # rp.get_noise_from_video expects uint8 numpy frames; everything + # downstream of the warp stays on torch. + vid_uint8 = (video[:length].clamp(0, 1) * 255).to(torch.uint8).cpu().numpy() frames = [vid_uint8[i] for i in range(vid_uint8.shape[0])] frames = rp.resize_images_to_hold(frames, height=height, width=width) @@ -285,38 +285,30 @@ class VOIDWarpedNoise(io.ComfyNode): downscale_factor=round(FRAME * FLOW) * LATENT_SCALE, ) - warped_np = warp_output.numpy_noises # (T, H, W, C) - if warped_np.dtype == np.float16: - warped_np = warped_np.astype(np.float32) + # (T, H, W, C) → torch on intermediate device for torchified resize. + warped = torch.from_numpy(warp_output.numpy_noises).float() + device = comfy.model_management.intermediate_device() + warped = warped.to(device) - import cv2 + if warped.shape[0] != latent_t: + indices = torch.linspace(0, warped.shape[0] - 1, latent_t, + device=device).long() + warped = warped[indices] - if warped_np.shape[0] != latent_t: - indices = np.linspace(0, warped_np.shape[0] - 1, latent_t).astype(int) - warped_np = warped_np[indices] - - if warped_np.shape[1] != latent_h or warped_np.shape[2] != latent_w: - resized = [] - for t_idx in range(latent_t): - frame = warped_np[t_idx] - ch_resized = [ - cv2.resize(frame[:, :, c], (latent_w, latent_h), - interpolation=cv2.INTER_LINEAR) - for c in range(frame.shape[2]) - ] - resized.append(np.stack(ch_resized, axis=2)) - warped_np = np.stack(resized, axis=0) - - # (T, H, W, C) -> (B, C, T, H, W) - warped_tensor = torch.from_numpy( - warped_np.transpose(3, 0, 1, 2) - ).float().unsqueeze(0) + if warped.shape[1] != latent_h or warped.shape[2] != latent_w: + # (T, H, W, C) → (T, C, H, W) → bilinear resize → back + warped = warped.permute(0, 3, 1, 2) + warped = torch.nn.functional.interpolate( + warped, size=(latent_h, latent_w), + mode="bilinear", align_corners=False, + ) + warped = warped.permute(0, 2, 3, 1) + # (T, H, W, C) → (B, C, T, H, W) + warped_tensor = warped.permute(3, 0, 1, 2).unsqueeze(0) if batch_size > 1: warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1) - warped_tensor = warped_tensor.to(comfy.model_management.intermediate_device()) - return io.NodeOutput({"samples": warped_tensor}) From 713b5577ff9b2437994d0a74381c998c09189f81 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Mon, 27 Apr 2026 11:13:33 +0200 Subject: [PATCH 10/13] Add native RaftOpticalFlow code. --- comfy_extras/nodes_void.py | 45 ++-- comfy_extras/void_noise_warp.py | 448 ++++++++++++++++++++++++++++++++ 2 files changed, 464 insertions(+), 29 deletions(-) create mode 100644 comfy_extras/void_noise_warp.py diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index 301163269..aeffb3ee2 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -12,6 +12,8 @@ from comfy.utils import model_trange as trange from comfy_api.latest import ComfyExtension, io from typing_extensions import override +from comfy_extras.void_noise_warp import get_noise_from_video + TEMPORAL_COMPRESSION = 4 PATCH_SIZE_T = 2 @@ -212,8 +214,6 @@ class VOIDWarpedNoise(io.ComfyNode): Takes the Pass 1 output video and produces temporally-correlated noise by warping Gaussian noise along optical flow vectors. This noise is used as the initial latent for Pass 2, resulting in better temporal consistency. - - Requires: pip install rp (auto-installs Go-with-the-Flow dependencies) """ @classmethod @@ -237,15 +237,6 @@ class VOIDWarpedNoise(io.ComfyNode): @classmethod def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput: - try: - import rp - rp.r._pip_import_autoyes = True - rp.git_import('CommonSource') - import rp.git.CommonSource.noise_warp as nw - except ImportError: - raise RuntimeError( - "VOIDWarpedNoise requires the 'rp' package. Install with: pip install rp" - ) adjusted_length = _valid_void_length(length) if adjusted_length != length: @@ -260,36 +251,31 @@ class VOIDWarpedNoise(io.ComfyNode): latent_h = height // 8 latent_w = width // 8 - # rp.get_noise_from_video expects uint8 numpy frames; everything - # downstream of the warp stays on torch. - vid_uint8 = (video[:length].clamp(0, 1) * 255).to(torch.uint8).cpu().numpy() + # RAFT + noise warp is real compute, not an "intermediate" buffer, so + # we want the actual torch device (CUDA/MPS). The final latent is + # moved back to intermediate_device() before returning to match the + # rest of the ComfyUI pipeline. + device = comfy.model_management.get_torch_device() - frames = [vid_uint8[i] for i in range(vid_uint8.shape[0])] - frames = rp.resize_images_to_hold(frames, height=height, width=width) - frames = rp.crop_images(frames, height=height, width=width, origin='center') - frames = rp.as_numpy_array(frames) + vid = video[:length].to(device) + vid = comfy.utils.common_upscale( + vid.movedim(-1, 1), width, height, "bilinear", "center" + ).movedim(1, -1) + vid_uint8 = (vid.clamp(0, 1) * 255).to(torch.uint8) FRAME = 2**-1 FLOW = 2**3 LATENT_SCALE = 8 - warp_output = nw.get_noise_from_video( - frames, - remove_background=False, - visualize=False, - save_files=False, + warped = get_noise_from_video( + vid_uint8, noise_channels=16, - output_folder=None, resize_frames=FRAME, resize_flow=FLOW, downscale_factor=round(FRAME * FLOW) * LATENT_SCALE, + device=device, ) - # (T, H, W, C) → torch on intermediate device for torchified resize. - warped = torch.from_numpy(warp_output.numpy_noises).float() - device = comfy.model_management.intermediate_device() - warped = warped.to(device) - if warped.shape[0] != latent_t: indices = torch.linspace(0, warped.shape[0] - 1, latent_t, device=device).long() @@ -309,6 +295,7 @@ class VOIDWarpedNoise(io.ComfyNode): if batch_size > 1: warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1) + warped_tensor = warped_tensor.to(comfy.model_management.intermediate_device()) return io.NodeOutput({"samples": warped_tensor}) diff --git a/comfy_extras/void_noise_warp.py b/comfy_extras/void_noise_warp.py new file mode 100644 index 000000000..358ff388e --- /dev/null +++ b/comfy_extras/void_noise_warp.py @@ -0,0 +1,448 @@ +""" +Optical-flow-warped noise for VOID Pass 2 refinement. + +Adapted from RyannDaGreat/CommonSource (MIT License, Ryan Burgert): + https://github.com/RyannDaGreat/CommonSource + - noise_warp.py (NoiseWarper / warp_xyωc / regaussianize / get_noise_from_video) + - raft.py (RaftOpticalFlow) + +Only the code paths that ``comfy_extras/nodes_void.py::VOIDWarpedNoise`` actually +uses (torch THWC uint8 input, no background removal, no visualization, no disk +I/O, default warp/noise params) have been inlined. External ``rp`` utilities +have been replaced with equivalents from torch.nn.functional / einops / +torchvision. +""" + +import logging +from typing import Optional + +import torch +import torch.nn.functional as F +from einops import rearrange + +import comfy.model_management + + +# --------------------------------------------------------------------------- +# Low-level torch image helpers (drop-in replacements for rp.torch_* primitives) +# --------------------------------------------------------------------------- + +def _torch_resize_chw(image, size, interp, copy=True): + """Resize a CHW tensor. + + ``size`` is either a scalar factor or a (h, w) tuple. ``interp`` is one + of ``"bilinear"``, ``"nearest"``, ``"area"``. When ``copy`` is False and + the requested size matches the input, returns the input tensor as is + (faster but callers must not mutate the result). + """ + assert image.ndim == 3, image.shape + _, in_h, in_w = image.shape + if isinstance(size, (int, float)) and not isinstance(size, bool): + new_h = max(1, int(in_h * size)) + new_w = max(1, int(in_w * size)) + else: + new_h, new_w = size + + if (new_h, new_w) == (in_h, in_w): + return image.clone() if copy else image + + kwargs = {} + if interp in ("bilinear", "bicubic"): + kwargs["align_corners"] = False + out = F.interpolate(image[None], size=(new_h, new_w), mode=interp, **kwargs)[0] + return out + + +def _torch_remap_relative(image, dx, dy, interp="bilinear"): + """Relative remap of a CHW image via ``F.grid_sample``. + + Equivalent to ``rp.torch_remap_image(image, dx, dy, relative=True, interp=interp)`` + for ``interp`` in {"bilinear", "nearest"}. Out-of-bounds samples are 0. + """ + assert image.ndim == 3 + assert dx.shape == dy.shape + _, h, w = image.shape + + x_abs = dx + torch.arange(w, device=dx.device, dtype=dx.dtype) + y_abs = dy + torch.arange(h, device=dy.device, dtype=dy.dtype)[:, None] + + x_norm = (x_abs / (w - 1)) * 2 - 1 + y_norm = (y_abs / (h - 1)) * 2 - 1 + + grid = torch.stack([x_norm, y_norm], dim=-1)[None].to(image.dtype) + out = F.grid_sample( + image[None], grid, mode=interp, align_corners=True, padding_mode="zeros" + )[0] + return out + + +def _torch_scatter_add_relative(image, dx, dy): + """Scatter-add a CHW image using relative floor-rounded (dx, dy) offsets. + + Equivalent to ``rp.torch_scatter_add_image(image, dx, dy, relative=True, + interp='floor')``. Out-of-bounds targets are dropped. + """ + assert image.ndim == 3 + in_c, in_h, in_w = image.shape + assert dx.shape == dy.shape == (in_h, in_w) + + x = dx.long() + torch.arange(in_w, device=dx.device, dtype=torch.long) + y = dy.long() + torch.arange(in_h, device=dy.device, dtype=torch.long)[:, None] + + valid = ((y >= 0) & (y < in_h) & (x >= 0) & (x < in_w)).reshape(-1) + indices = (y * in_w + x).reshape(-1)[valid] + + flat_image = rearrange(image, "c h w -> (h w) c")[valid] + out = torch.zeros((in_h * in_w, in_c), dtype=image.dtype, device=image.device) + out.index_add_(0, indices, flat_image) + return rearrange(out, "(h w) c -> c h w", h=in_h, w=in_w) + + +# --------------------------------------------------------------------------- +# Noise warping primitives (ported from noise_warp.py) +# --------------------------------------------------------------------------- + +def unique_pixels(image): + """Find unique pixel values in a CHW tensor. + + Returns ``(unique_colors [U, C], counts [U], index_matrix [H, W])`` where + ``index_matrix[i, j]`` is the index of the unique color at that pixel. + """ + _, h, w = image.shape + flat = rearrange(image, "c h w -> (h w) c") + unique_colors, inverse_indices, counts = torch.unique( + flat, dim=0, return_inverse=True, return_counts=True, sorted=False, + ) + index_matrix = rearrange(inverse_indices, "(h w) -> h w", h=h, w=w) + return unique_colors, counts, index_matrix + + +def sum_indexed_values(image, index_matrix): + """For each unique index, sum the CHW image values at its pixels.""" + _, h, w = image.shape + u = int(index_matrix.max().item()) + 1 + flat = rearrange(image, "c h w -> (h w) c") + out = torch.zeros((u, flat.shape[1]), dtype=flat.dtype, device=flat.device) + out.index_add_(0, index_matrix.view(-1), flat) + return out + + +def indexed_to_image(index_matrix, unique_colors): + """Build a CHW image from an index matrix and a (U, C) color table.""" + h, w = index_matrix.shape + flat = unique_colors[index_matrix.view(-1)] + return rearrange(flat, "(h w) c -> c h w", h=h, w=w) + + +def regaussianize(noise): + """Variance-preserving re-sampling of a CHW noise tensor. + + Wherever the noise contains groups of identical pixel values (e.g. after + a nearest-neighbor warp that duplicated source pixels), adds zero-mean + foreign noise within each group and scales by ``1/sqrt(count)`` so the + output is unit-variance gaussian again. + """ + _, hs, ws = noise.shape + _, counts, index_matrix = unique_pixels(noise[:1]) + + foreign_noise = torch.randn_like(noise) + summed = sum_indexed_values(foreign_noise, index_matrix) + meaned = indexed_to_image(index_matrix, summed / rearrange(counts, "u -> u 1")) + zeroed_foreign = foreign_noise - meaned + + counts_image = indexed_to_image(index_matrix, rearrange(counts, "u -> u 1")) + + output = noise / counts_image ** 0.5 + zeroed_foreign + return output, counts_image + + +def xy_meshgrid_like_image(image): + """Return a (2, H, W) tensor of (x, y) pixel coordinates matching ``image``.""" + _, h, w = image.shape + y, x = torch.meshgrid( + torch.arange(h, device=image.device, dtype=image.dtype), + torch.arange(w, device=image.device, dtype=image.dtype), + indexing="ij", + ) + return torch.stack([x, y]) + + +def noise_to_state(noise): + """Pack a (C, H, W) noise tensor into a state tensor (3+C, H, W) = [dx, dy, ω, noise].""" + zeros = torch.zeros_like(noise[:1]) + ones = torch.ones_like(noise[:1]) + return torch.cat([zeros, zeros, ones, noise]) + + +def state_to_noise(state): + """Unpack the noise channels from a state tensor.""" + return state[3:] + + +def warp_state(state, flow): + """Warp a noise-warper state tensor along the given optical flow. + + ``state`` has shape ``(3+c, h, w)`` (= dx, dy, ω, c noise channels). + ``flow`` has shape ``(2, h, w)`` (= dx, dy). + """ + assert flow.device == state.device + assert flow.ndim == 3 and flow.shape[0] == 2 + assert state.ndim == 3 + xyoc, h, w = state.shape + assert flow.shape == (2, h, w) + device = state.device + + x_ch, y_ch = 0, 1 + xy = 2 # state[:xy] = [dx, dy] + xyw = 3 # state[:xyw] = [dx, dy, ω] + w_ch = 2 # state[w_ch] = ω + c = xyoc - xyw + oc = xyoc - xy + assert c > 0, "state has no noise channels" + assert (state[w_ch] > 0).all(), "all weights must be > 0" + + grid = xy_meshgrid_like_image(state) + + init = torch.empty_like(state) + init[:xy] = 0 + init[w_ch] = 1 + init[-c:] = 0 + + # --- Expansion branch: nearest-neighbor remap with negated flow --- + pre_expand = torch.empty_like(state) + pre_expand[:xy] = _torch_remap_relative(state[:xy], -flow[0], -flow[1], "nearest") + pre_expand[-oc:] = _torch_remap_relative(state[-oc:], -flow[0], -flow[1], "nearest") + pre_expand[w_ch][pre_expand[w_ch] == 0] = 1 + + # --- Shrink branch: scatter-add state into new positions --- + pre_shrink = state.clone() + pre_shrink[:xy] += flow + + pos = (grid + pre_shrink[:xy]).round() + in_bounds = (pos[x_ch] >= 0) & (pos[x_ch] < w) & (pos[y_ch] >= 0) & (pos[y_ch] < h) + pre_shrink = torch.where(~in_bounds[None], init, pre_shrink) + + scat_xy = pre_shrink[:xy].round() + pre_shrink[:xy] -= scat_xy + pre_shrink[:xy] = 0 # xy_mode='none' in upstream + + def scat(tensor): + return _torch_scatter_add_relative(tensor, scat_xy[0], scat_xy[1]) + + # rp.torch_scatter_add_image on a bool tensor errors on modern torch; + # scatter-sum a float ones tensor and threshold to get the mask instead. + shrink_mask = scat(torch.ones(1, h, w, dtype=state.dtype, device=device)) > 0 + + # Drop expansion samples at positions that will be filled by shrink. + pre_expand = torch.where(shrink_mask, init, pre_expand) + + # Regaussianize both branches together so duplicated-source groups are + # counted globally, then split back apart. + concat = torch.cat([pre_shrink, pre_expand], dim=2) # along width + concat[-c:], counts_image = regaussianize(concat[-c:]) + concat[w_ch] = concat[w_ch] / counts_image[0] + concat[w_ch] = concat[w_ch].nan_to_num() + pre_shrink, expand = torch.chunk(concat, chunks=2, dim=2) + + shrink = torch.empty_like(pre_shrink) + shrink[w_ch] = scat(pre_shrink[w_ch][None])[0] + shrink[:xy] = scat(pre_shrink[:xy] * pre_shrink[w_ch][None]) / shrink[w_ch][None] + shrink[-c:] = scat(pre_shrink[-c:] * pre_shrink[w_ch][None]) / scat( + pre_shrink[w_ch][None] ** 2 + ).sqrt() + + output = torch.where(shrink_mask, shrink, expand) + output[w_ch] = output[w_ch] / output[w_ch].mean() + output[w_ch] += 1e-5 + output[w_ch] **= 0.9999 + return output + + +class NoiseWarper: + """Maintain a warpable noise state and emit gaussian noise per frame. + + Simplified from RyannDaGreat/CommonSource/noise_warp.py::NoiseWarper: + ``scale_factor``, ``post_noise_alpha``, ``progressive_noise_alpha``, and + ``warp_kwargs`` are all dropped since VOIDWarpedNoise always uses defaults. + """ + + def __init__(self, c, h, w, device, dtype=torch.float32): + assert c > 0 and h > 0 and w > 0 + self.c = c + self.h = h + self.w = w + self.device = device + self.dtype = dtype + + noise = torch.randn(c, h, w, dtype=dtype, device=device) + self._state = noise_to_state(noise) + + @property + def noise(self): + # With scale_factor=1 the "downsample to respect weights" step is a + # size-preserving no-op; the weight-variance correction math still + # runs to stay faithful to upstream. + n = state_to_noise(self._state) + weights = self._state[2:3] + return n * weights / (weights ** 2).sqrt() + + def __call__(self, dx, dy): + assert dx.shape == dy.shape + flow = torch.stack([dx, dy]).to(self.device, self.dtype) + _, oflowh, ofloww = flow.shape + + flow = _torch_resize_chw(flow, (self.h, self.w), "bilinear", copy=True) + flowh, floww = flow.shape[-2:] + + # Upstream scales flow[0] by flowh/oflowh and flow[1] by floww/ofloww + # (channel-order appears swapped but harmless when H and W are scaled + # by the same factor, which is always the case for our callers). + flow[0] *= flowh / oflowh + flow[1] *= floww / ofloww + + self._state = warp_state(self._state, flow) + return self + + +# --------------------------------------------------------------------------- +# RAFT optical flow wrapper (ported from raft.py) +# --------------------------------------------------------------------------- + +class RaftOpticalFlow: + """Torchvision RAFT-large wrapper. ``__call__`` returns a (2, H, W) flow.""" + + def __init__(self, device=None): + from torchvision.models.optical_flow import raft_large + + if device is None: + device = comfy.model_management.get_torch_device() + device = torch.device(device) if not isinstance(device, torch.device) else device + + model = raft_large(weights="DEFAULT", progress=False).to(device) + model.eval() + self.device = device + self.model = model + + def _preprocess(self, image_chw): + image = image_chw.to(self.device, torch.float32) + _, h, w = image.shape + new_h = (h // 8) * 8 + new_w = (w // 8) * 8 + image = _torch_resize_chw(image, (new_h, new_w), "bilinear", copy=False) + image = image * 2 - 1 + return image[None] + + def __call__(self, from_image, to_image): + """``from_image``, ``to_image``: CHW float tensors in [0, 1].""" + assert from_image.shape == to_image.shape + _, h, w = from_image.shape + with torch.no_grad(): + img1 = self._preprocess(from_image) + img2 = self._preprocess(to_image) + list_of_flows = self.model(img1, img2) + flow = list_of_flows[-1][0] # (2, new_h, new_w) + if flow.shape[-2:] != (h, w): + flow = _torch_resize_chw(flow, (h, w), "bilinear", copy=False) + return flow + + +_raft_cache: dict = {} + + +def _get_raft_model(device): + key = str(device) + if key not in _raft_cache: + _raft_cache[key] = RaftOpticalFlow(device=device) + return _raft_cache[key] + + +# --------------------------------------------------------------------------- +# Narrow entry point used by VOIDWarpedNoise +# --------------------------------------------------------------------------- + +def get_noise_from_video( + video_frames: torch.Tensor, + *, + noise_channels: int = 16, + resize_frames: float = 0.5, + resize_flow: int = 8, + downscale_factor: int = 32, + device: Optional[torch.device] = None, +) -> torch.Tensor: + """Produce optical-flow-warped gaussian noise from a video. + + Args: + video_frames: ``(T, H, W, 3)`` uint8 torch tensor. + noise_channels: Channels in the output noise. + resize_frames: Pre-RAFT frame scale factor. + resize_flow: Post-flow up-scale factor applied to the optical flow; + the internal noise state is allocated at + ``(resize_flow * resize_frames * H, resize_flow * resize_frames * W)``. + downscale_factor: Area-pool factor applied to the noise before return; + should evenly divide the internal noise resolution. + device: Target device. Defaults to ``comfy.model_management.get_torch_device()``. + + Returns: + ``(T, H', W', noise_channels)`` float32 noise tensor on ``device``. + """ + assert isinstance(resize_flow, int) and resize_flow >= 1, resize_flow + assert video_frames.ndim == 4 and video_frames.shape[-1] == 3, video_frames.shape + assert video_frames.dtype == torch.uint8, video_frames.dtype + + if device is None: + device = comfy.model_management.get_torch_device() + device = torch.device(device) if not isinstance(device, torch.device) else device + + if device.type == "cpu": + logging.warning( + "VOIDWarpedNoise: running get_noise_from_video on CPU; this will be " + "slow (minutes for ~45 frames). Use CUDA for interactive use." + ) + + T = video_frames.shape[0] + frames = video_frames.to(device).permute(0, 3, 1, 2).to(torch.float32) / 255.0 + if resize_frames != 1.0: + new_h = max(1, int(frames.shape[2] * resize_frames)) + new_w = max(1, int(frames.shape[3] * resize_frames)) + frames = F.interpolate(frames, size=(new_h, new_w), mode="area") + + _, _, H, W = frames.shape + internal_h = resize_flow * H + internal_w = resize_flow * W + if internal_h % downscale_factor or internal_w % downscale_factor: + logging.warning( + "VOIDWarpedNoise: internal noise size %dx%d is not divisible by " + "downscale_factor %d; output noise may have artifacts.", + internal_h, internal_w, downscale_factor, + ) + + raft = _get_raft_model(device) + + with torch.no_grad(): + warper = NoiseWarper( + c=noise_channels, h=internal_h, w=internal_w, device=device, + ) + down_h = warper.h // downscale_factor + down_w = warper.w // downscale_factor + output = torch.empty( + (T, down_h, down_w, noise_channels), dtype=torch.float32, device=device, + ) + + def downscale(noise_chw): + # Area-pool to 1/downscale_factor then multiply by downscale_factor + # to adjust std (sqrt of pool area == downscale_factor for a + # square pool). + down = _torch_resize_chw(noise_chw, 1.0 / downscale_factor, "area", copy=False) + return down * downscale_factor + + output[0] = downscale(warper.noise).permute(1, 2, 0) + + prev = frames[0] + for i in range(1, T): + curr = frames[i] + flow = raft(prev, curr).to(device) + warper(flow[0], flow[1]) + output[i] = downscale(warper.noise).permute(1, 2, 0) + prev = curr + + return output From 5c11f5d232610dea29432c608f3524b4b9542fb3 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Mon, 27 Apr 2026 11:42:00 +0200 Subject: [PATCH 11/13] Polish imports and modify asserts to raise proper errors with messages. --- comfy_extras/void_noise_warp.py | 87 ++++++++++++++++++++++++++------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/comfy_extras/void_noise_warp.py b/comfy_extras/void_noise_warp.py index 358ff388e..4f7ff470f 100644 --- a/comfy_extras/void_noise_warp.py +++ b/comfy_extras/void_noise_warp.py @@ -19,6 +19,7 @@ from typing import Optional import torch import torch.nn.functional as F from einops import rearrange +from torchvision.models.optical_flow import raft_large import comfy.model_management @@ -35,7 +36,10 @@ def _torch_resize_chw(image, size, interp, copy=True): the requested size matches the input, returns the input tensor as is (faster but callers must not mutate the result). """ - assert image.ndim == 3, image.shape + if image.ndim != 3: + raise ValueError( + f"_torch_resize_chw expects a 3D CHW tensor, got shape {tuple(image.shape)}" + ) _, in_h, in_w = image.shape if isinstance(size, (int, float)) and not isinstance(size, bool): new_h = max(1, int(in_h * size)) @@ -59,8 +63,14 @@ def _torch_remap_relative(image, dx, dy, interp="bilinear"): Equivalent to ``rp.torch_remap_image(image, dx, dy, relative=True, interp=interp)`` for ``interp`` in {"bilinear", "nearest"}. Out-of-bounds samples are 0. """ - assert image.ndim == 3 - assert dx.shape == dy.shape + if image.ndim != 3: + raise ValueError( + f"_torch_remap_relative expects a 3D CHW tensor, got shape {tuple(image.shape)}" + ) + if dx.shape != dy.shape: + raise ValueError( + f"_torch_remap_relative: dx and dy must match, got {tuple(dx.shape)} vs {tuple(dy.shape)}" + ) _, h, w = image.shape x_abs = dx + torch.arange(w, device=dx.device, dtype=dx.dtype) @@ -82,9 +92,16 @@ def _torch_scatter_add_relative(image, dx, dy): Equivalent to ``rp.torch_scatter_add_image(image, dx, dy, relative=True, interp='floor')``. Out-of-bounds targets are dropped. """ - assert image.ndim == 3 + if image.ndim != 3: + raise ValueError( + f"_torch_scatter_add_relative expects a 3D CHW tensor, got shape {tuple(image.shape)}" + ) in_c, in_h, in_w = image.shape - assert dx.shape == dy.shape == (in_h, in_w) + if dx.shape != (in_h, in_w) or dy.shape != (in_h, in_w): + raise ValueError( + f"_torch_scatter_add_relative: dx/dy must be ({in_h}, {in_w}), " + f"got dx={tuple(dx.shape)} dy={tuple(dy.shape)}" + ) x = dx.long() + torch.arange(in_w, device=dx.device, dtype=torch.long) y = dy.long() + torch.arange(in_h, device=dy.device, dtype=torch.long)[:, None] @@ -185,11 +202,20 @@ def warp_state(state, flow): ``state`` has shape ``(3+c, h, w)`` (= dx, dy, ω, c noise channels). ``flow`` has shape ``(2, h, w)`` (= dx, dy). """ - assert flow.device == state.device - assert flow.ndim == 3 and flow.shape[0] == 2 - assert state.ndim == 3 + if flow.device != state.device: + raise ValueError( + f"warp_state: flow and state must be on the same device, " + f"got flow={flow.device} state={state.device}" + ) + if state.ndim != 3: + raise ValueError( + f"warp_state: state must be 3D (3+C, H, W), got shape {tuple(state.shape)}" + ) xyoc, h, w = state.shape - assert flow.shape == (2, h, w) + if flow.shape != (2, h, w): + raise ValueError( + f"warp_state: flow must have shape (2, {h}, {w}), got {tuple(flow.shape)}" + ) device = state.device x_ch, y_ch = 0, 1 @@ -198,8 +224,12 @@ def warp_state(state, flow): w_ch = 2 # state[w_ch] = ω c = xyoc - xyw oc = xyoc - xy - assert c > 0, "state has no noise channels" - assert (state[w_ch] > 0).all(), "all weights must be > 0" + if c <= 0: + raise ValueError( + f"warp_state: state has no noise channels (expected 3+C with C>0, got {xyoc} channels)" + ) + if not (state[w_ch] > 0).all(): + raise ValueError("warp_state: all weights in state[2] must be > 0") grid = xy_meshgrid_like_image(state) @@ -267,7 +297,10 @@ class NoiseWarper: """ def __init__(self, c, h, w, device, dtype=torch.float32): - assert c > 0 and h > 0 and w > 0 + if c <= 0 or h <= 0 or w <= 0: + raise ValueError( + f"NoiseWarper: c/h/w must all be positive, got c={c} h={h} w={w}" + ) self.c = c self.h = h self.w = w @@ -287,7 +320,10 @@ class NoiseWarper: return n * weights / (weights ** 2).sqrt() def __call__(self, dx, dy): - assert dx.shape == dy.shape + if dx.shape != dy.shape: + raise ValueError( + f"NoiseWarper: dx and dy must match, got {tuple(dx.shape)} vs {tuple(dy.shape)}" + ) flow = torch.stack([dx, dy]).to(self.device, self.dtype) _, oflowh, ofloww = flow.shape @@ -312,8 +348,6 @@ class RaftOpticalFlow: """Torchvision RAFT-large wrapper. ``__call__`` returns a (2, H, W) flow.""" def __init__(self, device=None): - from torchvision.models.optical_flow import raft_large - if device is None: device = comfy.model_management.get_torch_device() device = torch.device(device) if not isinstance(device, torch.device) else device @@ -334,7 +368,11 @@ class RaftOpticalFlow: def __call__(self, from_image, to_image): """``from_image``, ``to_image``: CHW float tensors in [0, 1].""" - assert from_image.shape == to_image.shape + if from_image.shape != to_image.shape: + raise ValueError( + f"RaftOpticalFlow: from_image and to_image must match, " + f"got {tuple(from_image.shape)} vs {tuple(to_image.shape)}" + ) _, h, w = from_image.shape with torch.no_grad(): img1 = self._preprocess(from_image) @@ -385,9 +423,20 @@ def get_noise_from_video( Returns: ``(T, H', W', noise_channels)`` float32 noise tensor on ``device``. """ - assert isinstance(resize_flow, int) and resize_flow >= 1, resize_flow - assert video_frames.ndim == 4 and video_frames.shape[-1] == 3, video_frames.shape - assert video_frames.dtype == torch.uint8, video_frames.dtype + if not isinstance(resize_flow, int) or resize_flow < 1: + raise ValueError( + f"get_noise_from_video: resize_flow must be a positive int, got {resize_flow!r}" + ) + if video_frames.ndim != 4 or video_frames.shape[-1] != 3: + raise ValueError( + "get_noise_from_video: video_frames must have shape (T, H, W, 3), " + f"got {tuple(video_frames.shape)}" + ) + if video_frames.dtype != torch.uint8: + raise TypeError( + "get_noise_from_video: video_frames must be uint8 in [0, 255], " + f"got dtype {video_frames.dtype}" + ) if device is None: device = comfy.model_management.get_torch_device() From 0fca6d7225d1716677f9df9653364e0744401df1 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Thu, 30 Apr 2026 15:49:44 +0200 Subject: [PATCH 12/13] Add Optical Flow Loader. --- comfy_extras/nodes_void.py | 80 ++++++++++++++++++++++++++++++++- comfy_extras/void_noise_warp.py | 33 +++++++------- folder_paths.py | 2 + 3 files changed, 95 insertions(+), 20 deletions(-) diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py index aeffb3ee2..e7a8f3757 100644 --- a/comfy_extras/nodes_void.py +++ b/comfy_extras/nodes_void.py @@ -4,15 +4,21 @@ import torch import comfy import comfy.model_management +import comfy.model_patcher import comfy.samplers import comfy.utils +import folder_paths import node_helpers import nodes from comfy.utils import model_trange as trange from comfy_api.latest import ComfyExtension, io +from torchvision.models.optical_flow import raft_large from typing_extensions import override -from comfy_extras.void_noise_warp import get_noise_from_video + +from comfy_extras.void_noise_warp import RaftOpticalFlow, get_noise_from_video + +OpticalFlow = io.Custom("OPTICAL_FLOW") TEMPORAL_COMPRESSION = 4 PATCH_SIZE_T = 2 @@ -38,6 +44,67 @@ def _valid_void_length(length: int) -> int: return (target_latent_t - 1) * TEMPORAL_COMPRESSION + 1 +class OpticalFlowLoader(io.ComfyNode): + """Load an optical flow model from ``models/optical_flow/``. + + Only torchvision's RAFT-large format is recognized today (the model used + by VOIDWarpedNoise). The checkpoint must be placed under + ``models/optical_flow/`` — ComfyUI never downloads optical-flow weights + at runtime. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="OpticalFlowLoader", + display_name="Load Optical Flow Model", + category="loaders", + inputs=[ + io.Combo.Input( + "model_name", + options=folder_paths.get_filename_list("optical_flow"), + tooltip=( + "Optical flow model to load. Files must be placed in the " + "'optical_flow' folder. Today only torchvision's " + "raft_large.pth is supported." + ), + ), + ], + outputs=[ + OpticalFlow.Output(), + ], + ) + + @classmethod + def execute(cls, model_name) -> io.NodeOutput: + + model_path = folder_paths.get_full_path_or_raise("optical_flow", model_name) + sd = comfy.utils.load_torch_file(model_path, safe_load=True) + + has_raft_keys = ( + any(k.startswith("feature_encoder.") for k in sd) + and any(k.startswith("context_encoder.") for k in sd) + and any(k.startswith("update_block.") for k in sd) + ) + if not has_raft_keys: + raise ValueError( + "Unrecognized optical flow model format: expected a torchvision " + "RAFT-large state dict with 'feature_encoder.', 'context_encoder.' " + "and 'update_block.' prefixes." + ) + + model = raft_large(weights=None, progress=False) + model.load_state_dict(sd) + model.eval().to(torch.float32) + + patcher = comfy.model_patcher.ModelPatcher( + model, + load_device=comfy.model_management.get_torch_device(), + offload_device=comfy.model_management.unet_offload_device(), + ) + return io.NodeOutput(patcher) + + class VOIDQuadmaskPreprocess(io.ComfyNode): """Preprocess a quadmask video for VOID inpainting. @@ -222,6 +289,10 @@ class VOIDWarpedNoise(io.ComfyNode): node_id="VOIDWarpedNoise", category="latent/video", inputs=[ + OpticalFlow.Input( + "optical_flow", + tooltip="Optical flow model from OpticalFlowLoader (RAFT-large).", + ), io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"), io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8), io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8), @@ -236,7 +307,7 @@ class VOIDWarpedNoise(io.ComfyNode): ) @classmethod - def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput: + def execute(cls, optical_flow, video, width, height, length, batch_size) -> io.NodeOutput: adjusted_length = _valid_void_length(length) if adjusted_length != length: @@ -257,6 +328,9 @@ class VOIDWarpedNoise(io.ComfyNode): # rest of the ComfyUI pipeline. device = comfy.model_management.get_torch_device() + comfy.model_management.load_model_gpu(optical_flow) + raft = RaftOpticalFlow(optical_flow.model, device=device) + vid = video[:length].to(device) vid = comfy.utils.common_upscale( vid.movedim(-1, 1), width, height, "bilinear", "center" @@ -269,6 +343,7 @@ class VOIDWarpedNoise(io.ComfyNode): warped = get_noise_from_video( vid_uint8, + raft, noise_channels=16, resize_frames=FRAME, resize_flow=FLOW, @@ -395,6 +470,7 @@ class VOIDExtension(ComfyExtension): @override async def get_node_list(self) -> list[type[io.ComfyNode]]: return [ + OpticalFlowLoader, VOIDQuadmaskPreprocess, VOIDInpaintConditioning, VOIDWarpedNoise, diff --git a/comfy_extras/void_noise_warp.py b/comfy_extras/void_noise_warp.py index 4f7ff470f..fcc9a5f8b 100644 --- a/comfy_extras/void_noise_warp.py +++ b/comfy_extras/void_noise_warp.py @@ -9,8 +9,10 @@ Adapted from RyannDaGreat/CommonSource (MIT License, Ryan Burgert): Only the code paths that ``comfy_extras/nodes_void.py::VOIDWarpedNoise`` actually uses (torch THWC uint8 input, no background removal, no visualization, no disk I/O, default warp/noise params) have been inlined. External ``rp`` utilities -have been replaced with equivalents from torch.nn.functional / einops / -torchvision. +have been replaced with equivalents from torch.nn.functional / einops. The +RAFT optical-flow model itself is loaded offline via ``OpticalFlowLoader`` in +``nodes_void.py`` and passed into ``get_noise_from_video`` by the caller; this +module never downloads weights at runtime. """ import logging @@ -19,7 +21,6 @@ from typing import Optional import torch import torch.nn.functional as F from einops import rearrange -from torchvision.models.optical_flow import raft_large import comfy.model_management @@ -345,14 +346,20 @@ class NoiseWarper: # --------------------------------------------------------------------------- class RaftOpticalFlow: - """Torchvision RAFT-large wrapper. ``__call__`` returns a (2, H, W) flow.""" + """RAFT-large wrapper around a pre-loaded torchvision model. - def __init__(self, device=None): + ``model`` must be the ``torchvision.models.optical_flow.raft_large`` module + with its weights already populated; this class is load-agnostic so the + caller owns downloading/offload concerns (see ``OpticalFlowLoader`` in + ``nodes_void.py``). ``__call__`` returns a ``(2, H, W)`` flow. + """ + + def __init__(self, model, device=None): if device is None: device = comfy.model_management.get_torch_device() device = torch.device(device) if not isinstance(device, torch.device) else device - model = raft_large(weights="DEFAULT", progress=False).to(device) + model = model.to(device) model.eval() self.device = device self.model = model @@ -384,22 +391,13 @@ class RaftOpticalFlow: return flow -_raft_cache: dict = {} - - -def _get_raft_model(device): - key = str(device) - if key not in _raft_cache: - _raft_cache[key] = RaftOpticalFlow(device=device) - return _raft_cache[key] - - # --------------------------------------------------------------------------- # Narrow entry point used by VOIDWarpedNoise # --------------------------------------------------------------------------- def get_noise_from_video( video_frames: torch.Tensor, + raft: RaftOpticalFlow, *, noise_channels: int = 16, resize_frames: float = 0.5, @@ -411,6 +409,7 @@ def get_noise_from_video( Args: video_frames: ``(T, H, W, 3)`` uint8 torch tensor. + raft: Pre-loaded RAFT optical-flow wrapper (see ``RaftOpticalFlow``). noise_channels: Channels in the output noise. resize_frames: Pre-RAFT frame scale factor. resize_flow: Post-flow up-scale factor applied to the optical flow; @@ -465,8 +464,6 @@ def get_noise_from_video( internal_h, internal_w, downscale_factor, ) - raft = _get_raft_model(device) - with torch.no_grad(): warper = NoiseWarper( c=noise_channels, h=internal_h, w=internal_w, device=device, diff --git a/folder_paths.py b/folder_paths.py index 80f4b291a..322193aae 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -54,6 +54,8 @@ folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_enc folder_names_and_paths["frame_interpolation"] = ([os.path.join(models_dir, "frame_interpolation")], supported_pt_extensions) +folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions) + output_directory = os.path.join(base_path, "output") temp_directory = os.path.join(base_path, "temp") input_directory = os.path.join(base_path, "input") From 5e287b908e562a14f9318d8ec218148d0236ee11 Mon Sep 17 00:00:00 2001 From: Talmaj Marinc Date: Tue, 5 May 2026 21:33:47 +0200 Subject: [PATCH 13/13] Add placeholder file put_optical_flow_models_here. --- models/optical_flow/put_optical_flow_models_here | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 models/optical_flow/put_optical_flow_models_here diff --git a/models/optical_flow/put_optical_flow_models_here b/models/optical_flow/put_optical_flow_models_here new file mode 100644 index 000000000..e69de29bb