Initial void pass 2 commit.

2026-04-23 08:52:32 +08:00 · 2026-04-15 17:36:54 +02:00 · 2026-04-15 17:36:54 +02:00 · b4518ea697
commit b4518ea697
parent a0a69c9b34
1 changed files with 146 additions and 5 deletions
--- a/comfy_extras/nodes_void.py
+++ b/comfy_extras/nodes_void.py
@ -146,11 +146,13 @@ class VOIDInpaintConditioning(io.ComfyNode):
        inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
-        # CogVideoX.concat_cond() applies process_latent_in (x scale_factor) to
+        # CogVideoX-Fun was trained with Diffusers convention where VAE latents
-        # concat_latent_image before feeding it to the transformer. Pre-divide here
+        # are scaled by 0.7 (vae.config.scaling_factor). CogVideoX.concat_cond()
-        # so the net scaling is identity — the VOID model expects raw VAE latents.
+        # applies process_latent_in (×sf=1.153) to the stored conditioning.
-        scale_factor = comfy.latent_formats.CogVideoX().scale_factor
+        # Pre-multiply by 0.7 so the model sees the correct magnitude:
-        inpaint_latents = inpaint_latents / scale_factor
+        #   stored = vae_output × 0.7  →  after process_in: (vae_output×0.7)×sf = raw×0.7
        DIFFUSERS_SCALING_FACTOR = 0.7
        inpaint_latents = inpaint_latents * DIFFUSERS_SCALING_FACTOR
        positive = node_helpers.conditioning_set_values(
            positive, {"concat_latent_image": inpaint_latents}
@ -167,12 +169,151 @@ class VOIDInpaintConditioning(io.ComfyNode):
        return io.NodeOutput(positive, negative, {"samples": noise_latent})
 class VOIDWarpedNoise(io.ComfyNode):
    """Generate optical-flow warped noise for VOID Pass 2 refinement.
    Takes the Pass 1 output video and produces temporally-correlated noise
    by warping Gaussian noise along optical flow vectors. This noise is used
    as the initial latent for Pass 2, resulting in better temporal consistency.
    Requires: pip install rp (auto-installs Go-with-the-Flow dependencies)
    """
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="VOIDWarpedNoise",
            category="latent/video",
            inputs=[
                io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"),
                io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8),
                io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1,
                             tooltip="Number of pixel frames"),
                io.Int.Input("batch_size", default=1, min=1, max=64),
            ],
            outputs=[
                io.Latent.Output(display_name="warped_noise"),
            ],
        )
    @classmethod
    def execute(cls, video, width, height, length, batch_size) -> io.NodeOutput:
        import numpy as np
        try:
            import rp
            rp.r._pip_import_autoyes = True
            rp.git_import('CommonSource')
            import rp.git.CommonSource.noise_warp as nw
        except ImportError:
            raise RuntimeError(
                "VOIDWarpedNoise requires the 'rp' package. Install with: pip install rp"
            )
        temporal_compression = 4
        latent_t = ((length - 1) // temporal_compression) + 1
        latent_h = height // 8
        latent_w = width // 8
        vid = video[:length].cpu().numpy()
        vid_uint8 = (vid * 255).clip(0, 255).astype(np.uint8)
        frames = [vid_uint8[i] for i in range(vid_uint8.shape[0])]
        frames = rp.resize_images_to_hold(frames, height=height, width=width)
        frames = rp.crop_images(frames, height=height, width=width, origin='center')
        frames = rp.as_numpy_array(frames)
        FRAME = 2**-1
        FLOW = 2**3
        LATENT_SCALE = 8
        warp_output = nw.get_noise_from_video(
            frames,
            remove_background=False,
            visualize=False,
            save_files=False,
            noise_channels=16,
            output_folder=None,
            resize_frames=FRAME,
            resize_flow=FLOW,
            downscale_factor=round(FRAME * FLOW) * LATENT_SCALE,
        )
        warped_np = warp_output.numpy_noises  # (T, H, W, C)
        if warped_np.dtype == np.float16:
            warped_np = warped_np.astype(np.float32)
        import cv2
        if warped_np.shape[0] != latent_t:
            indices = np.linspace(0, warped_np.shape[0] - 1, latent_t).astype(int)
            warped_np = warped_np[indices]
        if warped_np.shape[1] != latent_h or warped_np.shape[2] != latent_w:
            resized = []
            for t_idx in range(latent_t):
                frame = warped_np[t_idx]
                ch_resized = [
                    cv2.resize(frame[:, :, c], (latent_w, latent_h),
                               interpolation=cv2.INTER_LINEAR)
                    for c in range(frame.shape[2])
                ]
                resized.append(np.stack(ch_resized, axis=2))
            warped_np = np.stack(resized, axis=0)
        # (T, H, W, C) -> (B, C, T, H, W)
        warped_tensor = torch.from_numpy(
            warped_np.transpose(3, 0, 1, 2)
        ).float().unsqueeze(0)
        if batch_size > 1:
            warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1)
        warped_tensor = warped_tensor.to(comfy.model_management.intermediate_device())
        return io.NodeOutput({"samples": warped_tensor})
 class Noise_FromLatent:
    """Wraps a pre-computed LATENT tensor as a NOISE source."""
    def __init__(self, latent_dict):
        self.seed = 0
        self._samples = latent_dict["samples"]
    def generate_noise(self, input_latent):
        return self._samples.clone().cpu()
 class VOIDWarpedNoiseSource(io.ComfyNode):
    """Convert a LATENT (e.g. from VOIDWarpedNoise) into a NOISE source
    for use with SamplerCustomAdvanced."""
    @classmethod
    def define_schema(cls):
        return io.Schema(
            node_id="VOIDWarpedNoiseSource",
            category="sampling/custom_sampling/noise",
            inputs=[
                io.Latent.Input("warped_noise",
                    tooltip="Warped noise latent from VOIDWarpedNoise"),
            ],
            outputs=[io.Noise.Output()],
        )
    @classmethod
    def execute(cls, warped_noise) -> io.NodeOutput:
        return io.NodeOutput(Noise_FromLatent(warped_noise))
 class VOIDExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            VOIDQuadmaskPreprocess,
            VOIDInpaintConditioning,
            VOIDWarpedNoise,
            VOIDWarpedNoiseSource,
        ]