From a16fc7ee98fd598b058f4f0c0a598f3ca4d2f256 Mon Sep 17 00:00:00 2001
From: Talmaj Marinc <talmaj@comfy.org>
Date: Fri, 10 Apr 2026 15:03:05 +0200
Subject: [PATCH] Remove sparkvsr related code.

---
 comfy_extras/nodes_cogvideox.py | 137 ------------------------------
 convert_sparkvsr_to_comfy.py    | 144 --------------------------------
 nodes.py                        |   1 -
 3 files changed, 282 deletions(-)
 delete mode 100644 comfy_extras/nodes_cogvideox.py
 delete mode 100644 convert_sparkvsr_to_comfy.py

diff --git a/comfy_extras/nodes_cogvideox.py b/comfy_extras/nodes_cogvideox.py
deleted file mode 100644
index 59aa74cee..000000000
--- a/comfy_extras/nodes_cogvideox.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import nodes
-import node_helpers
-import torch
-import comfy.model_management
-import comfy.utils
-from comfy_api.latest import io, ComfyExtension
-from typing_extensions import override
-
-class SparkVSRConditioning(io.ComfyNode):
-    """Conditioning node for SparkVSR video super-resolution.
-
-    Encodes LQ video and optional HR reference frames through the VAE,
-    builds the concat conditioning for the CogVideoX I2V model.
-    """
-
-    @classmethod
-    def define_schema(cls):
-        return io.Schema(
-            node_id="SparkVSRConditioning",
-            category="conditioning/video_models",
-            inputs=[
-                io.Conditioning.Input("positive"),
-                io.Conditioning.Input("negative"),
-                io.Vae.Input("vae"),
-                io.Image.Input("lq_video"),
-                io.Int.Input("width", default=832, min=16, max=nodes.MAX_RESOLUTION, step=8),
-                io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=8),
-                io.Int.Input("length", default=49, min=1, max=nodes.MAX_RESOLUTION, step=1),
-                io.Int.Input("batch_size", default=1, min=1, max=64),
-                io.Image.Input("ref_frames", optional=True),
-                io.Combo.Input("ref_mode", options=["auto", "manual"], default="auto", optional=True),
-                io.String.Input("ref_indices", default="", optional=True),
-                io.Float.Input("ref_guidance_scale", default=1.0, min=0.0, max=10.0, step=0.1, optional=True),
-            ],
-            outputs=[
-                io.Conditioning.Output(display_name="positive"),
-                io.Conditioning.Output(display_name="negative"),
-                io.Latent.Output(display_name="latent"),
-            ],
-        )
-
-    @classmethod
-    def execute(cls, positive, negative, vae, lq_video, width, height, length,
-                batch_size, ref_frames=None, ref_mode="auto", ref_indices="",
-                ref_guidance_scale=1.0) -> io.NodeOutput:
-
-        temporal_compression = 4
-        latent_t = ((length - 1) // temporal_compression) + 1
-        latent_h = height // 8
-        latent_w = width // 8
-
-        # Base latent (noise will be added by KSampler)
-        latent = torch.zeros(
-            [batch_size, 16, latent_t, latent_h, latent_w],
-            device=comfy.model_management.intermediate_device()
-        )
-
-        # Encode LQ video → this becomes the base latent (KSampler adds noise to this)
-        lq = lq_video[:length]
-        lq = comfy.utils.common_upscale(lq.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-        lq_latent = vae.encode(lq[:, :, :, :3])
-
-        # Ensure temporal dim matches
-        if lq_latent.shape[2] > latent_t:
-            lq_latent = lq_latent[:, :, :latent_t]
-        elif lq_latent.shape[2] < latent_t:
-            pad = latent_t - lq_latent.shape[2]
-            lq_latent = torch.cat([lq_latent, lq_latent[:, :, -1:].repeat(1, 1, pad, 1, 1)], dim=2)
-
-        # Build reference latent (16ch) — goes as concat_latent_image
-        # concat_cond in model_base will concatenate this with the noise (16ch) → 32ch total
-        ref_latent = torch.zeros_like(lq_latent)
-
-        if ref_frames is not None:
-            num_video_frames = lq_video.shape[0]
-
-            # Determine reference indices
-            if ref_mode == "manual" and ref_indices.strip():
-                indices = [int(x.strip()) for x in ref_indices.split(",") if x.strip()]
-            else:
-                indices = _select_indices(num_video_frames)
-
-            # Encode each reference frame and place at its temporal position.
-            # SparkVSR places refs at specific latent indices, rest stays zeros.
-            for ref_idx in indices:
-                if ref_idx >= ref_frames.shape[0]:
-                    continue
-
-                frame = ref_frames[ref_idx:ref_idx + 1]
-                frame = comfy.utils.common_upscale(frame.movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
-                frame_latent = vae.encode(frame[:, :, :, :3])
-
-                target_lat_idx = ref_idx // temporal_compression
-                if target_lat_idx < latent_t:
-                    ref_latent[:, :, target_lat_idx] = frame_latent[:, :, 0]
-
-        # Set ref latent as concat conditioning (16ch, model_base.concat_cond adds it to noise)
-        if ref_guidance_scale != 1.0 and ref_frames is not None:
-            # CFG: positive gets real refs, negative gets zero refs
-            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": ref_latent})
-            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": torch.zeros_like(ref_latent)})
-        else:
-            positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": ref_latent})
-            negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": ref_latent})
-
-        # LQ latent is the base — KSampler will noise it and denoise
-        out_latent = {"samples": lq_latent}
-        return io.NodeOutput(positive, negative, out_latent)
-
-
-def _select_indices(num_frames, max_refs=None):
-    """Auto-select reference frame indices (first, evenly spaced, last)."""
-    if max_refs is None:
-        max_refs = (num_frames - 1) // 4 + 1
-    max_refs = min(max_refs, 3)
-
-    if num_frames <= 1:
-        return [0]
-    if max_refs == 1:
-        return [0]
-    if max_refs == 2:
-        return [0, num_frames - 1]
-
-    mid = num_frames // 2
-    return [0, mid, num_frames - 1]
-
-
-class CogVideoXExtension(ComfyExtension):
-    @override
-    async def get_node_list(self) -> list[type[io.ComfyNode]]:
-        return [
-            SparkVSRConditioning,
-        ]
-
-
-async def comfy_entrypoint() -> CogVideoXExtension:
-    return CogVideoXExtension()
diff --git a/convert_sparkvsr_to_comfy.py b/convert_sparkvsr_to_comfy.py
deleted file mode 100644
index 891c14428..000000000
--- a/convert_sparkvsr_to_comfy.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-"""Convert SparkVSR/CogVideoX diffusers checkpoint to ComfyUI format.
-
-Usage:
-    python convert_sparkvsr_to_comfy.py --model_dir path/to/sparkvsr-checkpoint \
-        --output_dir ComfyUI/models/
-
-This creates two files:
-    - diffusion_models/cogvideox_sparkvsr.safetensors  (transformer)
-    - vae/cogvideox_vae.safetensors                     (VAE)
-
-T5-XXL text encoder does not need conversion — use existing ComfyUI T5 weights.
-"""
-
-import argparse
-import os
-import torch
-from safetensors.torch import load_file, save_file
-
-
-def remap_transformer_keys(state_dict):
-    """Remap diffusers transformer keys to ComfyUI CogVideoX naming."""
-    new_sd = {}
-    for k, v in state_dict.items():
-        new_key = k
-
-        # Patch embedding
-        new_key = new_key.replace("patch_embed.proj.", "patch_embed.proj.")
-        new_key = new_key.replace("patch_embed.text_proj.", "patch_embed.text_proj.")
-        new_key = new_key.replace("patch_embed.pos_embedding", "patch_embed.pos_embedding")
-
-        # Time embedding: diffusers uses time_embedding.linear_1/2, we use time_embedding_linear_1/2
-        new_key = new_key.replace("time_embedding.linear_1.", "time_embedding_linear_1.")
-        new_key = new_key.replace("time_embedding.linear_2.", "time_embedding_linear_2.")
-
-        # OFS embedding
-        new_key = new_key.replace("ofs_embedding.linear_1.", "ofs_embedding_linear_1.")
-        new_key = new_key.replace("ofs_embedding.linear_2.", "ofs_embedding_linear_2.")
-
-        # Transformer blocks: diffusers uses transformer_blocks, we use blocks
-        new_key = new_key.replace("transformer_blocks.", "blocks.")
-
-        # Attention: diffusers uses attn1.to_q/k/v/out, we use q/k/v/attn_out
-        new_key = new_key.replace(".attn1.to_q.", ".q.")
-        new_key = new_key.replace(".attn1.to_k.", ".k.")
-        new_key = new_key.replace(".attn1.to_v.", ".v.")
-        new_key = new_key.replace(".attn1.to_out.0.", ".attn_out.")
-        new_key = new_key.replace(".attn1.norm_q.", ".norm_q.")
-        new_key = new_key.replace(".attn1.norm_k.", ".norm_k.")
-
-        # Feed-forward: diffusers uses ff.net.0.proj/ff.net.2, we use ff_proj/ff_out
-        new_key = new_key.replace(".ff.net.0.proj.", ".ff_proj.")
-        new_key = new_key.replace(".ff.net.2.", ".ff_out.")
-
-        # Output norms
-        new_key = new_key.replace("norm_final.", "norm_final.")
-        new_key = new_key.replace("norm_out.linear.", "norm_out.linear.")
-        new_key = new_key.replace("norm_out.norm.", "norm_out.norm.")
-
-        new_sd[new_key] = v
-
-    return new_sd
-
-
-def remap_vae_keys(state_dict):
-    """Remap diffusers VAE keys to ComfyUI CogVideoX naming.
-
-    The VAE architecture is directly ported so most keys should match.
-    Main differences are in block naming.
-    """
-    new_sd = {}
-    for k, v in state_dict.items():
-        new_key = k
-
-        # Encoder blocks
-        new_key = new_key.replace("encoder.down_blocks.", "encoder.down_blocks.")
-        new_key = new_key.replace("encoder.mid_block.", "encoder.mid_block.")
-
-        # Decoder blocks
-        new_key = new_key.replace("decoder.up_blocks.", "decoder.up_blocks.")
-        new_key = new_key.replace("decoder.mid_block.", "decoder.mid_block.")
-
-        # Resnet blocks within down/up/mid
-        new_key = new_key.replace(".resnets.", ".resnets.")
-
-        # CausalConv3d: diffusers stores as .conv.weight inside CausalConv3d
-        # Our CausalConv3d also has .conv.weight, so this should match
-
-        # Downsamplers/Upsamplers
-        new_key = new_key.replace(".downsamplers.0.", ".downsamplers.0.")
-        new_key = new_key.replace(".upsamplers.0.", ".upsamplers.0.")
-
-        new_sd[new_key] = v
-
-    return new_sd
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert SparkVSR/CogVideoX to ComfyUI format")
-    parser.add_argument("--model_dir", type=str, required=True,
-                        help="Path to diffusers pipeline directory (contains transformer/, vae/, etc.)")
-    parser.add_argument("--output_dir", type=str, default=".",
-                        help="Output base directory (will create diffusion_models/ and vae/ subdirs)")
-    args = parser.parse_args()
-
-    # Load transformer
-    transformer_dir = os.path.join(args.model_dir, "transformer")
-    print(f"Loading transformer from {transformer_dir}...")
-    transformer_sd = {}
-    for f in sorted(os.listdir(transformer_dir)):
-        if f.endswith(".safetensors"):
-            sd = load_file(os.path.join(transformer_dir, f))
-            transformer_sd.update(sd)
-
-    transformer_sd = remap_transformer_keys(transformer_sd)
-
-    out_dir = os.path.join(args.output_dir, "diffusion_models")
-    os.makedirs(out_dir, exist_ok=True)
-    out_path = os.path.join(out_dir, "cogvideox_sparkvsr.safetensors")
-    print(f"Saving transformer to {out_path} ({len(transformer_sd)} keys)")
-    save_file(transformer_sd, out_path)
-
-    # Load VAE
-    vae_dir = os.path.join(args.model_dir, "vae")
-    print(f"Loading VAE from {vae_dir}...")
-    vae_sd = {}
-    for f in sorted(os.listdir(vae_dir)):
-        if f.endswith(".safetensors"):
-            sd = load_file(os.path.join(vae_dir, f))
-            vae_sd.update(sd)
-
-    vae_sd = remap_vae_keys(vae_sd)
-
-    out_dir = os.path.join(args.output_dir, "vae")
-    os.makedirs(out_dir, exist_ok=True)
-    out_path = os.path.join(out_dir, "cogvideox_vae.safetensors")
-    print(f"Saving VAE to {out_path} ({len(vae_sd)} keys)")
-    save_file(vae_sd, out_path)
-
-    print("Done! T5-XXL text encoder does not need conversion.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/nodes.py b/nodes.py
index f90cee732..ba2fa0246 100644
--- a/nodes.py
+++ b/nodes.py
@@ -2458,7 +2458,6 @@ async def init_builtin_extra_nodes():
         "nodes_painter.py",
         "nodes_curve.py",
         "nodes_rtdetr.py",
-        "nodes_cogvideox.py",
     ]
 
     import_failed = []