test

2026-01-28 23:30:16 +08:00 · 2026-01-11 03:45:35 -08:00
7 changed files with 44 additions and 201 deletions
--- a/README.md
+++ b/README.md
@ -183,7 +183,7 @@ Simply download, extract with [7-Zip](https://7-zip.org) or with the windows exp

 If you have trouble extracting it, right click the file -> properties -> unblock

-The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
+Update your Nvidia drivers if it doesn't start.

 #### Alternative Downloads:

@ -212,7 +212,7 @@ Python 3.14 works but you may encounter issues with the torch compile node. The

 Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12

-torch 2.4 and above is supported but some features might only work on newer versions. We generally recommend using the latest major version of pytorch with the latest cuda version unless it is less than 2 weeks old.
+torch 2.4 and above is supported but some features might only work on newer versions. We generally recommend using the latest major version of pytorch unless it is less than 2 weeks old.

 ### Instructions:

@ -458,3 +458,4 @@ This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy
 ### Which GPU should I buy for this?

 [See this page for some recommendations](https://github.com/comfyanonymous/ComfyUI/wiki/Which-GPU-should-I-buy-for-ComfyUI)
+
--- a/comfy/clip_model.py
+++ b/comfy/clip_model.py
@ -1,7 +1,6 @@
 import torch
 from comfy.ldm.modules.attention import optimized_attention_for_device
 import comfy.ops
-import math

 def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711], crop=True):
    image = image[:, :, :, :3] if image.shape[3] > 3 else image
@ -22,39 +21,6 @@ def clip_preprocess(image, size=224, mean=[0.48145466, 0.4578275, 0.40821073], s
    image = torch.clip((255. * image), 0, 255).round() / 255.0
    return (image - mean.view([3,1,1])) / std.view([3,1,1])

-def siglip2_flex_calc_resolution(oh, ow, patch_size, max_num_patches, eps=1e-5):
-    def scale_dim(size, scale):
-        scaled = math.ceil(size * scale / patch_size) * patch_size
-        return max(patch_size, int(scaled))
-
-    # Binary search for optimal scale
-    lo, hi = eps / 10, 100.0
-    while hi - lo >= eps:
-        mid = (lo + hi) / 2
-        h, w = scale_dim(oh, mid), scale_dim(ow, mid)
-        if (h // patch_size) * (w // patch_size) <= max_num_patches:
-            lo = mid
-        else:
-            hi = mid
-
-    return scale_dim(oh, lo), scale_dim(ow, lo)
-
-def siglip2_preprocess(image, size, patch_size, num_patches, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], crop=True):
-    if size > 0:
-        return clip_preprocess(image, size=size, mean=mean, std=std, crop=crop)
-
-    image = image[:, :, :, :3] if image.shape[3] > 3 else image
-    mean = torch.tensor(mean, device=image.device, dtype=image.dtype)
-    std = torch.tensor(std, device=image.device, dtype=image.dtype)
-    image = image.movedim(-1, 1)
-
-    b, c, h, w = image.shape
-    h, w = siglip2_flex_calc_resolution(h, w, patch_size, num_patches)
-
-    image = torch.nn.functional.interpolate(image, size=(h, w), mode="bilinear", antialias=True)
-    image = torch.clip((255. * image), 0, 255).round() / 255.0
-    return (image - mean.view([3, 1, 1])) / std.view([3, 1, 1])
-
 class CLIPAttention(torch.nn.Module):
    def __init__(self, embed_dim, heads, dtype, device, operations):
        super().__init__()
@ -209,27 +175,6 @@ class CLIPTextModel(torch.nn.Module):
        out = self.text_projection(x[2])
        return (x[0], x[1], out, x[2])

-def siglip2_pos_embed(embed_weight, embeds, orig_shape):
-    embed_weight_len = round(embed_weight.shape[0] ** 0.5)
-    embed_weight = comfy.ops.cast_to_input(embed_weight, embeds).movedim(1, 0).reshape(1, -1, embed_weight_len, embed_weight_len)
-    embed_weight = torch.nn.functional.interpolate(embed_weight, size=orig_shape, mode="bilinear", align_corners=False, antialias=True)
-    embed_weight = embed_weight.reshape(-1, embed_weight.shape[-2] * embed_weight.shape[-1]).movedim(0, 1)
-    return embeds + embed_weight
-
-class Siglip2Embeddings(torch.nn.Module):
-    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", num_patches=None, dtype=None, device=None, operations=None):
-        super().__init__()
-        self.patch_embedding = operations.Linear(num_channels * patch_size * patch_size, embed_dim, dtype=dtype, device=device)
-        self.position_embedding = operations.Embedding(num_patches, embed_dim, dtype=dtype, device=device)
-        self.patch_size = patch_size
-
-    def forward(self, pixel_values):
-        b, c, h, w = pixel_values.shape
-        img = pixel_values.movedim(1, -1).reshape(b, h // self.patch_size, self.patch_size, w // self.patch_size, self.patch_size, c)
-        img = img.permute(0, 1, 3, 2, 4, 5)
-        img = img.reshape(b, img.shape[1] * img.shape[2], -1)
-        img = self.patch_embedding(img)
-        return siglip2_pos_embed(self.position_embedding.weight, img, (h // self.patch_size, w // self.patch_size))

 class CLIPVisionEmbeddings(torch.nn.Module):
    def __init__(self, embed_dim, num_channels=3, patch_size=14, image_size=224, model_type="", dtype=None, device=None, operations=None):
@ -273,11 +218,8 @@ class CLIPVision(torch.nn.Module):
        intermediate_activation = config_dict["hidden_act"]
        model_type = config_dict["model_type"]

-        if model_type in ["siglip2_vision_model"]:
-            self.embeddings = Siglip2Embeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, num_patches=config_dict.get("num_patches", None), dtype=dtype, device=device, operations=operations)
-        else:
-            self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
-        if model_type in ["siglip_vision_model", "siglip2_vision_model"]:
+        self.embeddings = CLIPVisionEmbeddings(embed_dim, config_dict["num_channels"], config_dict["patch_size"], config_dict["image_size"], model_type=model_type, dtype=dtype, device=device, operations=operations)
+        if model_type == "siglip_vision_model":
            self.pre_layrnorm = lambda a: a
            self.output_layernorm = True
        else:
--- a/comfy/clip_vision.py
+++ b/comfy/clip_vision.py
@ -21,7 +21,6 @@ clip_preprocess = comfy.clip_model.clip_preprocess  # Prevent some stuff from br
 IMAGE_ENCODERS = {
    "clip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "siglip_vision_model": comfy.clip_model.CLIPVisionModelProjection,
-    "siglip2_vision_model": comfy.clip_model.CLIPVisionModelProjection,
    "dinov2": comfy.image_encoders.dino2.Dinov2Model,
 }

@ -33,10 +32,9 @@ class ClipVisionModel():
        self.image_size = config.get("image_size", 224)
        self.image_mean = config.get("image_mean", [0.48145466, 0.4578275, 0.40821073])
        self.image_std = config.get("image_std", [0.26862954, 0.26130258, 0.27577711])
-        self.model_type = config.get("model_type", "clip_vision_model")
-        self.config = config.copy()
-        model_class = IMAGE_ENCODERS.get(self.model_type)
-        if self.model_type == "siglip_vision_model":
+        model_type = config.get("model_type", "clip_vision_model")
+        model_class = IMAGE_ENCODERS.get(model_type)
+        if model_type == "siglip_vision_model":
            self.return_all_hidden_states = True
        else:
            self.return_all_hidden_states = False
@ -57,10 +55,7 @@ class ClipVisionModel():

    def encode_image(self, image, crop=True):
        comfy.model_management.load_model_gpu(self.patcher)
-        if self.model_type == "siglip2_vision_model":
-            pixel_values = comfy.clip_model.siglip2_preprocess(image.to(self.load_device), size=self.image_size, patch_size=self.config.get("patch_size", 16), num_patches=self.config.get("num_patches", 256), mean=self.image_mean, std=self.image_std, crop=crop).float()
-        else:
-            pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
+        pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=crop).float()
        out = self.model(pixel_values=pixel_values, intermediate_output='all' if self.return_all_hidden_states else -2)

        outputs = Output()
@ -112,14 +107,10 @@ def load_clipvision_from_sd(sd, prefix="", convert_keys=False):
    elif "vision_model.encoder.layers.22.layer_norm1.weight" in sd:
        embed_shape = sd["vision_model.embeddings.position_embedding.weight"].shape[0]
        if sd["vision_model.encoder.layers.0.layer_norm1.weight"].shape[0] == 1152:
-            patch_embedding_shape = sd["vision_model.embeddings.patch_embedding.weight"].shape
-            if len(patch_embedding_shape) == 2:
-                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip2_base_naflex.json")
-            else:
-                if embed_shape == 729:
-                    json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
-                elif embed_shape == 1024:
-                    json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
+            if embed_shape == 729:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_384.json")
+            elif embed_shape == 1024:
+                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_siglip_512.json")
        elif embed_shape == 577:
            if "multi_modal_projector.linear_1.bias" in sd:
                json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "clip_vision_config_vitl_336_llava.json")
--- a/comfy/clip_vision_siglip2_base_naflex.json
+++ b/comfy/clip_vision_siglip2_base_naflex.json
@ -1,14 +0,0 @@
-{
-  "num_channels": 3,
-  "hidden_act": "gelu_pytorch_tanh",
-  "hidden_size": 1152,
-  "image_size": -1,
-  "intermediate_size": 4304,
-  "model_type": "siglip2_vision_model",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 27,
-  "patch_size": 16,
-  "num_patches": 256,
-  "image_mean": [0.5, 0.5, 0.5],
-  "image_std": [0.5, 0.5, 0.5]
-}
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@ -11,69 +11,6 @@ from comfy.ldm.lightricks.model import (
 from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
 import comfy.ldm.common_dit

-class CompressedTimestep:
-    """Store video timestep embeddings in compressed form using per-frame indexing."""
-    __slots__ = ('data', 'batch_size', 'num_frames', 'patches_per_frame', 'feature_dim')
-
-    def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
-        """
-        tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
-        patches_per_frame: Number of spatial patches per frame (height * width in latent space)
-        """
-        self.batch_size, num_tokens, self.feature_dim = tensor.shape
-
-        # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
-        if num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
-            self.patches_per_frame = patches_per_frame
-            self.num_frames = num_tokens // patches_per_frame
-
-            # Reshape to [batch, frames, patches_per_frame, feature_dim] and store one value per frame
-            # All patches in a frame are identical, so we only keep the first one
-            reshaped = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)
-            self.data = reshaped[:, :, 0, :].contiguous()  # [batch, frames, feature_dim]
-        else:
-            # Not divisible or too small - store directly without compression
-            self.patches_per_frame = 1
-            self.num_frames = num_tokens
-            self.data = tensor
-
-    def expand(self):
-        """Expand back to original tensor."""
-        if self.patches_per_frame == 1:
-            return self.data
-
-        # [batch, frames, feature_dim] -> [batch, frames, patches_per_frame, feature_dim] -> [batch, tokens, feature_dim]
-        expanded = self.data.unsqueeze(2).expand(self.batch_size, self.num_frames, self.patches_per_frame, self.feature_dim)
-        return expanded.reshape(self.batch_size, -1, self.feature_dim)
-
-    def expand_for_computation(self, scale_shift_table: torch.Tensor, batch_size: int, indices: slice = slice(None, None)):
-        """Compute ada values on compressed per-frame data, then expand spatially."""
-        num_ada_params = scale_shift_table.shape[0]
-
-        # No compression - compute directly
-        if self.patches_per_frame == 1:
-            num_tokens = self.data.shape[1]
-            dim_per_param = self.feature_dim // num_ada_params
-            reshaped = self.data.reshape(batch_size, num_tokens, num_ada_params, dim_per_param)[:, :, indices, :]
-            table_values = scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(device=self.data.device, dtype=self.data.dtype)
-            ada_values = (table_values + reshaped).unbind(dim=2)
-            return ada_values
-
-        # Compressed: compute on per-frame data then expand spatially
-        # Reshape: [batch, frames, feature_dim] -> [batch, frames, num_ada_params, dim_per_param]
-        frame_reshaped = self.data.reshape(batch_size, self.num_frames, num_ada_params, -1)[:, :, indices, :]
-        table_values = scale_shift_table[indices].unsqueeze(0).unsqueeze(0).to(
-            device=self.data.device, dtype=self.data.dtype
-        )
-        frame_ada = (table_values + frame_reshaped).unbind(dim=2)
-
-        # Expand each ada parameter spatially: [batch, frames, dim] -> [batch, frames, patches, dim] -> [batch, tokens, dim]
-        return tuple(
-            frame_val.unsqueeze(2).expand(batch_size, self.num_frames, self.patches_per_frame, -1)
-            .reshape(batch_size, -1, frame_val.shape[-1])
-            for frame_val in frame_ada
-        )
-
 class BasicAVTransformerBlock(nn.Module):
    def __init__(
        self,
@ -182,9 +119,6 @@ class BasicAVTransformerBlock(nn.Module):
    def get_ada_values(
        self, scale_shift_table: torch.Tensor, batch_size: int, timestep: torch.Tensor, indices: slice = slice(None, None)
    ):
-        if isinstance(timestep, CompressedTimestep):
-            return timestep.expand_for_computation(scale_shift_table, batch_size, indices)
-
        num_ada_params = scale_shift_table.shape[0]

        ada_values = (
@ -212,7 +146,10 @@ class BasicAVTransformerBlock(nn.Module):
            gate_timestep,
        )

-        return (*scale_shift_ada_values, *gate_ada_values)
+        scale_shift_chunks = [t.squeeze(2) for t in scale_shift_ada_values]
+        gate_ada_values = [t.squeeze(2) for t in gate_ada_values]
+
+        return (*scale_shift_chunks, *gate_ada_values)

    def forward(
        self,
@ -606,80 +543,72 @@ class LTXAVModel(LTXVModel):
        if grid_mask is not None:
            timestep = timestep[:, grid_mask]

-        timestep_scaled = timestep * self.timestep_scale_multiplier
-
+        timestep = timestep * self.timestep_scale_multiplier
        v_timestep, v_embedded_timestep = self.adaln_single(
-            timestep_scaled.flatten(),
+            timestep.flatten(),
            {"resolution": None, "aspect_ratio": None},
            batch_size=batch_size,
            hidden_dtype=hidden_dtype,
        )

-        # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
-        # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
-        orig_shape = kwargs.get("orig_shape")
-        v_patches_per_frame = None
-        if orig_shape is not None and len(orig_shape) == 5:
-            # orig_shape[3] = height, orig_shape[4] = width (in latent space)
-            v_patches_per_frame = orig_shape[3] * orig_shape[4]
-
-        # Reshape to [batch_size, num_tokens, dim] and compress for storage
-        v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame)
-        v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame)
+        # Second dimension is 1 or number of tokens (if timestep_per_token)
+        v_timestep = v_timestep.view(batch_size, -1, v_timestep.shape[-1])
+        v_embedded_timestep = v_embedded_timestep.view(
+            batch_size, -1, v_embedded_timestep.shape[-1]
+        )

        # Prepare audio timestep
        a_timestep = kwargs.get("a_timestep")
        if a_timestep is not None:
-            a_timestep_scaled = a_timestep * self.timestep_scale_multiplier
-            a_timestep_flat = a_timestep_scaled.flatten()
-            timestep_flat = timestep_scaled.flatten()
+            a_timestep = a_timestep * self.timestep_scale_multiplier
            av_ca_factor = self.av_ca_timestep_scale_multiplier / self.timestep_scale_multiplier

-            # Cross-attention timesteps - compress these too
            av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
-                a_timestep_flat,
+                a_timestep.flatten(),
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
-                timestep_flat,
+                timestep.flatten(),
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
-                timestep_flat * av_ca_factor,
+                timestep.flatten() * av_ca_factor,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
            av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
-                a_timestep_flat * av_ca_factor,
+                a_timestep.flatten() * av_ca_factor,
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )

-            # Compress cross-attention timesteps (only video side, audio is too small to benefit)
-            cross_av_timestep_ss = [
-                av_ca_audio_scale_shift_timestep.view(batch_size, -1, av_ca_audio_scale_shift_timestep.shape[-1]),
-                CompressedTimestep(av_ca_video_scale_shift_timestep.view(batch_size, -1, av_ca_video_scale_shift_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
-                CompressedTimestep(av_ca_a2v_gate_noise_timestep.view(batch_size, -1, av_ca_a2v_gate_noise_timestep.shape[-1]), v_patches_per_frame),  # video - compressed
-                av_ca_v2a_gate_noise_timestep.view(batch_size, -1, av_ca_v2a_gate_noise_timestep.shape[-1]),
-            ]
-
            a_timestep, a_embedded_timestep = self.audio_adaln_single(
-                a_timestep_flat,
+                a_timestep.flatten(),
                {"resolution": None, "aspect_ratio": None},
                batch_size=batch_size,
                hidden_dtype=hidden_dtype,
            )
-            # Audio timesteps
            a_timestep = a_timestep.view(batch_size, -1, a_timestep.shape[-1])
-            a_embedded_timestep = a_embedded_timestep.view(batch_size, -1, a_embedded_timestep.shape[-1])
+            a_embedded_timestep = a_embedded_timestep.view(
+                batch_size, -1, a_embedded_timestep.shape[-1]
+            )
+            cross_av_timestep_ss = [
+                av_ca_audio_scale_shift_timestep,
+                av_ca_video_scale_shift_timestep,
+                av_ca_a2v_gate_noise_timestep,
+                av_ca_v2a_gate_noise_timestep,
+            ]
+            cross_av_timestep_ss = list(
+                [t.view(batch_size, -1, t.shape[-1]) for t in cross_av_timestep_ss]
+            )
        else:
-            a_timestep = timestep_scaled
+            a_timestep = timestep
            a_embedded_timestep = kwargs.get("embedded_timestep")
            cross_av_timestep_ss = []

@ -838,11 +767,6 @@ class LTXAVModel(LTXVModel):
        ax = x[1]
        v_embedded_timestep = embedded_timestep[0]
        a_embedded_timestep = embedded_timestep[1]
-
-        # Expand compressed video timestep if needed
-        if isinstance(v_embedded_timestep, CompressedTimestep):
-            v_embedded_timestep = v_embedded_timestep.expand()
-
        vx = super()._process_output(vx, v_embedded_timestep, keyframe_idxs, **kwargs)

        # Process audio output
--- a/comfy/lora.py
+++ b/comfy/lora.py
@ -322,7 +322,6 @@ def model_lora_keys_unet(model, key_map={}):
                key_map["diffusion_model.{}".format(key_lora)] = to
                key_map["transformer.{}".format(key_lora)] = to
                key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = to
-                key_map[key_lora] = to

    if isinstance(model, comfy.model_base.Kandinsky5):
        for k in sdk:
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.36.13
-comfyui-workflow-templates==0.8.4
+comfyui-workflow-templates==0.7.69
 comfyui-embedded-docs==0.4.0
 torch
 torchsde