Support Wan 2.1 lightVAE

Also support video taes in previews
Only first frame for now as live preview playback is currently only available through VHS custom nodes.
2026-05-14 11:07:24 +08:00 · 2025-11-26 17:14:56 +02:00 · 2025-11-26 17:00:35 +02:00
6 changed files with 67 additions and 22 deletions
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@ -382,6 +382,7 @@ class HunyuanVideo(LatentFormat):
    ]

    latent_rgb_factors_bias = [ 0.0259, -0.0192, -0.0761]
+    taesd_decoder_name = "taehv"

 class Cosmos1CV8x8x8(LatentFormat):
    latent_channels = 16
@ -445,7 +446,7 @@ class Wan21(LatentFormat):
        ]).view(1, self.latent_channels, 1, 1, 1)


-        self.taesd_decoder_name = None #TODO
+        self.taesd_decoder_name = "lighttaew2_1"

    def process_in(self, latent):
        latents_mean = self.latents_mean.to(latent.device, latent.dtype)
@ -516,6 +517,7 @@ class Wan22(Wan21):

    def __init__(self):
        self.scale_factor = 1.0
+        self.taesd_decoder_name = "lighttaew2_2"
        self.latents_mean = torch.tensor([
                -0.2289, -0.0052, -0.1323, -0.2339, -0.2799, 0.0174, 0.1838, 0.1557,
                -0.1382, 0.0542, 0.2813, 0.0891, 0.1570, -0.0098, 0.0375, -0.1825,
@ -670,6 +672,7 @@ class HunyuanVideo15(LatentFormat):
    latent_channels = 32
    latent_dimensions = 3
    scale_factor = 1.03682
+    taesd_decoder_name = "lighttaehy1_5"

 class Hunyuan3Dv2(LatentFormat):
    latent_channels = 64
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@ -231,7 +231,8 @@ class Encoder3d(nn.Module):
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
+                 dropout=0.0,
+                 pruning_rate=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
@ -242,6 +243,7 @@ class Encoder3d(nn.Module):

        # dimensions
        dims = [dim * u for u in [1] + dim_mult]
+        dims = [int(d * (1 - pruning_rate)) for d in dims]
        scale = 1.0

        # init block
@ -335,7 +337,8 @@ class Decoder3d(nn.Module):
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_upsample=[False, True, True],
-                 dropout=0.0):
+                 dropout=0.0,
+                 pruning_rate=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
@ -346,6 +349,7 @@ class Decoder3d(nn.Module):

        # dimensions
        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
+        dims = [int(d * (1 - pruning_rate)) for d in dims]
        scale = 1.0 / 2**(len(dim_mult) - 2)

        # init block
@ -449,7 +453,8 @@ class WanVAE(nn.Module):
                 num_res_blocks=2,
                 attn_scales=[],
                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
+                 dropout=0.0,
+                 pruning_rate=0.0):
        super().__init__()
        self.dim = dim
        self.z_dim = z_dim
@ -461,11 +466,11 @@ class WanVAE(nn.Module):

        # modules
        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_downsample, dropout)
+                                 attn_scales, self.temperal_downsample, dropout, pruning_rate)
        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_upsample, dropout)
+                                 attn_scales, self.temperal_upsample, dropout, pruning_rate)

    def encode(self, x):
        conv_idx = [0]
--- a/comfy/sd.py
+++ b/comfy/sd.py
@ -528,13 +528,16 @@ class VAE:
                    self.memory_used_encode = lambda shape, dtype: 3300 * shape[3] * shape[4] * model_management.dtype_size(dtype)
                    self.memory_used_decode = lambda shape, dtype: 8000 * shape[3] * shape[4] * (16 * 16) * model_management.dtype_size(dtype)
                else:  # Wan 2.1 VAE
+                    pruning_rate = 0.0
+                    if sd["decoder.middle.0.residual.0.gamma"].shape[0] == 96: # lightx2v lightvae
+                        pruning_rate = 0.75
                    self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
                    self.upscale_index_formula = (4, 8, 8)
                    self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
                    self.downscale_index_formula = (4, 8, 8)
                    self.latent_dim = 3
                    self.latent_channels = 16
-                    ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0}
+                    ddconfig = {"dim": 96, "z_dim": self.latent_channels, "dim_mult": [1, 2, 4, 4], "num_res_blocks": 2, "attn_scales": [], "temperal_downsample": [False, True, True], "dropout": 0.0, "pruning_rate": pruning_rate}
                    self.first_stage_model = comfy.ldm.wan.vae.WanVAE(**ddconfig)
                    self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
                    self.memory_used_encode = lambda shape, dtype: 6000 * shape[3] * shape[4] * model_management.dtype_size(dtype)
--- a/comfy/taesd/taehv.py
+++ b/comfy/taesd/taehv.py
@ -112,13 +112,14 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):


 class TAEHV(nn.Module):
-    def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None):
+    def __init__(self, latent_channels, parallel=False, decoder_time_upscale=(True, True), decoder_space_upscale=(True, True, True), latent_format=None, show_progress_bar=True):
        super().__init__()
        self.image_channels = 3
        self.patch_size = 1
        self.latent_channels = latent_channels
        self.parallel = parallel
        self.latent_format = latent_format
+        self.show_progress_bar = show_progress_bar
        self.process_in = latent_format().process_in if latent_format is not None else (lambda x: x)
        self.process_out = latent_format().process_out if latent_format is not None else (lambda x: x)
        if self.latent_channels in [48, 32]: # Wan 2.2 and HunyuanVideo1.5
@ -144,8 +145,15 @@ class TAEHV(nn.Module):
            MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), MemBlock(n_f[2], n_f[2], act_func), nn.Upsample(scale_factor=2 if decoder_space_upscale[2] else 1), TGrow(n_f[2], 2 if decoder_time_upscale[1] else 1), conv(n_f[2], n_f[3], bias=False),
            act_func, conv(n_f[3], self.image_channels*self.patch_size**2),
        )
+        @property
+        def show_progress_bar(self):
+            return self._show_progress_bar

-    def encode(self, x, show_progress_bar=True, **kwargs):
+        @show_progress_bar.setter
+        def show_progress_bar(self, value):
+            self._show_progress_bar = value
+
+    def encode(self, x, **kwargs):
        if self.patch_size > 1: x = F.pixel_unshuffle(x, self.patch_size)
        x = x.movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
        if x.shape[1] % 4 != 0:
@ -153,11 +161,11 @@ class TAEHV(nn.Module):
            n_pad = 4 - x.shape[1] % 4
            padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
            x = torch.cat([x, padding], 1)
-        x = apply_model_with_memblocks(self.encoder, x, self.parallel, show_progress_bar).movedim(2, 1)
+        x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
        return self.process_out(x)

-    def decode(self, x, show_progress_bar=True, **kwargs):
+    def decode(self, x, **kwargs):
        x = self.process_in(x).movedim(2, 1)  # [B, C, T, H, W] -> [B, T, C, H, W]
-        x = apply_model_with_memblocks(self.decoder, x, self.parallel, show_progress_bar)
+        x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
        if self.patch_size > 1: x = F.pixel_shuffle(x, self.patch_size)
        return x[:, self.frames_to_trim:].movedim(2, 1)
--- a/latent_preview.py
+++ b/latent_preview.py
@ -2,17 +2,24 @@ import torch
 from PIL import Image
 from comfy.cli_args import args, LatentPreviewMethod
 from comfy.taesd.taesd import TAESD
+from comfy.sd import VAE
 import comfy.model_management
 import folder_paths
 import comfy.utils
 import logging

 MAX_PREVIEW_RESOLUTION = args.preview_size
+VIDEO_TAES = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]

-def preview_to_image(latent_image):
-        latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1)  # change scale from -1..1 to 0..1
-                            .mul(0xFF)  # to 0..255
-                            )
+def preview_to_image(latent_image, do_scale=True):
+        if do_scale:
+            latents_ubyte = (((latent_image + 1.0) / 2.0).clamp(0, 1)  # change scale from -1..1 to 0..1
+                                .mul(0xFF)  # to 0..255
+                                )
+        else:
+            latents_ubyte = (latent_image.clamp(0, 1)
+                                .mul(0xFF)  # to 0..255
+                                )
        if comfy.model_management.directml_enabled:
                latents_ubyte = latents_ubyte.to(dtype=torch.uint8)
        latents_ubyte = latents_ubyte.to(device="cpu", dtype=torch.uint8, non_blocking=comfy.model_management.device_supports_non_blocking(latent_image.device))
@ -35,6 +42,10 @@ class TAESDPreviewerImpl(LatentPreviewer):
        x_sample = self.taesd.decode(x0[:1])[0].movedim(0, 2)
        return preview_to_image(x_sample)

+class TAEHVPreviewerImpl(TAESDPreviewerImpl):
+    def decode_latent_to_preview(self, x0):
+        x_sample = self.taesd.decode(x0[:1, :, :1])[0][0]
+        return preview_to_image(x_sample, do_scale=False)

 class Latent2RGBPreviewer(LatentPreviewer):
    def __init__(self, latent_rgb_factors, latent_rgb_factors_bias=None):
@ -78,8 +89,13 @@ def get_previewer(device, latent_format):

        if method == LatentPreviewMethod.TAESD:
            if taesd_decoder_path:
-                taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
-                previewer = TAESDPreviewerImpl(taesd)
+                if latent_format.taesd_decoder_name in VIDEO_TAES:
+                    taesd = VAE(comfy.utils.load_torch_file(taesd_decoder_path))
+                    taesd.first_stage_model.show_progress_bar = False
+                    previewer = TAEHVPreviewerImpl(taesd)
+                else:
+                    taesd = TAESD(None, taesd_decoder_path, latent_channels=latent_format.latent_channels).to(device)
+                    previewer = TAESDPreviewerImpl(taesd)
            else:
                logging.warning("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name))

--- a/nodes.py
+++ b/nodes.py
@ -692,8 +692,10 @@ class LoraLoaderModelOnly(LoraLoader):
        return (self.load_lora(model, None, lora_name, strength_model, 0)[0],)

 class VAELoader:
+    video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5"]
+    image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
    @staticmethod
-    def vae_list():
+    def vae_list(s):
        vaes = folder_paths.get_filename_list("vae")
        approx_vaes = folder_paths.get_filename_list("vae_approx")
        sdxl_taesd_enc = False
@ -722,6 +724,11 @@ class VAELoader:
                f1_taesd_dec = True
            elif v.startswith("taef1_decoder."):
                f1_taesd_enc = True
+            else:
+                for tae in s.video_taes:
+                    if v.startswith(tae):
+                        vaes.append(v)
+
        if sd1_taesd_dec and sd1_taesd_enc:
            vaes.append("taesd")
        if sdxl_taesd_dec and sdxl_taesd_enc:
@ -765,7 +772,7 @@ class VAELoader:

    @classmethod
    def INPUT_TYPES(s):
-        return {"required": { "vae_name": (s.vae_list(), )}}
+        return {"required": { "vae_name": (s.vae_list(s), )}}
    RETURN_TYPES = ("VAE",)
    FUNCTION = "load_vae"

@ -776,10 +783,13 @@ class VAELoader:
        if vae_name == "pixel_space":
            sd = {}
            sd["pixel_space_vae"] = torch.tensor(1.0)
-        elif vae_name in ["taesd", "taesdxl", "taesd3", "taef1"]:
+        elif vae_name in self.image_taes:
            sd = self.load_taesd(vae_name)
        else:
-            vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
+            if os.path.splitext(vae_name)[0] in self.video_taes:
+                vae_path = folder_paths.get_full_path_or_raise("vae_approx", vae_name)
+            else:
+                vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
            sd = comfy.utils.load_torch_file(vae_path)
        vae = comfy.sd.VAE(sd=sd)
        vae.throw_exception_if_invalid()
Author	SHA1	Message	Date
kijai	8e93a15857	Support Wan 2.1 lightVAE	2025-11-26 17:14:56 +02:00
kijai	7ef46f66ff	Also support video taes in previews Only first frame for now as live preview playback is currently only available through VHS custom nodes.	2025-11-26 17:00:35 +02:00