diff --git a/comfy/sd.py b/comfy/sd.py index 610c4e2b8..acbfb6716 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1131,13 +1131,14 @@ class VAE: output = self.decode_tiled_3d(samples, **args) return output.movedim(1, -1) - def encode(self, pixel_samples): + def encode(self, pixel_samples, not_video=None): self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) pixel_samples = pixel_samples.movedim(-1, 1) do_tile = False + _not_video = self.not_video if not_video is None else not_video if self.latent_dim == 3 and pixel_samples.ndim < 5: - if not self.not_video: + if not _not_video: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) else: pixel_samples = pixel_samples.unsqueeze(2) @@ -1184,13 +1185,14 @@ class VAE: return samples - def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None): + def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None, not_video=None): self.throw_exception_if_invalid() pixel_samples = self.vae_encode_crop_pixels(pixel_samples) dims = self.latent_dim pixel_samples = pixel_samples.movedim(-1, 1) + _not_video = self.not_video if not_video is None else not_video if dims == 3: - if not self.not_video: + if not _not_video: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) else: pixel_samples = pixel_samples.unsqueeze(2) @@ -1909,6 +1911,8 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c vae_sd = model_config.process_vae_state_dict(vae_sd) vae_device = model_options.get("load_device", None) vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device) + if getattr(model_config, 'vae_not_video', None) is not None: # <-- add + vae.not_video = model_config.vae_not_video if output_clip: if te_model_options.get("custom_operations", None) is None: diff --git a/comfy/supported_models.py b/comfy/supported_models.py index afb66e6f3..d4ff490e4 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1867,6 +1867,7 @@ class QwenImage(supported_models_base.BASE): vae_key_prefix = ["vae."] text_encoder_key_prefix = ["text_encoders."] + vae_not_video = True def get_model(self, state_dict, prefix="", device=None): out = model_base.QwenImage(self, device=device) diff --git a/nodes.py b/nodes.py index 9043a8d0a..1038fe09d 100644 --- a/nodes.py +++ b/nodes.py @@ -373,15 +373,22 @@ class VAEDecodeTiled: class VAEEncode: @classmethod def INPUT_TYPES(s): - return {"required": { "pixels": ("IMAGE", ), "vae": ("VAE", )}} + return {"required": { "pixels": ("IMAGE", ), "vae": ("VAE", ), + "encode_as": (["Auto", "Video Frames", "Individual Images"], {"default": "Auto", "advanced": True, "tooltip": "For 3D/video VAEs: 'Video Frames' merges the batch into a temporal sequence, 'Individual Images' encodes each image independently. 'auto' uses the VAE default."}), + }} RETURN_TYPES = ("LATENT",) FUNCTION = "encode" CATEGORY = "model/latent" SEARCH_ALIASES = ["encode", "encode image", "image to latent"] - def encode(self, vae, pixels): - t = vae.encode(pixels) + def encode(self, vae, pixels, encode_as="Auto"): + not_video = None + if encode_as == "Individual Images": + not_video = True + elif encode_as == "Video Frames": + not_video = False + t = vae.encode(pixels, not_video=not_video) return ({"samples":t}, ) class VAEEncodeTiled: @@ -392,14 +399,20 @@ class VAEEncodeTiled: "overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32, "advanced": True}), "temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to encode at a time.", "advanced": True}), "temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap.", "advanced": True}), + "encode_as": (["Auto", "Video Frames", "Individual Images"], {"default": "Auto", "advanced": True, "tooltip": "For 3D/video VAEs: 'Video Frames' merges the batch into a temporal sequence, 'Individual Images' encodes each image independently. 'auto' uses the VAE default."}), }} RETURN_TYPES = ("LATENT",) FUNCTION = "encode" CATEGORY = "model/latent" - def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8): - t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap) + def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8, encode_as="Auto"): + not_video = None + if encode_as == "Individual Images": + not_video = True + elif encode_as == "Video Frames": + not_video = False + t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap, not_video=not_video) return ({"samples": t}, ) class VAEEncodeForInpaint: