mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-07-03 21:20:49 +08:00
Merge 77cb84873a into 694815f498
This commit is contained in:
commit
b4a857b7aa
12
comfy/sd.py
12
comfy/sd.py
@ -1131,13 +1131,14 @@ class VAE:
|
|||||||
output = self.decode_tiled_3d(samples, **args)
|
output = self.decode_tiled_3d(samples, **args)
|
||||||
return output.movedim(1, -1)
|
return output.movedim(1, -1)
|
||||||
|
|
||||||
def encode(self, pixel_samples):
|
def encode(self, pixel_samples, not_video=None):
|
||||||
self.throw_exception_if_invalid()
|
self.throw_exception_if_invalid()
|
||||||
pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
|
pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
|
||||||
pixel_samples = pixel_samples.movedim(-1, 1)
|
pixel_samples = pixel_samples.movedim(-1, 1)
|
||||||
do_tile = False
|
do_tile = False
|
||||||
|
_not_video = self.not_video if not_video is None else not_video
|
||||||
if self.latent_dim == 3 and pixel_samples.ndim < 5:
|
if self.latent_dim == 3 and pixel_samples.ndim < 5:
|
||||||
if not self.not_video:
|
if not _not_video:
|
||||||
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
||||||
else:
|
else:
|
||||||
pixel_samples = pixel_samples.unsqueeze(2)
|
pixel_samples = pixel_samples.unsqueeze(2)
|
||||||
@ -1184,13 +1185,14 @@ class VAE:
|
|||||||
|
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None):
|
def encode_tiled(self, pixel_samples, tile_x=None, tile_y=None, overlap=None, tile_t=None, overlap_t=None, not_video=None):
|
||||||
self.throw_exception_if_invalid()
|
self.throw_exception_if_invalid()
|
||||||
pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
|
pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
|
||||||
dims = self.latent_dim
|
dims = self.latent_dim
|
||||||
pixel_samples = pixel_samples.movedim(-1, 1)
|
pixel_samples = pixel_samples.movedim(-1, 1)
|
||||||
|
_not_video = self.not_video if not_video is None else not_video
|
||||||
if dims == 3:
|
if dims == 3:
|
||||||
if not self.not_video:
|
if not _not_video:
|
||||||
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
|
||||||
else:
|
else:
|
||||||
pixel_samples = pixel_samples.unsqueeze(2)
|
pixel_samples = pixel_samples.unsqueeze(2)
|
||||||
@ -1909,6 +1911,8 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
|
|||||||
vae_sd = model_config.process_vae_state_dict(vae_sd)
|
vae_sd = model_config.process_vae_state_dict(vae_sd)
|
||||||
vae_device = model_options.get("load_device", None)
|
vae_device = model_options.get("load_device", None)
|
||||||
vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device)
|
vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device)
|
||||||
|
if getattr(model_config, 'vae_not_video', None) is not None: # <-- add
|
||||||
|
vae.not_video = model_config.vae_not_video
|
||||||
|
|
||||||
if output_clip:
|
if output_clip:
|
||||||
if te_model_options.get("custom_operations", None) is None:
|
if te_model_options.get("custom_operations", None) is None:
|
||||||
|
|||||||
@ -1867,6 +1867,7 @@ class QwenImage(supported_models_base.BASE):
|
|||||||
|
|
||||||
vae_key_prefix = ["vae."]
|
vae_key_prefix = ["vae."]
|
||||||
text_encoder_key_prefix = ["text_encoders."]
|
text_encoder_key_prefix = ["text_encoders."]
|
||||||
|
vae_not_video = True
|
||||||
|
|
||||||
def get_model(self, state_dict, prefix="", device=None):
|
def get_model(self, state_dict, prefix="", device=None):
|
||||||
out = model_base.QwenImage(self, device=device)
|
out = model_base.QwenImage(self, device=device)
|
||||||
|
|||||||
23
nodes.py
23
nodes.py
@ -373,15 +373,22 @@ class VAEDecodeTiled:
|
|||||||
class VAEEncode:
|
class VAEEncode:
|
||||||
@classmethod
|
@classmethod
|
||||||
def INPUT_TYPES(s):
|
def INPUT_TYPES(s):
|
||||||
return {"required": { "pixels": ("IMAGE", ), "vae": ("VAE", )}}
|
return {"required": { "pixels": ("IMAGE", ), "vae": ("VAE", ),
|
||||||
|
"encode_as": (["Auto", "Video Frames", "Individual Images"], {"default": "Auto", "advanced": True, "tooltip": "For 3D/video VAEs: 'Video Frames' merges the batch into a temporal sequence, 'Individual Images' encodes each image independently. 'auto' uses the VAE default."}),
|
||||||
|
}}
|
||||||
RETURN_TYPES = ("LATENT",)
|
RETURN_TYPES = ("LATENT",)
|
||||||
FUNCTION = "encode"
|
FUNCTION = "encode"
|
||||||
|
|
||||||
CATEGORY = "model/latent"
|
CATEGORY = "model/latent"
|
||||||
SEARCH_ALIASES = ["encode", "encode image", "image to latent"]
|
SEARCH_ALIASES = ["encode", "encode image", "image to latent"]
|
||||||
|
|
||||||
def encode(self, vae, pixels):
|
def encode(self, vae, pixels, encode_as="Auto"):
|
||||||
t = vae.encode(pixels)
|
not_video = None
|
||||||
|
if encode_as == "Individual Images":
|
||||||
|
not_video = True
|
||||||
|
elif encode_as == "Video Frames":
|
||||||
|
not_video = False
|
||||||
|
t = vae.encode(pixels, not_video=not_video)
|
||||||
return ({"samples":t}, )
|
return ({"samples":t}, )
|
||||||
|
|
||||||
class VAEEncodeTiled:
|
class VAEEncodeTiled:
|
||||||
@ -392,14 +399,20 @@ class VAEEncodeTiled:
|
|||||||
"overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32, "advanced": True}),
|
"overlap": ("INT", {"default": 64, "min": 0, "max": 4096, "step": 32, "advanced": True}),
|
||||||
"temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to encode at a time.", "advanced": True}),
|
"temporal_size": ("INT", {"default": 64, "min": 8, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to encode at a time.", "advanced": True}),
|
||||||
"temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap.", "advanced": True}),
|
"temporal_overlap": ("INT", {"default": 8, "min": 4, "max": 4096, "step": 4, "tooltip": "Only used for video VAEs: Amount of frames to overlap.", "advanced": True}),
|
||||||
|
"encode_as": (["Auto", "Video Frames", "Individual Images"], {"default": "Auto", "advanced": True, "tooltip": "For 3D/video VAEs: 'Video Frames' merges the batch into a temporal sequence, 'Individual Images' encodes each image independently. 'auto' uses the VAE default."}),
|
||||||
}}
|
}}
|
||||||
RETURN_TYPES = ("LATENT",)
|
RETURN_TYPES = ("LATENT",)
|
||||||
FUNCTION = "encode"
|
FUNCTION = "encode"
|
||||||
|
|
||||||
CATEGORY = "model/latent"
|
CATEGORY = "model/latent"
|
||||||
|
|
||||||
def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8):
|
def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8, encode_as="Auto"):
|
||||||
t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap)
|
not_video = None
|
||||||
|
if encode_as == "Individual Images":
|
||||||
|
not_video = True
|
||||||
|
elif encode_as == "Video Frames":
|
||||||
|
not_video = False
|
||||||
|
t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap, not_video=not_video)
|
||||||
return ({"samples": t}, )
|
return ({"samples": t}, )
|
||||||
|
|
||||||
class VAEEncodeForInpaint:
|
class VAEEncodeForInpaint:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user