diff --git a/comfy/ldm/seedvr/model.py b/comfy/ldm/seedvr/model.py index 0825a12ba..c1b8a1738 100644 --- a/comfy/ldm/seedvr/model.py +++ b/comfy/ldm/seedvr/model.py @@ -1377,7 +1377,7 @@ class NaDiT(nn.Module): out = torch.stack(vid) try: pos, neg = out.chunk(2) - ut = torch.cat([neg, pos]) + out = torch.cat([neg, pos]) out = out.movedim(-1, 1) return out except: diff --git a/comfy/ldm/seedvr/vae.py b/comfy/ldm/seedvr/vae.py index ac5e20b8d..d3786e85d 100644 --- a/comfy/ldm/seedvr/vae.py +++ b/comfy/ldm/seedvr/vae.py @@ -1541,9 +1541,10 @@ class VideoAutoencoderKLWrapper(VideoAutoencoderKL): x = self.decode(z).sample return x, z, p - def encode(self, x: torch.FloatTensor): + def encode(self, x, orig_dims): # we need to keep a reference to the image/video so we later can do a colour fix later self.original_image_video = x + self.img_dims = orig_dims if x.ndim == 4: x = x.unsqueeze(2) x = x.to(next(self.parameters()).dtype) @@ -1570,6 +1571,8 @@ class VideoAutoencoderKLWrapper(VideoAutoencoderKL): input = rearrange(self.original_image_video[0], "c t h w -> t c h w") x = wavelet_reconstruction(x, input) + o_h, o_w = self.img_dims + x = x[..., :o_h, :o_w] return x def set_memory_limit(self, conv_max_mem: Optional[float], norm_max_mem: Optional[float]): diff --git a/comfy/sd.py b/comfy/sd.py index 86b5ff2ad..be2ce30a8 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -386,6 +386,8 @@ class VAE: self.downscale_index_formula = (4, 8, 8) self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8) self.upscale_index_formula = (4, 8, 8) + self.process_input = lambda image: image + self.crop_input = False elif "decoder.conv_in.weight" in sd: if sd['decoder.conv_in.weight'].shape[1] == 64: ddconfig = {"block_out_channels": [128, 256, 512, 512, 1024, 1024], "in_channels": 3, "out_channels": 3, "num_res_blocks": 2, "ffactor_spatial": 32, "downsample_match_channel": True, "upsample_match_channel": True} diff --git a/comfy_extras/nodes_seedvr.py b/comfy_extras/nodes_seedvr.py index bff358424..2b1d41174 100644 --- a/comfy_extras/nodes_seedvr.py +++ b/comfy_extras/nodes_seedvr.py @@ -68,14 +68,21 @@ def area_resize(image, max_area): interpolation=InterpolationMode.BICUBIC, ) -def crop(image, factor): +def div_pad(image, factor): + height_factor, width_factor = factor height, width = image.shape[-2:] - cropped_height = height - (height % height_factor) - cropped_width = width - (width % width_factor) + pad_height = (height_factor - (height % height_factor)) % height_factor + pad_width = (width_factor - (width % width_factor)) % width_factor + + if pad_height == 0 and pad_width == 0: + return image + + if isinstance(image, torch.Tensor): + padding = (0, pad_width, 0, pad_height) + image = torch.nn.functional.pad(image, padding, mode='constant', value=0.0) - image = TVF.center_crop(img=image, output_size=(cropped_height, cropped_width)) return image def cut_videos(videos): @@ -120,6 +127,8 @@ class SeedVR2InputProcessing(io.ComfyNode): device = vae.patcher.load_device offload_device = comfy.model_management.intermediate_device() + main_device = comfy.model_management.get_torch_device() + images = images.to(main_device) vae_model = vae.first_stage_model scale = 0.9152; shift = 0 if images.dim() != 5: # add the t dim @@ -135,7 +144,8 @@ class SeedVR2InputProcessing(io.ComfyNode): images = area_resize(images, max_area) images = clip(images) - images = crop(images, (16, 16)) + o_h, o_w = images.shape[-2:] + images = div_pad(images, (16, 16)) images = normalize(images) _, _, new_h, new_w = images.shape @@ -145,7 +155,7 @@ class SeedVR2InputProcessing(io.ComfyNode): images = rearrange(images, "b t c h w -> b c t h w") images = images.to(device) vae_model = vae_model.to(device) - latent = vae_model.encode(images)[0] + latent = vae_model.encode(images, [o_h, o_w])[0] vae_model = vae_model.to(offload_device) latent = latent.unsqueeze(2) if latent.ndim == 4 else latent