diff --git a/comfy/ldm/seedvr/vae.py b/comfy/ldm/seedvr/vae.py
index c9fef0677..292958a88 100644
--- a/comfy/ldm/seedvr/vae.py
+++ b/comfy/ldm/seedvr/vae.py
@@ -1878,8 +1878,6 @@ class VideoAutoencoderKLWrapper(VideoAutoencoderKL):
         if latent.ndim == 4:
             latent = latent.unsqueeze(2)
 
-        target_device = comfy.model_management.get_torch_device()
-        self.decoder.to(target_device)
         if self.tiled_args.get("enable_tiling", None) is not None:
             self.enable_tiling = self.tiled_args.pop("enable_tiling", False)
 
diff --git a/comfy/sd.py b/comfy/sd.py
index be2ce30a8..5f89d2c82 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -379,8 +379,8 @@ class VAE:
                 self.latent_channels = 16
             elif "decoder.up_blocks.2.upsamplers.0.upscale_conv.weight" in sd: # seedvr2
                 self.first_stage_model = comfy.ldm.seedvr.vae.VideoAutoencoderKLWrapper()
-                self.memory_used_decode = lambda shape, dtype: (2000 * shape[1] * shape[2] * shape[3] * (4 * 8 * 8)) * model_management.dtype_size(dtype)
-                self.memory_used_encode = lambda shape, dtype: (1000 * max(shape[1], 5) * shape[2] * shape[3]) * model_management.dtype_size(dtype)
+                self.memory_used_decode = lambda shape, dtype: (10 * shape[1] * shape[2] * shape[3] * (4 * 8 * 8)) * model_management.dtype_size(dtype)
+                self.memory_used_encode = lambda shape, dtype: (10 * max(shape[1], 5) * shape[2] * shape[3]) * model_management.dtype_size(dtype)
                 self.working_dtypes = [torch.bfloat16, torch.float32]
                 self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
                 self.downscale_index_formula = (4, 8, 8)
diff --git a/comfy_extras/nodes_seedvr.py b/comfy_extras/nodes_seedvr.py
index 314100324..945cf966b 100644
--- a/comfy_extras/nodes_seedvr.py
+++ b/comfy_extras/nodes_seedvr.py
@@ -332,11 +332,8 @@ class SeedVR2InputProcessing(io.ComfyNode):
 
     @classmethod
     def execute(cls, images, vae, resolution, spatial_tile_size, temporal_tile_size, spatial_overlap, enable_tiling):
-        device = vae.patcher.load_device
 
-        offload_device = comfy.model_management.intermediate_device()
-        main_device = comfy.model_management.get_torch_device()
-        images = images.to(main_device)
+        comfy.model_management.load_models_gpu([vae.patcher])
         vae_model = vae.first_stage_model
         scale = 0.9152; shift = 0
         if images.dim() != 5: # add the t dim
@@ -360,8 +357,6 @@ class SeedVR2InputProcessing(io.ComfyNode):
         images = cut_videos(images)
 
         images = rearrange(images, "b t c h w -> b c t h w")
-        images = images.to(device)
-        vae_model = vae_model.to(device)
 
         # in case users a non-compatiable number for tiling
         def make_divisible(val, divisor):
@@ -393,7 +388,6 @@ class SeedVR2InputProcessing(io.ComfyNode):
         latent = rearrange(latent, "b c ... -> b ... c")
 
         latent = (latent - shift) * scale
-        latent = latent.to(offload_device)
 
         return io.NodeOutput({"samples": latent})