From 911331c06c16aa80633c5438c58edb32dbfdff50 Mon Sep 17 00:00:00 2001
From: rattus128 <46076784+rattus128@users.noreply.github.com>
Date: Thu, 2 Oct 2025 08:40:28 +1000
Subject: [PATCH 1/2] sd: fix VAE tiled fallback VRAM leak (#10139)

When the VAE catches this VRAM OOM, it launches the fallback logic
straight from the exception context.

Python however refs the entire call stack that caused the exception
including any local variables for the sake of exception report and
debugging. In the case of tensors, this can hold on the references
to GBs of VRAM and inhibit the VRAM allocated from freeing them.

So dump the except context completely before going back to the VAE
via the tiler by getting out of the except block with nothing but
a flag.

The greately increases the reliability of the tiler fallback,
especially on low VRAM cards, as with the bug, if the leak randomly
leaked more than the headroom needed for a single tile, the tiler
would fallback would OOM and fail the flow.
---
 comfy/sd.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/comfy/sd.py b/comfy/sd.py
index 873ad20f2..be225ad03 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -652,6 +652,7 @@ class VAE:
     def decode(self, samples_in, vae_options={}):
         self.throw_exception_if_invalid()
         pixel_samples = None
+        do_tile = False
         try:
             memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
             model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
@@ -667,6 +668,13 @@ class VAE:
                 pixel_samples[x:x+batch_number] = out
         except model_management.OOM_EXCEPTION:
             logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
+            #NOTE: We don't know what tensors were allocated to stack variables at the time of the
+            #exception and the exception itself refs them all until we get out of this except block.
+            #So we just set a flag for tiler fallback so that tensor gc can happen once the
+            #exception is fully off the books.
+            do_tile = True
+
+        if do_tile:
             dims = samples_in.ndim - 2
             if dims == 1 or self.extra_1d_channel is not None:
                 pixel_samples = self.decode_tiled_1d(samples_in)
@@ -713,6 +721,7 @@ class VAE:
         self.throw_exception_if_invalid()
         pixel_samples = self.vae_encode_crop_pixels(pixel_samples)
         pixel_samples = pixel_samples.movedim(-1, 1)
+        do_tile = False
         if self.latent_dim == 3 and pixel_samples.ndim < 5:
             if not self.not_video:
                 pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
@@ -734,6 +743,13 @@ class VAE:
 
         except model_management.OOM_EXCEPTION:
             logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
+            #NOTE: We don't know what tensors were allocated to stack variables at the time of the
+            #exception and the exception itself refs them all until we get out of this except block.
+            #So we just set a flag for tiler fallback so that tensor gc can happen once the
+            #exception is fully off the books.
+            do_tile = True
+
+        if do_tile:
             if self.latent_dim == 3:
                 tile = 256
                 overlap = tile // 4

From 4965c0e2acf39d84e82cb63dd6cc4400299d0a61 Mon Sep 17 00:00:00 2001
From: rattus128 <46076784+rattus128@users.noreply.github.com>
Date: Thu, 2 Oct 2025 08:42:16 +1000
Subject: [PATCH 2/2] WAN: Fix cache VRAM leak on error (#10141)

If this suffers an exception (such as a VRAM oom) it will leave the
encode() and decode() methods which skips the cleanup of the WAN
feature cache. The comfy node cache then ultimately keeps a reference
this object which is in turn reffing large tensors from the failed
execution.

The feature cache is currently setup at a class variable on the
encoder/decoder however, the encode and decode functions always clear
it on both entry and exit of normal execution.

Its likely the design intent is this is usable as a streaming encoder
where the input comes in batches, however the functions as they are
today don't support that.

So simplify by bringing the cache back to local variable, so that if
it does VRAM OOM the cache itself is properly garbage when the
encode()/decode() functions dissappear from the stack.
---
 comfy/ldm/wan/vae.py | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/comfy/ldm/wan/vae.py b/comfy/ldm/wan/vae.py
index 791596938..ccbb25822 100644
--- a/comfy/ldm/wan/vae.py
+++ b/comfy/ldm/wan/vae.py
@@ -468,55 +468,46 @@ class WanVAE(nn.Module):
                                  attn_scales, self.temperal_upsample, dropout)
 
     def encode(self, x):
-        self.clear_cache()
+        conv_idx = [0]
+        feat_map = [None] * count_conv3d(self.decoder)
         ## cache
         t = x.shape[2]
         iter_ = 1 + (t - 1) // 4
         ## 对encode输入的x，按时间拆分为1、4、4、4....
         for i in range(iter_):
-            self._enc_conv_idx = [0]
+            conv_idx = [0]
             if i == 0:
                 out = self.encoder(
                     x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
+                    feat_cache=feat_map,
+                    feat_idx=conv_idx)
             else:
                 out_ = self.encoder(
                     x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
+                    feat_cache=feat_map,
+                    feat_idx=conv_idx)
                 out = torch.cat([out, out_], 2)
         mu, log_var = self.conv1(out).chunk(2, dim=1)
-        self.clear_cache()
         return mu
 
     def decode(self, z):
-        self.clear_cache()
+        conv_idx = [0]
+        feat_map = [None] * count_conv3d(self.decoder)
         # z: [b,c,t,h,w]
 
         iter_ = z.shape[2]
         x = self.conv2(z)
         for i in range(iter_):
-            self._conv_idx = [0]
+            conv_idx = [0]
             if i == 0:
                 out = self.decoder(
                     x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
+                    feat_cache=feat_map,
+                    feat_idx=conv_idx)
             else:
                 out_ = self.decoder(
                     x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
+                    feat_cache=feat_map,
+                    feat_idx=conv_idx)
                 out = torch.cat([out, out_], 2)
-        self.clear_cache()
         return out
-
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        #cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num