Workaround for nvidia issue where VAE uses 3x more memory on torch 2.9 (#10373)

2025-12-17 01:52:59 +08:00 · 2025-10-16 15:16:03 -07:00 · 2025-10-16 15:16:03 -07:00 · 19b466160c
commit 19b466160c
parent bc0ad9bb49
1 changed files with 19 additions and 0 deletions
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -52,6 +52,16 @@ try:
 except (ModuleNotFoundError, TypeError):
    logging.warning("Could not set sdpa backend priority.")

+NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
+try:
+    if comfy.model_management.is_nvidia():
+        if torch.backends.cudnn.version() >= 91300 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+            #TODO: change upper bound version once it's fixed'
+            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
+            logging.info("working around nvidia conv3d memory bug.")
+except:
+    pass
+
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references

 if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
@ -151,6 +161,15 @@ class disable_weight_init:
        def reset_parameters(self):
            return None

+        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
+                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
+                if bias is not None:
+                    out += bias.reshape((1, -1) + (1,) * (out.ndim - 2))
+                return out
+            else:
+                return super()._conv_forward(input, weight, bias, *args, **kwargs)
+
        def forward_comfy_cast_weights(self, input):
            weight, bias = cast_bias_weight(self, input)
            return self._conv_forward(input, weight, bias)