Merge branch 'master' into dr-support-pip-cm

2025-12-20 11:32:58 +08:00 · 2025-10-19 11:39:42 +09:00 · 2025-10-19 11:39:42 +09:00 · 8f59e2a341
commit 8f59e2a341
parent 7d5e73ea94 5b80addafd
3 changed files with 7 additions and 6 deletions
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -371,6 +371,9 @@ try:
 except:
    pass

+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+    torch.backends.cudnn.benchmark = True
+
 try:
    if torch_version_numeric >= (2, 5):
        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -67,9 +67,6 @@ except:

 cast_to = comfy.model_management.cast_to #TODO: remove once no more references

-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
-    torch.backends.cudnn.benchmark = True
-
 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)

--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@ -1,6 +1,6 @@
 import os
 import importlib.util
-from comfy.cli_args import args
+from comfy.cli_args import args, PerformanceFeature
 import subprocess

 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
@ -75,7 +75,8 @@ if not args.cuda_malloc:
                spec.loader.exec_module(module)
                version = module.__version__

-        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
+        if int(version[0]) >= 2 and "+cu" in version:  # enable by default for torch version 2.0 and up only on cuda torch
+            if PerformanceFeature.AutoTune not in args.fast:  # Autotune has issues with cuda malloc
                args.cuda_malloc = cuda_malloc_supported()
    except:
        pass