From ea6cdd2631fbca6ed81b95796150c32c9a029f0d Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Fri, 5 Sep 2025 22:05:05 -0700
Subject: [PATCH 1/7] Print all fast options in --help (#9737)

---
 comfy/cli_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 72eeaea9a..cc1f12482 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -145,7 +145,7 @@ class PerformanceFeature(enum.Enum):
     CublasOps = "cublas_ops"
     AutoTune = "autotune"
 
-parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: fp16_accumulation fp8_matrix_mult cublas_ops")
+parser.add_argument("--fast", nargs="*", type=PerformanceFeature, help="Enable some untested and potentially quality deteriorating optimizations. --fast with no arguments enables everything. You can pass a list specific optimizations if you only want to enable specific ones. Current valid optimizations: {}".format(" ".join(map(lambda c: c.value, PerformanceFeature))))
 
 parser.add_argument("--mmap-torch-files", action="store_true", help="Use mmap when loading ckpt/pt files.")
 parser.add_argument("--disable-mmap", action="store_true", help="Don't use mmap when loading safetensors.")

From 27a0fcccc376fef6f035ed97664db8aa7e2e6117 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 6 Sep 2025 20:25:22 -0700
Subject: [PATCH 2/7] Enable bf16 VAE on RDNA4. (#9746)

---
 comfy/model_management.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index d08aee1fe..17516b6ed 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -289,6 +289,21 @@ def is_amd():
             return True
     return False
 
+def amd_min_version(device=None, min_rdna_version=0):
+    if not is_amd():
+        return False
+
+    arch = torch.cuda.get_device_properties(device).gcnArchName
+    if arch.startswith('gfx') and len(arch) == 7:
+        try:
+            cmp_rdna_version = int(arch[4]) + 2
+        except:
+            cmp_rdna_version = 0
+        if cmp_rdna_version >= min_rdna_version:
+            return True
+
+    return False
+
 MIN_WEIGHT_MEMORY_RATIO = 0.4
 if is_nvidia():
     MIN_WEIGHT_MEMORY_RATIO = 0.0
@@ -905,7 +920,9 @@ def vae_dtype(device=None, allowed_dtypes=[]):
 
         # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
         # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
-        if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
+        # also a problem on RDNA4 except fp32 is also slow there.
+        # This is due to large bf16 convolutions being extremely slow.
+        if d == torch.bfloat16 and ((not is_amd()) or amd_min_version(device, min_rdna_version=4)) and should_use_bf16(device):
             return d
 
     return torch.float32

From bcbd7884e3af5ee8b6ab848da2a3123f247d6114 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sat, 6 Sep 2025 21:29:38 -0700
Subject: [PATCH 3/7] Don't enable pytorch attention on AMD if triton isn't
 available. (#9747)

---
 comfy/model_management.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 17516b6ed..cb6580f73 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -22,6 +22,7 @@ from enum import Enum
 from comfy.cli_args import args, PerformanceFeature
 import torch
 import sys
+import importlib
 import platform
 import weakref
 import gc
@@ -336,12 +337,13 @@ try:
         logging.info("AMD arch: {}".format(arch))
         logging.info("ROCm version: {}".format(rocm_version))
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
-            if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
-                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
-                    ENABLE_PYTORCH_ATTENTION = True
-#            if torch_version_numeric >= (2, 8):
-#                if any((a in arch) for a in ["gfx1201"]):
-#                    ENABLE_PYTORCH_ATTENTION = True
+            if importlib.util.find_spec('triton') is not None:  # AMD efficient attention implementation depends on triton. TODO: better way of detecting if it's compiled in or not.
+                if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
+                    if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
+                        ENABLE_PYTORCH_ATTENTION = True
+#                if torch_version_numeric >= (2, 8):
+#                    if any((a in arch) for a in ["gfx1201"]):
+#                        ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
             if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
                 SUPPORT_FP8_OPS = True

From fb763d43332aaf15e96350cf1c25e2a1927423f1 Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Sun, 7 Sep 2025 18:16:29 -0700
Subject: [PATCH 4/7] Fix amd_min_version crash when cpu device. (#9754)

---
 comfy/model_management.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index cb6580f73..bbfc3c7a1 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -294,6 +294,9 @@ def amd_min_version(device=None, min_rdna_version=0):
     if not is_amd():
         return False
 
+    if is_device_cpu(device):
+        return False
+
     arch = torch.cuda.get_device_properties(device).gcnArchName
     if arch.startswith('gfx') and len(arch) == 7:
         try:

From bd1d9bcd5fcdb8379ce5a8020cb2b8f42de1b7c7 Mon Sep 17 00:00:00 2001
From: Jedrzej Kosinski <kosinkadink1@gmail.com>
Date: Mon, 8 Sep 2025 12:07:04 -0700
Subject: [PATCH 5/7] Add ZeroDivisionError catch for EasyCache logging
 statement (#9768)

---
 comfy_extras/nodes_easycache.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index 9d2988f5f..c170e9fd9 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -162,7 +162,12 @@ def easycache_sample_wrapper(executor, *args, **kwargs):
             logging.info(f"{easycache.name} [verbose] - output_change_rates {len(output_change_rates)}: {output_change_rates}")
             logging.info(f"{easycache.name} [verbose] - approx_output_change_rates {len(approx_output_change_rates)}: {approx_output_change_rates}")
         total_steps = len(args[3])-1
-        logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({total_steps/(total_steps-easycache.total_steps_skipped):.2f}x speedup).")
+        # catch division by zero for log statement; sucks to crash after all sampling is done
+        try:
+            speedup = total_steps/(total_steps-easycache.total_steps_skipped)
+        except ZeroDivisionError:
+            speedup = 1.0
+        logging.info(f"{easycache.name} - skipped {easycache.total_steps_skipped}/{total_steps} steps ({speedup:.2f}x speedup).")
         easycache.reset()
         guider.model_options = orig_model_options
 

From 97652d26b81f83fc9a3675be55ede7762fafb7bd Mon Sep 17 00:00:00 2001
From: contentis <lspindler@nvidia.com>
Date: Mon, 8 Sep 2025 21:08:18 +0200
Subject: [PATCH 6/7] Add explicit casting in apply_rope for Qwen VL (#9759)

---
 comfy/text_encoders/llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 4c976058f..5e11956b5 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -128,11 +128,12 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_dims=None, device=N
 
 
 def apply_rope(xq, xk, freqs_cis):
+    org_dtype = xq.dtype
     cos = freqs_cis[0]
     sin = freqs_cis[1]
     q_embed = (xq * cos) + (rotate_half(xq) * sin)
     k_embed = (xk * cos) + (rotate_half(xk) * sin)
-    return q_embed, k_embed
+    return q_embed.to(org_dtype), k_embed.to(org_dtype)
 
 
 class Attention(nn.Module):

From 103a12cb668303f197b22f52bb2981bb1539beea Mon Sep 17 00:00:00 2001
From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com>
Date: Mon, 8 Sep 2025 14:30:26 -0700
Subject: [PATCH 7/7] Support qwen inpaint controlnet. (#9772)

---
 comfy/controlnet.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index e3dfedf55..f08ff4b36 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -253,7 +253,10 @@ class ControlNet(ControlBase):
                 to_concat = []
                 for c in self.extra_concat_orig:
                     c = c.to(self.cond_hint.device)
-                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
+                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[-1], self.cond_hint.shape[-2], self.upscale_algorithm, "center")
+                    if c.ndim < self.cond_hint.ndim:
+                        c = c.unsqueeze(2)
+                        c = comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[2], dim=2)
                     to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
                 self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
 
@@ -585,11 +588,18 @@ def load_controlnet_flux_instantx(sd, model_options={}):
 
 def load_controlnet_qwen_instantx(sd, model_options={}):
     model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
-    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_latent_channels = sd.get("controlnet_x_embedder.weight").shape[1]
+
+    extra_condition_channels = 0
+    concat_mask = False
+    if control_latent_channels == 68: #inpaint controlnet
+        extra_condition_channels = control_latent_channels - 64
+        concat_mask = True
+    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(extra_condition_channels=extra_condition_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
     control_model = controlnet_load_state_dict(control_model, sd)
     latent_format = comfy.latent_formats.Wan21()
     extra_conds = []
-    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
     return control
 
 def convert_mistoline(sd):