Merge branch 'master' of https://github.com/siliconflow/ComfyUI into refine_offload

2025-12-20 11:32:58 +08:00 · 2025-10-22 21:15:28 +08:00 · 2025-10-22 21:15:28 +08:00 · f3c673d086
commit f3c673d086
parent 98ba311511 9cdc64998f
13 changed files with 140 additions and 69 deletions
--- a/README.md
+++ b/README.md
@ -197,7 +197,7 @@ comfy install
 ## Manual Install (Windows, Linux)
-Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) and install pytorch nightly but it is not recommended.
+Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) but it is not recommended.
 Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@ -189,15 +189,15 @@ class ChromaRadiance(Chroma):
        nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
        nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]
        # Reshape for per-patch processing
        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
        if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
            # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
            # the tile size.
-            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
        else:
            # Reshape for per-patch processing
            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
            # Get DCT-encoded pixel embeddings [pixel-dct]
            img_dct = self.nerf_image_embedder(nerf_pixels)
@ -240,17 +240,8 @@ class ChromaRadiance(Chroma):
            end = min(i + tile_size, num_patches)
            # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[:, i:end, :]
+            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
-            nerf_pixels_tile = nerf_pixels[:, i:end, :]
+            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]
            # Get the actual number of patches in this tile (can be smaller for the last tile)
            num_patches_tile = nerf_hidden_tile.shape[1]
            # Reshape the tile for per-patch processing
            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
            # get DCT-encoded pixel embeddings [pixel-dct]
            img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@ -213,7 +213,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                dit_config["nerf_mlp_ratio"] = 4
                dit_config["nerf_depth"] = 4
                dit_config["nerf_max_freqs"] = 8
-                dit_config["nerf_tile_size"] = 32
+                dit_config["nerf_tile_size"] = 512
                dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                dit_config["nerf_embedder_dtype"] = torch.float32
        else:
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@ -330,14 +330,21 @@ except:
 SUPPORT_FP8_OPS = args.supports_fp8_compute
 AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
 try:
    if is_amd():
-        torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
        if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
            torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
            logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
        try:
            rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
        except:
            rocm_version = (6, -1)
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+
        logging.info("AMD arch: {}".format(arch))
        logging.info("ROCm version: {}".format(rocm_version))
        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
@ -371,6 +378,9 @@ try:
 except:
    pass
 if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
    torch.backends.cudnn.benchmark = True
 try:
    if torch_version_numeric >= (2, 5):
        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
@ -1358,7 +1368,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
    if is_amd():
        arch = torch.cuda.get_device_properties(device).gcnArchName
-        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
+        if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH):  # RDNA2 and older don't support bf16
            if manual_cast:
                return True
            return False
--- a/comfy/ops.py
+++ b/comfy/ops.py
@ -25,6 +25,9 @@ import comfy.rmsnorm
 import contextlib
 def run_every_op():
    if torch.compiler.is_compiling():
        return
    comfy.model_management.throw_exception_if_processing_interrupted()
 def scaled_dot_product_attention(q, k, v, *args, **kwargs):
@ -52,14 +55,22 @@ try:
 except (ModuleNotFoundError, TypeError):
    logging.warning("Could not set sdpa backend priority.")
-cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
 try:
    if comfy.model_management.is_nvidia():
        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
            #TODO: change upper bound version once it's fixed'
            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
            logging.info("working around nvidia conv3d memory bug.")
 except:
    pass
-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
    torch.backends.cudnn.benchmark = True
 def cast_to_input(weight, input, non_blocking=False, copy=True):
    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
@torch.compiler.disable()
 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
    if input is not None:
        if dtype is None:
@ -151,6 +162,15 @@ class disable_weight_init:
        def reset_parameters(self):
            return None
        def _conv_forward(self, input, weight, bias, *args, **kwargs):
            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
                if bias is not None:
                    out += bias.reshape((1, -1) + (1,) * (out.ndim - 2))
                return out
            else:
                return super()._conv_forward(input, weight, bias, *args, **kwargs)
        def forward_comfy_cast_weights(self, input):
            weight, bias = cast_bias_weight(self, input)
            return self._conv_forward(input, weight, bias)
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@ -393,7 +393,9 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                ),
                IO.Combo.Input(
                    "model",
-                    options=list(MODELS_MAP.keys()),
+                    options=[
                        "veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.0-generate-001", "veo-3.0-fast-generate-001"
                    ],
                    default="veo-3.0-generate-001",
                    tooltip="Veo 3 model to use for video generation",
                    optional=True,
--- a/comfy_extras/nodes_controlnet.py
+++ b/comfy_extras/nodes_controlnet.py
@ -1,20 +1,26 @@
 from comfy.cldm.control_types import UNION_CONTROLNET_TYPES
 import nodes
 import comfy.utils
 from typing_extensions import override
 from comfy_api.latest import ComfyExtension, io
-class SetUnionControlNetType:
+class SetUnionControlNetType(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"control_net": ("CONTROL_NET", ),
+        return io.Schema(
-                             "type": (["auto"] + list(UNION_CONTROLNET_TYPES.keys()),)
+            node_id="SetUnionControlNetType",
-                             }}
+            category="conditioning/controlnet",
            inputs=[
                io.ControlNet.Input("control_net"),
                io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())),
            ],
            outputs=[
                io.ControlNet.Output(),
            ],
        )
-    CATEGORY = "conditioning/controlnet"
+    @classmethod
-    RETURN_TYPES = ("CONTROL_NET",)
+    def execute(cls, control_net, type) -> io.NodeOutput:
    FUNCTION = "set_controlnet_type"
    def set_controlnet_type(self, control_net, type):
        control_net = control_net.copy()
        type_number = UNION_CONTROLNET_TYPES.get(type, -1)
        if type_number >= 0:
@ -22,27 +28,36 @@ class SetUnionControlNetType:
        else:
            control_net.set_extra_arg("control_type", [])
-        return (control_net,)
+        return io.NodeOutput(control_net)
-class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
+    set_controlnet_type = execute  # TODO: remove
 class ControlNetInpaintingAliMamaApply(io.ComfyNode):
    @classmethod
-    def INPUT_TYPES(s):
+    def define_schema(cls):
-        return {"required": {"positive": ("CONDITIONING", ),
+        return io.Schema(
-                             "negative": ("CONDITIONING", ),
+            node_id="ControlNetInpaintingAliMamaApply",
-                             "control_net": ("CONTROL_NET", ),
+            category="conditioning/controlnet",
-                             "vae": ("VAE", ),
+            inputs=[
-                             "image": ("IMAGE", ),
+                io.Conditioning.Input("positive"),
-                             "mask": ("MASK", ),
+                io.Conditioning.Input("negative"),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
+                io.ControlNet.Input("control_net"),
-                             "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
+                io.Vae.Input("vae"),
-                             "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001})
+                io.Image.Input("image"),
-                             }}
+                io.Mask.Input("mask"),
                io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
                io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001),
                io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001),
            ],
            outputs=[
                io.Conditioning.Output(display_name="positive"),
                io.Conditioning.Output(display_name="negative"),
            ],
        )
-    FUNCTION = "apply_inpaint_controlnet"
+    @classmethod
-
+    def execute(cls, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent) -> io.NodeOutput:
    CATEGORY = "conditioning/controlnet"
    def apply_inpaint_controlnet(self, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent):
        extra_concat = []
        if control_net.concat_mask:
            mask = 1.0 - mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1]))
@ -50,11 +65,20 @@ class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
            image = image * mask_apply.movedim(1, -1).repeat(1, 1, 1, image.shape[3])
            extra_concat = [mask]
-        return self.apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
+        result = nodes.ControlNetApplyAdvanced().apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
        return io.NodeOutput(result[0], result[1])
    apply_inpaint_controlnet = execute  # TODO: remove
 class ControlNetExtension(ComfyExtension):
    @override
    async def get_node_list(self) -> list[type[io.ComfyNode]]:
        return [
            SetUnionControlNetType,
            ControlNetInpaintingAliMamaApply,
        ]
-NODE_CLASS_MAPPINGS = {
+
-    "SetUnionControlNetType": SetUnionControlNetType,
+async def comfy_entrypoint() -> ControlNetExtension:
-    "ControlNetInpaintingAliMamaApply": ControlNetInpaintingAliMamaApply,
+    return ControlNetExtension()
 }
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@ -244,6 +244,8 @@ class EasyCacheHolder:
            self.total_steps_skipped += 1
        batch_offset = x.shape[0] // len(uuids)
        for i, uuid in enumerate(uuids):
            # slice out only what is relevant to this cond
            batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
            # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
            if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
                if not self.allow_mismatch:
@ -261,9 +263,8 @@ class EasyCacheHolder:
                            slicing.append(slice(None, dim_u))
                    else:
                        slicing.append(slice(None))
-                slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing
+                batch_slice = batch_slice + slicing
-                x = x[slicing]
+            x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device)
            x += self.uuid_cache_diffs[uuid].to(x.device)
        return x
    def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
--- a/comfyui_version.py
+++ b/comfyui_version.py
@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.65"
+__version__ = "0.3.66"
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@ -1,6 +1,6 @@
 import os
 import importlib.util
-from comfy.cli_args import args
+from comfy.cli_args import args, PerformanceFeature
 import subprocess
 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
@ -75,8 +75,9 @@ if not args.cuda_malloc:
                spec.loader.exec_module(module)
                version = module.__version__
-        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
+        if int(version[0]) >= 2 and "+cu" in version:  # enable by default for torch version 2.0 and up only on cuda torch
-            args.cuda_malloc = cuda_malloc_supported()
+            if PerformanceFeature.AutoTune not in args.fast:  # Autotune has issues with cuda malloc
                args.cuda_malloc = cuda_malloc_supported()
    except:
        pass
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.65"
+version = "0.3.66"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 comfyui-frontend-package==1.28.7
-comfyui-workflow-templates==0.1.95
+comfyui-workflow-templates==0.2.1
 comfyui-embedded-docs==0.3.0
 torch
 torchsde
--- a/server.py
+++ b/server.py
@ -48,6 +48,28 @@ async def send_socket_catch_exception(function, message):
    except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err:
        logging.warning("send error: {}".format(err))
 # Track deprecated paths that have been warned about to only warn once per file
 _deprecated_paths_warned = set()
@web.middleware
 async def deprecation_warning(request: web.Request, handler):
    """Middleware to warn about deprecated frontend API paths"""
    path = request.path
    if path.startswith("/scripts/ui") or path.startswith("/extensions/core/"):
        # Only warn once per unique file path
        if path not in _deprecated_paths_warned:
            _deprecated_paths_warned.add(path)
            logging.warning(
                f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. "
                f"This is likely caused by a custom node extension using outdated APIs. "
                f"Please update your extensions or contact the extension author for an updated version."
            )
    response: web.Response = await handler(request)
    return response
@web.middleware
 async def compress_body(request: web.Request, handler):
    accept_encoding = request.headers.get("Accept-Encoding", "")
@ -159,7 +181,7 @@ class PromptServer():
        self.client_session:Optional[aiohttp.ClientSession] = None
        self.number = 0
-        middlewares = [cache_control]
+        middlewares = [cache_control, deprecation_warning]
        if args.enable_compress_response_body:
            middlewares.append(compress_body)