diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
index 47aa11b04..7d7be80f5 100644
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -189,15 +189,15 @@ class ChromaRadiance(Chroma):
         nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
         nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]
 
+        # Reshape for per-patch processing
+        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
+        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
+
         if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
             # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
             # the tile size.
-            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
         else:
-            # Reshape for per-patch processing
-            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
-            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
-
             # Get DCT-encoded pixel embeddings [pixel-dct]
             img_dct = self.nerf_image_embedder(nerf_pixels)
 
@@ -240,17 +240,8 @@ class ChromaRadiance(Chroma):
             end = min(i + tile_size, num_patches)
 
             # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[:, i:end, :]
-            nerf_pixels_tile = nerf_pixels[:, i:end, :]
-
-            # Get the actual number of patches in this tile (can be smaller for the last tile)
-            num_patches_tile = nerf_hidden_tile.shape[1]
-
-            # Reshape the tile for per-patch processing
-            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
-            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
-            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
-            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
+            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
+            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]
 
             # get DCT-encoded pixel embeddings [pixel-dct]
             img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 7677617c0..141f1e164 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -213,7 +213,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["nerf_mlp_ratio"] = 4
                 dit_config["nerf_depth"] = 4
                 dit_config["nerf_max_freqs"] = 8
-                dit_config["nerf_tile_size"] = 32
+                dit_config["nerf_tile_size"] = 512
                 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                 dit_config["nerf_embedder_dtype"] = torch.float32
         else:
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 709ebc40b..79d6ff9d4 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -330,14 +330,21 @@ except:
 
 
 SUPPORT_FP8_OPS = args.supports_fp8_compute
+
+AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
+
 try:
     if is_amd():
-        torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+        if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
+            torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+            logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
+
         try:
             rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
         except:
             rocm_version = (6, -1)
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+
         logging.info("AMD arch: {}".format(arch))
         logging.info("ROCm version: {}".format(rocm_version))
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
@@ -349,7 +356,7 @@ try:
                    if any((a in arch) for a in ["gfx1201"]):
                        ENABLE_PYTORCH_ATTENTION = True
         if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
-            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
+            if any((a in arch) for a in ["gfx1200", "gfx1201", "gfx950"]):  # TODO: more arches, "gfx942" gives error on pytorch nightly 2.10 1013 rocm7.0
                 SUPPORT_FP8_OPS = True
 
 except:
@@ -371,6 +378,9 @@ try:
 except:
     pass
 
+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+    torch.backends.cudnn.benchmark = True
+
 try:
     if torch_version_numeric >= (2, 5):
         torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
@@ -1327,7 +1337,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
 
     if is_amd():
         arch = torch.cuda.get_device_properties(device).gcnArchName
-        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
+        if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH):  # RDNA2 and older don't support bf16
             if manual_cast:
                 return True
             return False
diff --git a/comfy/ops.py b/comfy/ops.py
index b2096b40e..934e21261 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -25,6 +25,9 @@ import comfy.rmsnorm
 import contextlib
 
 def run_every_op():
+    if torch.compiler.is_compiling():
+        return
+
     comfy.model_management.throw_exception_if_processing_interrupted()
 
 def scaled_dot_product_attention(q, k, v, *args, **kwargs):
@@ -52,14 +55,22 @@ try:
 except (ModuleNotFoundError, TypeError):
     logging.warning("Could not set sdpa backend priority.")
 
-cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
+try:
+    if comfy.model_management.is_nvidia():
+        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+            #TODO: change upper bound version once it's fixed'
+            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
+            logging.info("working around nvidia conv3d memory bug.")
+except:
+    pass
 
-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
-    torch.backends.cudnn.benchmark = True
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 
+@torch.compiler.disable()
 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
     if input is not None:
         if dtype is None:
@@ -151,6 +162,15 @@ class disable_weight_init:
         def reset_parameters(self):
             return None
 
+        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
+                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
+                if bias is not None:
+                    out += bias.reshape((1, -1) + (1,) * (out.ndim - 2))
+                return out
+            else:
+                return super()._conv_forward(input, weight, bias, *args, **kwargs)
+
         def forward_comfy_cast_weights(self, input):
             weight, bias = cast_bias_weight(self, input)
             return self._conv_forward(input, weight, bias)
diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py
index 46cc7b2a8..5ee4d5ee5 100644
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -150,7 +150,7 @@ def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True):
     for key, value in dict2.items():
         if isinstance(value, dict):
             curr_value = merged_dict.setdefault(key, {})
-            merged_dict[key] = merge_nested_dicts(value, curr_value)
+            merged_dict[key] = merge_nested_dicts(curr_value, value)
         elif isinstance(value, list):
             merged_dict.setdefault(key, []).extend(value)
         else:
diff --git a/comfy/samplers.py b/comfy/samplers.py
index c59e296a1..e7efaf470 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -306,17 +306,10 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
                                                                                  copy_dict1=False)
 
             if patches is not None:
-                # TODO: replace with merge_nested_dicts function
-                if "patches" in transformer_options:
-                    cur_patches = transformer_options["patches"].copy()
-                    for p in patches:
-                        if p in cur_patches:
-                            cur_patches[p] = cur_patches[p] + patches[p]
-                        else:
-                            cur_patches[p] = patches[p]
-                    transformer_options["patches"] = cur_patches
-                else:
-                    transformer_options["patches"] = patches
+                transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts(
+                    transformer_options.get("patches", {}),
+                    patches
+                )
 
             transformer_options["cond_or_uncond"] = cond_or_uncond[:]
             transformer_options["uuids"] = uuids[:]
diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
index 4588a7991..daeaa823e 100644
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -27,6 +27,13 @@ from comfy_api_nodes.apinode_utils import (
 )
 
 AVERAGE_DURATION_VIDEO_GEN = 32
+MODELS_MAP = {
+    "veo-2.0-generate-001": "veo-2.0-generate-001",
+    "veo-3.1-generate": "veo-3.1-generate-preview",
+    "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview",
+    "veo-3.0-generate-001": "veo-3.0-generate-001",
+    "veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001",
+}
 
 def convert_image_to_base64(image: torch.Tensor):
     if image is None:
@@ -158,6 +165,7 @@ class VeoVideoGenerationNode(IO.ComfyNode):
         model="veo-2.0-generate-001",
         generate_audio=False,
     ):
+        model = MODELS_MAP[model]
         # Prepare the instances for the request
         instances = []
 
@@ -385,7 +393,9 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                 ),
                 IO.Combo.Input(
                     "model",
-                    options=["veo-3.0-generate-001", "veo-3.0-fast-generate-001"],
+                    options=[
+                        "veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.0-generate-001", "veo-3.0-fast-generate-001"
+                    ],
                     default="veo-3.0-generate-001",
                     tooltip="Veo 3 model to use for video generation",
                     optional=True,
diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py
index 2d20e1fed..e835feed7 100644
--- a/comfy_extras/nodes_controlnet.py
+++ b/comfy_extras/nodes_controlnet.py
@@ -1,20 +1,26 @@
 from comfy.cldm.control_types import UNION_CONTROLNET_TYPES
 import nodes
 import comfy.utils
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
 
-class SetUnionControlNetType:
+class SetUnionControlNetType(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"control_net": ("CONTROL_NET", ),
-                             "type": (["auto"] + list(UNION_CONTROLNET_TYPES.keys()),)
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SetUnionControlNetType",
+            category="conditioning/controlnet",
+            inputs=[
+                io.ControlNet.Input("control_net"),
+                io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())),
+            ],
+            outputs=[
+                io.ControlNet.Output(),
+            ],
+        )
 
-    CATEGORY = "conditioning/controlnet"
-    RETURN_TYPES = ("CONTROL_NET",)
-
-    FUNCTION = "set_controlnet_type"
-
-    def set_controlnet_type(self, control_net, type):
+    @classmethod
+    def execute(cls, control_net, type) -> io.NodeOutput:
         control_net = control_net.copy()
         type_number = UNION_CONTROLNET_TYPES.get(type, -1)
         if type_number >= 0:
@@ -22,27 +28,36 @@ class SetUnionControlNetType:
         else:
             control_net.set_extra_arg("control_type", [])
 
-        return (control_net,)
+        return io.NodeOutput(control_net)
 
-class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
+    set_controlnet_type = execute  # TODO: remove
+
+
+class ControlNetInpaintingAliMamaApply(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "control_net": ("CONTROL_NET", ),
-                             "vae": ("VAE", ),
-                             "image": ("IMAGE", ),
-                             "mask": ("MASK", ),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
-                             "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
-                             "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001})
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ControlNetInpaintingAliMamaApply",
+            category="conditioning/controlnet",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.ControlNet.Input("control_net"),
+                io.Vae.Input("vae"),
+                io.Image.Input("image"),
+                io.Mask.Input("mask"),
+                io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
+                io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001),
+                io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+            ],
+        )
 
-    FUNCTION = "apply_inpaint_controlnet"
-
-    CATEGORY = "conditioning/controlnet"
-
-    def apply_inpaint_controlnet(self, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent):
+    @classmethod
+    def execute(cls, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent) -> io.NodeOutput:
         extra_concat = []
         if control_net.concat_mask:
             mask = 1.0 - mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1]))
@@ -50,11 +65,20 @@ class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
             image = image * mask_apply.movedim(1, -1).repeat(1, 1, 1, image.shape[3])
             extra_concat = [mask]
 
-        return self.apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
+        result = nodes.ControlNetApplyAdvanced().apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
+        return io.NodeOutput(result[0], result[1])
+
+    apply_inpaint_controlnet = execute  # TODO: remove
 
 
+class ControlNetExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            SetUnionControlNetType,
+            ControlNetInpaintingAliMamaApply,
+        ]
 
-NODE_CLASS_MAPPINGS = {
-    "SetUnionControlNetType": SetUnionControlNetType,
-    "ControlNetInpaintingAliMamaApply": ControlNetInpaintingAliMamaApply,
-}
+
+async def comfy_entrypoint() -> ControlNetExtension:
+    return ControlNetExtension()
diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index c170e9fd9..1359e2f99 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -244,6 +244,8 @@ class EasyCacheHolder:
             self.total_steps_skipped += 1
         batch_offset = x.shape[0] // len(uuids)
         for i, uuid in enumerate(uuids):
+            # slice out only what is relevant to this cond
+            batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
             # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
             if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
                 if not self.allow_mismatch:
@@ -261,9 +263,8 @@ class EasyCacheHolder:
                             slicing.append(slice(None, dim_u))
                     else:
                         slicing.append(slice(None))
-                slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing
-                x = x[slicing]
-            x += self.uuid_cache_diffs[uuid].to(x.device)
+                batch_slice = batch_slice + slicing
+            x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device)
         return x
 
     def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py
index 7852d85e5..4d8061741 100644
--- a/comfy_extras/nodes_eps.py
+++ b/comfy_extras/nodes_eps.py
@@ -1,5 +1,7 @@
+import torch
 from typing_extensions import override
 
+from comfy.k_diffusion.sampling import sigma_to_half_log_snr
 from comfy_api.latest import ComfyExtension, io
 
 
@@ -63,12 +65,105 @@ class EpsilonScaling(io.ComfyNode):
         return io.NodeOutput(model_clone)
 
 
+def compute_tsr_rescaling_factor(
+    snr: torch.Tensor, tsr_k: float, tsr_variance: float
+) -> torch.Tensor:
+    """Compute the rescaling score ratio in Temporal Score Rescaling.
+
+    See equation (6) in https://arxiv.org/pdf/2510.01184v1.
+    """
+    posinf_mask = torch.isposinf(snr)
+    rescaling_factor = (snr * tsr_variance + 1) / (snr * tsr_variance / tsr_k + 1)
+    return torch.where(posinf_mask, tsr_k, rescaling_factor) # when snr → inf, r = tsr_k
+
+
+class TemporalScoreRescaling(io.ComfyNode):
+    @classmethod
+    def define_schema(cls):
+        return io.Schema(
+            node_id="TemporalScoreRescaling",
+            display_name="TSR - Temporal Score Rescaling",
+            category="model_patches/unet",
+            inputs=[
+                io.Model.Input("model"),
+                io.Float.Input(
+                    "tsr_k",
+                    tooltip=(
+                        "Controls the rescaling strength.\n"
+                        "Lower k produces more detailed results; higher k produces smoother results in image generation. Setting k = 1 disables rescaling."
+                    ),
+                    default=0.95,
+                    min=0.01,
+                    max=100.0,
+                    step=0.001,
+                    display_mode=io.NumberDisplay.number,
+                ),
+                io.Float.Input(
+                    "tsr_sigma",
+                    tooltip=(
+                        "Controls how early rescaling takes effect.\n"
+                        "Larger values take effect earlier."
+                    ),
+                    default=1.0,
+                    min=0.01,
+                    max=100.0,
+                    step=0.001,
+                    display_mode=io.NumberDisplay.number,
+                ),
+            ],
+            outputs=[
+                io.Model.Output(
+                    display_name="patched_model",
+                ),
+            ],
+            description=(
+                "[Post-CFG Function]\n"
+                "TSR - Temporal Score Rescaling (2510.01184)\n\n"
+                "Rescaling the model's score or noise to steer the sampling diversity.\n"
+            ),
+        )
+
+    @classmethod
+    def execute(cls, model, tsr_k, tsr_sigma) -> io.NodeOutput:
+        tsr_variance = tsr_sigma**2
+
+        def temporal_score_rescaling(args):
+            denoised = args["denoised"]
+            x = args["input"]
+            sigma = args["sigma"]
+            curr_model = args["model"]
+
+            # No rescaling (r = 1) or no noise
+            if tsr_k == 1 or sigma == 0:
+                return denoised
+
+            model_sampling = curr_model.current_patcher.get_model_object("model_sampling")
+            half_log_snr = sigma_to_half_log_snr(sigma, model_sampling)
+            snr = (2 * half_log_snr).exp()
+
+            # No rescaling needed (r = 1)
+            if snr == 0:
+                return denoised
+
+            rescaling_r = compute_tsr_rescaling_factor(snr, tsr_k, tsr_variance)
+
+            # Derived from scaled_denoised = (x - r * sigma * noise) / alpha
+            alpha = sigma * half_log_snr.exp()
+            return torch.lerp(x / alpha, denoised, rescaling_r)
+
+        m = model.clone()
+        m.set_model_sampler_post_cfg_function(temporal_score_rescaling)
+        return io.NodeOutput(m)
+
+
 class EpsilonScalingExtension(ComfyExtension):
     @override
     async def get_node_list(self) -> list[type[io.ComfyNode]]:
         return [
             EpsilonScaling,
+            TemporalScoreRescaling,
         ]
 
+
 async def comfy_entrypoint() -> EpsilonScalingExtension:
     return EpsilonScalingExtension()
diff --git a/comfyui_version.py b/comfyui_version.py
index d39c1fdc4..33a06bbb0 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.65"
+__version__ = "0.3.66"
diff --git a/cuda_malloc.py b/cuda_malloc.py
index c1d9ae3ca..6520d5123 100644
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@@ -1,6 +1,6 @@
 import os
 import importlib.util
-from comfy.cli_args import args
+from comfy.cli_args import args, PerformanceFeature
 import subprocess
 
 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
@@ -75,8 +75,9 @@ if not args.cuda_malloc:
                 spec.loader.exec_module(module)
                 version = module.__version__
 
-        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
-            args.cuda_malloc = cuda_malloc_supported()
+        if int(version[0]) >= 2 and "+cu" in version:  # enable by default for torch version 2.0 and up only on cuda torch
+            if PerformanceFeature.AutoTune not in args.fast:  # Autotune has issues with cuda malloc
+                args.cuda_malloc = cuda_malloc_supported()
     except:
         pass
 
diff --git a/pyproject.toml b/pyproject.toml
index 653604e24..0c6b23a25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.65"
+version = "0.3.66"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
diff --git a/requirements.txt b/requirements.txt
index bbb22364f..dd2afcab0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-comfyui-frontend-package==1.27.10
-comfyui-workflow-templates==0.1.95
+comfyui-frontend-package==1.28.7
+comfyui-workflow-templates==0.2.1
 comfyui-embedded-docs==0.3.0
 torch
 torchsde
diff --git a/server.py b/server.py
index 80e9d3fa7..10c2698b5 100644
--- a/server.py
+++ b/server.py
@@ -48,6 +48,28 @@ async def send_socket_catch_exception(function, message):
     except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err:
         logging.warning("send error: {}".format(err))
 
+# Track deprecated paths that have been warned about to only warn once per file
+_deprecated_paths_warned = set()
+
+@web.middleware
+async def deprecation_warning(request: web.Request, handler):
+    """Middleware to warn about deprecated frontend API paths"""
+    path = request.path
+
+    if path.startswith("/scripts/ui") or path.startswith("/extensions/core/"):
+        # Only warn once per unique file path
+        if path not in _deprecated_paths_warned:
+            _deprecated_paths_warned.add(path)
+            logging.warning(
+                f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. "
+                f"This is likely caused by a custom node extension using outdated APIs. "
+                f"Please update your extensions or contact the extension author for an updated version."
+            )
+
+    response: web.Response = await handler(request)
+    return response
+
+
 @web.middleware
 async def compress_body(request: web.Request, handler):
     accept_encoding = request.headers.get("Accept-Encoding", "")
@@ -159,7 +181,7 @@ class PromptServer():
         self.client_session:Optional[aiohttp.ClientSession] = None
         self.number = 0
 
-        middlewares = [cache_control]
+        middlewares = [cache_control, deprecation_warning]
         if args.enable_compress_response_body:
             middlewares.append(compress_body)