diff --git a/README.md b/README.md
index b0731db33..c9a0644e3 100644
--- a/README.md
+++ b/README.md
@@ -197,7 +197,7 @@ comfy install
 
 ## Manual Install (Windows, Linux)
 
-Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) and install pytorch nightly but it is not recommended.
+Python 3.14 will work if you comment out the `kornia` dependency in the requirements.txt file (breaks the canny node) but it is not recommended.
 
 Python 3.13 is very well supported. If you have trouble with some custom node dependencies on 3.13 you can try 3.12
 
diff --git a/comfy/ldm/chroma_radiance/model.py b/comfy/ldm/chroma_radiance/model.py
index 47aa11b04..7d7be80f5 100644
--- a/comfy/ldm/chroma_radiance/model.py
+++ b/comfy/ldm/chroma_radiance/model.py
@@ -189,15 +189,15 @@ class ChromaRadiance(Chroma):
         nerf_pixels = nn.functional.unfold(img_orig, kernel_size=patch_size, stride=patch_size)
         nerf_pixels = nerf_pixels.transpose(1, 2) # -> [B, NumPatches, C * P * P]
 
+        # Reshape for per-patch processing
+        nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
+        nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
+
         if params.nerf_tile_size > 0 and num_patches > params.nerf_tile_size:
             # Enable tiling if nerf_tile_size isn't 0 and we actually have more patches than
             # the tile size.
-            img_dct = self.forward_tiled_nerf(img_out, nerf_pixels, B, C, num_patches, patch_size, params)
+            img_dct = self.forward_tiled_nerf(nerf_hidden, nerf_pixels, B, C, num_patches, patch_size, params)
         else:
-            # Reshape for per-patch processing
-            nerf_hidden = img_out.reshape(B * num_patches, params.hidden_size)
-            nerf_pixels = nerf_pixels.reshape(B * num_patches, C, patch_size**2).transpose(1, 2)
-
             # Get DCT-encoded pixel embeddings [pixel-dct]
             img_dct = self.nerf_image_embedder(nerf_pixels)
 
@@ -240,17 +240,8 @@ class ChromaRadiance(Chroma):
             end = min(i + tile_size, num_patches)
 
             # Slice the current tile from the input tensors
-            nerf_hidden_tile = nerf_hidden[:, i:end, :]
-            nerf_pixels_tile = nerf_pixels[:, i:end, :]
-
-            # Get the actual number of patches in this tile (can be smaller for the last tile)
-            num_patches_tile = nerf_hidden_tile.shape[1]
-
-            # Reshape the tile for per-patch processing
-            # [B, NumPatches_tile, D] -> [B * NumPatches_tile, D]
-            nerf_hidden_tile = nerf_hidden_tile.reshape(batch * num_patches_tile, params.hidden_size)
-            # [B, NumPatches_tile, C*P*P] -> [B*NumPatches_tile, C, P*P] -> [B*NumPatches_tile, P*P, C]
-            nerf_pixels_tile = nerf_pixels_tile.reshape(batch * num_patches_tile, channels, patch_size**2).transpose(1, 2)
+            nerf_hidden_tile = nerf_hidden[i * batch:end * batch]
+            nerf_pixels_tile = nerf_pixels[i * batch:end * batch]
 
             # get DCT-encoded pixel embeddings [pixel-dct]
             img_dct_tile = self.nerf_image_embedder(nerf_pixels_tile)
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index 7677617c0..141f1e164 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -213,7 +213,7 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
                 dit_config["nerf_mlp_ratio"] = 4
                 dit_config["nerf_depth"] = 4
                 dit_config["nerf_max_freqs"] = 8
-                dit_config["nerf_tile_size"] = 32
+                dit_config["nerf_tile_size"] = 512
                 dit_config["nerf_final_head_type"] = "conv" if f"{key_prefix}nerf_final_layer_conv.norm.scale" in state_dict_keys else "linear"
                 dit_config["nerf_embedder_dtype"] = torch.float32
         else:
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 70a5039ef..0dc471fb8 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -330,14 +330,21 @@ except:
 
 
 SUPPORT_FP8_OPS = args.supports_fp8_compute
+
+AMD_RDNA2_AND_OLDER_ARCH = ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]
+
 try:
     if is_amd():
-        torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+        if not (any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH)):
+            torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+            logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
+
         try:
             rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
         except:
             rocm_version = (6, -1)
-        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+
         logging.info("AMD arch: {}".format(arch))
         logging.info("ROCm version: {}".format(rocm_version))
         if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
@@ -371,6 +378,9 @@ try:
 except:
     pass
 
+if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
+    torch.backends.cudnn.benchmark = True
+
 try:
     if torch_version_numeric >= (2, 5):
         torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
@@ -1358,7 +1368,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
 
     if is_amd():
         arch = torch.cuda.get_device_properties(device).gcnArchName
-        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
+        if any((a in arch) for a in AMD_RDNA2_AND_OLDER_ARCH):  # RDNA2 and older don't support bf16
             if manual_cast:
                 return True
             return False
diff --git a/comfy/ops.py b/comfy/ops.py
index b2096b40e..934e21261 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -25,6 +25,9 @@ import comfy.rmsnorm
 import contextlib
 
 def run_every_op():
+    if torch.compiler.is_compiling():
+        return
+
     comfy.model_management.throw_exception_if_processing_interrupted()
 
 def scaled_dot_product_attention(q, k, v, *args, **kwargs):
@@ -52,14 +55,22 @@ try:
 except (ModuleNotFoundError, TypeError):
     logging.warning("Could not set sdpa backend priority.")
 
-cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+NVIDIA_MEMORY_CONV_BUG_WORKAROUND = False
+try:
+    if comfy.model_management.is_nvidia():
+        if torch.backends.cudnn.version() >= 91002 and comfy.model_management.torch_version_numeric >= (2, 9) and comfy.model_management.torch_version_numeric <= (2, 10):
+            #TODO: change upper bound version once it's fixed'
+            NVIDIA_MEMORY_CONV_BUG_WORKAROUND = True
+            logging.info("working around nvidia conv3d memory bug.")
+except:
+    pass
 
-if torch.cuda.is_available() and torch.backends.cudnn.is_available() and PerformanceFeature.AutoTune in args.fast:
-    torch.backends.cudnn.benchmark = True
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 
+@torch.compiler.disable()
 def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
     if input is not None:
         if dtype is None:
@@ -151,6 +162,15 @@ class disable_weight_init:
         def reset_parameters(self):
             return None
 
+        def _conv_forward(self, input, weight, bias, *args, **kwargs):
+            if NVIDIA_MEMORY_CONV_BUG_WORKAROUND and weight.dtype in (torch.float16, torch.bfloat16):
+                out = torch.cudnn_convolution(input, weight, self.padding, self.stride, self.dilation, self.groups, benchmark=False, deterministic=False, allow_tf32=True)
+                if bias is not None:
+                    out += bias.reshape((1, -1) + (1,) * (out.ndim - 2))
+                return out
+            else:
+                return super()._conv_forward(input, weight, bias, *args, **kwargs)
+
         def forward_comfy_cast_weights(self, input):
             weight, bias = cast_bias_weight(self, input)
             return self._conv_forward(input, weight, bias)
diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
index 4ab5c5186..daeaa823e 100644
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -393,7 +393,9 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
                 ),
                 IO.Combo.Input(
                     "model",
-                    options=list(MODELS_MAP.keys()),
+                    options=[
+                        "veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.0-generate-001", "veo-3.0-fast-generate-001"
+                    ],
                     default="veo-3.0-generate-001",
                     tooltip="Veo 3 model to use for video generation",
                     optional=True,
diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py
index 2d20e1fed..e835feed7 100644
--- a/comfy_extras/nodes_controlnet.py
+++ b/comfy_extras/nodes_controlnet.py
@@ -1,20 +1,26 @@
 from comfy.cldm.control_types import UNION_CONTROLNET_TYPES
 import nodes
 import comfy.utils
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
 
-class SetUnionControlNetType:
+class SetUnionControlNetType(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"control_net": ("CONTROL_NET", ),
-                             "type": (["auto"] + list(UNION_CONTROLNET_TYPES.keys()),)
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="SetUnionControlNetType",
+            category="conditioning/controlnet",
+            inputs=[
+                io.ControlNet.Input("control_net"),
+                io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())),
+            ],
+            outputs=[
+                io.ControlNet.Output(),
+            ],
+        )
 
-    CATEGORY = "conditioning/controlnet"
-    RETURN_TYPES = ("CONTROL_NET",)
-
-    FUNCTION = "set_controlnet_type"
-
-    def set_controlnet_type(self, control_net, type):
+    @classmethod
+    def execute(cls, control_net, type) -> io.NodeOutput:
         control_net = control_net.copy()
         type_number = UNION_CONTROLNET_TYPES.get(type, -1)
         if type_number >= 0:
@@ -22,27 +28,36 @@ class SetUnionControlNetType:
         else:
             control_net.set_extra_arg("control_type", [])
 
-        return (control_net,)
+        return io.NodeOutput(control_net)
 
-class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
+    set_controlnet_type = execute  # TODO: remove
+
+
+class ControlNetInpaintingAliMamaApply(io.ComfyNode):
     @classmethod
-    def INPUT_TYPES(s):
-        return {"required": {"positive": ("CONDITIONING", ),
-                             "negative": ("CONDITIONING", ),
-                             "control_net": ("CONTROL_NET", ),
-                             "vae": ("VAE", ),
-                             "image": ("IMAGE", ),
-                             "mask": ("MASK", ),
-                             "strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 10.0, "step": 0.01}),
-                             "start_percent": ("FLOAT", {"default": 0.0, "min": 0.0, "max": 1.0, "step": 0.001}),
-                             "end_percent": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.001})
-                             }}
+    def define_schema(cls):
+        return io.Schema(
+            node_id="ControlNetInpaintingAliMamaApply",
+            category="conditioning/controlnet",
+            inputs=[
+                io.Conditioning.Input("positive"),
+                io.Conditioning.Input("negative"),
+                io.ControlNet.Input("control_net"),
+                io.Vae.Input("vae"),
+                io.Image.Input("image"),
+                io.Mask.Input("mask"),
+                io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
+                io.Float.Input("start_percent", default=0.0, min=0.0, max=1.0, step=0.001),
+                io.Float.Input("end_percent", default=1.0, min=0.0, max=1.0, step=0.001),
+            ],
+            outputs=[
+                io.Conditioning.Output(display_name="positive"),
+                io.Conditioning.Output(display_name="negative"),
+            ],
+        )
 
-    FUNCTION = "apply_inpaint_controlnet"
-
-    CATEGORY = "conditioning/controlnet"
-
-    def apply_inpaint_controlnet(self, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent):
+    @classmethod
+    def execute(cls, positive, negative, control_net, vae, image, mask, strength, start_percent, end_percent) -> io.NodeOutput:
         extra_concat = []
         if control_net.concat_mask:
             mask = 1.0 - mask.reshape((-1, 1, mask.shape[-2], mask.shape[-1]))
@@ -50,11 +65,20 @@ class ControlNetInpaintingAliMamaApply(nodes.ControlNetApplyAdvanced):
             image = image * mask_apply.movedim(1, -1).repeat(1, 1, 1, image.shape[3])
             extra_concat = [mask]
 
-        return self.apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
+        result = nodes.ControlNetApplyAdvanced().apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent, vae=vae, extra_concat=extra_concat)
+        return io.NodeOutput(result[0], result[1])
+
+    apply_inpaint_controlnet = execute  # TODO: remove
 
 
+class ControlNetExtension(ComfyExtension):
+    @override
+    async def get_node_list(self) -> list[type[io.ComfyNode]]:
+        return [
+            SetUnionControlNetType,
+            ControlNetInpaintingAliMamaApply,
+        ]
 
-NODE_CLASS_MAPPINGS = {
-    "SetUnionControlNetType": SetUnionControlNetType,
-    "ControlNetInpaintingAliMamaApply": ControlNetInpaintingAliMamaApply,
-}
+
+async def comfy_entrypoint() -> ControlNetExtension:
+    return ControlNetExtension()
diff --git a/comfy_extras/nodes_easycache.py b/comfy_extras/nodes_easycache.py
index c170e9fd9..1359e2f99 100644
--- a/comfy_extras/nodes_easycache.py
+++ b/comfy_extras/nodes_easycache.py
@@ -244,6 +244,8 @@ class EasyCacheHolder:
             self.total_steps_skipped += 1
         batch_offset = x.shape[0] // len(uuids)
         for i, uuid in enumerate(uuids):
+            # slice out only what is relevant to this cond
+            batch_slice = [slice(i*batch_offset,(i+1)*batch_offset)]
             # if cached dims don't match x dims, cut off excess and hope for the best (cosmos world2video)
             if x.shape[1:] != self.uuid_cache_diffs[uuid].shape[1:]:
                 if not self.allow_mismatch:
@@ -261,9 +263,8 @@ class EasyCacheHolder:
                             slicing.append(slice(None, dim_u))
                     else:
                         slicing.append(slice(None))
-                slicing = [slice(i*batch_offset,(i+1)*batch_offset)] + slicing
-                x = x[slicing]
-            x += self.uuid_cache_diffs[uuid].to(x.device)
+                batch_slice = batch_slice + slicing
+            x[batch_slice] += self.uuid_cache_diffs[uuid].to(x.device)
         return x
 
     def update_cache_diff(self, output: torch.Tensor, x: torch.Tensor, uuids: list[UUID]):
diff --git a/comfyui_version.py b/comfyui_version.py
index d39c1fdc4..33a06bbb0 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.65"
+__version__ = "0.3.66"
diff --git a/cuda_malloc.py b/cuda_malloc.py
index c1d9ae3ca..6520d5123 100644
--- a/cuda_malloc.py
+++ b/cuda_malloc.py
@@ -1,6 +1,6 @@
 import os
 import importlib.util
-from comfy.cli_args import args
+from comfy.cli_args import args, PerformanceFeature
 import subprocess
 
 #Can't use pytorch to get the GPU names because the cuda malloc has to be set before the first import.
@@ -75,8 +75,9 @@ if not args.cuda_malloc:
                 spec.loader.exec_module(module)
                 version = module.__version__
 
-        if int(version[0]) >= 2 and "+cu" in version: #enable by default for torch version 2.0 and up only on cuda torch
-            args.cuda_malloc = cuda_malloc_supported()
+        if int(version[0]) >= 2 and "+cu" in version:  # enable by default for torch version 2.0 and up only on cuda torch
+            if PerformanceFeature.AutoTune not in args.fast:  # Autotune has issues with cuda malloc
+                args.cuda_malloc = cuda_malloc_supported()
     except:
         pass
 
diff --git a/pyproject.toml b/pyproject.toml
index 653604e24..0c6b23a25 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "ComfyUI"
-version = "0.3.65"
+version = "0.3.66"
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.9"
diff --git a/requirements.txt b/requirements.txt
index 82457df54..dd2afcab0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 comfyui-frontend-package==1.28.7
-comfyui-workflow-templates==0.1.95
+comfyui-workflow-templates==0.2.1
 comfyui-embedded-docs==0.3.0
 torch
 torchsde
diff --git a/server.py b/server.py
index 80e9d3fa7..10c2698b5 100644
--- a/server.py
+++ b/server.py
@@ -48,6 +48,28 @@ async def send_socket_catch_exception(function, message):
     except (aiohttp.ClientError, aiohttp.ClientPayloadError, ConnectionResetError, BrokenPipeError, ConnectionError) as err:
         logging.warning("send error: {}".format(err))
 
+# Track deprecated paths that have been warned about to only warn once per file
+_deprecated_paths_warned = set()
+
+@web.middleware
+async def deprecation_warning(request: web.Request, handler):
+    """Middleware to warn about deprecated frontend API paths"""
+    path = request.path
+
+    if path.startswith("/scripts/ui") or path.startswith("/extensions/core/"):
+        # Only warn once per unique file path
+        if path not in _deprecated_paths_warned:
+            _deprecated_paths_warned.add(path)
+            logging.warning(
+                f"[DEPRECATION WARNING] Detected import of deprecated legacy API: {path}. "
+                f"This is likely caused by a custom node extension using outdated APIs. "
+                f"Please update your extensions or contact the extension author for an updated version."
+            )
+
+    response: web.Response = await handler(request)
+    return response
+
+
 @web.middleware
 async def compress_body(request: web.Request, handler):
     accept_encoding = request.headers.get("Accept-Encoding", "")
@@ -159,7 +181,7 @@ class PromptServer():
         self.client_session:Optional[aiohttp.ClientSession] = None
         self.number = 0
 
-        middlewares = [cache_control]
+        middlewares = [cache_control, deprecation_warning]
         if args.enable_compress_response_body:
             middlewares.append(compress_body)