diff --git a/comfy/__init__.py b/comfy/__init__.py
index ae3b89fad..ba0c5a01a 100644
--- a/comfy/__init__.py
+++ b/comfy/__init__.py
@@ -1,6 +1,6 @@
 # This file is automatically generated by the build process when version is
 # updated in pyproject.toml.
-__version__ = "0.3.65"
+__version__ = "0.3.66"
 
 # This deals with workspace issues
 from comfy_compatibility.workspace import auto_patch_workspace_and_restart
diff --git a/comfy/cmd/execution.py b/comfy/cmd/execution.py
index b2dbe7c11..9ad735af7 100644
--- a/comfy/cmd/execution.py
+++ b/comfy/cmd/execution.py
@@ -16,14 +16,14 @@ from enum import Enum
 from os import PathLike
 from typing import List, Optional, Tuple, Literal
 
-import torch
-from opentelemetry.trace import get_current_span, StatusCode, Status
-
 # order matters
 from .main_pre import tracer
 
+import torch
+from opentelemetry.trace import get_current_span, StatusCode, Status
+
 from comfy_execution.caching import HierarchicalCache, LRUCache, CacheKeySetInputSignature, CacheKeySetID, \
-    NullCache, \
+    DependencyAwareCache, \
     BasicCache
 from comfy_execution.graph import get_input_info, ExecutionList, DynamicPrompt, ExecutionBlocker
 from comfy_execution.graph_utils import is_link, GraphBuilder
@@ -105,13 +105,13 @@ class IsChangedCache:
 class CacheType(Enum):
     CLASSIC = 0
     LRU = 1
-    NONE = 2
+    DEPENDENCY_AWARE = 2
 
 
 class CacheSet:
     def __init__(self, cache_type=None, cache_size=None):
-        if cache_type == CacheType.NONE:
-            self.init_null_cache()
+        if cache_type == CacheType.DEPENDENCY_AWARE:
+            self.init_dependency_aware_cache()
             logger.info("Disabling intermediate node cache.")
         elif cache_type == CacheType.LRU:
             if cache_size is None:
@@ -134,12 +134,11 @@ class CacheSet:
         self.ui = LRUCache(CacheKeySetInputSignature, max_size=cache_size)
         self.objects = HierarchicalCache(CacheKeySetID)
 
-    def init_null_cache(self):
-        self.outputs = NullCache()
-        #The UI cache is expected to be iterable at the end of each workflow
-        #so it must cache at least a full workflow. Use Heirachical
-        self.ui = HierarchicalCache(CacheKeySetInputSignature)
-        self.objects = NullCache()
+    # only hold cached items while the decendents have not executed
+    def init_dependency_aware_cache(self):
+        self.outputs = DependencyAwareCache(CacheKeySetInputSignature)
+        self.ui = DependencyAwareCache(CacheKeySetInputSignature)
+        self.objects = DependencyAwareCache(CacheKeySetID)
 
     def recursive_debug_dump(self):
         result = {
@@ -152,7 +151,7 @@ class CacheSet:
 SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")
 
 
-def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data=None):
+def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data=None):
     if extra_data is None:
         extra_data = {}
     is_v3 = issubclass(class_def, _ComfyNodeInternal)
@@ -174,10 +173,10 @@ def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=
         if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)):
             input_unique_id = input_data[0]
             output_index = input_data[1]
-            if execution_list is None:
+            if outputs is None:
                 mark_missing()
                 continue  # This might be a lazily-evaluated input
-            cached_output = execution_list.get_output_cache(input_unique_id, unique_id)
+            cached_output = outputs.get(input_unique_id)
             if cached_output is None:
                 mark_missing()
                 continue
@@ -485,7 +484,6 @@ async def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra
             cached_output = caches.ui.get(unique_id) or {}
             server.send_sync("executed", {"node": unique_id, "display_node": display_node_id, "output": cached_output.get("output", None), "prompt_id": prompt_id}, server.client_id)
         get_progress_state().finish_progress(unique_id)
-        execution_list.cache_update(unique_id, caches.outputs.get(unique_id))
         return RecursiveExecutionTuple(ExecutionResult.SUCCESS, None, None)
 
     input_data_all = None
@@ -515,7 +513,7 @@ async def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra
                     for r in result:
                         if is_link(r):
                             source_node, source_output = r[0], r[1]
-                            node_output = execution_list.get_output_cache(source_node, unique_id)[source_output]
+                            node_output = caches.outputs.get(source_node)[source_output]
                             for o in node_output:
                                 resolved_output.append(o)
 
@@ -527,7 +525,7 @@ async def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra
             has_subgraph = False
         else:
             get_progress_state().start_progress(unique_id)
-            input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
+            input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data)
             if server.client_id is not None:
                 server.last_node_id = display_node_id
                 server.send_sync("executing", {"node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id}, server.client_id)
@@ -635,14 +633,11 @@ async def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra
                 subcache.clean_unused()
             for node_id in new_output_ids:
                 execution_list.add_node(node_id)
-                execution_list.cache_link(node_id, unique_id)
             for link in new_output_links:
                 execution_list.add_strong_link(link[0], link[1], unique_id)
             pending_subgraph_results[unique_id] = cached_outputs
             return RecursiveExecutionTuple(ExecutionResult.PENDING, None, None)
         caches.outputs.set(unique_id, output_data)
-        execution_list.cache_update(unique_id, output_data)
-
     except interruption.InterruptProcessingException as iex:
         logger.info("Processing interrupted")
 
diff --git a/comfy/model_management.py b/comfy/model_management.py
index ea6723195..6f464e8f9 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -383,6 +383,7 @@ SUPPORT_FP8_OPS = args.supports_fp8_compute
 try:
     if is_amd():
         torch.backends.cudnn.enabled = False  # Seems to improve things a lot on AMD
+        logging.info("Set: torch.backends.cudnn.enabled = False for better AMD performance.")
         try:
             rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
         except:
diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py
index ca140ca3f..85481451e 100644
--- a/comfy_execution/caching.py
+++ b/comfy_execution/caching.py
@@ -271,28 +271,6 @@ class HierarchicalCache(BasicCache):
         assert cache is not None
         return await cache._ensure_subcache(node_id, children_ids)
 
-
-class NullCache:
-
-    async def set_prompt(self, dynprompt, node_ids, is_changed_cache):
-        pass
-
-    def all_node_ids(self):
-        return []
-
-    def clean_unused(self):
-        pass
-
-    def get(self, node_id):
-        return None
-
-    def set(self, node_id, value):
-        pass
-
-    async def ensure_subcache_for(self, node_id, children_ids):
-        return self
-
-
 class LRUCache(BasicCache):
     def __init__(self, key_class, max_size=100):
         super().__init__(key_class)
@@ -344,3 +322,157 @@ class LRUCache(BasicCache):
             self._mark_used(child_id)
             self.children[cache_key].append(self.cache_key_set.get_data_key(child_id))
         return self
+
+
+class DependencyAwareCache(BasicCache):
+    """
+    A cache implementation that tracks dependencies between nodes and manages
+    their execution and caching accordingly. It extends the BasicCache class.
+    Nodes are removed from this cache once all of their descendants have been
+    executed.
+    """
+
+    def __init__(self, key_class):
+        """
+        Initialize the DependencyAwareCache.
+
+        Args:
+            key_class: The class used for generating cache keys.
+        """
+        super().__init__(key_class)
+        self.descendants = {}  # Maps node_id -> set of descendant node_ids
+        self.ancestors = {}    # Maps node_id -> set of ancestor node_ids
+        self.executed_nodes = set()  # Tracks nodes that have been executed
+
+    async def set_prompt(self, dynprompt, node_ids, is_changed_cache):
+        """
+        Clear the entire cache and rebuild the dependency graph.
+
+        Args:
+            dynprompt: The dynamic prompt object containing node information.
+            node_ids: List of node IDs to initialize the cache for.
+            is_changed_cache: Flag indicating if the cache has changed.
+        """
+        # Clear all existing cache data
+        self.cache.clear()
+        self.subcaches.clear()
+        self.descendants.clear()
+        self.ancestors.clear()
+        self.executed_nodes.clear()
+
+        # Call the parent method to initialize the cache with the new prompt
+        await super().set_prompt(dynprompt, node_ids, is_changed_cache)
+
+        # Rebuild the dependency graph
+        self._build_dependency_graph(dynprompt, node_ids)
+
+    def _build_dependency_graph(self, dynprompt, node_ids):
+        """
+        Build the dependency graph for all nodes.
+
+        Args:
+            dynprompt: The dynamic prompt object containing node information.
+            node_ids: List of node IDs to build the graph for.
+        """
+        self.descendants.clear()
+        self.ancestors.clear()
+        for node_id in node_ids:
+            self.descendants[node_id] = set()
+            self.ancestors[node_id] = set()
+
+        for node_id in node_ids:
+            inputs = dynprompt.get_node(node_id)["inputs"]
+            for input_data in inputs.values():
+                if is_link(input_data):  # Check if the input is a link to another node
+                    ancestor_id = input_data[0]
+                    self.descendants[ancestor_id].add(node_id)
+                    self.ancestors[node_id].add(ancestor_id)
+
+    def set(self, node_id, value):
+        """
+        Mark a node as executed and store its value in the cache.
+
+        Args:
+            node_id: The ID of the node to store.
+            value: The value to store for the node.
+        """
+        self._set_immediate(node_id, value)
+        self.executed_nodes.add(node_id)
+        self._cleanup_ancestors(node_id)
+
+    def get(self, node_id):
+        """
+        Retrieve the cached value for a node.
+
+        Args:
+            node_id: The ID of the node to retrieve.
+
+        Returns:
+            The cached value for the node.
+        """
+        return self._get_immediate(node_id)
+
+    async def ensure_subcache_for(self, node_id, children_ids):
+        """
+        Ensure a subcache exists for a node and update dependencies.
+
+        Args:
+            node_id: The ID of the parent node.
+            children_ids: List of child node IDs to associate with the parent node.
+
+        Returns:
+            The subcache object for the node.
+        """
+        subcache = await super()._ensure_subcache(node_id, children_ids)
+        for child_id in children_ids:
+            self.descendants[node_id].add(child_id)
+            self.ancestors[child_id].add(node_id)
+        return subcache
+
+    def _cleanup_ancestors(self, node_id):
+        """
+        Check if ancestors of a node can be removed from the cache.
+
+        Args:
+            node_id: The ID of the node whose ancestors are to be checked.
+        """
+        for ancestor_id in self.ancestors.get(node_id, []):
+            if ancestor_id in self.executed_nodes:
+                # Remove ancestor if all its descendants have been executed
+                if all(descendant in self.executed_nodes for descendant in self.descendants[ancestor_id]):
+                    self._remove_node(ancestor_id)
+
+    def _remove_node(self, node_id):
+        """
+        Remove a node from the cache.
+
+        Args:
+            node_id: The ID of the node to remove.
+        """
+        cache_key = self.cache_key_set.get_data_key(node_id)
+        if cache_key in self.cache:
+            del self.cache[cache_key]
+        subcache_key = self.cache_key_set.get_subcache_key(node_id)
+        if subcache_key in self.subcaches:
+            del self.subcaches[subcache_key]
+
+    def clean_unused(self):
+        """
+        Clean up unused nodes. This is a no-op for this cache implementation.
+        """
+        pass
+
+    def recursive_debug_dump(self):
+        """
+        Dump the cache and dependency graph for debugging.
+
+        Returns:
+            A list containing the cache state and dependency graph.
+        """
+        result = super().recursive_debug_dump()
+        result.append({
+            "descendants": self.descendants,
+            "ancestors": self.ancestors,
+            "executed_nodes": list(self.executed_nodes),
+        })
+        return result
diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py
index 9611fb051..b3dfcf49b 100644
--- a/comfy_execution/graph.py
+++ b/comfy_execution/graph.py
@@ -161,9 +161,8 @@ class TopologicalSort:
                         continue
                     _, _, input_info = self.get_input_info(unique_id, input_name)
                     is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"]
-                    if (include_lazy or not is_lazy):
-                        if not self.is_cached(from_node_id):
-                            node_ids.append(from_node_id)
+                    if (include_lazy or not is_lazy) and not self.is_cached(from_node_id):
+                        node_ids.append(from_node_id)
                         links.append((from_node_id, from_socket, unique_id))
 
         for link in links:
@@ -207,34 +206,10 @@ class ExecutionList(TopologicalSort):
         super().__init__(dynprompt)
         self.output_cache = output_cache
         self.staged_node_id = None
-        self.execution_cache = {}
-        self.execution_cache_listeners = {}
 
     def is_cached(self, node_id):
         return self.output_cache.get(node_id) is not None
 
-    def cache_link(self, from_node_id, to_node_id):
-        if not to_node_id in self.execution_cache:
-            self.execution_cache[to_node_id] = {}
-        self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id)
-        if not from_node_id in self.execution_cache_listeners:
-            self.execution_cache_listeners[from_node_id] = set()
-        self.execution_cache_listeners[from_node_id].add(to_node_id)
-
-    def get_output_cache(self, from_node_id, to_node_id):
-        if not to_node_id in self.execution_cache:
-            return None
-        return self.execution_cache[to_node_id].get(from_node_id)
-
-    def cache_update(self, node_id, value):
-        if node_id in self.execution_cache_listeners:
-            for to_node_id in self.execution_cache_listeners[node_id]:
-                self.execution_cache[to_node_id][node_id] = value
-
-    def add_strong_link(self, from_node_id, from_socket, to_node_id):
-        super().add_strong_link(from_node_id, from_socket, to_node_id)
-        self.cache_link(from_node_id, to_node_id)
-
     async def stage_node_execution(self) -> tuple[Optional[str], Optional[DependencyExecutionErrorMessage], Optional[DependencyCycleError]]:
         assert self.staged_node_id is None
         if self.is_empty():
@@ -314,8 +289,6 @@ class ExecutionList(TopologicalSort):
     def complete_node_execution(self):
         node_id = self.staged_node_id
         self.pop_node(node_id)
-        self.execution_cache.pop(node_id, None)
-        self.execution_cache_listeners.pop(node_id, None)
         self.staged_node_id = None
 
     def get_nodes_in_cycle(self):
diff --git a/pyproject.toml b/pyproject.toml
index 98adf712e..6501e5797 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "comfyui"
-version = "0.3.65"
+version = "0.3.66"
 description = "An installable version of ComfyUI"
 readme = "README.md"
 authors = [
diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py
index d1edfe9a5..25d5e86ae 100644
--- a/tests/execution/test_execution.py
+++ b/tests/execution/test_execution.py
@@ -174,10 +174,7 @@ class TestExecution:
         await client.run(g)
         result2 = await client.run(g)
         for node_id, node in g.nodes.items():
-            # if server["should_cache_results"]:
             assert not result2.did_run(node), f"Node {node_id} ran, but should have been cached"
-            # else:
-            #     assert result2.did_run(node), f"Node {node_id} was cached, but should have been run"
 
     async def test_partial_cache(self, client: ComfyClient, builder: GraphBuilder):
         g = builder
@@ -190,13 +187,9 @@ class TestExecution:
 
         await client.run(g)
         mask.inputs['value'] = 0.4
-        result2 = await client.run(g)
-        # if server["should_cache_results"]:
+        result2 = client.run(g)
         assert not result2.did_run(input1), "Input1 should have been cached"
         assert not result2.did_run(input2), "Input2 should have been cached"
-        # else:
-        #     assert result2.did_run(input1), "Input1 should have been rerun"
-        #     assert result2.did_run(input2), "Input2 should have been rerun"
 
     async def test_error(self, client: ComfyClient, builder: GraphBuilder):
         g = builder
@@ -336,10 +329,7 @@ class TestExecution:
         result3 = await client.run(g)
         result4 = await client.run(g)
         assert result1.did_run(is_changed), "is_changed should have been run"
-        # if server["should_cache_results"]:
         assert not result2.did_run(is_changed), "is_changed should have been cached"
-        # else:
-        #     assert result2.did_run(is_changed), "is_changed should have been re-run"
         assert result3.did_run(is_changed), "is_changed should have been re-run"
         assert result4.did_run(is_changed), "is_changed should not have been cached"
 
@@ -461,10 +451,7 @@ class TestExecution:
         images = result.get_images(output)
         assert len(images) == 1, "Should have 1 image"
         assert numpy.array(images[0]).min() == 63 and numpy.array(images[0]).max() == 63, "Image should have value 0.25"
-        # if server["should_cache_results"]:
         assert not result.did_run(test_node), "The execution should have been cached"
-        # else:
-        #     assert result.did_run(test_node), "The execution should have been re-run"
 
     async def test_parallel_sleep_nodes(self, client: ComfyClient, builder: GraphBuilder, skip_timing_checks):
         # Warmup execution to ensure server is fully initialized