execution: fold in dependency aware caching

This makes --cache-none compatiable with lazy and expanded
subgraphs.

Currently the --cache-none option is powered by the
DependencyAwareCache. The cache attempts to maintain a parallel
copy of the execution list data structure, however it is only
setup once at the start of execution and does not get meaninigful
updates to the execution list.

This causes multiple problems when --cache-none is used with lazy
and expanded subgraphs as the DAC does not accurately update its
copy of the execution data structure.

DAC has an attempt to handle subgraphs ensure_subcache however
this does not accurately connect to nodes outside the subgraph.
The current semantics of DAC are to free a node ASAP after the
dependent nodes are executed.

This means that if a subgraph refs such a node it will be requed
and re-executed by the execution_list but DAC wont see it in
its to-free lists anymore and leak memory.

Rather than try and cover all the cases where the execution list
changes from inside the cache, move the while problem to the
executor which maintains an always up-to-date copy of the wanted
data-structure.

The executor now has a fast-moving run-local cache of its own.
Each _to node has its own mini cache, and the cache is unconditionally
primed at the time of add_strong_link.

add_strong_link is called for all of static workflows, lazy links
and expanded subgraphs so its the singular source of truth for
output dependendencies.

In the case of a cache-hit, the executor cache will hold the non-none
value (it will respect updates if they happen somehow as well).

In the case of a cache-miss, the executor caches a None and will
wait for a notification to update the value when the node completes.

When a node completes execution, it simply releases its mini-cache
and in turn its strong refs on its direct anscestor outputs, allowing
for ASAP freeing (same as the DependencyAwareCache but a little more
automatic).

This now allows for re-implementation of --cache-none with no cache
at all. The dependency aware cache was also observing the dependency
sematics for the objects and UI cache which is not accurate (this
entire logic was always outputs specific).

This also prepares for more complex caching strategies (such as RAM
pressure based caching), where a cache can implement any freeing
strategy completely independently of the DepedancyAwareness
requirement.
This commit is contained in:
Rattus 2025-10-16 21:30:18 +10:00
parent b7992f871a
commit 943dda5f94
2 changed files with 40 additions and 7 deletions

View File

@ -153,8 +153,9 @@ class TopologicalSort:
continue
_, _, input_info = self.get_input_info(unique_id, input_name)
is_lazy = input_info is not None and "lazy" in input_info and input_info["lazy"]
if (include_lazy or not is_lazy) and not self.is_cached(from_node_id):
node_ids.append(from_node_id)
if (include_lazy or not is_lazy):
if not self.is_cached(from_node_id):
node_ids.append(from_node_id)
links.append((from_node_id, from_socket, unique_id))
for link in links:
@ -194,10 +195,35 @@ class ExecutionList(TopologicalSort):
super().__init__(dynprompt)
self.output_cache = output_cache
self.staged_node_id = None
self.execution_cache = {}
self.execution_cache_listeners = {}
def is_cached(self, node_id):
return self.output_cache.get(node_id) is not None
def cache_link(self, from_node_id, to_node_id):
if not to_node_id in self.execution_cache:
self.execution_cache[to_node_id] = {}
self.execution_cache[to_node_id][from_node_id] = self.output_cache.get(from_node_id)
if not from_node_id in self.execution_cache_listeners:
self.execution_cache_listeners[from_node_id] = set()
self.execution_cache_listeners[from_node_id].add(to_node_id)
def get_output_cache(self, from_node_id, to_node_id):
if not to_node_id in self.execution_cache:
return None
return self.execution_cache[to_node_id].get(from_node_id)
def cache_update(self, node_id, value):
if node_id in self.execution_cache_listeners:
for to_node_id in self.execution_cache_listeners[node_id]:
if to_node_id in self.execution_cache:
self.execution_cache[to_node_id][node_id] = value
def add_strong_link(self, from_node_id, from_socket, to_node_id):
super().add_strong_link(from_node_id, from_socket, to_node_id)
self.cache_link(from_node_id, to_node_id)
async def stage_node_execution(self):
assert self.staged_node_id is None
if self.is_empty():
@ -277,6 +303,8 @@ class ExecutionList(TopologicalSort):
def complete_node_execution(self):
node_id = self.staged_node_id
self.pop_node(node_id)
self.execution_cache.pop(node_id, None)
self.execution_cache_listeners.pop(node_id, None)
self.staged_node_id = None
def get_nodes_in_cycle(self):

View File

@ -135,7 +135,7 @@ class CacheSet:
SENSITIVE_EXTRA_DATA_KEYS = ("auth_token_comfy_org", "api_key_comfy_org")
def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, extra_data={}):
def get_input_data(inputs, class_def, unique_id, execution_list=None, dynprompt=None, extra_data={}):
is_v3 = issubclass(class_def, _ComfyNodeInternal)
if is_v3:
valid_inputs, schema = class_def.INPUT_TYPES(include_hidden=False, return_schema=True)
@ -153,10 +153,10 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
if is_link(input_data) and (not input_info or not input_info.get("rawLink", False)):
input_unique_id = input_data[0]
output_index = input_data[1]
if outputs is None:
if execution_list is None:
mark_missing()
continue # This might be a lazily-evaluated input
cached_output = outputs.get(input_unique_id)
cached_output = execution_list.get_output_cache(input_unique_id, unique_id)
if cached_output is None:
mark_missing()
continue
@ -405,6 +405,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
cached_output = caches.ui.get(unique_id) or {}
server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": cached_output.get("output",None), "prompt_id": prompt_id }, server.client_id)
get_progress_state().finish_progress(unique_id)
execution_list.cache_update(unique_id, caches.outputs.get(unique_id))
return (ExecutionResult.SUCCESS, None, None)
input_data_all = None
@ -434,7 +435,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
for r in result:
if is_link(r):
source_node, source_output = r[0], r[1]
node_output = caches.outputs.get(source_node)[source_output]
node_output = execution_list.get_output_cache(source_node, unique_id)[source_output]
for o in node_output:
resolved_output.append(o)
@ -446,7 +447,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
has_subgraph = False
else:
get_progress_state().start_progress(unique_id)
input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, caches.outputs, dynprompt, extra_data)
input_data_all, missing_keys, hidden_inputs = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data)
if server.client_id is not None:
server.last_node_id = display_node_id
server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id)
@ -549,11 +550,15 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
subcache.clean_unused()
for node_id in new_output_ids:
execution_list.add_node(node_id)
execution_list.cache_link(node_id, unique_id)
for link in new_output_links:
execution_list.add_strong_link(link[0], link[1], unique_id)
pending_subgraph_results[unique_id] = cached_outputs
return (ExecutionResult.PENDING, None, None)
caches.outputs.set(unique_id, output_data)
execution_list.cache_update(unique_id, output_data)
except comfy.model_management.InterruptProcessingException as iex:
logging.info("Processing interrupted")