mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-03-08 02:37:42 +08:00
refactor: enhance isolation handling in notify_execution_graph and process_latent_in methods
This commit is contained in:
parent
26edd5663d
commit
4ae6f77064
@ -188,22 +188,33 @@ async def notify_execution_graph(needed_class_types: Set[str]) -> None:
|
|||||||
scan_shm_forensics("ISO:stop_extension", refresh_model_context=True)
|
scan_shm_forensics("ISO:stop_extension", refresh_model_context=True)
|
||||||
|
|
||||||
scan_shm_forensics("ISO:notify_graph_start", refresh_model_context=True)
|
scan_shm_forensics("ISO:notify_graph_start", refresh_model_context=True)
|
||||||
|
isolated_class_types_in_graph = needed_class_types.intersection(
|
||||||
|
{spec.node_name for spec in _ISOLATED_NODE_SPECS}
|
||||||
|
)
|
||||||
|
graph_uses_isolation = bool(isolated_class_types_in_graph)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"%s ISO:notify_graph_start running=%d needed=%d",
|
"%s ISO:notify_graph_start running=%d needed=%d",
|
||||||
LOG_PREFIX,
|
LOG_PREFIX,
|
||||||
len(_RUNNING_EXTENSIONS),
|
len(_RUNNING_EXTENSIONS),
|
||||||
len(needed_class_types),
|
len(needed_class_types),
|
||||||
)
|
)
|
||||||
for ext_name, extension in list(_RUNNING_EXTENSIONS.items()):
|
if graph_uses_isolation:
|
||||||
ext_class_types = _get_class_types_for_extension(ext_name)
|
for ext_name, extension in list(_RUNNING_EXTENSIONS.items()):
|
||||||
|
ext_class_types = _get_class_types_for_extension(ext_name)
|
||||||
|
|
||||||
# If NONE of this extension's nodes are in the execution graph → evict
|
# If NONE of this extension's nodes are in the execution graph -> evict.
|
||||||
if not ext_class_types.intersection(needed_class_types):
|
if not ext_class_types.intersection(needed_class_types):
|
||||||
await _stop_extension(
|
await _stop_extension(
|
||||||
ext_name,
|
ext_name,
|
||||||
extension,
|
extension,
|
||||||
"isolated custom_node not in execution graph, evicting",
|
"isolated custom_node not in execution graph, evicting",
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"%s ISO:notify_graph_skip_evict running=%d reason=no isolated nodes in graph",
|
||||||
|
LOG_PREFIX,
|
||||||
|
len(_RUNNING_EXTENSIONS),
|
||||||
|
)
|
||||||
|
|
||||||
# Isolated child processes add steady VRAM pressure; reclaim host-side models
|
# Isolated child processes add steady VRAM pressure; reclaim host-side models
|
||||||
# at workflow boundaries so subsequent host nodes (e.g. CLIP encode) keep headroom.
|
# at workflow boundaries so subsequent host nodes (e.g. CLIP encode) keep headroom.
|
||||||
@ -217,7 +228,7 @@ async def notify_execution_graph(needed_class_types: Set[str]) -> None:
|
|||||||
_WORKFLOW_BOUNDARY_MIN_FREE_VRAM_BYTES,
|
_WORKFLOW_BOUNDARY_MIN_FREE_VRAM_BYTES,
|
||||||
)
|
)
|
||||||
free_before = model_management.get_free_memory(device)
|
free_before = model_management.get_free_memory(device)
|
||||||
if free_before < required and _RUNNING_EXTENSIONS:
|
if free_before < required and _RUNNING_EXTENSIONS and graph_uses_isolation:
|
||||||
for ext_name, extension in list(_RUNNING_EXTENSIONS.items()):
|
for ext_name, extension in list(_RUNNING_EXTENSIONS.items()):
|
||||||
await _stop_extension(
|
await _stop_extension(
|
||||||
ext_name,
|
ext_name,
|
||||||
|
|||||||
@ -1082,15 +1082,32 @@ class ModelPatcherRegistry(BaseRegistry[Any]):
|
|||||||
async def process_latent_in(
|
async def process_latent_in(
|
||||||
self, instance_id: str, args: tuple, kwargs: dict
|
self, instance_id: str, args: tuple, kwargs: dict
|
||||||
) -> Any:
|
) -> Any:
|
||||||
return self._run_operation_with_lease(
|
import torch
|
||||||
instance_id,
|
|
||||||
"process_latent_in",
|
def _invoke() -> Any:
|
||||||
lambda: detach_if_grad(
|
instance = self._get_instance(instance_id)
|
||||||
self._get_instance(instance_id).model.process_latent_in(
|
result = detach_if_grad(instance.model.process_latent_in(*args, **kwargs))
|
||||||
*args, **kwargs
|
|
||||||
)
|
# DynamicVRAM + isolation: returning CUDA tensors across RPC can stall
|
||||||
),
|
# at the transport boundary. Marshal dynamic-path results as CPU and let
|
||||||
)
|
# the proxy restore placement when needed.
|
||||||
|
is_dynamic_fn = getattr(instance, "is_dynamic", None)
|
||||||
|
if callable(is_dynamic_fn) and is_dynamic_fn():
|
||||||
|
def _to_cpu(obj: Any) -> Any:
|
||||||
|
if torch.is_tensor(obj):
|
||||||
|
return obj.detach().cpu() if obj.device.type != "cpu" else obj
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: _to_cpu(v) for k, v in obj.items()}
|
||||||
|
if isinstance(obj, list):
|
||||||
|
return [_to_cpu(v) for v in obj]
|
||||||
|
if isinstance(obj, tuple):
|
||||||
|
return tuple(_to_cpu(v) for v in obj)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
return _to_cpu(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return self._run_operation_with_lease(instance_id, "process_latent_in", _invoke)
|
||||||
|
|
||||||
async def process_latent_out(
|
async def process_latent_out(
|
||||||
self, instance_id: str, args: tuple, kwargs: dict
|
self, instance_id: str, args: tuple, kwargs: dict
|
||||||
|
|||||||
@ -831,7 +831,11 @@ def archive_model_dtypes(model):
|
|||||||
def cleanup_models():
|
def cleanup_models():
|
||||||
to_delete = []
|
to_delete = []
|
||||||
for i in range(len(current_loaded_models)):
|
for i in range(len(current_loaded_models)):
|
||||||
if current_loaded_models[i].real_model() is None:
|
real_model_ref = current_loaded_models[i].real_model
|
||||||
|
if real_model_ref is None:
|
||||||
|
to_delete = [i] + to_delete
|
||||||
|
continue
|
||||||
|
if callable(real_model_ref) and real_model_ref() is None:
|
||||||
to_delete = [i] + to_delete
|
to_delete = [i] + to_delete
|
||||||
|
|
||||||
for i in to_delete:
|
for i in to_delete:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user