Users can now configure their workers to panic if they have out of memory exceptions, which occur due to complex failures in custom nodes

2026-03-14 13:47:44 +08:00 · 2025-02-18 10:57:23 -08:00 · 2025-02-18 10:57:23 -08:00 · 684d180446
commit 684d180446
parent d04288ce8d
7 changed files with 269 additions and 2 deletions
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@ -190,6 +190,17 @@ def _create_parser() -> EnhancedConfigArgParser:
        """,
    )

+    parser.add_argument(
+        '--panic-when',
+        action='append',
+        help="""
+        List of fully qualified exception class names to panic (os.exit(1)) when a workflow raises it.
+        Example: --panic-when=torch.cuda.OutOfMemoryError. Can be specified multiple times or as a 
+        comma-separated list.""",
+        type=str,
+        default=[]
+    )
+
    def is_valid_directory(path: Optional[str]) -> Optional[str]:
        """Validate if the given path is a directory."""
        if path is None:
--- a/comfy/cli_args_types.py
+++ b/comfy/cli_args_types.py
@ -122,6 +122,7 @@ class Configuration(dict):
        anthropic_api_key (str): Configures the Anthropic API key for its nodes related to Claude functionality. Visit https://console.anthropic.com/settings/keys to create this key.
        user_directory (Optional[str]): Set the ComfyUI user directory with an absolute path.
        log_stdout (bool): Send normal process output to stdout instead of stderr (default)
+        panic_when (list[str]): List of fully qualified exception class names to panic (os.exit(1)) when a workflow raises it.
    """

    def __init__(self, **kwargs):
@ -220,6 +221,7 @@ class Configuration(dict):
        self.ideogram_api_key: Optional[str] = None
        self.anthropic_api_key: Optional[str] = None
        self.user_directory: Optional[str] = None
+        self.panic_when: list[str] = []

    def __getattr__(self, item):
        if item not in self:
--- a/comfy/client/embedded_comfy_client.py
+++ b/comfy/client/embedded_comfy_client.py
@ -41,12 +41,14 @@ def _execute_prompt(
    span_context: Context = propagate.extract(span_context)
    token = attach(span_context)
    try:
-        return __execute_prompt(prompt, prompt_id, client_id, span_context, progress_handler, configuration)
+        # there is never an event loop running on a thread or process pool thread here
+        # this also guarantees nodes will be able to successfully call await
+        return asyncio.run(__execute_prompt(prompt, prompt_id, client_id, span_context, progress_handler, configuration))
    finally:
        detach(token)


-def __execute_prompt(
+async def __execute_prompt(
        prompt: dict,
        prompt_id: str,
        client_id: str,
--- a/comfy/cmd/execution.py
+++ b/comfy/cmd/execution.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+import asyncio
 import copy
 import heapq
 import inspect
@ -20,6 +21,7 @@ from opentelemetry.trace import get_current_span, StatusCode, Status
 from .main_pre import tracer
 from .. import interruption
 from .. import model_management
+from ..cli_args import args
 from ..component_model.abstract_prompt_queue import AbstractPromptQueue
 from ..component_model.executor_types import ExecutorToClientProgress, ValidationTuple, ValidateInputsTuple, \
    ValidationErrorDict, NodeErrorsDictValue, ValidationErrorExtraInfoDict, FormattedValue, RecursiveExecutionTuple, \
@ -28,6 +30,7 @@ from ..component_model.executor_types import ExecutorToClientProgress, Validatio
 from ..component_model.files import canonicalize_path
 from ..component_model.queue_types import QueueTuple, HistoryEntry, QueueItem, MAXIMUM_HISTORY_SIZE, ExecutionStatus
 from ..execution_context import context_execute_node, context_execute_prompt
+from ..execution_ext import should_panic_on_exception
 from ..nodes.package import import_all_nodes_in_workspace
 from ..nodes.package_typing import ExportedNodes, InputTypeSpec, FloatSpecOptions, IntSpecOptions, CustomNode

@ -110,6 +113,7 @@ def get_input_data(inputs, class_def, unique_id, outputs=None, dynprompt=None, e
    for x in inputs:
        input_data = inputs[x]
        input_type, input_category, input_info = get_input_info(class_def, x, valid_inputs)
+
        def mark_missing():
            missing_keys[x] = True
            input_data_all[x] = (None,)
@ -480,6 +484,14 @@ def _execute(server, dynprompt, caches: CacheSet, current_item: str, extra_data,
            logging.error("Got an OOM, unloading all loaded models.")
            model_management.unload_all_models()

+        if should_panic_on_exception(ex, args.panic_when):
+            logging.error(f"The exception {ex} was configured as unrecoverable, scheduling an exit")
+
+            def sys_exit(*args):
+                sys.exit(1)
+
+            asyncio.get_event_loop().call_soon_threadsafe(sys_exit, ())
+
        return RecursiveExecutionTuple(ExecutionResult.FAILURE, error_details, ex)

    executed.add(unique_id)
--- a/comfy/distributed/executors.py
+++ b/comfy/distributed/executors.py
@ -1,5 +1,7 @@
+import asyncio
 import concurrent
 import contextvars
+import threading
 import typing
 from concurrent.futures import Future, ThreadPoolExecutor
 from functools import partial
--- a/comfy/execution_ext.py
+++ b/comfy/execution_ext.py
@ -0,0 +1,56 @@
+import importlib
+
+
+def import_exception_class(fqn: str):
+    """
+    Imports an exception class from its fully qualified name.
+    Example: 'torch.cuda.OutOfMemoryError' -> torch.cuda.OutOfMemoryError
+
+    Args:
+        fqn: Fully qualified name of the exception class
+
+    Returns:
+        The exception class
+
+    Raises:
+        ValueError: If the class cannot be imported or is not a subclass of Exception
+    """
+    try:
+        module_path, class_name = fqn.rsplit('.', 1)
+        module = importlib.import_module(module_path)
+        exc_class = getattr(module, class_name)
+
+        if not isinstance(exc_class, type) or not issubclass(exc_class, Exception):
+            raise ValueError(f"{fqn} is not an exception class")
+
+        return exc_class
+    except (ImportError, AttributeError) as e:
+        raise ValueError(f"Could not import exception class {fqn}: {str(e)}")
+
+
+def should_panic_on_exception(exc: Exception, panic_classes: list[str]) -> bool:
+    """
+    Checks if the given exception matches any of the specified panic classes.
+
+    Args:
+        exc: The exception to check
+        panic_classes: List of fully qualified exception class names
+
+    Returns:
+        True if the exception is an instance of one of the specified classes
+    """
+    # Handle comma-separated lists (from config files or env vars)
+    expanded_classes = []
+    for class_spec in panic_classes:
+        expanded_classes.extend(name.strip() for name in class_spec.split(','))
+
+    # Import all exception classes
+    try:
+        exception_types = [import_exception_class(name)
+                           for name in expanded_classes if name]
+    except ValueError as e:
+        print(f"Warning: {str(e)}")
+        return False
+
+    # Check if exception matches any of the specified types
+    return any(isinstance(exc, exc_type) for exc_type in exception_types)
--- a/tests/unit/test_panics.py
+++ b/tests/unit/test_panics.py
@ -0,0 +1,182 @@
+import asyncio
+import threading
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from comfy.cli_args_types import Configuration
+from comfy.client.embedded_comfy_client import EmbeddedComfyClient
+from comfy.cmd.execution import nodes
+from comfy.component_model.make_mutable import make_mutable
+from comfy.component_model.tensor_types import RGBImageBatch
+from comfy.nodes.package_typing import CustomNode, ExportedNodes
+
+
+@pytest.mark.asyncio
+async def test_event_loop_callbacks():
+    """Test to understand event loop callback behavior in pytest-asyncio"""
+    callback_executed = False
+    current_thread = threading.current_thread()
+    current_loop = asyncio.get_running_loop()
+
+    def callback(*args):
+        nonlocal callback_executed
+        print(f"Callback executing in thread: {threading.current_thread()}")
+        print(f"Original thread was: {current_thread}")
+        callback_executed = True
+
+    print(f"Test running in thread: {current_thread}")
+    print(f"Test using event loop: {current_loop}")
+
+    # Try different ways of scheduling the callback
+    current_loop.call_soon(callback)
+    await asyncio.sleep(0)
+    print(f"After sleep(0), callback_executed: {callback_executed}")
+
+    if not callback_executed:
+        current_loop.call_soon_threadsafe(callback)
+        await asyncio.sleep(0)
+        print(f"After threadsafe callback, callback_executed: {callback_executed}")
+
+    if not callback_executed:
+        # Try running callback in event loop directly
+        await asyncio.get_event_loop().run_in_executor(None, callback)
+        print(f"After run_in_executor, callback_executed: {callback_executed}")
+
+    assert callback_executed, "Callback was never executed"
+
+
+@pytest.mark.asyncio
+async def test_separate_thread_callback():
+    """Test callbacks scheduled from a separate thread"""
+    callback_executed = False
+    event = threading.Event()
+    main_loop = asyncio.get_running_loop()
+
+    def thread_func():
+        print(f"Thread function running in: {threading.current_thread()}")
+        main_loop.call_soon_threadsafe(lambda *_: event.set())
+
+    print(f"Test running in thread: {threading.current_thread()}")
+    print(f"Test using event loop: {main_loop}")
+
+    # Start thread that will schedule callback
+    thread = threading.Thread(target=thread_func)
+    thread.start()
+
+    # Wait for event with timeout
+    try:
+        await asyncio.wait_for(
+            asyncio.get_event_loop().run_in_executor(None, event.wait),
+            timeout=1.0
+        )
+        print("Event was set!")
+    except asyncio.TimeoutError:
+        print("Timed out waiting for event!")
+        assert False, "Event was never set"
+
+    thread.join()
+
+
+# Custom test exception that we'll configure to panic on
+class TestUnrecoverableError(Exception):
+    pass
+
+
+class TestExceptionNode(CustomNode):
+    """Node that raises a specific exception for testing"""
+
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {
+                "should_raise": ("BOOL", {"default": True}),
+            },
+        }
+
+    RETURN_TYPES = ("IMAGE",)  # Make it an output node by returning IMAGE
+    FUNCTION = "raise_exception"
+    CATEGORY = "Testing/Nodes"
+    OUTPUT_NODE = True
+
+    def raise_exception(self, should_raise=True) -> tuple[RGBImageBatch]:
+        if should_raise:
+            raise TestUnrecoverableError("Test exception from node")
+        else:
+            # Return a dummy image if not raising
+            return (torch.zeros([1, 64, 64, 3]),)
+
+
+# Export the node mappings
+TEST_NODE_CLASS_MAPPINGS = {
+    "TestExceptionNode": TestExceptionNode,
+}
+
+TEST_NODE_DISPLAY_NAME_MAPPINGS = {
+    "TestExceptionNode": "Test Exception Node",
+}
+
+
+def create_failing_workflow():
+    """Create a workflow that uses our test node to raise an exception"""
+    return make_mutable({
+        "1": {
+            "class_type": "TestExceptionNode",
+            "inputs": {
+                "should_raise": True
+            }
+        }
+    })
+
+
+@pytest.mark.asyncio
+async def test_panic_on_exception():
+    # Set up the test nodes
+    nodes.update(ExportedNodes(NODE_CLASS_MAPPINGS=TEST_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS=TEST_NODE_DISPLAY_NAME_MAPPINGS))
+
+    # Create configuration with our test exception in panic_when
+    config = Configuration()
+    config.panic_when = [f"{__name__}.TestUnrecoverableError"]
+
+    # Mock sys.exit to prevent actual exit and verify it's called
+    with patch('sys.exit') as mock_exit:
+        try:
+            async with EmbeddedComfyClient(configuration=config) as client:
+                # Queue our failing workflow
+                await client.queue_prompt(create_failing_workflow())
+        except TestUnrecoverableError:
+            # We expect the exception to be raised here
+            pass
+
+        # Give the event loop a chance to process the exit callback
+        await asyncio.sleep(0)
+
+        # Verify sys.exit was called with code 1
+        mock_exit.assert_called_once_with(1)
+
+
+@pytest.mark.asyncio
+async def test_no_panic_when_disabled():
+    """Verify that the same exception doesn't trigger exit when not in panic_when"""
+    # Set up the test nodes
+    nodes.update(ExportedNodes(NODE_CLASS_MAPPINGS=TEST_NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS=TEST_NODE_DISPLAY_NAME_MAPPINGS))
+
+    # Create configuration without the exception in panic_when
+    config = Configuration()
+
+    # Mock sys.exit to verify it's not called
+    with patch('sys.exit') as mock_exit:
+        try:
+            async with EmbeddedComfyClient(configuration=config) as client:
+                # Queue our failing workflow
+                await client.queue_prompt(create_failing_workflow())
+        except TestUnrecoverableError:
+            # We expect the exception to be raised here
+            pass
+
+        # Give the event loop a chance to process any callbacks
+        await asyncio.sleep(0.1)
+
+        # Verify sys.exit was not called
+        mock_exit.assert_not_called()