From 3d870ff51fe73b58a340df4a87f423f3b65b3afa Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Fri, 15 May 2026 01:25:18 +0800 Subject: [PATCH 1/9] chore: update workflow templates to v0.9.77 (#13895) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 36b248a4f..f499a10ae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ comfyui-frontend-package==1.43.18 -comfyui-workflow-templates==0.9.75 +comfyui-workflow-templates==0.9.77 comfyui-embedded-docs==0.5.0 torch torchsde From 3f9bdc70ee39f07894a2353abdb0926d9f31ce40 Mon Sep 17 00:00:00 2001 From: Robin Huang Date: Thu, 14 May 2026 10:32:40 -0700 Subject: [PATCH 2/9] Add careers link to README and startup log (#13897) --- README.md | 2 ++ server.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/README.md b/README.md index 0fd317d0a..64d494f20 100644 --- a/README.md +++ b/README.md @@ -429,6 +429,8 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w See also: [https://www.comfy.org/](https://www.comfy.org/) +> _psst — we're hiring!_ Help build ComfyUI: [comfy.org/careers](https://www.comfy.org/careers) + ## Frontend Development As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory. diff --git a/server.py b/server.py index 2f3b438bb..18fb4064e 100644 --- a/server.py +++ b/server.py @@ -1266,6 +1266,9 @@ class PromptServer(): if verbose: logging.info("To see the GUI go to: {}://{}:{}".format(scheme, address_print, port)) + if verbose: + logging.info("psst — we're hiring! https://www.comfy.org/careers") + if call_on_start is not None: call_on_start(scheme, self.address, self.port) From 7a063e83a7fd2c9d8770bb20e0a7547af7ec080b Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 May 2026 12:26:13 -0700 Subject: [PATCH 3/9] Remove annoying message. (#13899) --- server.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/server.py b/server.py index 18fb4064e..2f3b438bb 100644 --- a/server.py +++ b/server.py @@ -1266,9 +1266,6 @@ class PromptServer(): if verbose: logging.info("To see the GUI go to: {}://{}:{}".format(scheme, address_print, port)) - if verbose: - logging.info("psst — we're hiring! https://www.comfy.org/careers") - if call_on_start is not None: call_on_start(scheme, self.address, self.port) From 4f6018982dcd27258da3e1c50b9b50d464fcaed2 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 14 May 2026 15:11:34 -0700 Subject: [PATCH 4/9] Include workflow_id in all execution WebSocket messages (CORE-198) (#13684) --- comfy_execution/jobs.py | 24 +- comfy_execution/progress.py | 13 +- execution.py | 33 +- main.py | 11 +- server.py | 6 +- .../test_workflow_id_in_ws_messages.py | 297 ++++++++++++++++++ tests/execution/test_jobs.py | 35 +++ 7 files changed, 398 insertions(+), 21 deletions(-) create mode 100644 tests-unit/execution_test/test_workflow_id_in_ws_messages.py diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py index fcd7ef735..24dd1ffd0 100644 --- a/comfy_execution/jobs.py +++ b/comfy_execution/jobs.py @@ -93,6 +93,27 @@ def _create_text_preview(value: str) -> dict: } +def extract_workflow_id(extra_data: Optional[dict]) -> Optional[str]: + """Extract the workflow id from a prompt's ``extra_data``. + + The frontend stores the id at ``extra_data["extra_pnginfo"]["workflow"]["id"]`` + when a prompt is queued. Any value that is not a non-empty string is treated as + missing so callers can rely on the return being either ``None`` or a string. + """ + if not isinstance(extra_data, dict): + return None + extra_pnginfo = extra_data.get('extra_pnginfo') + if not isinstance(extra_pnginfo, dict): + return None + workflow = extra_pnginfo.get('workflow') + if not isinstance(workflow, dict): + return None + workflow_id = workflow.get('id') + if isinstance(workflow_id, str) and workflow_id: + return workflow_id + return None + + def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]: """Extract create_time and workflow_id from extra_data. @@ -100,8 +121,7 @@ def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str tuple: (create_time, workflow_id) """ create_time = extra_data.get('create_time') - extra_pnginfo = extra_data.get('extra_pnginfo', {}) - workflow_id = extra_pnginfo.get('workflow', {}).get('id') + workflow_id = extract_workflow_id(extra_data) return create_time, workflow_id diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index f951a3350..b6d3bd3e4 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -164,6 +164,8 @@ class WebUIProgressHandler(ProgressHandler): if self.server_instance is None: return + workflow_id = self.registry.workflow_id if self.registry else None + # Only send info for non-pending nodes active_nodes = { node_id: { @@ -172,6 +174,7 @@ class WebUIProgressHandler(ProgressHandler): "state": state["state"].value, "node_id": node_id, "prompt_id": prompt_id, + "workflow_id": workflow_id, "display_node_id": self.registry.dynprompt.get_display_node_id(node_id), "parent_node_id": self.registry.dynprompt.get_parent_node_id(node_id), "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), @@ -183,7 +186,7 @@ class WebUIProgressHandler(ProgressHandler): # Send a combined progress_state message with all node states # Include client_id to ensure message is only sent to the initiating client self.server_instance.send_sync( - "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes}, self.server_instance.client_id + "progress_state", {"prompt_id": prompt_id, "workflow_id": workflow_id, "nodes": active_nodes}, self.server_instance.client_id ) @override @@ -215,6 +218,7 @@ class WebUIProgressHandler(ProgressHandler): metadata = { "node_id": node_id, "prompt_id": prompt_id, + "workflow_id": self.registry.workflow_id if self.registry else None, "display_node_id": self.registry.dynprompt.get_display_node_id( node_id ), @@ -240,9 +244,10 @@ class ProgressRegistry: Registry that maintains node progress state and notifies registered handlers. """ - def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt"): + def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt", workflow_id: Optional[str] = None): self.prompt_id = prompt_id self.dynprompt = dynprompt + self.workflow_id = workflow_id self.nodes: Dict[str, NodeProgressState] = {} self.handlers: Dict[str, ProgressHandler] = {} @@ -322,7 +327,7 @@ class ProgressRegistry: # Global registry instance global_progress_registry: ProgressRegistry | None = None -def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: +def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt", workflow_id: Optional[str] = None) -> None: global global_progress_registry # Reset existing handlers if registry exists @@ -330,7 +335,7 @@ def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global_progress_registry.reset_handlers() # Create new registry - global_progress_registry = ProgressRegistry(prompt_id, dynprompt) + global_progress_registry = ProgressRegistry(prompt_id, dynprompt, workflow_id) def add_progress_handler(handler: ProgressHandler) -> None: diff --git a/execution.py b/execution.py index f37d0360d..ff8240588 100644 --- a/execution.py +++ b/execution.py @@ -38,6 +38,7 @@ from comfy_execution.graph import ( from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler +from comfy_execution.jobs import extract_workflow_id from comfy_execution.utils import CurrentNodeContext from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func from comfy_api.latest import io, _io @@ -417,15 +418,15 @@ def _is_intermediate_output(dynprompt, node_id): class_def = nodes.NODE_CLASS_MAPPINGS[class_type] return getattr(class_def, 'HAS_INTERMEDIATE_OUTPUT', False) -def _send_cached_ui(server, node_id, display_node_id, cached, prompt_id, ui_outputs): +def _send_cached_ui(server, node_id, display_node_id, cached, prompt_id, workflow_id, ui_outputs): if server.client_id is None: return cached_ui = cached.ui or {} - server.send_sync("executed", { "node": node_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id }, server.client_id) + server.send_sync("executed", { "node": node_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) if cached.ui is not None: ui_outputs[node_id] = cached.ui -async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, workflow_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -435,7 +436,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, class_def = nodes.NODE_CLASS_MAPPINGS[class_type] cached = await caches.outputs.get(unique_id) if cached is not None: - _send_cached_ui(server, unique_id, display_node_id, cached, prompt_id, ui_outputs) + _send_cached_ui(server, unique_id, display_node_id, cached, prompt_id, workflow_id, ui_outputs) get_progress_state().finish_progress(unique_id) execution_list.cache_update(unique_id, cached) return (ExecutionResult.SUCCESS, None, None) @@ -483,7 +484,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id - server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) + server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) obj = await caches.objects.get(unique_id) if obj is None: @@ -513,6 +514,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, if block.message is not None: mes = { "prompt_id": prompt_id, + "workflow_id": workflow_id, "node_id": unique_id, "node_type": class_type, "executed": list(executed), @@ -561,7 +563,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, "output": output_ui } if server.client_id is not None: - server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id) + server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) if has_subgraph: cached_outputs = [] new_node_ids = [] @@ -658,6 +660,7 @@ class PromptExecutor: self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args) self.status_messages = [] self.success = True + self.workflow_id = None def add_message(self, event, data: dict, broadcast: bool): data = { @@ -677,6 +680,7 @@ class PromptExecutor: if isinstance(ex, comfy.model_management.InterruptProcessingException): mes = { "prompt_id": prompt_id, + "workflow_id": self.workflow_id, "node_id": node_id, "node_type": class_type, "executed": list(executed), @@ -685,6 +689,7 @@ class PromptExecutor: else: mes = { "prompt_id": prompt_id, + "workflow_id": self.workflow_id, "node_id": node_id, "node_type": class_type, "executed": list(executed), @@ -723,7 +728,9 @@ class PromptExecutor: self.server.client_id = None self.status_messages = [] - self.add_message("execution_start", { "prompt_id": prompt_id}, broadcast=False) + self.workflow_id = extract_workflow_id(extra_data) + self.server.last_workflow_id = self.workflow_id + self.add_message("execution_start", { "prompt_id": prompt_id, "workflow_id": self.workflow_id }, broadcast=False) self._notify_prompt_lifecycle("start", prompt_id) ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) @@ -733,7 +740,7 @@ class PromptExecutor: try: with torch.inference_mode(): dynamic_prompt = DynamicPrompt(prompt) - reset_progress_state(prompt_id, dynamic_prompt) + reset_progress_state(prompt_id, dynamic_prompt, self.workflow_id) add_progress_handler(WebUIProgressHandler(self.server)) is_changed_cache = IsChangedCache(prompt_id, dynamic_prompt, self.caches.outputs) for cache in self.caches.all: @@ -751,7 +758,7 @@ class PromptExecutor: comfy.model_management.cleanup_models_gc() self.add_message("execution_cached", - { "nodes": cached_nodes, "prompt_id": prompt_id}, + { "nodes": cached_nodes, "prompt_id": prompt_id, "workflow_id": self.workflow_id }, broadcast=False) pending_subgraph_results = {} pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results @@ -769,7 +776,7 @@ class PromptExecutor: break assert node_id is not None, "Node ID should not be None at this point" - result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, self.workflow_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -793,8 +800,8 @@ class PromptExecutor: cached = await self.caches.outputs.get(node_id) if cached is not None: display_node_id = dynamic_prompt.get_display_node_id(node_id) - _send_cached_ui(self.server, node_id, display_node_id, cached, prompt_id, ui_node_outputs) - self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) + _send_cached_ui(self.server, node_id, display_node_id, cached, prompt_id, self.workflow_id, ui_node_outputs) + self.add_message("execution_success", { "prompt_id": prompt_id, "workflow_id": self.workflow_id }, broadcast=False) ui_outputs = {} meta_outputs = {} @@ -811,6 +818,8 @@ class PromptExecutor: finally: comfy.memory_management.set_ram_cache_release_state(None, 0) self._notify_prompt_lifecycle("end", prompt_id) + self.server.last_workflow_id = None + self.workflow_id = None async def validate_inputs(prompt_id, prompt, item, validated, visiting=None): diff --git a/main.py b/main.py index a6fdaf43c..3ac8395b1 100644 --- a/main.py +++ b/main.py @@ -29,6 +29,7 @@ import logging import sys from comfy_execution.progress import get_progress_state from comfy_execution.utils import get_executing_context +from comfy_execution.jobs import extract_workflow_id from comfy_api import feature_flags from app.database.db import init_db, dependencies_available @@ -317,6 +318,12 @@ def prompt_worker(q, server_instance): for k in sensitive: extra_data[k] = sensitive[k] + # Capture the workflow id for this prompt before execution: the + # executor clears server.last_workflow_id in its finally block, so + # reading it after e.execute() returns would emit workflow_id=None + # on the terminal "executing" reset below. + workflow_id = extract_workflow_id(extra_data) + asset_seeder.pause() e.execute(item[2], prompt_id, extra_data, item[4]) @@ -330,7 +337,7 @@ def prompt_worker(q, server_instance): completed=e.success, messages=e.status_messages), process_item=remove_sensitive) if server_instance.client_id is not None: - server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id) + server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id, "workflow_id": workflow_id}, server_instance.client_id) current_time = time.perf_counter() execution_time = current_time - execution_start_time @@ -393,7 +400,7 @@ def hijack_progress(server_instance): prompt_id = server_instance.last_prompt_id if node_id is None: node_id = server_instance.last_node_id - progress = {"value": value, "max": total, "prompt_id": prompt_id, "node": node_id} + progress = {"value": value, "max": total, "prompt_id": prompt_id, "workflow_id": getattr(server_instance, 'last_workflow_id', None), "node": node_id} get_progress_state().update_progress(node_id, value, total, preview_image) server_instance.send_sync("progress", progress, server_instance.client_id) diff --git a/server.py b/server.py index 2f3b438bb..08eea1160 100644 --- a/server.py +++ b/server.py @@ -275,7 +275,11 @@ class PromptServer(): await self.send("status", {"status": self.get_queue_info(), "sid": sid}, sid) # On reconnect if we are the currently executing client send the current node if self.client_id == sid and self.last_node_id is not None: - await self.send("executing", { "node": self.last_node_id }, sid) + await self.send("executing", { + "node": self.last_node_id, + "prompt_id": getattr(self, "last_prompt_id", None), + "workflow_id": getattr(self, "last_workflow_id", None), + }, sid) # Flag to track if we've received the first message first_message = True diff --git a/tests-unit/execution_test/test_workflow_id_in_ws_messages.py b/tests-unit/execution_test/test_workflow_id_in_ws_messages.py new file mode 100644 index 000000000..cf1ff71e9 --- /dev/null +++ b/tests-unit/execution_test/test_workflow_id_in_ws_messages.py @@ -0,0 +1,297 @@ +"""Tests that workflow_id is included alongside prompt_id in WebSocket payloads +emitted by the progress handler and the prompt executor. + +Frontend stores extra_data["extra_pnginfo"]["workflow"]["id"] when queueing a +prompt; we propagate that as `workflow_id` on every execution event so a +multi-tab UI can scope progress state by workflow even when terminal +WebSocket frames are dropped. +""" + +from unittest.mock import MagicMock + +import pytest + +from comfy_execution.progress import ( + NodeState, + ProgressRegistry, + WebUIProgressHandler, + reset_progress_state, + get_progress_state, +) + + +class _DummyDynPrompt: + def get_display_node_id(self, node_id): + return node_id + + def get_parent_node_id(self, node_id): + return None + + def get_real_node_id(self, node_id): + return node_id + + +@pytest.fixture +def server(): + s = MagicMock() + s.client_id = "client-1" + return s + + +def _registry(workflow_id): + return ProgressRegistry( + prompt_id="prompt-1", + dynprompt=_DummyDynPrompt(), + workflow_id=workflow_id, + ) + + +class TestProgressStatePayload: + def test_progress_state_includes_workflow_id(self, server): + registry = _registry("wf-abc") + registry.nodes["n1"] = { + "state": NodeState.Running, + "value": 1.0, + "max": 5.0, + } + + handler = WebUIProgressHandler(server) + handler.set_registry(registry) + handler._send_progress_state("prompt-1", registry.nodes) + + server.send_sync.assert_called_once() + event, payload, sid = server.send_sync.call_args.args + assert event == "progress_state" + assert payload["prompt_id"] == "prompt-1" + assert payload["workflow_id"] == "wf-abc" + assert payload["nodes"]["n1"]["workflow_id"] == "wf-abc" + assert payload["nodes"]["n1"]["prompt_id"] == "prompt-1" + assert sid == "client-1" + + def test_progress_state_workflow_id_none_when_missing(self, server): + registry = _registry(None) + registry.nodes["n1"] = { + "state": NodeState.Running, + "value": 0.5, + "max": 1.0, + } + + handler = WebUIProgressHandler(server) + handler.set_registry(registry) + handler._send_progress_state("prompt-1", registry.nodes) + + _, payload, _ = server.send_sync.call_args.args + assert payload["workflow_id"] is None + assert payload["nodes"]["n1"]["workflow_id"] is None + + +class TestProgressRegistryConstruction: + def test_workflow_id_default_is_none(self): + registry = ProgressRegistry( + prompt_id="prompt-1", dynprompt=_DummyDynPrompt() + ) + assert registry.workflow_id is None + + def test_workflow_id_stored_on_registry(self): + registry = ProgressRegistry( + prompt_id="prompt-1", + dynprompt=_DummyDynPrompt(), + workflow_id="wf-xyz", + ) + assert registry.workflow_id == "wf-xyz" + + +class TestResetProgressState: + def test_reset_threads_workflow_id(self): + reset_progress_state("prompt-1", _DummyDynPrompt(), "wf-456") + assert get_progress_state().workflow_id == "wf-456" + + def test_reset_default_workflow_id_none(self): + reset_progress_state("prompt-2", _DummyDynPrompt()) + assert get_progress_state().workflow_id is None + + +class TestExecutionMessagePayloadsContainWorkflowId: + """Static-analysis guard ensuring every WebSocket message payload that + carries `prompt_id` also carries `workflow_id`. This is a regression net + for future refactors of execution.py / main.py / progress.py and avoids + the GPU/torch dependency of importing `execution.py` directly. + """ + + @staticmethod + def _emitting_dicts(source: str): + """Yield every dict literal in `source` that contains a 'prompt_id' key.""" + import ast + + tree = ast.parse(source) + for node in ast.walk(tree): + if not isinstance(node, ast.Dict): + continue + keys = [ + k.value + for k in node.keys + if isinstance(k, ast.Constant) and isinstance(k.value, str) + ] + if "prompt_id" in keys: + yield node, keys + + def _assert_workflow_id_in_every_prompt_id_dict(self, file_path: str): + from pathlib import Path + + repo_root = Path(__file__).resolve().parents[2] + source = (repo_root / file_path).read_text() + offenders = [] + for node, keys in self._emitting_dicts(source): + if "workflow_id" not in keys: + offenders.append((node.lineno, keys)) + assert not offenders, ( + f"{file_path}: dict literals with 'prompt_id' but no 'workflow_id': {offenders}" + ) + + def test_execution_py_payloads_include_workflow_id(self): + self._assert_workflow_id_in_every_prompt_id_dict("execution.py") + + def test_main_py_payloads_include_workflow_id(self): + self._assert_workflow_id_in_every_prompt_id_dict("main.py") + + def test_progress_py_payloads_include_workflow_id(self): + self._assert_workflow_id_in_every_prompt_id_dict("comfy_execution/progress.py") + + +class TestPreviewImageMetadataPayload: + """Verify PREVIEW_IMAGE_WITH_METADATA metadata carries workflow_id.""" + + def test_preview_metadata_includes_workflow_id(self): + from unittest.mock import MagicMock, patch + from PIL import Image + + from comfy_execution.progress import ( + NodeState, + ProgressRegistry, + WebUIProgressHandler, + ) + + class _DynPrompt: + def get_display_node_id(self, n): + return n + + def get_parent_node_id(self, n): + return None + + def get_real_node_id(self, n): + return n + + server = MagicMock() + server.client_id = "cid" + server.sockets_metadata = {} + + registry = ProgressRegistry( + prompt_id="p1", dynprompt=_DynPrompt(), workflow_id="wf-1" + ) + handler = WebUIProgressHandler(server) + handler.set_registry(registry) + + image = ("PNG", Image.new("RGB", (1, 1)), None) + + with patch( + "comfy_execution.progress.feature_flags.supports_feature", + return_value=True, + ): + handler.update_handler( + node_id="n1", + value=1.0, + max_value=1.0, + state={ + "state": NodeState.Running, + "value": 1.0, + "max": 1.0, + }, + prompt_id="p1", + image=image, + ) + + preview_calls = [ + c + for c in server.send_sync.call_args_list + if c.args[0] != "progress_state" + ] + assert len(preview_calls) == 1 + _, payload, _ = preview_calls[0].args + _, metadata = payload + assert metadata["prompt_id"] == "p1" + assert metadata["workflow_id"] == "wf-1" + + + +class TestTerminalExecutingResetInMainPy: + """Regression test for the main.py prompt_worker terminal 'executing' reset. + + The executor clears server.last_workflow_id in its finally block, so + main.py must capture the workflow id *before* calling e.execute() and use + that local value, not read server.last_workflow_id afterwards. + + Rather than importing main.py (which triggers torch CUDA init in this + environment), we statically assert the contract via AST: somewhere + between the `extra_data = item[3].copy()` line and the + `e.execute(item[2], ...)` call, the function must extract workflow_id + from extra_data into a local, and the subsequent send_sync("executing", + ...) must reference that local rather than server.last_workflow_id. + """ + + def test_terminal_executing_uses_locally_captured_workflow_id(self): + import ast + from pathlib import Path + + repo_root = Path(__file__).resolve().parents[2] + source = (repo_root / "main.py").read_text() + tree = ast.parse(source) + + worker = next( + ( + n + for n in ast.walk(tree) + if isinstance(n, ast.FunctionDef) and n.name == "prompt_worker" + ), + None, + ) + assert worker is not None, "prompt_worker function not found in main.py" + + worker_src = ast.get_source_segment(source, worker) or "" + + assert "extract_workflow_id(extra_data)" in worker_src, ( + "main.py:prompt_worker must capture workflow_id locally from extra_data " + "before calling e.execute() (the executor clears server.last_workflow_id " + "in finally)." + ) + + matched_terminal_executing_send = False + for node in ast.walk(worker): + if not isinstance(node, ast.Call): + continue + func = node.func + if not ( + isinstance(func, ast.Attribute) + and func.attr == "send_sync" + and node.args + and isinstance(node.args[0], ast.Constant) + and node.args[0].value == "executing" + and len(node.args) >= 2 + and isinstance(node.args[1], ast.Dict) + ): + continue + matched_terminal_executing_send = True + payload = node.args[1] + for key, value in zip(payload.keys, payload.values): + if isinstance(key, ast.Constant) and key.value == "workflow_id": + rendered = ast.unparse(value) + assert "last_workflow_id" not in rendered, ( + "main.py terminal 'executing' must not read " + "server.last_workflow_id; the executor clears it in its " + "finally block. Use a locally captured workflow_id instead." + ) + + assert matched_terminal_executing_send, ( + "main.py:prompt_worker no longer has an inline " + 'send_sync("executing", {...}) payload; update this regression test ' + "so it still verifies the terminal workflow_id source." + ) diff --git a/tests/execution/test_jobs.py b/tests/execution/test_jobs.py index 814af5c13..6afa6cd9c 100644 --- a/tests/execution/test_jobs.py +++ b/tests/execution/test_jobs.py @@ -10,9 +10,44 @@ from comfy_execution.jobs import ( get_outputs_summary, apply_sorting, has_3d_extension, + extract_workflow_id, ) +class TestExtractWorkflowId: + """Unit tests for extract_workflow_id().""" + + def test_returns_id_from_extra_pnginfo(self): + assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': 'wf-123'}}}) == 'wf-123' + + def test_missing_extra_data_returns_none(self): + assert extract_workflow_id(None) is None + + def test_non_dict_extra_data_returns_none(self): + assert extract_workflow_id('not-a-dict') is None + + def test_missing_extra_pnginfo_returns_none(self): + assert extract_workflow_id({}) is None + + def test_missing_workflow_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': {}}) is None + + def test_missing_id_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': {'workflow': {}}}) is None + + def test_empty_string_id_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': ''}}}) is None + + def test_non_string_id_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': 42}}}) is None + + def test_non_dict_workflow_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': {'workflow': 'not-a-dict'}}) is None + + def test_non_dict_extra_pnginfo_returns_none(self): + assert extract_workflow_id({'extra_pnginfo': 'not-a-dict'}) is None + + class TestJobStatus: """Test JobStatus constants.""" From 616cab4f979381d60d40400c9b3c07da0fa8eae0 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 May 2026 15:35:42 -0700 Subject: [PATCH 5/9] =?UTF-8?q?Revert=20"Include=20workflow=5Fid=20in=20al?= =?UTF-8?q?l=20execution=20WebSocket=20messages=20(CORE-198)=20(#=E2=80=A6?= =?UTF-8?q?"=20(#13901)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 4f6018982dcd27258da3e1c50b9b50d464fcaed2. --- comfy_execution/jobs.py | 24 +- comfy_execution/progress.py | 13 +- execution.py | 33 +- main.py | 11 +- server.py | 6 +- .../test_workflow_id_in_ws_messages.py | 297 ------------------ tests/execution/test_jobs.py | 35 --- 7 files changed, 21 insertions(+), 398 deletions(-) delete mode 100644 tests-unit/execution_test/test_workflow_id_in_ws_messages.py diff --git a/comfy_execution/jobs.py b/comfy_execution/jobs.py index 24dd1ffd0..fcd7ef735 100644 --- a/comfy_execution/jobs.py +++ b/comfy_execution/jobs.py @@ -93,27 +93,6 @@ def _create_text_preview(value: str) -> dict: } -def extract_workflow_id(extra_data: Optional[dict]) -> Optional[str]: - """Extract the workflow id from a prompt's ``extra_data``. - - The frontend stores the id at ``extra_data["extra_pnginfo"]["workflow"]["id"]`` - when a prompt is queued. Any value that is not a non-empty string is treated as - missing so callers can rely on the return being either ``None`` or a string. - """ - if not isinstance(extra_data, dict): - return None - extra_pnginfo = extra_data.get('extra_pnginfo') - if not isinstance(extra_pnginfo, dict): - return None - workflow = extra_pnginfo.get('workflow') - if not isinstance(workflow, dict): - return None - workflow_id = workflow.get('id') - if isinstance(workflow_id, str) and workflow_id: - return workflow_id - return None - - def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str]]: """Extract create_time and workflow_id from extra_data. @@ -121,7 +100,8 @@ def _extract_job_metadata(extra_data: dict) -> tuple[Optional[int], Optional[str tuple: (create_time, workflow_id) """ create_time = extra_data.get('create_time') - workflow_id = extract_workflow_id(extra_data) + extra_pnginfo = extra_data.get('extra_pnginfo', {}) + workflow_id = extra_pnginfo.get('workflow', {}).get('id') return create_time, workflow_id diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index b6d3bd3e4..f951a3350 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -164,8 +164,6 @@ class WebUIProgressHandler(ProgressHandler): if self.server_instance is None: return - workflow_id = self.registry.workflow_id if self.registry else None - # Only send info for non-pending nodes active_nodes = { node_id: { @@ -174,7 +172,6 @@ class WebUIProgressHandler(ProgressHandler): "state": state["state"].value, "node_id": node_id, "prompt_id": prompt_id, - "workflow_id": workflow_id, "display_node_id": self.registry.dynprompt.get_display_node_id(node_id), "parent_node_id": self.registry.dynprompt.get_parent_node_id(node_id), "real_node_id": self.registry.dynprompt.get_real_node_id(node_id), @@ -186,7 +183,7 @@ class WebUIProgressHandler(ProgressHandler): # Send a combined progress_state message with all node states # Include client_id to ensure message is only sent to the initiating client self.server_instance.send_sync( - "progress_state", {"prompt_id": prompt_id, "workflow_id": workflow_id, "nodes": active_nodes}, self.server_instance.client_id + "progress_state", {"prompt_id": prompt_id, "nodes": active_nodes}, self.server_instance.client_id ) @override @@ -218,7 +215,6 @@ class WebUIProgressHandler(ProgressHandler): metadata = { "node_id": node_id, "prompt_id": prompt_id, - "workflow_id": self.registry.workflow_id if self.registry else None, "display_node_id": self.registry.dynprompt.get_display_node_id( node_id ), @@ -244,10 +240,9 @@ class ProgressRegistry: Registry that maintains node progress state and notifies registered handlers. """ - def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt", workflow_id: Optional[str] = None): + def __init__(self, prompt_id: str, dynprompt: "DynamicPrompt"): self.prompt_id = prompt_id self.dynprompt = dynprompt - self.workflow_id = workflow_id self.nodes: Dict[str, NodeProgressState] = {} self.handlers: Dict[str, ProgressHandler] = {} @@ -327,7 +322,7 @@ class ProgressRegistry: # Global registry instance global_progress_registry: ProgressRegistry | None = None -def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt", workflow_id: Optional[str] = None) -> None: +def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt") -> None: global global_progress_registry # Reset existing handlers if registry exists @@ -335,7 +330,7 @@ def reset_progress_state(prompt_id: str, dynprompt: "DynamicPrompt", workflow_id global_progress_registry.reset_handlers() # Create new registry - global_progress_registry = ProgressRegistry(prompt_id, dynprompt, workflow_id) + global_progress_registry = ProgressRegistry(prompt_id, dynprompt) def add_progress_handler(handler: ProgressHandler) -> None: diff --git a/execution.py b/execution.py index ff8240588..f37d0360d 100644 --- a/execution.py +++ b/execution.py @@ -38,7 +38,6 @@ from comfy_execution.graph import ( from comfy_execution.graph_utils import GraphBuilder, is_link from comfy_execution.validation import validate_node_input from comfy_execution.progress import get_progress_state, reset_progress_state, add_progress_handler, WebUIProgressHandler -from comfy_execution.jobs import extract_workflow_id from comfy_execution.utils import CurrentNodeContext from comfy_api.internal import _ComfyNodeInternal, _NodeOutputInternal, first_real_override, is_class, make_locked_method_func from comfy_api.latest import io, _io @@ -418,15 +417,15 @@ def _is_intermediate_output(dynprompt, node_id): class_def = nodes.NODE_CLASS_MAPPINGS[class_type] return getattr(class_def, 'HAS_INTERMEDIATE_OUTPUT', False) -def _send_cached_ui(server, node_id, display_node_id, cached, prompt_id, workflow_id, ui_outputs): +def _send_cached_ui(server, node_id, display_node_id, cached, prompt_id, ui_outputs): if server.client_id is None: return cached_ui = cached.ui or {} - server.send_sync("executed", { "node": node_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) + server.send_sync("executed", { "node": node_id, "display_node": display_node_id, "output": cached_ui.get("output", None), "prompt_id": prompt_id }, server.client_id) if cached.ui is not None: ui_outputs[node_id] = cached.ui -async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, workflow_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): +async def execute(server, dynprompt, caches, current_item, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_outputs): unique_id = current_item real_node_id = dynprompt.get_real_node_id(unique_id) display_node_id = dynprompt.get_display_node_id(unique_id) @@ -436,7 +435,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, class_def = nodes.NODE_CLASS_MAPPINGS[class_type] cached = await caches.outputs.get(unique_id) if cached is not None: - _send_cached_ui(server, unique_id, display_node_id, cached, prompt_id, workflow_id, ui_outputs) + _send_cached_ui(server, unique_id, display_node_id, cached, prompt_id, ui_outputs) get_progress_state().finish_progress(unique_id) execution_list.cache_update(unique_id, cached) return (ExecutionResult.SUCCESS, None, None) @@ -484,7 +483,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, input_data_all, missing_keys, v3_data = get_input_data(inputs, class_def, unique_id, execution_list, dynprompt, extra_data) if server.client_id is not None: server.last_node_id = display_node_id - server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) + server.send_sync("executing", { "node": unique_id, "display_node": display_node_id, "prompt_id": prompt_id }, server.client_id) obj = await caches.objects.get(unique_id) if obj is None: @@ -514,7 +513,6 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, if block.message is not None: mes = { "prompt_id": prompt_id, - "workflow_id": workflow_id, "node_id": unique_id, "node_type": class_type, "executed": list(executed), @@ -563,7 +561,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed, "output": output_ui } if server.client_id is not None: - server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id, "workflow_id": workflow_id }, server.client_id) + server.send_sync("executed", { "node": unique_id, "display_node": display_node_id, "output": output_ui, "prompt_id": prompt_id }, server.client_id) if has_subgraph: cached_outputs = [] new_node_ids = [] @@ -660,7 +658,6 @@ class PromptExecutor: self.caches = CacheSet(cache_type=self.cache_type, cache_args=self.cache_args) self.status_messages = [] self.success = True - self.workflow_id = None def add_message(self, event, data: dict, broadcast: bool): data = { @@ -680,7 +677,6 @@ class PromptExecutor: if isinstance(ex, comfy.model_management.InterruptProcessingException): mes = { "prompt_id": prompt_id, - "workflow_id": self.workflow_id, "node_id": node_id, "node_type": class_type, "executed": list(executed), @@ -689,7 +685,6 @@ class PromptExecutor: else: mes = { "prompt_id": prompt_id, - "workflow_id": self.workflow_id, "node_id": node_id, "node_type": class_type, "executed": list(executed), @@ -728,9 +723,7 @@ class PromptExecutor: self.server.client_id = None self.status_messages = [] - self.workflow_id = extract_workflow_id(extra_data) - self.server.last_workflow_id = self.workflow_id - self.add_message("execution_start", { "prompt_id": prompt_id, "workflow_id": self.workflow_id }, broadcast=False) + self.add_message("execution_start", { "prompt_id": prompt_id}, broadcast=False) self._notify_prompt_lifecycle("start", prompt_id) ram_headroom = int(self.cache_args["ram"] * (1024 ** 3)) @@ -740,7 +733,7 @@ class PromptExecutor: try: with torch.inference_mode(): dynamic_prompt = DynamicPrompt(prompt) - reset_progress_state(prompt_id, dynamic_prompt, self.workflow_id) + reset_progress_state(prompt_id, dynamic_prompt) add_progress_handler(WebUIProgressHandler(self.server)) is_changed_cache = IsChangedCache(prompt_id, dynamic_prompt, self.caches.outputs) for cache in self.caches.all: @@ -758,7 +751,7 @@ class PromptExecutor: comfy.model_management.cleanup_models_gc() self.add_message("execution_cached", - { "nodes": cached_nodes, "prompt_id": prompt_id, "workflow_id": self.workflow_id }, + { "nodes": cached_nodes, "prompt_id": prompt_id}, broadcast=False) pending_subgraph_results = {} pending_async_nodes = {} # TODO - Unify this with pending_subgraph_results @@ -776,7 +769,7 @@ class PromptExecutor: break assert node_id is not None, "Node ID should not be None at this point" - result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, self.workflow_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) + result, error, ex = await execute(self.server, dynamic_prompt, self.caches, node_id, extra_data, executed, prompt_id, execution_list, pending_subgraph_results, pending_async_nodes, ui_node_outputs) self.success = result != ExecutionResult.FAILURE if result == ExecutionResult.FAILURE: self.handle_execution_error(prompt_id, dynamic_prompt.original_prompt, current_outputs, executed, error, ex) @@ -800,8 +793,8 @@ class PromptExecutor: cached = await self.caches.outputs.get(node_id) if cached is not None: display_node_id = dynamic_prompt.get_display_node_id(node_id) - _send_cached_ui(self.server, node_id, display_node_id, cached, prompt_id, self.workflow_id, ui_node_outputs) - self.add_message("execution_success", { "prompt_id": prompt_id, "workflow_id": self.workflow_id }, broadcast=False) + _send_cached_ui(self.server, node_id, display_node_id, cached, prompt_id, ui_node_outputs) + self.add_message("execution_success", { "prompt_id": prompt_id }, broadcast=False) ui_outputs = {} meta_outputs = {} @@ -818,8 +811,6 @@ class PromptExecutor: finally: comfy.memory_management.set_ram_cache_release_state(None, 0) self._notify_prompt_lifecycle("end", prompt_id) - self.server.last_workflow_id = None - self.workflow_id = None async def validate_inputs(prompt_id, prompt, item, validated, visiting=None): diff --git a/main.py b/main.py index 3ac8395b1..a6fdaf43c 100644 --- a/main.py +++ b/main.py @@ -29,7 +29,6 @@ import logging import sys from comfy_execution.progress import get_progress_state from comfy_execution.utils import get_executing_context -from comfy_execution.jobs import extract_workflow_id from comfy_api import feature_flags from app.database.db import init_db, dependencies_available @@ -318,12 +317,6 @@ def prompt_worker(q, server_instance): for k in sensitive: extra_data[k] = sensitive[k] - # Capture the workflow id for this prompt before execution: the - # executor clears server.last_workflow_id in its finally block, so - # reading it after e.execute() returns would emit workflow_id=None - # on the terminal "executing" reset below. - workflow_id = extract_workflow_id(extra_data) - asset_seeder.pause() e.execute(item[2], prompt_id, extra_data, item[4]) @@ -337,7 +330,7 @@ def prompt_worker(q, server_instance): completed=e.success, messages=e.status_messages), process_item=remove_sensitive) if server_instance.client_id is not None: - server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id, "workflow_id": workflow_id}, server_instance.client_id) + server_instance.send_sync("executing", {"node": None, "prompt_id": prompt_id}, server_instance.client_id) current_time = time.perf_counter() execution_time = current_time - execution_start_time @@ -400,7 +393,7 @@ def hijack_progress(server_instance): prompt_id = server_instance.last_prompt_id if node_id is None: node_id = server_instance.last_node_id - progress = {"value": value, "max": total, "prompt_id": prompt_id, "workflow_id": getattr(server_instance, 'last_workflow_id', None), "node": node_id} + progress = {"value": value, "max": total, "prompt_id": prompt_id, "node": node_id} get_progress_state().update_progress(node_id, value, total, preview_image) server_instance.send_sync("progress", progress, server_instance.client_id) diff --git a/server.py b/server.py index 08eea1160..2f3b438bb 100644 --- a/server.py +++ b/server.py @@ -275,11 +275,7 @@ class PromptServer(): await self.send("status", {"status": self.get_queue_info(), "sid": sid}, sid) # On reconnect if we are the currently executing client send the current node if self.client_id == sid and self.last_node_id is not None: - await self.send("executing", { - "node": self.last_node_id, - "prompt_id": getattr(self, "last_prompt_id", None), - "workflow_id": getattr(self, "last_workflow_id", None), - }, sid) + await self.send("executing", { "node": self.last_node_id }, sid) # Flag to track if we've received the first message first_message = True diff --git a/tests-unit/execution_test/test_workflow_id_in_ws_messages.py b/tests-unit/execution_test/test_workflow_id_in_ws_messages.py deleted file mode 100644 index cf1ff71e9..000000000 --- a/tests-unit/execution_test/test_workflow_id_in_ws_messages.py +++ /dev/null @@ -1,297 +0,0 @@ -"""Tests that workflow_id is included alongside prompt_id in WebSocket payloads -emitted by the progress handler and the prompt executor. - -Frontend stores extra_data["extra_pnginfo"]["workflow"]["id"] when queueing a -prompt; we propagate that as `workflow_id` on every execution event so a -multi-tab UI can scope progress state by workflow even when terminal -WebSocket frames are dropped. -""" - -from unittest.mock import MagicMock - -import pytest - -from comfy_execution.progress import ( - NodeState, - ProgressRegistry, - WebUIProgressHandler, - reset_progress_state, - get_progress_state, -) - - -class _DummyDynPrompt: - def get_display_node_id(self, node_id): - return node_id - - def get_parent_node_id(self, node_id): - return None - - def get_real_node_id(self, node_id): - return node_id - - -@pytest.fixture -def server(): - s = MagicMock() - s.client_id = "client-1" - return s - - -def _registry(workflow_id): - return ProgressRegistry( - prompt_id="prompt-1", - dynprompt=_DummyDynPrompt(), - workflow_id=workflow_id, - ) - - -class TestProgressStatePayload: - def test_progress_state_includes_workflow_id(self, server): - registry = _registry("wf-abc") - registry.nodes["n1"] = { - "state": NodeState.Running, - "value": 1.0, - "max": 5.0, - } - - handler = WebUIProgressHandler(server) - handler.set_registry(registry) - handler._send_progress_state("prompt-1", registry.nodes) - - server.send_sync.assert_called_once() - event, payload, sid = server.send_sync.call_args.args - assert event == "progress_state" - assert payload["prompt_id"] == "prompt-1" - assert payload["workflow_id"] == "wf-abc" - assert payload["nodes"]["n1"]["workflow_id"] == "wf-abc" - assert payload["nodes"]["n1"]["prompt_id"] == "prompt-1" - assert sid == "client-1" - - def test_progress_state_workflow_id_none_when_missing(self, server): - registry = _registry(None) - registry.nodes["n1"] = { - "state": NodeState.Running, - "value": 0.5, - "max": 1.0, - } - - handler = WebUIProgressHandler(server) - handler.set_registry(registry) - handler._send_progress_state("prompt-1", registry.nodes) - - _, payload, _ = server.send_sync.call_args.args - assert payload["workflow_id"] is None - assert payload["nodes"]["n1"]["workflow_id"] is None - - -class TestProgressRegistryConstruction: - def test_workflow_id_default_is_none(self): - registry = ProgressRegistry( - prompt_id="prompt-1", dynprompt=_DummyDynPrompt() - ) - assert registry.workflow_id is None - - def test_workflow_id_stored_on_registry(self): - registry = ProgressRegistry( - prompt_id="prompt-1", - dynprompt=_DummyDynPrompt(), - workflow_id="wf-xyz", - ) - assert registry.workflow_id == "wf-xyz" - - -class TestResetProgressState: - def test_reset_threads_workflow_id(self): - reset_progress_state("prompt-1", _DummyDynPrompt(), "wf-456") - assert get_progress_state().workflow_id == "wf-456" - - def test_reset_default_workflow_id_none(self): - reset_progress_state("prompt-2", _DummyDynPrompt()) - assert get_progress_state().workflow_id is None - - -class TestExecutionMessagePayloadsContainWorkflowId: - """Static-analysis guard ensuring every WebSocket message payload that - carries `prompt_id` also carries `workflow_id`. This is a regression net - for future refactors of execution.py / main.py / progress.py and avoids - the GPU/torch dependency of importing `execution.py` directly. - """ - - @staticmethod - def _emitting_dicts(source: str): - """Yield every dict literal in `source` that contains a 'prompt_id' key.""" - import ast - - tree = ast.parse(source) - for node in ast.walk(tree): - if not isinstance(node, ast.Dict): - continue - keys = [ - k.value - for k in node.keys - if isinstance(k, ast.Constant) and isinstance(k.value, str) - ] - if "prompt_id" in keys: - yield node, keys - - def _assert_workflow_id_in_every_prompt_id_dict(self, file_path: str): - from pathlib import Path - - repo_root = Path(__file__).resolve().parents[2] - source = (repo_root / file_path).read_text() - offenders = [] - for node, keys in self._emitting_dicts(source): - if "workflow_id" not in keys: - offenders.append((node.lineno, keys)) - assert not offenders, ( - f"{file_path}: dict literals with 'prompt_id' but no 'workflow_id': {offenders}" - ) - - def test_execution_py_payloads_include_workflow_id(self): - self._assert_workflow_id_in_every_prompt_id_dict("execution.py") - - def test_main_py_payloads_include_workflow_id(self): - self._assert_workflow_id_in_every_prompt_id_dict("main.py") - - def test_progress_py_payloads_include_workflow_id(self): - self._assert_workflow_id_in_every_prompt_id_dict("comfy_execution/progress.py") - - -class TestPreviewImageMetadataPayload: - """Verify PREVIEW_IMAGE_WITH_METADATA metadata carries workflow_id.""" - - def test_preview_metadata_includes_workflow_id(self): - from unittest.mock import MagicMock, patch - from PIL import Image - - from comfy_execution.progress import ( - NodeState, - ProgressRegistry, - WebUIProgressHandler, - ) - - class _DynPrompt: - def get_display_node_id(self, n): - return n - - def get_parent_node_id(self, n): - return None - - def get_real_node_id(self, n): - return n - - server = MagicMock() - server.client_id = "cid" - server.sockets_metadata = {} - - registry = ProgressRegistry( - prompt_id="p1", dynprompt=_DynPrompt(), workflow_id="wf-1" - ) - handler = WebUIProgressHandler(server) - handler.set_registry(registry) - - image = ("PNG", Image.new("RGB", (1, 1)), None) - - with patch( - "comfy_execution.progress.feature_flags.supports_feature", - return_value=True, - ): - handler.update_handler( - node_id="n1", - value=1.0, - max_value=1.0, - state={ - "state": NodeState.Running, - "value": 1.0, - "max": 1.0, - }, - prompt_id="p1", - image=image, - ) - - preview_calls = [ - c - for c in server.send_sync.call_args_list - if c.args[0] != "progress_state" - ] - assert len(preview_calls) == 1 - _, payload, _ = preview_calls[0].args - _, metadata = payload - assert metadata["prompt_id"] == "p1" - assert metadata["workflow_id"] == "wf-1" - - - -class TestTerminalExecutingResetInMainPy: - """Regression test for the main.py prompt_worker terminal 'executing' reset. - - The executor clears server.last_workflow_id in its finally block, so - main.py must capture the workflow id *before* calling e.execute() and use - that local value, not read server.last_workflow_id afterwards. - - Rather than importing main.py (which triggers torch CUDA init in this - environment), we statically assert the contract via AST: somewhere - between the `extra_data = item[3].copy()` line and the - `e.execute(item[2], ...)` call, the function must extract workflow_id - from extra_data into a local, and the subsequent send_sync("executing", - ...) must reference that local rather than server.last_workflow_id. - """ - - def test_terminal_executing_uses_locally_captured_workflow_id(self): - import ast - from pathlib import Path - - repo_root = Path(__file__).resolve().parents[2] - source = (repo_root / "main.py").read_text() - tree = ast.parse(source) - - worker = next( - ( - n - for n in ast.walk(tree) - if isinstance(n, ast.FunctionDef) and n.name == "prompt_worker" - ), - None, - ) - assert worker is not None, "prompt_worker function not found in main.py" - - worker_src = ast.get_source_segment(source, worker) or "" - - assert "extract_workflow_id(extra_data)" in worker_src, ( - "main.py:prompt_worker must capture workflow_id locally from extra_data " - "before calling e.execute() (the executor clears server.last_workflow_id " - "in finally)." - ) - - matched_terminal_executing_send = False - for node in ast.walk(worker): - if not isinstance(node, ast.Call): - continue - func = node.func - if not ( - isinstance(func, ast.Attribute) - and func.attr == "send_sync" - and node.args - and isinstance(node.args[0], ast.Constant) - and node.args[0].value == "executing" - and len(node.args) >= 2 - and isinstance(node.args[1], ast.Dict) - ): - continue - matched_terminal_executing_send = True - payload = node.args[1] - for key, value in zip(payload.keys, payload.values): - if isinstance(key, ast.Constant) and key.value == "workflow_id": - rendered = ast.unparse(value) - assert "last_workflow_id" not in rendered, ( - "main.py terminal 'executing' must not read " - "server.last_workflow_id; the executor clears it in its " - "finally block. Use a locally captured workflow_id instead." - ) - - assert matched_terminal_executing_send, ( - "main.py:prompt_worker no longer has an inline " - 'send_sync("executing", {...}) payload; update this regression test ' - "so it still verifies the terminal workflow_id source." - ) diff --git a/tests/execution/test_jobs.py b/tests/execution/test_jobs.py index 6afa6cd9c..814af5c13 100644 --- a/tests/execution/test_jobs.py +++ b/tests/execution/test_jobs.py @@ -10,44 +10,9 @@ from comfy_execution.jobs import ( get_outputs_summary, apply_sorting, has_3d_extension, - extract_workflow_id, ) -class TestExtractWorkflowId: - """Unit tests for extract_workflow_id().""" - - def test_returns_id_from_extra_pnginfo(self): - assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': 'wf-123'}}}) == 'wf-123' - - def test_missing_extra_data_returns_none(self): - assert extract_workflow_id(None) is None - - def test_non_dict_extra_data_returns_none(self): - assert extract_workflow_id('not-a-dict') is None - - def test_missing_extra_pnginfo_returns_none(self): - assert extract_workflow_id({}) is None - - def test_missing_workflow_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': {}}) is None - - def test_missing_id_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': {'workflow': {}}}) is None - - def test_empty_string_id_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': ''}}}) is None - - def test_non_string_id_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': {'workflow': {'id': 42}}}) is None - - def test_non_dict_workflow_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': {'workflow': 'not-a-dict'}}) is None - - def test_non_dict_extra_pnginfo_returns_none(self): - assert extract_workflow_id({'extra_pnginfo': 'not-a-dict'}) is None - - class TestJobStatus: """Test JobStatus constants.""" From ed78da062c35b9c540247e7c71897a0ecdbb25e4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Thu, 14 May 2026 16:02:22 -0700 Subject: [PATCH 6/9] Create SECURITY.md. (#13902) --- SECURITY.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..299b0067b --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,44 @@ +# Security Policy + +## Scope + +ComfyUI is designed to run locally. By default, the server binds to `127.0.0.1`, meaning only the user's own machine can reach it. Our threat model assumes: + +- The user installed ComfyUI through a supported channel: the desktop application, the portable build, or a manual install following the README. +- The user has not installed untrusted custom nodes. Custom nodes are arbitrary Python code and are trusted as much as any other software the user chooses to install. +- Anyone with access to the ComfyUI URL is trusted (a direct consequence of the localhost-only default). +- PyTorch and other dependencies are at the versions we ship or recommend in the README. + +A report is in scope only if it affects a user operating within this threat model. + +## What We Consider a Vulnerability + +We want to hear about issues where a **reasonable user** — someone who does not install random untrusted nodes and who reads UI prompts and warnings before clicking through them — can be harmed by ComfyUI itself. + +The clearest example: a workflow file that such a user might plausibly load and run, using only built-in nodes, that results in **untrusted code execution, arbitrary file read/write outside expected directories, or credential/data exfiltration**. + +When submitting a report, please include a clear description of *why this is a problem for a typical local ComfyUI user*. Reports without this context are difficult to act on. + +## What We Do Not Consider a Security Vulnerability + +Please report the following through our regular [GitHub issues](https://github.com/comfyanonymous/ComfyUI/issues) instead. Filing them as security reports will likely cause them to be deprioritized or closed. + +- **Issues requiring `--listen` or any non-default network exposure.** ComfyUI binds to localhost by default. If a remote attacker needs to reach the server for the attack to work, the user has chosen to expose it and is responsible for securing that deployment (firewall, reverse proxy, authentication, etc.). These are bugs, not vulnerabilities. +- **`torch.load` and related deserialization issues in old PyTorch versions.** These are upstream PyTorch issues. Our distributions ship with — and our documentation recommends — recent PyTorch versions where these are addressed. +- **Vulnerabilities that depend on outdated library versions** that we neither ship nor recommend (e.g., requiring PyTorch 2.6 or older). +- **Issues that require a specific custom node to be installed.** Custom nodes are third-party code. Report these to the maintainer of that node. +- **Crashes, hangs, or resource exhaustion from a loaded workflow.** Annoying, but not a security issue in our model. File a regular bug. +- **Social-engineering scenarios** where the user is expected to ignore an explicit UI warning or prompt. + +## Reporting + +If you believe you have found an issue that falls within the scope above, please report it privately via GitHub's [Report a vulnerability](https://github.com/comfyanonymous/ComfyUI/security/advisories/new) feature rather than opening a public issue. + +Please include: + +1. A description of the vulnerability and the affected component. +2. Reproduction steps, ideally with a minimal workflow file or proof-of-concept. +3. The ComfyUI version, install method (desktop / portable / manual), and OS. +4. An explanation of how this affects a typical local user as described in the threat model. + +We will acknowledge valid reports and coordinate a fix and disclosure timeline with you. From b112f68681e56e508eee6fe27e96bf525dc2c137 Mon Sep 17 00:00:00 2001 From: Christian Byrne Date: Thu, 14 May 2026 16:13:30 -0700 Subject: [PATCH 7/9] Generalize frontend version warning to all comfy* requirements.txt entries (#13875) --- app/frontend_management.py | 61 +++++++++++++------- openapi.yaml | 18 ++++++ server.py | 2 + tests-unit/app_test/frontend_manager_test.py | 11 +++- 4 files changed, 68 insertions(+), 24 deletions(-) diff --git a/app/frontend_management.py b/app/frontend_management.py index 7108bd35a..d0596b276 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -38,40 +38,54 @@ def is_valid_version(version: str) -> bool: pattern = r"^(\d+)\.(\d+)\.(\d+)$" return bool(re.match(pattern, version)) -def get_installed_frontend_version(): - """Get the currently installed frontend package version.""" - frontend_version_str = version("comfyui-frontend-package") - return frontend_version_str - - def get_required_frontend_version(): return get_required_packages_versions().get("comfyui-frontend-package", None) -def check_frontend_version(): - """Check if the frontend version is up to date.""" +COMFY_PACKAGE_VERSIONS = [] +def get_comfy_package_versions(): + """List installed/required versions for every comfy* package in requirements.txt.""" + if COMFY_PACKAGE_VERSIONS: + return COMFY_PACKAGE_VERSIONS.copy() + out = COMFY_PACKAGE_VERSIONS + for name, required in (get_required_packages_versions() or {}).items(): + if not name.startswith("comfy"): + continue + try: + installed = version(name) + except Exception: + installed = None + out.append({"name": name, "installed": installed, "required": required}) + return out.copy() - try: - frontend_version_str = get_installed_frontend_version() - frontend_version = parse_version(frontend_version_str) - required_frontend_str = get_required_frontend_version() - required_frontend = parse_version(required_frontend_str) - if frontend_version < required_frontend: + +def check_comfy_packages_versions(): + """Warn for every comfy* package whose installed version is below requirements.txt.""" + from packaging.version import InvalidVersion, parse as parse_pep440 + for pkg in get_comfy_package_versions(): + installed_str = pkg["installed"] + required_str = pkg["required"] + if not installed_str or not required_str: + continue + try: + outdated = parse_pep440(installed_str) < parse_pep440(required_str) + except InvalidVersion as e: + logging.error(f"Failed to check {pkg['name']} version: {e}") + continue + if outdated: app.logger.log_startup_warning( f""" ________________________________________________________________________ WARNING WARNING WARNING WARNING WARNING -Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}. +Installed {pkg["name"]} version {installed_str} is lower than the recommended version {required_str}. -{frontend_install_warning_message()} +{get_missing_requirements_message()} ________________________________________________________________________ """.strip() ) else: - logging.info("ComfyUI frontend version: {}".format(frontend_version_str)) - except Exception as e: - logging.error(f"Failed to check frontend version: {e}") + logging.info("{} version: {}".format(pkg["name"], installed_str)) REQUEST_TIMEOUT = 10 # seconds @@ -201,6 +215,11 @@ class FrontendManager: def get_required_templates_version(cls) -> str: return get_required_packages_versions().get("comfyui-workflow-templates", None) + @classmethod + def get_comfy_package_versions(cls): + """List installed/required versions for every comfy* package in requirements.txt.""" + return get_comfy_package_versions() + @classmethod def default_frontend_path(cls) -> str: try: @@ -341,7 +360,7 @@ comfyui-workflow-templates is not installed. main error source might be request timeout or invalid URL. """ if version_string == DEFAULT_VERSION_STRING: - check_frontend_version() + check_comfy_packages_versions() return cls.default_frontend_path() repo_owner, repo_name, version = cls.parse_version_string(version_string) @@ -403,7 +422,7 @@ comfyui-workflow-templates is not installed. except Exception as e: logging.error("Failed to initialize frontend: %s", e) logging.info("Falling back to the default frontend.") - check_frontend_version() + check_comfy_packages_versions() return cls.default_frontend_path() @classmethod def template_asset_handler(cls): diff --git a/openapi.yaml b/openapi.yaml index 96be4c1d5..214962c5c 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -6030,6 +6030,24 @@ components: type: string nullable: true description: Minimum required workflow templates version for this ComfyUI build + comfy_package_versions: + type: array + description: Installed and required versions for every comfy* package pinned in requirements.txt + items: + type: object + required: + - name + - installed + - required + properties: + name: + type: string + installed: + type: string + nullable: true + required: + type: string + nullable: true devices: type: array items: diff --git a/server.py b/server.py index 2f3b438bb..44470b904 100644 --- a/server.py +++ b/server.py @@ -656,6 +656,7 @@ class PromptServer(): required_frontend_version = FrontendManager.get_required_frontend_version() installed_templates_version = FrontendManager.get_installed_templates_version() required_templates_version = FrontendManager.get_required_templates_version() + comfy_package_versions = FrontendManager.get_comfy_package_versions() system_stats = { "system": { @@ -666,6 +667,7 @@ class PromptServer(): "required_frontend_version": required_frontend_version, "installed_templates_version": installed_templates_version, "required_templates_version": required_templates_version, + "comfy_package_versions": comfy_package_versions, "python_version": sys.version, "pytorch_version": comfy.model_management.torch_version, "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py index 1d5a84b47..8c8a2eb48 100644 --- a/tests-unit/app_test/frontend_manager_test.py +++ b/tests-unit/app_test/frontend_manager_test.py @@ -52,7 +52,10 @@ def mock_provider(mock_releases): @pytest.fixture(autouse=True) def clear_cache(): import utils.install_util + import app.frontend_management + utils.install_util.PACKAGE_VERSIONS = {} + app.frontend_management.COMFY_PACKAGE_VERSIONS = [] def test_get_release(mock_provider, mock_releases): @@ -147,7 +150,7 @@ def test_init_frontend_default_with_mocks(): # Act with ( - patch("app.frontend_management.check_frontend_version") as mock_check, + patch("app.frontend_management.check_comfy_packages_versions") as mock_check, patch.object( FrontendManager, "default_frontend_path", return_value="/mocked/path" ), @@ -168,7 +171,7 @@ def test_init_frontend_fallback_on_error(): patch.object( FrontendManager, "init_frontend_unsafe", side_effect=Exception("Test error") ), - patch("app.frontend_management.check_frontend_version") as mock_check, + patch("app.frontend_management.check_comfy_packages_versions") as mock_check, patch.object( FrontendManager, "default_frontend_path", return_value="/default/path" ), @@ -277,7 +280,9 @@ def test_get_installed_templates_version(): def test_get_installed_templates_version_not_installed(): # Act - with patch("app.frontend_management.version", side_effect=Exception("Package not found")): + with patch( + "app.frontend_management.version", side_effect=Exception("Package not found") + ): version = FrontendManager.get_installed_templates_version() # Assert From b2000029c8290207720a95c0aff5b71b2c80d91f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Fri, 15 May 2026 04:36:17 +0300 Subject: [PATCH 8/9] Persists ModelNoiseScale when also patching shift (#13892) --- comfy_extras/nodes_model_advanced.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/comfy_extras/nodes_model_advanced.py b/comfy_extras/nodes_model_advanced.py index 33b940a0f..b27ac1296 100644 --- a/comfy_extras/nodes_model_advanced.py +++ b/comfy_extras/nodes_model_advanced.py @@ -134,8 +134,11 @@ class ModelSamplingSD3: class ModelSamplingAdvanced(sampling_base, sampling_type): pass + original = m.get_model_object("model_sampling") model_sampling = ModelSamplingAdvanced(model.model.model_config) model_sampling.set_parameters(shift=shift, multiplier=multiplier) + if hasattr(original, "noise_scale"): + model_sampling.set_noise_scale(original.noise_scale) m.add_object_patch("model_sampling", model_sampling) return (m, ) @@ -315,7 +318,7 @@ class ModelNoiseScale: def patch(self, model, noise_scale): m = model.clone() - original = m.model.model_sampling + original = m.get_model_object("model_sampling") ms = type(original)(m.model.model_config) ms.set_parameters(shift=original.shift, multiplier=original.multiplier) ms.set_noise_scale(noise_scale) From 77e2ed5e01bcb5eb82e05760f4091d67f7d85a71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Fri, 15 May 2026 05:34:56 +0300 Subject: [PATCH 9/9] feat: Support MoGe (CORE-168) (#13878) --- comfy/image_encoders/dino2.py | 45 +- comfy/ldm/moge/geometry.py | 189 ++++++++ comfy/ldm/moge/model.py | 347 +++++++++++++++ comfy/ldm/moge/modules.py | 204 +++++++++ comfy/ldm/moge/panorama.py | 313 ++++++++++++++ comfy_extras/nodes_moge.py | 406 ++++++++++++++++++ folder_paths.py | 2 + .../put_geometry_estimation_models_here | 0 nodes.py | 1 + 9 files changed, 1504 insertions(+), 3 deletions(-) create mode 100644 comfy/ldm/moge/geometry.py create mode 100644 comfy/ldm/moge/model.py create mode 100644 comfy/ldm/moge/modules.py create mode 100644 comfy/ldm/moge/panorama.py create mode 100644 comfy_extras/nodes_moge.py create mode 100644 models/geometry_estimation/put_geometry_estimation_models_here diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py index 9b6dace9d..ee86f8309 100644 --- a/comfy/image_encoders/dino2.py +++ b/comfy/image_encoders/dino2.py @@ -106,6 +106,7 @@ class Dino2Encoder(torch.nn.Module): class Dino2PatchEmbeddings(torch.nn.Module): def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None): super().__init__() + self.patch_size = patch_size self.projection = operations.Conv2d( in_channels=num_channels, out_channels=dim, @@ -125,17 +126,37 @@ class Dino2Embeddings(torch.nn.Module): super().__init__() patch_size = 14 image_size = 518 + self.patch_size = patch_size self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations) self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device)) - self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) + self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key. self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device)) + def interpolate_pos_encoding(self, x, h_pixels, w_pixels): + pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32) + + class_pos = pos_embed[:, 0:1] + patch_pos = pos_embed[:, 1:] + N = patch_pos.shape[1] + M = int(N ** 0.5) + h0 = h_pixels // self.patch_size + w0 = w_pixels // self.patch_size + scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M) # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0). + + patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2) + patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False) + patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2) + return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype) + def forward(self, pixel_values): x = self.patch_embeddings(pixel_values) - # TODO: mask_token? x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1) - x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype) + if x.shape[1] - 1 == self.position_embeddings.shape[1] - 1: + x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype) + else: + h, w = pixel_values.shape[-2:] + x = x + self.interpolate_pos_encoding(x, h, w) return x @@ -158,3 +179,21 @@ class Dinov2Model(torch.nn.Module): x = self.layernorm(x) pooled_output = x[:, 0, :] return x, i, pooled_output, None + + def get_intermediate_layers(self, pixel_values, indices, apply_norm=True): + x = self.embeddings(pixel_values) + optimized_attention = optimized_attention_for_device(x.device, False, small_input=True) + n_layers = len(self.encoder.layer) + resolved = [(i if i >= 0 else n_layers + i) for i in indices] + target = set(resolved) + max_idx = max(resolved) + n_skip = 1 # skip cls token + cache = {} + for i, layer in enumerate(self.encoder.layer): + x = layer(x, optimized_attention) + if i in target: + normed = self.layernorm(x) if apply_norm else x + cache[i] = (normed[:, n_skip:], normed[:, 0]) + if i >= max_idx: + break + return [cache[i] for i in resolved] diff --git a/comfy/ldm/moge/geometry.py b/comfy/ldm/moge/geometry.py new file mode 100644 index 000000000..7fdc97871 --- /dev/null +++ b/comfy/ldm/moge/geometry.py @@ -0,0 +1,189 @@ +"""Pure-torch + scipy geometry helpers for MoGe inference and mesh export.""" + +from __future__ import annotations + +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F + +from scipy.optimize import least_squares + +def normalized_view_plane_uv(width: int, height: int, aspect_ratio: Optional[float] = None, + dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None) -> torch.Tensor: + """Normalized view-plane UV coordinates with corners at +/-(W, H)/diagonal.""" + if aspect_ratio is None: + aspect_ratio = width / height + span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5 + span_y = 1.0 / (1 + aspect_ratio ** 2) ** 0.5 + u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device) + v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device) + u, v = torch.meshgrid(u, v, indexing="xy") + return torch.stack([u, v], dim=-1) + + +def intrinsics_from_focal_center(fx: torch.Tensor, fy: torch.Tensor, cx: torch.Tensor, cy: torch.Tensor) -> torch.Tensor: + """Assemble (..., 3, 3) intrinsics from broadcastable fx, fy, cx, cy.""" + fx, fy, cx, cy = [torch.as_tensor(v) for v in (fx, fy, cx, cy)] + fx, fy, cx, cy = torch.broadcast_tensors(fx, fy, cx, cy) + zero = torch.zeros_like(fx) + one = torch.ones_like(fx) + return torch.stack([ + torch.stack([fx, zero, cx], dim=-1), + torch.stack([zero, fy, cy], dim=-1), + torch.stack([zero, zero, one], dim=-1), + ], dim=-2) + + +def depth_map_to_point_map(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor: + """Back-project a (..., H, W) depth map through K^-1 to (..., H, W, 3) camera-space points. + + Intrinsics use normalized image coords (x in [0, 1] left->right, y in [0, 1] top->bottom). + """ + H, W = depth.shape[-2:] + device, dtype = depth.device, depth.dtype + u = (torch.arange(W, dtype=dtype, device=device) + 0.5) / W + v = (torch.arange(H, dtype=dtype, device=device) + 0.5) / H + grid_v, grid_u = torch.meshgrid(v, u, indexing="ij") + pix = torch.stack([grid_u, grid_v, torch.ones_like(grid_u)], dim=-1) + K_inv = torch.linalg.inv(intrinsics) + rays = torch.einsum("...ij,hwj->...hwi", K_inv, pix) + return rays * depth.unsqueeze(-1) + + +def _solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray, + focal: Optional[float] = None) -> Tuple[float, float]: + """LM-solve for z-shift; when focal is None, also recovers the optimal focal.""" + uv = uv.reshape(-1, 2) + xy = xyz[..., :2].reshape(-1, 2) + z = xyz[..., 2].reshape(-1) + + def fn(shift): + xy_proj = xy / (z + shift)[:, None] + f = focal if focal is not None else (xy_proj * uv).sum() / np.square(xy_proj).sum() + return (f * xy_proj - uv).ravel() + + sol = least_squares(fn, x0=0.0, ftol=1e-3, method="lm") + shift = float(np.asarray(sol["x"]).squeeze()) + if focal is None: + xy_proj = xy / (z + shift)[:, None] + focal = float((xy_proj * uv).sum() / np.square(xy_proj).sum()) + return shift, focal + + +def recover_focal_shift(points: torch.Tensor, mask: Optional[torch.Tensor] = None, + focal: Optional[torch.Tensor] = None, downsample_size: Tuple[int, int] = (64, 64) + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Recover the focal length and z-shift that turn points into a metric point map. + + Optical center is at the image center; returned focal is relative to half the image diagonal. + Returns (focal, shift) on the same device/dtype as points. + """ + shape = points.shape + H, W = shape[-3], shape[-2] + points_b = points.reshape(-1, H, W, 3) + mask_b = None if mask is None else mask.reshape(-1, H, W) + focal_b = None if focal is None else focal.reshape(-1) + + uv = normalized_view_plane_uv(W, H, dtype=points.dtype, device=points.device) + + points_lr = F.interpolate(points_b.permute(0, 3, 1, 2), downsample_size, mode="nearest").permute(0, 2, 3, 1) + uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode="nearest").squeeze(0).permute(1, 2, 0) + mask_lr = None + if mask_b is not None: + mask_lr = F.interpolate(mask_b.to(torch.float32).unsqueeze(1), downsample_size, mode="nearest").squeeze(1) > 0 + + uv_np = uv_lr.detach().cpu().numpy() + points_np = points_lr.detach().cpu().numpy() + mask_np = None if mask_lr is None else mask_lr.detach().cpu().numpy() + focal_np = None if focal_b is None else focal_b.detach().cpu().numpy() + + out_focal: list = [] + out_shift: list = [] + for i in range(points_b.shape[0]): + if mask_np is None: + xyz_i = points_np[i].reshape(-1, 3) + uv_i = uv_np.reshape(-1, 2) + else: + sel = mask_np[i] + if sel.sum() < 2: + out_focal.append(1.0) + out_shift.append(0.0) + continue + xyz_i = points_np[i][sel] + uv_i = uv_np[sel] + if focal_np is None: + shift_i, focal_i = _solve_optimal_shift(uv_i, xyz_i) + out_focal.append(focal_i) + else: + shift_i, _ = _solve_optimal_shift(uv_i, xyz_i, focal=float(focal_np[i])) + out_shift.append(shift_i) + + shift_t = torch.tensor(out_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3]) + if focal is None: + focal_t = torch.tensor(out_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3]) + else: + focal_t = focal.reshape(shape[:-3]) + return focal_t, shift_t + + +def depth_map_edge(depth: torch.Tensor, atol: Optional[float] = None, rtol: Optional[float] = None, kernel_size: int = 3) -> torch.Tensor: + """Per-pixel boolean: True where the local depth window's max-min span exceeds atol or rtol*depth.""" + shape = depth.shape + d = depth.reshape(-1, 1, *shape[-2:]) + pad = kernel_size // 2 + diff = F.max_pool2d(d, kernel_size, stride=1, padding=pad) + F.max_pool2d(-d, kernel_size, stride=1, padding=pad) + edge = torch.zeros_like(d, dtype=torch.bool) + if atol is not None: + edge |= diff > atol + if rtol is not None: + edge |= (diff / d.clamp_min(1e-6)).nan_to_num_() > rtol + return edge.reshape(*shape) + + +def triangulate_grid_mesh(points: torch.Tensor, mask: Optional[torch.Tensor] = None, decimation: int = 1, discontinuity_threshold: float = 0.04, + depth: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """Triangulate a (H, W, 3) point map into (vertices, faces, uvs) on CPU. + + Vertices: pixels with finite coords (passing optional mask). Quads with four valid corners + become two triangles. depth overrides the scalar used for the rtol edge check; pass radial + depth for panoramas (the default points[..., 2] goes negative below the equator). + """ + points = points.detach().cpu() + finite = torch.isfinite(points).all(dim=-1) + if mask is None: + mask = finite + else: + mask = mask.detach().cpu().to(torch.bool) & finite + + if discontinuity_threshold > 0: + d = depth.detach().cpu() if depth is not None else points[..., 2] + # Replace inf with 0 so max-pool doesn't poison neighbourhoods (mask above already excludes those pixels). + d_finite = torch.where(finite, d, torch.zeros_like(d)) + edge = depth_map_edge(d_finite, rtol=discontinuity_threshold) + mask = mask & ~edge + + if decimation > 1: + points = points[::decimation, ::decimation].contiguous() + mask = mask[::decimation, ::decimation].contiguous() + H, W = points.shape[:2] + + flat_mask = mask.reshape(-1) + idx = torch.full((H * W,), -1, dtype=torch.long) + n_valid = int(flat_mask.sum().item()) + idx[flat_mask] = torch.arange(n_valid, dtype=torch.long) + idx = idx.reshape(H, W) + + vertices = points.reshape(-1, 3)[flat_mask].contiguous() + + yy, xx = torch.meshgrid(torch.arange(H), torch.arange(W), indexing="ij") + u = xx.float() / max(W - 1, 1) + v = yy.float() / max(H - 1, 1) + uvs = torch.stack([u, v], dim=-1).reshape(-1, 2)[flat_mask].contiguous() + + a, b, c, d = idx[:-1, :-1], idx[:-1, 1:], idx[1:, 1:], idx[1:, :-1] + quad_ok = (a >= 0) & (b >= 0) & (c >= 0) & (d >= 0) + a, b, c, d = a[quad_ok], b[quad_ok], c[quad_ok], d[quad_ok] + faces = torch.cat([torch.stack([a, b, c], dim=-1), torch.stack([a, c, d], dim=-1)], dim=0).contiguous() + return vertices, faces, uvs diff --git a/comfy/ldm/moge/model.py b/comfy/ldm/moge/model.py new file mode 100644 index 000000000..6876c4af2 --- /dev/null +++ b/comfy/ldm/moge/model.py @@ -0,0 +1,347 @@ +"""MoGe v1 / v2 inference modules and a state-dict-driven builder. + +V1: DINOv2 backbone + multi-output head (points, mask). +V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP). +""" + +from __future__ import annotations + +from numbers import Number +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops +import comfy.model_management +import comfy.model_patcher + +from comfy.image_encoders.dino2 import Dinov2Model + +from .geometry import depth_map_to_point_map, intrinsics_from_focal_center, recover_focal_shift +from .modules import ConvStack, DINOv2Encoder, HeadV1, MLP, _view_plane_uv_grid + + +def _remap_points(points: torch.Tensor) -> torch.Tensor: + """Apply the exp remap: z -> exp(z), xy stays linear and gets scaled by the new z.""" + xy, z = points.split([2, 1], dim=-1) + z = torch.exp(z) + return torch.cat([xy * z, z], dim=-1) + + +def _detect_dinov2(sd: dict, prefix: str) -> Dict[str, Any]: + # All shipped MoGe checkpoints use plain DINOv2 + hidden = sd[prefix + "embeddings.cls_token"].shape[-1] + layer_prefix = prefix + "encoder.layer." + depth = 1 + max(int(k[len(layer_prefix):].split(".")[0]) for k in sd if k.startswith(layer_prefix)) + return { + "hidden_size": hidden, + "num_attention_heads": hidden // 64, + "num_hidden_layers": depth, + "layer_norm_eps": 1e-6, + "use_swiglu_ffn": False, + } + + +class MoGeModelV1(nn.Module): + """MoGe v1: DINOv2 backbone + HeadV1 (points, mask).""" + + image_mean: torch.Tensor + image_std: torch.Tensor + + intermediate_layers = 4 + num_tokens_range: Tuple[Number, Number] = (1200, 2500) + mask_threshold = 0.5 + + def __init__(self, backbone: Dict[str, Any], dim_upsample: List[int] = (256, 128, 128), + num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1, + dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + self.backbone = Dinov2Model(backbone, dtype, device, operations) + self.head = HeadV1(dim_in=backbone["hidden_size"], dim_upsample=list(dim_upsample), + num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times_res_block_hidden, + dtype=dtype, device=device, operations=operations) + self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) + self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) + + def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]: + H, W = image.shape[-2:] + resize = ((num_tokens * 14 ** 2) / (H * W)) ** 0.5 + rh, rw = int(H * resize), int(W * resize) + x = F.interpolate(image, (rh, rw), mode="bicubic", align_corners=False, antialias=True) + x = (x - self.image_mean) / self.image_std + x14 = F.interpolate(x, (rh // 14 * 14, rw // 14 * 14), mode="bilinear", align_corners=False, antialias=True) + + n_layers = len(self.backbone.encoder.layer) + indices = list(range(n_layers - self.intermediate_layers, n_layers)) + feats = self.backbone.get_intermediate_layers(x14, indices, apply_norm=True) + + points, mask = self.head(feats, x) + points = F.interpolate(points.float(), (H, W), mode="bilinear", align_corners=False) + points = _remap_points(points.permute(0, 2, 3, 1)) + + mask = F.interpolate(mask.float(), (H, W), mode="bilinear", align_corners=False).squeeze(1) + + return {"points": points, "mask": mask} + + @classmethod + def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast): + """Detect the v1 head config from sd, build a model, and load weights.""" + n_up = 1 + max(int(k.split(".")[2]) for k in sd if k.startswith("head.upsample_blocks.")) + dim_upsample = [sd[f"head.upsample_blocks.{i}.0.0.weight"].shape[1] for i in range(n_up)] + # Each upsample stage is Sequential[upsampler, *res_blocks]; count res blocks at level 0. + num_res_blocks = max({int(k.split(".")[3]) for k in sd if k.startswith("head.upsample_blocks.0.")}) + hidden_out = sd["head.upsample_blocks.0.1.layers.2.weight"].shape[0] + dim_times = max(hidden_out // dim_upsample[0], 1) + model = cls(backbone=_detect_dinov2(sd, prefix="backbone."), + dim_upsample=dim_upsample, num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times, + dtype=dtype, device=device, operations=operations) + model.load_state_dict(sd, strict=True) + return model + + +class MoGeModelV2(nn.Module): + """MoGe v2: DINOv2 encoder + neck + per-output heads (points/mask/normal/metric-scale).""" + + intermediate_layers = 4 + num_tokens_range: Tuple[Number, Number] = (1200, 3600) + + def __init__(self, + encoder: Dict[str, Any], + neck: Dict[str, Any], + points_head: Dict[str, Any], + mask_head: Dict[str, Any], + scale_head: Dict[str, Any], + normal_head: Optional[Dict[str, Any]] = None, + dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + self.encoder = DINOv2Encoder(**encoder, dtype=dtype, device=device, operations=operations) + self.neck = ConvStack(**neck, dtype=dtype, device=device, operations=operations) + self.points_head = ConvStack(**points_head, dtype=dtype, device=device, operations=operations) + self.mask_head = ConvStack(**mask_head, dtype=dtype, device=device, operations=operations) + self.scale_head = MLP(**scale_head, dtype=dtype, device=device, operations=operations) + if normal_head is not None: + self.normal_head = ConvStack(**normal_head, dtype=dtype, device=device, operations=operations) + + def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]: + B, _, H, W = image.shape + device, dtype = image.device, image.dtype + aspect_ratio = W / H + base_h = round((num_tokens / aspect_ratio) ** 0.5) + base_w = round((num_tokens * aspect_ratio) ** 0.5) + + feat_top, cls_token = self.encoder(image, base_h, base_w, return_class_token=True) + + # 5-level pyramid: feat at level 0 concatenated with UV, other levels UV-only. + levels = [_view_plane_uv_grid(B, base_h * (2 ** L), base_w * (2 ** L), aspect_ratio, dtype, device) + for L in range(5)] + levels[0] = torch.cat([feat_top, levels[0]], dim=1) + + feats = self.neck(levels) + + def _resize(v): + return F.interpolate(v, (H, W), mode="bilinear", align_corners=False) + + points = _remap_points(_resize(self.points_head(feats)[-1]).permute(0, 2, 3, 1)) + mask = _resize(self.mask_head(feats)[-1]).squeeze(1).sigmoid() + metric_scale = self.scale_head(cls_token).squeeze(1).exp() + + result = {"points": points, "mask": mask, "metric_scale": metric_scale} + if hasattr(self, "normal_head"): + normal = _resize(self.normal_head(feats)[-1]) + result["normal"] = F.normalize(normal.permute(0, 2, 3, 1), dim=-1) + return result + + @classmethod + def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast): + """Detect the v2 encoder/neck/heads config from sd, build a model, and load weights.""" + backbone = _detect_dinov2(sd, prefix="encoder.backbone.") + depth = backbone["num_hidden_layers"] + n = cls.intermediate_layers + encoder = { + "backbone": backbone, + "intermediate_layers": [(depth // n) * (i + 1) - 1 for i in range(n)], + "dim_out": sd["encoder.output_projections.0.weight"].shape[0], + } + # scale_head is an MLP: Sequential of [Linear, ReLU, ..., Linear]; Linear weight is (out, in). + scale_idxs = sorted({int(k.split(".")[1]) for k in sd if k.startswith("scale_head.")}) + scale_first = sd[f"scale_head.{scale_idxs[0]}.weight"] + cfg: Dict[str, Any] = { + "encoder": encoder, + "neck": cls._detect_convstack(sd, "neck."), + "points_head": cls._detect_convstack(sd, "points_head."), + "mask_head": cls._detect_convstack(sd, "mask_head."), + "scale_head": {"dims": [scale_first.shape[1]] + [sd[f"scale_head.{i}.weight"].shape[0] for i in scale_idxs]}, + } + if any(k.startswith("normal_head.") for k in sd): + cfg["normal_head"] = cls._detect_convstack(sd, "normal_head.") + model = cls(**cfg, dtype=dtype, device=device, operations=operations) + model.load_state_dict(sd, strict=True) + return model + + @staticmethod + def _detect_convstack(sd: dict, prefix: str) -> Dict[str, Any]: + """Reconstruct a ConvStack config from the keys under prefix""" + in_keys = [k for k in sd if k.startswith(f"{prefix}input_blocks.") and k.endswith(".weight")] + n = 1 + max(int(k[len(f"{prefix}input_blocks."):].split(".")[0]) for k in in_keys) + + in_shapes = [sd[f"{prefix}input_blocks.{i}.weight"].shape for i in range(n)] + has_out = lambda i: f"{prefix}output_blocks.{i}.weight" in sd + has_norm = f"{prefix}res_blocks.0.0.layers.0.weight" in sd + + def num_res_at(i): + rb_prefix = f"{prefix}res_blocks.{i}." + return len({int(k[len(rb_prefix):].split(".")[0]) for k in sd if k.startswith(rb_prefix)}) + + return { + "dim_in": [s[1] for s in in_shapes], + "dim_res_blocks": [s[0] for s in in_shapes], + "dim_out": [sd[f"{prefix}output_blocks.{i}.weight"].shape[0] if has_out(i) else None for i in range(n)], + "num_res_blocks": [num_res_at(i) for i in range(n)], + "resamplers": ["conv_transpose" if f"{prefix}resamplers.{i}.0.weight" in sd else "bilinear" + for i in range(n - 1)], + "res_block_in_norm": "layer_norm" if has_norm else "none", + "res_block_hidden_norm": "group_norm" if has_norm else "none", + } + + +# Translate the Meta-style DINOv2 keys MoGe ships to the naming ComfyUI DINOv2 port expects, +# and split each fused qkv tensor into Q/K/V. +_DINOV2_TOPLEVEL_RENAMES = { + "patch_embed.proj.weight": "embeddings.patch_embeddings.projection.weight", + "patch_embed.proj.bias": "embeddings.patch_embeddings.projection.bias", + "cls_token": "embeddings.cls_token", + "pos_embed": "embeddings.position_embeddings", + "register_tokens": "embeddings.register_tokens", + "mask_token": "embeddings.mask_token", + "norm.weight": "layernorm.weight", + "norm.bias": "layernorm.bias", +} +_DINOV2_BLOCK_RENAMES = [ + ("ls1.gamma", "layer_scale1.lambda1"), + ("ls2.gamma", "layer_scale2.lambda1"), + ("attn.proj.", "attention.output.dense."), + ("mlp.w12.", "mlp.weights_in."), + ("mlp.w3.", "mlp.weights_out."), +] + + +def _remap_state_dict(sd: dict) -> dict: + if "model" in sd and "model_config" in sd: + sd = sd["model"] + prefix = "encoder.backbone." if any(k.startswith("encoder.backbone.") for k in sd) else "backbone." + out: dict = {} + for k, v in sd.items(): + if not k.startswith(prefix): + out[k] = v + continue + rel = k[len(prefix):] + if rel in _DINOV2_TOPLEVEL_RENAMES: + out[prefix + _DINOV2_TOPLEVEL_RENAMES[rel]] = v + continue + if not rel.startswith("blocks."): + out[k] = v + continue + _, idx, sub = rel.split(".", 2) + if sub in ("attn.qkv.weight", "attn.qkv.bias"): + tail = sub.rsplit(".", 1)[1] + q, kw, vw = v.chunk(3, dim=0) + base = f"{prefix}encoder.layer.{idx}.attention.attention" + out[f"{base}.query.{tail}"] = q + out[f"{base}.key.{tail}"] = kw + out[f"{base}.value.{tail}"] = vw + continue + for old, new in _DINOV2_BLOCK_RENAMES: + sub = sub.replace(old, new) + out[f"{prefix}encoder.layer.{idx}.{sub}"] = v + return out + + +def build_from_state_dict(sd: dict, dtype=None, device=None, operations=comfy.ops.manual_cast) -> nn.Module: + """Dispatch to v1 or v2 based on the DINOv2 backbone prefix.""" + sd = _remap_state_dict(sd) + cls = MoGeModelV2 if any(k.startswith("encoder.backbone.") for k in sd) else MoGeModelV1 + return cls.from_state_dict(sd, dtype=dtype, device=device, operations=operations) + + +class MoGeModel: + """Loaded MoGe model + ComfyUI memory management.""" + + def __init__(self, state_dict: dict): + # text encoder dtype closest match + self.load_device = comfy.model_management.text_encoder_device() + offload_device = comfy.model_management.text_encoder_offload_device() + self.dtype = comfy.model_management.text_encoder_dtype(self.load_device) + + self.model = build_from_state_dict(state_dict, dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast).eval() + self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device) + self.version = "v2" if hasattr(self.model, "encoder") else "v1" + self.mask_threshold = float(getattr(self.model, "mask_threshold", 0.5)) + nt = getattr(self.model, "num_tokens_range", (1200, 2500 if self.version == "v1" else 3600)) + self.num_tokens_range = (int(nt[0]), int(nt[1])) + + def infer(self, image: torch.Tensor, num_tokens: Optional[int] = None, + resolution_level: int = 9, fov_x: Optional[Union[Number, torch.Tensor]] = None, + force_projection: bool = True, apply_mask: bool = True, + apply_metric_scale: bool = True + ) -> Dict[str, torch.Tensor]: + """Run a single MoGe forward + post-process pass. image is (B, 3, H, W) in [0, 1].""" + comfy.model_management.load_model_gpu(self.patcher) + image = image.to(device=self.load_device, dtype=self.dtype) + H, W = image.shape[-2:] + aspect_ratio = W / H + + if num_tokens is None: + lo, hi = self.num_tokens_range + num_tokens = int(lo + (resolution_level / 9) * (hi - lo)) + + out = self.model.forward(image, num_tokens=num_tokens) + points = out["points"].float() # recover_focal_shift goes through scipy on CPU; needs fp32. + mask_binary = out["mask"] > self.mask_threshold + normal = out.get("normal") + metric_scale = out.get("metric_scale") + + diag = (1 + aspect_ratio ** 2) ** 0.5 + + def focal_from_fov_deg(deg): + fov = torch.as_tensor(deg, device=points.device, dtype=points.dtype) + return aspect_ratio / diag / torch.tan(torch.deg2rad(fov / 2)) + + if fov_x is None: + focal, shift = recover_focal_shift(points, mask_binary) + # Fall back to 60 deg FoV when the least-squares solver flips the focal sign. + bad = ~torch.isfinite(focal) | (focal <= 0) + if bool(bad.any()): + focal = torch.where(bad, focal_from_fov_deg(60.0), focal) + _, shift = recover_focal_shift(points, mask_binary, focal=focal) + else: + focal = focal_from_fov_deg(fov_x).expand(points.shape[0]) + _, shift = recover_focal_shift(points, mask_binary, focal=focal) + + f_diag = focal / 2 * diag + half = torch.tensor(0.5, device=points.device, dtype=points.dtype) + intrinsics = intrinsics_from_focal_center(f_diag / aspect_ratio, f_diag, half, half) + points[..., 2] = points[..., 2] + shift[..., None, None] + # v2 only: filter mask by depth>0 to drop metric-scale negative-depth artifacts. + if self.version == "v2": + mask_binary = mask_binary & (points[..., 2] > 0) + depth = points[..., 2].clone() + + if force_projection: + points = depth_map_to_point_map(depth, intrinsics=intrinsics) + + if apply_metric_scale and metric_scale is not None: + points = points * metric_scale[:, None, None, None] + depth = depth * metric_scale[:, None, None] + + if apply_mask: + points = torch.where(mask_binary[..., None], points, torch.full_like(points, float("inf"))) + depth = torch.where(mask_binary, depth, torch.full_like(depth, float("inf"))) + if normal is not None: + normal = torch.where(mask_binary[..., None], normal, torch.zeros_like(normal)) + + result = {"points": points, "depth": depth, "intrinsics": intrinsics, "mask": mask_binary} + if normal is not None: + result["normal"] = normal + return result diff --git a/comfy/ldm/moge/modules.py b/comfy/ldm/moge/modules.py new file mode 100644 index 000000000..235a59212 --- /dev/null +++ b/comfy/ldm/moge/modules.py @@ -0,0 +1,204 @@ +"""Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head.""" + +from __future__ import annotations + +from typing import List, Optional, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops +from comfy.image_encoders.dino2 import Dinov2Model + +from .geometry import normalized_view_plane_uv + + +def _conv2d(operations, c_in: int, c_out: int, k: int = 3, *, dtype=None, device=None): + return operations.Conv2d(c_in, c_out, kernel_size=k, padding=k // 2, padding_mode="replicate", dtype=dtype, device=device) + + +def _view_plane_uv_grid(batch: int, height: int, width: int, aspect_ratio: float, dtype, device) -> torch.Tensor: + """Batched normalized view-plane UV grid as a (B, 2, H, W) tensor.""" + uv = normalized_view_plane_uv(width, height, aspect_ratio=aspect_ratio, dtype=dtype, device=device) + return uv.permute(2, 0, 1).unsqueeze(0).expand(batch, -1, -1, -1) + + +def _concat_view_plane_uv(x: torch.Tensor, aspect_ratio: float) -> torch.Tensor: + """Append a 2-channel normalized view-plane UV grid to x along the channel dim.""" + uv = _view_plane_uv_grid(x.shape[0], x.shape[-2], x.shape[-1], aspect_ratio, x.dtype, x.device) + return torch.cat([x, uv], dim=1) + + +class ResidualConvBlock(nn.Module): + def __init__(self, channels: int, hidden_channels: Optional[int] = None, in_norm: str = "layer_norm", hidden_norm: str = "group_norm", + dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + hidden_channels = hidden_channels if hidden_channels is not None else channels + + in_norm_layer = operations.GroupNorm(1, channels, dtype=dtype, device=device) if in_norm == "layer_norm" else nn.Identity() + hidden_norm_layer = (operations.GroupNorm(max(hidden_channels // 32, 1), hidden_channels, dtype=dtype, device=device) + if hidden_norm == "group_norm" else nn.Identity()) + + self.layers = nn.Sequential( + in_norm_layer, nn.ReLU(), _conv2d(operations, channels, hidden_channels, dtype=dtype, device=device), + hidden_norm_layer, nn.ReLU(), _conv2d(operations, hidden_channels, channels, dtype=dtype, device=device), + ) + + def forward(self, x): + return self.layers(x) + x + + +class Resampler(nn.Sequential): + """2x upsampler: ConvTranspose2d(2x2) or bilinear upsample, followed by a 3x3 conv.""" + + def __init__(self, in_channels: int, out_channels: int, type_: str, dtype=None, device=None, operations=comfy.ops.manual_cast): + if type_ == "conv_transpose": + up = operations.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2, dtype=dtype, device=device) + conv_in = out_channels + else: # "bilinear" + up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + conv_in = in_channels + super().__init__(up, _conv2d(operations, conv_in, out_channels, dtype=dtype, device=device)) + + +class MLP(nn.Sequential): + def __init__(self, dims: Sequence[int], dtype=None, device=None, operations=comfy.ops.manual_cast): + layers = [] + for d_in, d_out in zip(dims[:-2], dims[1:-1]): + layers.append(operations.Linear(d_in, d_out, dtype=dtype, device=device)) + layers.append(nn.ReLU(inplace=True)) + layers.append(operations.Linear(dims[-2], dims[-1], dtype=dtype, device=device)) + super().__init__(*layers) + + +class ConvStack(nn.Module): + def __init__(self, dim_in: List[Optional[int]], dim_res_blocks: List[int], dim_out: List[Optional[int]], resamplers: List[str], + num_res_blocks: List[int], dim_times_res_block_hidden: int = 1, res_block_in_norm: str = "layer_norm", res_block_hidden_norm: str = "group_norm", + dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + + self.input_blocks = nn.ModuleList([ + (_conv2d(operations, d_in, d_res, k=1, dtype=dtype, device=device) + if d_in is not None else nn.Identity()) + for d_in, d_res in zip(dim_in, dim_res_blocks) + ]) + + self.resamplers = nn.ModuleList([ + Resampler(prev, succ, type_=r, dtype=dtype, device=device, operations=operations) + for prev, succ, r in zip(dim_res_blocks[:-1], dim_res_blocks[1:], resamplers) + ]) + + self.res_blocks = nn.ModuleList([ + nn.Sequential(*[ + ResidualConvBlock(d_res, dim_times_res_block_hidden * d_res, in_norm=res_block_in_norm, hidden_norm=res_block_hidden_norm, dtype=dtype, device=device, operations=operations) + for _ in range(num_res_blocks[i]) + ]) + for i, d_res in enumerate(dim_res_blocks) + ]) + + self.output_blocks = nn.ModuleList([ + (_conv2d(operations, d_res, d_out, k=1, dtype=dtype, device=device) + if d_out is not None else nn.Identity()) + for d_out, d_res in zip(dim_out, dim_res_blocks) + ]) + + def forward(self, in_features: List[Optional[torch.Tensor]]): + out_features = [] + x = None + for i in range(len(self.res_blocks)): + feat = self.input_blocks[i](in_features[i]) if in_features[i] is not None else None + if i == 0: + x = feat + elif feat is not None: + x = x + feat + x = self.res_blocks[i](x) + out_features.append(self.output_blocks[i](x)) + if i < len(self.res_blocks) - 1: + x = self.resamplers[i](x) + return out_features + + +class DINOv2Encoder(nn.Module): + """Comfy DINOv2 backbone with per-layer 1x1 projection heads.""" + + def __init__(self, backbone: dict, intermediate_layers: List[int], dim_out: int, dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + self.intermediate_layers = list(intermediate_layers) + dim_features = backbone["hidden_size"] + self.backbone = Dinov2Model(backbone, dtype, device, operations) + self.output_projections = nn.ModuleList([ + _conv2d(operations, dim_features, dim_out, k=1, dtype=dtype, device=device) + for _ in range(len(self.intermediate_layers)) + ]) + self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)) + self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)) + + def forward(self, image: torch.Tensor, token_rows: int, token_cols: int, + return_class_token: bool = False) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + image_14 = F.interpolate(image, (token_rows * 14, token_cols * 14), mode="bilinear", align_corners=False, antialias=True) + image_14 = (image_14 - self.image_mean) / self.image_std + feats = self.backbone.get_intermediate_layers(image_14, self.intermediate_layers, apply_norm=True) + x = torch.stack([ + proj(feat.permute(0, 2, 1).unflatten(2, (token_rows, token_cols)).contiguous()) + for proj, (feat, _cls) in zip(self.output_projections, feats) + ], dim=1).sum(dim=1) + if return_class_token: + return x, feats[-1][1] + return x + + +class HeadV1(nn.Module): + """v1 head: 4 backbone-feature projections -> shared upsample stack -> per-target output convs (points, mask).""" + + NUM_FEATURES = 4 + DIM_PROJ = 512 + DIM_OUT = (3, 1) # 3 channels for points, 1 for mask + LAST_CONV_CHANNELS = 32 + + def __init__(self, dim_in: int, dim_upsample: List[int] = (256, 128, 128), num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1, + dtype=None, device=None, operations=comfy.ops.manual_cast): + super().__init__() + self.projects = nn.ModuleList([ + _conv2d(operations, dim_in, self.DIM_PROJ, k=1, dtype=dtype, device=device) + for _ in range(self.NUM_FEATURES) + ]) + def upsampler(in_ch, out_ch): + return nn.Sequential( + operations.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2, dtype=dtype, device=device), + _conv2d(operations, out_ch, out_ch, dtype=dtype, device=device), + ) + + in_chs = [self.DIM_PROJ] + list(dim_upsample[:-1]) + self.upsample_blocks = nn.ModuleList([ + nn.Sequential( + upsampler(in_ch + 2, out_ch), + *(ResidualConvBlock(out_ch, dim_times_res_block_hidden * out_ch, dtype=dtype, device=device, operations=operations) + for _ in range(num_res_blocks)) + ) + for in_ch, out_ch in zip(in_chs, dim_upsample) + ]) + self.output_block = nn.ModuleList([ + nn.Sequential( + _conv2d(operations, dim_upsample[-1] + 2, self.LAST_CONV_CHANNELS, dtype=dtype, device=device), + nn.ReLU(inplace=True), + _conv2d(operations, self.LAST_CONV_CHANNELS, d_out, k=1, dtype=dtype, device=device), + ) + for d_out in self.DIM_OUT + ]) + + def forward(self, hidden_states, image: torch.Tensor): + img_h, img_w = image.shape[-2:] + patch_h, patch_w = img_h // 14, img_w // 14 + aspect = img_w / img_h + x = torch.stack([ + proj(feat.permute(0, 2, 1).unflatten(2, (patch_h, patch_w)).contiguous()) + for proj, (feat, _cls) in zip(self.projects, hidden_states) + ], dim=1).sum(dim=1) + + for block in self.upsample_blocks: + x = block(_concat_view_plane_uv(x, aspect)) + + x = F.interpolate(x, (img_h, img_w), mode="bilinear", align_corners=False) + x = _concat_view_plane_uv(x, aspect) + return [block(x) for block in self.output_block] diff --git a/comfy/ldm/moge/panorama.py b/comfy/ldm/moge/panorama.py new file mode 100644 index 000000000..de53ebe68 --- /dev/null +++ b/comfy/ldm/moge/panorama.py @@ -0,0 +1,313 @@ +"""Panorama (equirectangular) inference helpers for MoGe. + +Splits an equirect into 12 perspective views via an icosahedron camera rig, runs +the model per view, and stitches per-view distance maps back into a single +equirect distance map via a multi-scale Poisson + gradient sparse solve. +Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU). +""" + +from __future__ import annotations + +from typing import Callable, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F + +from scipy.ndimage import convolve, map_coordinates +from scipy.sparse import vstack, csr_array +from scipy.sparse.linalg import lsmr + + +def _icosahedron_directions() -> np.ndarray: + """12 icosahedron-vertex directions (non-normalised, matching upstream's vertex order).""" + A = (1.0 + np.sqrt(5.0)) / 2.0 + return np.array([ + [0, 1, A], [0, -1, A], [0, 1, -A], [0, -1, -A], + [1, A, 0], [-1, A, 0], [1, -A, 0], [-1, -A, 0], + [A, 0, 1], [A, 0, -1], [-A, 0, 1], [-A, 0, -1], + ], dtype=np.float32) + + +def _intrinsics_from_fov(fov_x_rad: float, fov_y_rad: float) -> np.ndarray: + """Normalised-image (unit-square) K matrix.""" + fx = 0.5 / np.tan(fov_x_rad / 2) + fy = 0.5 / np.tan(fov_y_rad / 2) + return np.array([[fx, 0, 0.5], [0, fy, 0.5], [0, 0, 1]], dtype=np.float32) + + +def _extrinsics_look_at(eye: np.ndarray, target: np.ndarray, up: np.ndarray) -> np.ndarray: + """OpenCV-convention world->camera extrinsics for an array of look-at targets (N, 4, 4).""" + eye = np.asarray(eye, dtype=np.float32) + target = np.asarray(target, dtype=np.float32) + up = np.asarray(up, dtype=np.float32) + if target.ndim == 1: + target = target[None] + + fwd = target - eye + fwd = fwd / np.linalg.norm(fwd, axis=-1, keepdims=True).clip(1e-12) + right = np.cross(fwd, up) + right_norm = np.linalg.norm(right, axis=-1, keepdims=True) + # Fall back to an arbitrary perpendicular if forward is parallel to up. + parallel = right_norm.squeeze(-1) < 1e-6 + if parallel.any(): + alt_up = np.array([1, 0, 0], dtype=np.float32) + right = np.where(parallel[:, None], np.cross(fwd, alt_up), right) + right_norm = np.linalg.norm(right, axis=-1, keepdims=True) + right = right / right_norm.clip(1e-12) + new_up = np.cross(fwd, right) + + R = np.stack([right, new_up, fwd], axis=-2) + t = -np.einsum("nij,j->ni", R, eye) + E = np.zeros((R.shape[0], 4, 4), dtype=np.float32) + E[:, :3, :3] = R + E[:, :3, 3] = t + E[:, 3, 3] = 1.0 + return E + + +def get_panorama_cameras() -> Tuple[np.ndarray, List[np.ndarray]]: + """Returns (extrinsics (12, 4, 4), [intrinsics] * 12) for icosahedron views at 90 deg FoV.""" + targets = _icosahedron_directions() + eye = np.zeros(3, dtype=np.float32) + up = np.array([0, 0, 1], dtype=np.float32) + extrinsics = _extrinsics_look_at(eye, targets, up) + K = _intrinsics_from_fov(np.deg2rad(90.0), np.deg2rad(90.0)) + return extrinsics, [K] * len(targets) + + +def spherical_uv_to_directions(uv: np.ndarray) -> np.ndarray: + """Equirect UV in [0, 1] -> 3D unit-direction (Z up).""" + theta = (1 - uv[..., 0]) * (2 * np.pi) + phi = uv[..., 1] * np.pi + return np.stack([ + np.sin(phi) * np.cos(theta), + np.sin(phi) * np.sin(theta), + np.cos(phi), + ], axis=-1).astype(np.float32) + + +def directions_to_spherical_uv(directions: np.ndarray) -> np.ndarray: + """3D direction -> equirect UV in [0, 1].""" + n = np.linalg.norm(directions, axis=-1, keepdims=True).clip(1e-12) + d = directions / n + u = 1 - np.arctan2(d[..., 1], d[..., 0]) / (2 * np.pi) % 1.0 + v = np.arccos(d[..., 2].clip(-1, 1)) / np.pi + return np.stack([u, v], axis=-1).astype(np.float32) + + +def _uv_grid(H: int, W: int) -> np.ndarray: + """Pixel-center UV grid in [0, 1]; (H, W, 2).""" + u = (np.arange(W, dtype=np.float32) + 0.5) / W + v = (np.arange(H, dtype=np.float32) + 0.5) / H + return np.stack(np.meshgrid(u, v, indexing="xy"), axis=-1) + + +def _unproject_cv(uv: np.ndarray, depth: np.ndarray, + extrinsics: np.ndarray, intrinsics: np.ndarray) -> np.ndarray: + """Back-project pixels into world coords (OpenCV convention).""" + pix = np.concatenate([uv, np.ones_like(uv[..., :1])], axis=-1) + K_inv = np.linalg.inv(intrinsics) + cam = pix @ K_inv.T * depth[..., None] + cam_h = np.concatenate([cam, np.ones_like(cam[..., :1])], axis=-1) + E_inv = np.linalg.inv(extrinsics) + return (cam_h @ E_inv.T)[..., :3] + + +def _project_cv(points: np.ndarray, extrinsics: np.ndarray, intrinsics: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + """World coords -> (uv, depth) in the camera (OpenCV convention).""" + pts_h = np.concatenate([points, np.ones_like(points[..., :1])], axis=-1) + cam = pts_h @ extrinsics.T + cam_xyz = cam[..., :3] + depth = cam_xyz[..., 2] + proj = cam_xyz @ intrinsics.T + uv = proj[..., :2] / proj[..., 2:3].clip(1e-12) + return uv.astype(np.float32), depth.astype(np.float32) + + +def _grid_sample_uv(img_bchw: torch.Tensor, uv: torch.Tensor, mode: str = "bilinear") -> torch.Tensor: + """Sample img_bchw at UV-in-[0,1] coords uv of shape (B, H, W, 2); replicate-border.""" + grid = uv * 2.0 - 1.0 + return F.grid_sample(img_bchw, grid, mode=mode, padding_mode="border", align_corners=False) + + +def split_panorama_image(image: torch.Tensor, extrinsics: np.ndarray, intrinsics: List[np.ndarray], resolution: int) -> torch.Tensor: + """(3, Hp, Wp) equirect on any device -> (N, 3, R, R) perspective crops on the same device.""" + device = image.device + N = len(extrinsics) + uv = _uv_grid(resolution, resolution) + sample_uvs = [] + for i in range(N): + world = _unproject_cv(uv, np.ones(uv.shape[:-1], dtype=np.float32), extrinsics[i], intrinsics[i]) + sample_uvs.append(directions_to_spherical_uv(world)) + sample_uvs = np.stack(sample_uvs, axis=0) + + img_bchw = image.unsqueeze(0).expand(N, -1, -1, -1).contiguous() + sample_uvs_t = torch.from_numpy(sample_uvs).to(device=device, dtype=image.dtype) + return _grid_sample_uv(img_bchw, sample_uvs_t, mode="bilinear") + + +def _poisson_equation(W: int, H: int, wrap_x: bool = False, wrap_y: bool = False): + """Sparse Laplacian operator over the H x W grid.""" + grid_index = np.arange(H * W).reshape(H, W) + grid_index = np.pad(grid_index, ((0, 0), (1, 1)), mode="wrap" if wrap_x else "edge") + grid_index = np.pad(grid_index, ((1, 1), (0, 0)), mode="wrap" if wrap_y else "edge") + + data = np.array([[-4, 1, 1, 1, 1]], dtype=np.float32).repeat(H * W, axis=0).reshape(-1) + indices = np.stack([ + grid_index[1:-1, 1:-1], + grid_index[:-2, 1:-1], grid_index[2:, 1:-1], + grid_index[1:-1, :-2], grid_index[1:-1, 2:], + ], axis=-1).reshape(-1) + indptr = np.arange(0, H * W * 5 + 1, 5) + return csr_array((data, indices, indptr), shape=(H * W, H * W)) + + +def _grad_equation(W: int, H: int, wrap_x: bool = False, wrap_y: bool = False): + """Sparse forward-difference operator over the H x W grid.""" + grid_index = np.arange(W * H).reshape(H, W) + if wrap_x: + grid_index = np.pad(grid_index, ((0, 0), (0, 1)), mode="wrap") + if wrap_y: + grid_index = np.pad(grid_index, ((0, 1), (0, 0)), mode="wrap") + + data = np.concatenate([ + np.concatenate([ + np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1), + -np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1), + ], axis=1).reshape(-1), + np.concatenate([ + np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1), + -np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1), + ], axis=1).reshape(-1), + ]) + indices = np.concatenate([ + np.concatenate([grid_index[:, :-1].reshape(-1, 1), grid_index[:, 1:].reshape(-1, 1)], axis=1).reshape(-1), + np.concatenate([grid_index[:-1, :].reshape(-1, 1), grid_index[1:, :].reshape(-1, 1)], axis=1).reshape(-1), + ]) + nx = grid_index.shape[0] * (grid_index.shape[1] - 1) + ny = (grid_index.shape[0] - 1) * grid_index.shape[1] + indptr = np.arange(0, nx * 2 + ny * 2 + 1, 2) + return csr_array((data, indices, indptr), shape=(nx + ny, H * W)) + + +def _scipy_remap_bilinear(img: np.ndarray, sample_pixels: np.ndarray, mode: str = "bilinear") -> np.ndarray: + """Bilinear/nearest sampling at fractional pixel coords; out-of-range clamps to nearest border.""" + H, W = img.shape[:2] + yy = np.clip(sample_pixels[..., 1], 0, H - 1) + xx = np.clip(sample_pixels[..., 0], 0, W - 1) + order = 1 if mode == "bilinear" else 0 + if img.ndim == 2: + return map_coordinates(img, [yy, xx], order=order, mode="nearest").astype(img.dtype) + out = np.stack([ + map_coordinates(img[..., c], [yy, xx], order=order, mode="nearest") + for c in range(img.shape[-1]) + ], axis=-1) + return out.astype(img.dtype) + + +def merge_panorama_depth(width: int, height: int, + distance_maps: List[np.ndarray], pred_masks: List[np.ndarray], + extrinsics: List[np.ndarray], intrinsics: List[np.ndarray], + on_view: Optional[Callable[[], None]] = None, + on_solve_start: Optional[Callable[[int, int], None]] = None, + on_solve_end: Optional[Callable[[int, int], None]] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + """Stitch per-view distance maps into a single equirect distance map. + + Recursive multi-scale solve: solves at half resolution first and uses that as the lsmr init + for the full-resolution solve. Optional callbacks fire per view processed and around each + lsmr solve so callers can drive a progress bar. + """ + + if max(width, height) > 256: + coarse_depth, _ = merge_panorama_depth(width // 2, height // 2, + distance_maps, pred_masks, extrinsics, intrinsics, + on_view=on_view, + on_solve_start=on_solve_start, + on_solve_end=on_solve_end) + t = torch.from_numpy(coarse_depth).unsqueeze(0).unsqueeze(0) + t = F.interpolate(t, size=(height, width), mode="bilinear", align_corners=False) + depth_init = t.squeeze().numpy().astype(np.float32) + else: + depth_init = None + + spherical_directions = spherical_uv_to_directions(_uv_grid(height, width)) + + pano_log_grad_maps, pano_grad_masks = [], [] + pano_log_lap_maps, pano_lap_masks = [], [] + pano_pred_masks: List[np.ndarray] = [] + + for i in range(len(distance_maps)): + proj_uv, proj_depth = _project_cv(spherical_directions, extrinsics[i], intrinsics[i]) + proj_valid = (proj_depth > 0) & (proj_uv > 0).all(axis=-1) & (proj_uv < 1).all(axis=-1) + + Hd, Wd = distance_maps[i].shape[:2] + proj_pixels = np.clip(proj_uv, 0, 1) * np.array([Wd - 1, Hd - 1], dtype=np.float32) + + log_dist = np.log(np.clip(distance_maps[i], 1e-6, None)) + sampled = _scipy_remap_bilinear(log_dist, proj_pixels, mode="bilinear") + pano_log = np.where(proj_valid, sampled, 0.0).astype(np.float32) + + sampled_mask = _scipy_remap_bilinear(pred_masks[i].astype(np.uint8), proj_pixels, mode="nearest") + pano_pred = proj_valid & (sampled_mask > 0) + + # Equirect wraps horizontally but not vertically: wrap pad along x, edge pad along y. + padded = np.pad(pano_log, ((0, 0), (0, 1)), mode="wrap") + gx, gy = padded[:, :-1] - padded[:, 1:], padded[:-1, :] - padded[1:, :] + padded_m = np.pad(pano_pred, ((0, 0), (0, 1)), mode="wrap") + mx, my = padded_m[:, :-1] & padded_m[:, 1:], padded_m[:-1, :] & padded_m[1:, :] + pano_log_grad_maps.append((gx, gy)) + pano_grad_masks.append((mx, my)) + + padded = np.pad(pano_log, ((1, 1), (0, 0)), mode="edge") + padded = np.pad(padded, ((0, 0), (1, 1)), mode="wrap") + lap_kernel = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32) + lap = convolve(padded, lap_kernel)[1:-1, 1:-1] + padded_m = np.pad(pano_pred, ((1, 1), (0, 0)), mode="edge") + padded_m = np.pad(padded_m, ((0, 0), (1, 1)), mode="wrap") + m_kernel = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8) + lap_mask = convolve(padded_m.astype(np.uint8), m_kernel)[1:-1, 1:-1] == 5 + pano_log_lap_maps.append(lap) + pano_lap_masks.append(lap_mask) + pano_pred_masks.append(pano_pred) + + if on_view is not None: + on_view() + + gx = np.stack([m[0] for m in pano_log_grad_maps], axis=0) + gy = np.stack([m[1] for m in pano_log_grad_maps], axis=0) + mx = np.stack([m[0] for m in pano_grad_masks], axis=0) + my = np.stack([m[1] for m in pano_grad_masks], axis=0) + gx_avg = (gx * mx).sum(axis=0) / mx.sum(axis=0).clip(1e-3) + gy_avg = (gy * my).sum(axis=0) / my.sum(axis=0).clip(1e-3) + + laps = np.stack(pano_log_lap_maps, axis=0) + lap_masks = np.stack(pano_lap_masks, axis=0) + lap_avg = (laps * lap_masks).sum(axis=0) / lap_masks.sum(axis=0).clip(1e-3) + + grad_x_mask = mx.any(axis=0).reshape(-1) + grad_y_mask = my.any(axis=0).reshape(-1) + grad_mask = np.concatenate([grad_x_mask, grad_y_mask]) + lap_mask_flat = lap_masks.any(axis=0).reshape(-1) + + A = vstack([ + _grad_equation(width, height, wrap_x=True, wrap_y=False)[grad_mask], + _poisson_equation(width, height, wrap_x=True, wrap_y=False)[lap_mask_flat], + ]) + b = np.concatenate([ + gx_avg.reshape(-1)[grad_x_mask], + gy_avg.reshape(-1)[grad_y_mask], + lap_avg.reshape(-1)[lap_mask_flat], + ]) + x0 = np.log(np.clip(depth_init, 1e-6, None)).reshape(-1) if depth_init is not None else None + + if on_solve_start is not None: + on_solve_start(width, height) + x, *_ = lsmr(A, b, atol=1e-5, btol=1e-5, x0=x0, show=False) + if on_solve_end is not None: + on_solve_end(width, height) + + pano_depth = np.exp(x).reshape(height, width).astype(np.float32) + pano_mask = np.any(pano_pred_masks, axis=0) + return pano_depth, pano_mask diff --git a/comfy_extras/nodes_moge.py b/comfy_extras/nodes_moge.py new file mode 100644 index 000000000..d9a08ebc7 --- /dev/null +++ b/comfy_extras/nodes_moge.py @@ -0,0 +1,406 @@ +"""ComfyUI nodes for the native MoGe (Monocular Geometry Estimation) integration.""" + +from __future__ import annotations + +import torch + +import comfy.utils +import folder_paths +from comfy_api.latest import ComfyExtension, Types, io +from typing_extensions import override + +from comfy.ldm.moge.model import MoGeModel +from comfy.ldm.moge.geometry import triangulate_grid_mesh +from comfy.ldm.moge.panorama import get_panorama_cameras, split_panorama_image, merge_panorama_depth, spherical_uv_to_directions, _uv_grid +import comfy.model_management +from tqdm.auto import tqdm + +MoGeModelType = io.Custom("MOGE_MODEL") +MoGeGeometry = io.Custom("MOGE_GEOMETRY") + + +# MOGE_GEOMETRY is a dict with these optional keys (absent when the upstream model didn't produce them): +# "points": torch.Tensor (B, H, W, 3) +# "depth": torch.Tensor (B, H, W) +# "intrinsics": torch.Tensor (B, 3, 3) -- perspective only +# "mask": torch.Tensor (B, H, W) bool +# "normal": torch.Tensor (B, H, W, 3) -- v2 only +# "image": torch.Tensor (B, H, W, 3) in [0, 1], CPU (always present) + + +def _turbo(x: torch.Tensor) -> torch.Tensor: + """Anton Mikhailov polynomial approximation of the turbo colormap.""" + x = x.clamp(0.0, 1.0) + x2 = x * x + x3 = x2 * x + x4 = x2 * x2 + x5 = x4 * x + r = 0.13572138 + 4.61539260*x - 42.66032258*x2 + 132.13108234*x3 - 152.94239396*x4 + 59.28637943*x5 + g = 0.09140261 + 2.19418839*x + 4.84296658*x2 - 14.18503333*x3 + 4.27729857*x4 + 2.82956604*x5 + b = 0.10667330 + 12.64194608*x - 60.58204836*x2 + 110.36276771*x3 - 89.90310912*x4 + 27.34824973*x5 + return torch.stack([r, g, b], dim=-1).clamp(0.0, 1.0) + + +def _normals_from_points(points: torch.Tensor) -> torch.Tensor: + """Camera-space surface normals from a (B, H, W, 3) point map (v1 fallback).""" + finite = torch.isfinite(points).all(dim=-1) + pts = torch.where(finite.unsqueeze(-1), points, torch.zeros_like(points)) + dx = pts[..., :, 2:, :] - pts[..., :, :-2, :] + dy = pts[..., 2:, :, :] - pts[..., :-2, :, :] + dx = torch.nn.functional.pad(dx.permute(0, 3, 1, 2), (1, 1, 0, 0)).permute(0, 2, 3, 1) + dy = torch.nn.functional.pad(dy.permute(0, 3, 1, 2), (0, 0, 1, 1)).permute(0, 2, 3, 1) + # dy x dx (not dx x dy) so the result is outward-facing in OpenCV (Y-down flips the right-hand rule), matching v2's predicted normals. + n = torch.cross(dy, dx, dim=-1) + n = torch.nn.functional.normalize(n, dim=-1) + return torch.where(finite.unsqueeze(-1), n, torch.zeros_like(n)) + + +def _normalize_disparity(depth: torch.Tensor) -> torch.Tensor: + """Per-batch normalize 1/depth to [0, 1] using 0.1/99.9 percentile clipping.""" + out = torch.zeros_like(depth) + for i in range(depth.shape[0]): + d = depth[i] + valid = torch.isfinite(d) & (d > 0) + if not valid.any(): + continue + disp = torch.where(valid, 1.0 / d.clamp_min(1e-6), torch.zeros_like(d)) + disp_valid = disp[valid] + lo = torch.quantile(disp_valid, 0.001) + hi = torch.quantile(disp_valid, 0.999) + scale = (hi - lo).clamp_min(1e-6) + norm = ((disp - lo) / scale).clamp(0.0, 1.0) + out[i] = torch.where(valid, norm, torch.zeros_like(norm)) + return out + + +class LoadMoGeModel(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="LoadMoGeModel", + display_name="Load MoGe Model", + category="loaders", + inputs=[ + io.Combo.Input("model_name", options=folder_paths.get_filename_list("geometry_estimation")), + ], + outputs=[MoGeModelType.Output()], + ) + + @classmethod + def execute(cls, model_name) -> io.NodeOutput: + path = folder_paths.get_full_path_or_raise("geometry_estimation", model_name) + sd = comfy.utils.load_torch_file(path, safe_load=True) + return io.NodeOutput(MoGeModel(sd)) + + +class MoGePanoramaInference(io.ComfyNode): + """Equirectangular panorama inference: split into 12 perspective views, run + MoGe at fov_x=90 on each, merge via multi-scale Poisson + gradient solve. + v2's predicted normals and metric scale are ignored (per-view scales would not align across seams). + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MoGePanoramaInference", + display_name="MoGe Panorama Inference", + category="image/geometry_estimation", + inputs=[ + MoGeModelType.Input("moge_model"), + io.Image.Input("image", tooltip="Equirectangular panorama (any aspect)."), + io.Int.Input("resolution_level", default=9, min=0, max=9, + tooltip="Per-view detail (0 = fastest, 9 = most detailed)."), + io.Int.Input("split_resolution", default=512, min=256, max=1024, + tooltip="Resolution of each perspective split."), + io.Int.Input("merge_resolution", default=1920, min=256, max=8192, + tooltip="Long-side resolution of the merged equirect distance map."), + io.Int.Input("batch_size", default=4, min=1, max=12, + tooltip="Views per inference batch (12 splits total)."), + ], + outputs=[MoGeGeometry.Output(display_name="moge_geometry")], + ) + + @classmethod + def execute(cls, moge_model, image, resolution_level, split_resolution, merge_resolution, batch_size) -> io.NodeOutput: + + if image.shape[0] != 1: + raise ValueError(f"MoGePanoramaInference takes a single image (got batch of {image.shape[0]})") + + image = image[..., :3] + H, W = int(image.shape[1]), int(image.shape[2]) + scale = min(merge_resolution / max(H, W), 1.0) + merge_h, merge_w = max(int(H * scale), 32), max(int(W * scale), 32) + + extrinsics, intrinsics = get_panorama_cameras() + + comfy.model_management.load_model_gpu(moge_model.patcher) + device = moge_model.load_device + img_chw = image[0].movedim(-1, -3).to(device=device, dtype=moge_model.dtype) + splits = split_panorama_image(img_chw, extrinsics, intrinsics, split_resolution) + + n_views = splits.shape[0] + + # Weight each lsmr solve by 4^level so the final-resolution solve doesn't leave the bar idle. + merge_levels: list[tuple[int, int]] = [] + w_, h_ = merge_w, merge_h + while True: + merge_levels.append((w_, h_)) + if max(w_, h_) <= 256: + break + w_, h_ = w_ // 2, h_ // 2 + merge_levels.reverse() + + solve_weight = {wh: 4 ** i for i, wh in enumerate(merge_levels)} + n_merge_view_units = n_views * len(merge_levels) + n_merge_solve_units = sum(solve_weight.values()) + + pbar = comfy.utils.ProgressBar(n_views + n_merge_view_units + n_merge_solve_units) + done = 0 + + distance_maps: list = [] + masks: list = [] + with tqdm(total=n_views, desc="MoGe panorama inference") as tq: + for i in range(0, n_views, batch_size): + batch = splits[i:i + batch_size] + # apply_metric_scale=False: per-view scales would not align across overlap seams. + result = moge_model.infer(batch, resolution_level=resolution_level, + fov_x=90.0, force_projection=True, + apply_mask=False, apply_metric_scale=False) + distance_maps.extend(list(result["points"].float().norm(dim=-1).cpu().numpy())) + masks.extend(list(result["mask"].cpu().numpy())) + n = batch.shape[0] + done += n + pbar.update_absolute(done) + tq.update(n) + + with tqdm(total=n_merge_view_units + n_merge_solve_units, desc="MoGe panorama merge: views") as tq: + def _on_merge_view(): + nonlocal done + done += 1 + pbar.update_absolute(done) + tq.update(1) + + def _on_solve_start(w, h): + tq.set_description(f"MoGe panorama merge: solving {w}x{h}") + + def _on_solve_end(w, h): + nonlocal done + weight = solve_weight[(w, h)] + done += weight + pbar.update_absolute(done) + tq.update(weight) + tq.set_description("MoGe panorama merge: views") + + pano_depth, pano_mask = merge_panorama_depth( + merge_w, merge_h, distance_maps, masks, list(extrinsics), intrinsics, + on_view=_on_merge_view, on_solve_start=_on_solve_start, on_solve_end=_on_solve_end) + + pano_depth = torch.from_numpy(pano_depth) + pano_mask = torch.from_numpy(pano_mask) + + if (merge_h, merge_w) != (H, W): + pano_depth = torch.nn.functional.interpolate(pano_depth[None, None], size=(H, W), mode="bilinear", align_corners=False).squeeze() + pano_mask = torch.nn.functional.interpolate(pano_mask[None, None].float(), size=(H, W), mode="nearest").squeeze() > 0 + + # Pixels uncovered by any view's predicted foreground are unconstrained in the lsmr solve and stay at log_depth=0 (depth=1) + if pano_mask.any() and not pano_mask.all(): + far = torch.quantile(pano_depth[pano_mask], 0.95) * 5.0 + pano_depth = torch.where(pano_mask, pano_depth, far) + + directions = torch.from_numpy(spherical_uv_to_directions(_uv_grid(H, W))) + points = (directions * pano_depth[..., None]).unsqueeze(0) + depth = pano_depth.unsqueeze(0) + mask = pano_mask.unsqueeze(0) + + # Points stay in MoGe spherical coords; MoGePointMapToMesh applies the spherical->glTF rotation after triangulation + moge_geometry = {"points": points, "depth": depth, "mask": mask, "image": image.cpu()} + return io.NodeOutput(moge_geometry) + + +class MoGeInference(io.ComfyNode): + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MoGeInference", + display_name="MoGe Inference", + category="image/geometry_estimation", + inputs=[ + MoGeModelType.Input("moge_model"), + io.Image.Input("image"), + io.Int.Input("resolution_level", default=9, min=0, max=9, + tooltip="0 = fastest, 9 = most detail."), + io.Float.Input("fov_x_degrees", default=0.0, min=0.0, max=170.0, step=0.1, advanced=True, + tooltip="Horizontal field of view of the source camera. Sets the focal length used to unproject the depth map into 3D. 0 = auto-recover from the predicted points."), + io.Int.Input("batch_size", default=4, min=1, max=64, + tooltip="Images per inference call. Lower if you OOM on a long video / image set."), + io.Boolean.Input("force_projection", default=True, advanced=True), + io.Boolean.Input("apply_mask", default=True, advanced=True, + tooltip="Set masked-out (sky / invalid) pixels to inf in points and depth so meshing culls them. Disable to keep the raw predicted geometry everywhere; the mask is still returned separately."), + ], + outputs=[MoGeGeometry.Output(display_name="moge_geometry")], + ) + + @classmethod + def execute(cls, moge_model, image, resolution_level, fov_x_degrees, batch_size, force_projection, apply_mask) -> io.NodeOutput: + + image = image[..., :3] + bchw = image.movedim(-1, -3).contiguous() + B = bchw.shape[0] + fov = None if fov_x_degrees <= 0 else float(fov_x_degrees) + + pbar = comfy.utils.ProgressBar(B) + chunks: list[dict] = [] + with tqdm(total=B, desc="MoGe inference") as tq: + for i in range(0, B, batch_size): + chunk = bchw[i:i + batch_size] + chunks.append(moge_model.infer(chunk, resolution_level=resolution_level, fov_x=fov, + force_projection=force_projection, apply_mask=apply_mask)) + pbar.update_absolute(min(i + batch_size, B)) + tq.update(chunk.shape[0]) + + def stack(field): + vals = [c[field] for c in chunks if field in c] + return torch.cat(vals, dim=0) if vals else None + + moge_geometry = {"image": image.cpu()} + for field in ("points", "depth", "intrinsics", "mask", "normal"): + v = stack(field) + if v is not None: + moge_geometry[field] = v + return io.NodeOutput(moge_geometry) + + +class MoGeRender(io.ComfyNode): + """Render a visualization or mask from a MOGE_GEOMETRY packet.""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MoGeRender", + display_name="MoGe Render", + category="image/geometry_estimation", + inputs=[ + MoGeGeometry.Input("moge_geometry"), + io.Combo.Input("output", options=["depth", "depth_colored", "normal_opengl", "normal_directx", "mask"], default="depth", + tooltip="DirectX vs OpenGL controls the normal-map green-channel convention. DirectX: green = -Y down (Unreal). OpenGL: green = +Y up (Blender, Substance, Unity, glTF)."), + ], + outputs=[io.Image.Output()], + ) + + @classmethod + def execute(cls, moge_geometry, output) -> io.NodeOutput: + is_normal = output in ("normal_directx", "normal_opengl") + opengl = output.endswith("_opengl") + + # Pick the input tensor for the chosen mode and validate availability. + if output in ("depth", "depth_colored"): + if "depth" not in moge_geometry: + raise ValueError("moge_geometry has no depth output.") + src = moge_geometry["depth"] + elif is_normal: + if "normal" in moge_geometry: + src = moge_geometry["normal"] + elif "points" in moge_geometry: + src = moge_geometry["points"] + else: + raise ValueError("moge_geometry has neither normals nor points to derive normals from.") + elif output == "mask": + if "mask" not in moge_geometry: + raise ValueError("moge_geometry has no mask output.") + src = moge_geometry["mask"] + else: + raise ValueError(f"Unknown output mode: {output}") + + B = src.shape[0] + pbar = comfy.utils.ProgressBar(B) + out: list[torch.Tensor] = [] + with tqdm(total=B, desc=f"MoGe render: {output}") as tq: + for i in range(B): + slc = src[i:i + 1].float() + if output in ("depth", "depth_colored"): + d = _normalize_disparity(slc) + out.append(_turbo(d) if output == "depth_colored" + else d.unsqueeze(-1).expand(*d.shape, 3).contiguous()) + elif is_normal: + n = slc if "normal" in moge_geometry else _normals_from_points(slc) + # MoGe is OpenCV (Z+ into scene); normal-map convention is Z+ out of surface, so flip Z. + y_sign = -1.0 if opengl else 1.0 + n = n * n.new_tensor([1.0, y_sign, -1.0]) + out.append((n * 0.5 + 0.5).clamp(0.0, 1.0)) + elif output == "mask": + out.append(slc.unsqueeze(-1).expand(*slc.shape, 3).contiguous()) + pbar.update_absolute(i + 1) + tq.update(1) + result = torch.cat(out, dim=0).to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype()) + return io.NodeOutput(result) + + +class MoGePointMapToMesh(io.ComfyNode): + """Triangulate one image of a MoGe point map into a Types.MESH (UVs + texture).""" + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MoGePointMapToMesh", + display_name="MoGe Point Map to Mesh", + category="image/geometry_estimation", + inputs=[ + MoGeGeometry.Input("moge_geometry"), + io.Int.Input("batch_index", default=0, min=0, max=4096, + tooltip="Which image of a batched MoGe geometry to mesh. Per-image vertex counts " + "differ, so batches can't be stacked into a single MESH."), + io.Int.Input("decimation", default=1, min=1, max=8, + tooltip="Vertex stride; 1 = full resolution."), + io.Float.Input("discontinuity_threshold", default=0.04, min=0.0, max=1.0, step=0.01, + tooltip="Drop pixels whose 3x3 depth span exceeds this fraction. 0 = off."), + io.Boolean.Input("texture", default=True, + tooltip="Carry the source image through as the baseColor texture."), + ], + outputs=[io.Mesh.Output()], + ) + + @classmethod + def execute(cls, moge_geometry, batch_index, decimation, discontinuity_threshold, texture) -> io.NodeOutput: + if "points" not in moge_geometry: + raise ValueError("moge_geometry has no points output.") + points = moge_geometry["points"] + B = points.shape[0] + if batch_index >= B: + raise ValueError(f"batch_index {batch_index} out of range; moge_geometry has batch size {B}.") + + # Pass depth so the rtol edge check sees radial depth -- for panoramas + # points[..., 2] = cos(phi)*r goes negative below the equator and the rtol clamp would drop the bottom half. + edge_depth = moge_geometry["depth"][batch_index] if "depth" in moge_geometry else None + verts, faces, uvs = triangulate_grid_mesh( + points[batch_index], decimation=decimation, + discontinuity_threshold=discontinuity_threshold, depth=edge_depth, + ) + if verts.shape[0] == 0 or faces.shape[0] == 0: + raise ValueError("MoGe produced an empty mesh; try discontinuity_threshold=0 or apply_mask=False.") + + if "intrinsics" not in moge_geometry: + # Panorama: rotate MoGe spherical (Z up) -> glTF (Y up, Z back), correct for inside-the-sphere viewing) + verts = verts[:, [1, 2, 0]].contiguous() + else: + # Perspective MoGe (X right, Y down, Z forward) -> glTF; face flip keeps winding CCW after the Y/Z flip. + verts = verts * torch.tensor([1.0, -1.0, -1.0], dtype=verts.dtype) + faces = faces[:, [0, 2, 1]].contiguous() + + tex = moge_geometry["image"][batch_index:batch_index + 1] if texture else None + mesh = Types.MESH( + vertices=verts.unsqueeze(0), + faces=faces.unsqueeze(0), + uvs=uvs.unsqueeze(0), + texture=tex, + ) + return io.NodeOutput(mesh) + + +class MoGeExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [LoadMoGeModel, MoGeInference, MoGePanoramaInference, MoGeRender, MoGePointMapToMesh] + + +async def comfy_entrypoint() -> MoGeExtension: + return MoGeExtension() diff --git a/folder_paths.py b/folder_paths.py index 92e8df3cf..ad7f0f4fc 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -56,6 +56,8 @@ folder_names_and_paths["background_removal"] = ([os.path.join(models_dir, "backg folder_names_and_paths["frame_interpolation"] = ([os.path.join(models_dir, "frame_interpolation")], supported_pt_extensions) +folder_names_and_paths["geometry_estimation"] = ([os.path.join(models_dir, "geometry_estimation")], supported_pt_extensions) + folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions) output_directory = os.path.join(base_path, "output") diff --git a/models/geometry_estimation/put_geometry_estimation_models_here b/models/geometry_estimation/put_geometry_estimation_models_here new file mode 100644 index 000000000..e69de29bb diff --git a/nodes.py b/nodes.py index 2b63f9fbb..991238fb8 100644 --- a/nodes.py +++ b/nodes.py @@ -2437,6 +2437,7 @@ async def init_builtin_extra_nodes(): "nodes_wandancer.py", "nodes_hidream_o1.py", "nodes_save_3d.py", + "nodes_moge.py", ] import_failed = []