From 6de7fc063b668a3d8216f26e0e0672c47da45fd5 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 25 May 2026 11:21:35 -0700 Subject: [PATCH 01/13] Emit `hash` alongside `asset_hash` on all Asset responses (#13739) * Emit `hash` alongside `asset_hash` on all Asset responses Add a `hash` field to the Asset response schema that carries the same value as the existing `asset_hash` field. Both fields are now populated in _build_asset_response, so every Asset-returning endpoint (GET, POST, PUT) includes both. No existing fields are removed. Tests updated to assert both fields. Co-authored-by: Matt Miller * Tighten hash field tests and DRY response builder - Extract assert_hash_fields_consistent() helper that verifies presence parity and value equality, replacing body.get()-based assertions that treated missing keys and explicit nulls identically. - Conftest seeded_asset fixture and seed-asset list assertions now check key absence directly, so a regression that surfaces null fields would be caught (validates exclude_none behavior). - DRY duplicate hash expression in _build_asset_response. - Add list-endpoint coverage asserting hash is present and consistent on populated assets. - Add schema-level test asserting AssetCreated inherits the hash field from Asset, guarding against future inheritance drift. --------- Co-authored-by: Matt Miller Co-authored-by: guill --- app/assets/api/routes.py | 4 +++- app/assets/api/schemas_out.py | 1 + tests-unit/assets_test/conftest.py | 2 ++ tests-unit/assets_test/helpers.py | 23 +++++++++++++++++++ .../assets_test/test_assets_missing_sync.py | 4 +++- tests-unit/assets_test/test_crud.py | 9 +++++++- tests-unit/assets_test/test_list_filter.py | 5 ++++ tests-unit/assets_test/test_uploads.py | 22 ++++++++++++++++++ 8 files changed, 67 insertions(+), 3 deletions(-) diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py index 68126b6a5..6555974e9 100644 --- a/app/assets/api/routes.py +++ b/app/assets/api/routes.py @@ -160,10 +160,12 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu preview_url = None else: preview_url = _build_preview_url_from_view(result.tags, result.ref.user_metadata) + asset_content_hash = result.asset.hash if result.asset else None return schemas_out.Asset( id=result.ref.id, name=result.ref.name, - asset_hash=result.asset.hash if result.asset else None, + hash=asset_content_hash, + asset_hash=asset_content_hash, size=int(result.asset.size_bytes) if result.asset else None, mime_type=result.asset.mime_type if result.asset else None, tags=result.tags, diff --git a/app/assets/api/schemas_out.py b/app/assets/api/schemas_out.py index d99b1098d..0e748b907 100644 --- a/app/assets/api/schemas_out.py +++ b/app/assets/api/schemas_out.py @@ -10,6 +10,7 @@ class Asset(BaseModel): id: str name: str + hash: str | None = None asset_hash: str | None = None size: int | None = None mime_type: str | None = None diff --git a/tests-unit/assets_test/conftest.py b/tests-unit/assets_test/conftest.py index 6c5c56113..9867b4e14 100644 --- a/tests-unit/assets_test/conftest.py +++ b/tests-unit/assets_test/conftest.py @@ -236,6 +236,8 @@ def seeded_asset(request: pytest.FixtureRequest, http: requests.Session, api_bas r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120) body = r.json() assert r.status_code == 201, body + from helpers import assert_hash_fields_consistent + assert_hash_fields_consistent(body) return body diff --git a/tests-unit/assets_test/helpers.py b/tests-unit/assets_test/helpers.py index 770e011f4..ae3de6dc3 100644 --- a/tests-unit/assets_test/helpers.py +++ b/tests-unit/assets_test/helpers.py @@ -26,3 +26,26 @@ def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None: def get_asset_filename(asset_hash: str, extension: str) -> str: return asset_hash.removeprefix("blake3:") + extension + + +def assert_hash_fields_consistent(body: dict, expected_hash: str | None = None) -> None: + """Assert hash and asset_hash invariants on an Asset response. + + Both must be present or both absent (so a regression that drops only one + is caught). When present, they must equal each other and, if expected_hash + is provided, must equal that value. + """ + hash_present = "hash" in body + asset_hash_present = "asset_hash" in body + assert hash_present == asset_hash_present, ( + f"hash and asset_hash must both be present or both absent: " + f"hash present={hash_present}, asset_hash present={asset_hash_present}" + ) + if hash_present: + h = body["hash"] + ah = body["asset_hash"] + assert h == ah, f"hash and asset_hash must match: hash={h!r}, asset_hash={ah!r}" + if expected_hash is not None: + assert h == expected_hash, ( + f"hash must equal expected: got {h!r}, expected {expected_hash!r}" + ) diff --git a/tests-unit/assets_test/test_assets_missing_sync.py b/tests-unit/assets_test/test_assets_missing_sync.py index 47dc130cb..29ec1d09d 100644 --- a/tests-unit/assets_test/test_assets_missing_sync.py +++ b/tests-unit/assets_test/test_assets_missing_sync.py @@ -40,7 +40,9 @@ def test_seed_asset_removed_when_file_is_deleted( # there should be exactly one with that name matches = [a for a in body1.get("assets", []) if a.get("name") == name] assert matches - assert matches[0].get("asset_hash") is None + # Seed assets have no hash; exclude_none drops both keys from the response + assert "asset_hash" not in matches[0] + assert "hash" not in matches[0] asset_info_id = matches[0]["id"] # Remove the underlying file and sync again diff --git a/tests-unit/assets_test/test_crud.py b/tests-unit/assets_test/test_crud.py index 07310223e..fd2e9a098 100644 --- a/tests-unit/assets_test/test_crud.py +++ b/tests-unit/assets_test/test_crud.py @@ -21,6 +21,8 @@ def test_create_from_hash_success( b1 = r1.json() assert r1.status_code == 201, b1 assert b1["asset_hash"] == h + assert b1["hash"] == h + assert b1["hash"] == b1["asset_hash"] assert b1["created_new"] is False aid = b1["id"] @@ -39,6 +41,7 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse detail = rg.json() assert rg.status_code == 200, detail assert detail["id"] == aid + assert detail["hash"] == detail["asset_hash"] assert "user_metadata" in detail assert "filename" in detail["user_metadata"] @@ -97,6 +100,7 @@ def test_delete_upon_reference_count( copy = r2.json() assert r2.status_code == 201, copy assert copy["asset_hash"] == src_hash + assert copy["hash"] == src_hash assert copy["created_new"] is False # Soft-delete original reference (default) -> asset identity must remain @@ -139,6 +143,7 @@ def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset body = ru.json() assert ru.status_code == 200, body assert body["name"] == payload["name"] + assert body["hash"] == body["asset_hash"] assert body["tags"] == original_tags # tags unchanged assert body["user_metadata"]["purpose"] == "updated" # filename should still be present and normalized by server @@ -289,7 +294,9 @@ def test_metadata_filename_is_set_for_seed_asset_without_hash( assert r1.status_code == 200, body matches = [a for a in body.get("assets", []) if a.get("name") == name] assert matches, "Seed asset should be visible after sync" - assert matches[0].get("asset_hash") is None # still a seed + # Seed assets have no hash; exclude_none drops both keys from the response + assert "asset_hash" not in matches[0] + assert "hash" not in matches[0] aid = matches[0]["id"] r2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120) diff --git a/tests-unit/assets_test/test_list_filter.py b/tests-unit/assets_test/test_list_filter.py index dcb7a73ca..17bbea5c6 100644 --- a/tests-unit/assets_test/test_list_filter.py +++ b/tests-unit/assets_test/test_list_filter.py @@ -3,6 +3,7 @@ import uuid import pytest import requests +from helpers import assert_hash_fields_consistent def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asset_factory, make_asset_bytes): @@ -26,6 +27,10 @@ def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asse got1 = [a["name"] for a in b1["assets"]] assert got1 == sorted(names)[:2] assert b1["has_more"] is True + # Populated assets in list responses must carry both `hash` and `asset_hash` consistently + for asset in b1["assets"]: + assert_hash_fields_consistent(asset) + assert "hash" in asset, "populated asset must emit hash on list endpoint" r2 = http.get( api_base + "/api/assets", diff --git a/tests-unit/assets_test/test_uploads.py b/tests-unit/assets_test/test_uploads.py index 0f2b124a3..427a417cc 100644 --- a/tests-unit/assets_test/test_uploads.py +++ b/tests-unit/assets_test/test_uploads.py @@ -5,6 +5,20 @@ from concurrent.futures import ThreadPoolExecutor import requests import pytest +from app.assets.api.schemas_out import Asset, AssetCreated + + +def test_asset_created_inherits_hash_field(): + """AssetCreated must inherit `hash` from Asset so POST /api/assets responses emit it. + + Schema-level guard: integration tests cover the wire shape, but inheritance + drift (e.g. AssetCreated ever being redefined to no longer extend Asset) + would silently drop `hash` from a major endpoint without this check. + """ + assert "hash" in Asset.model_fields + assert "hash" in AssetCreated.model_fields + assert AssetCreated.model_fields["hash"].annotation == Asset.model_fields["hash"].annotation + def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, make_asset_bytes): name = "dup_a.safetensors" @@ -17,6 +31,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma a1 = r1.json() assert r1.status_code == 201, a1 assert a1["created_new"] is True + assert a1["hash"] == a1["asset_hash"] # Second upload with the same data and name creates a new AssetReference (duplicates allowed) # Returns 200 because Asset already exists, but a new AssetReference is created @@ -26,6 +41,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma a2 = r2.json() assert r2.status_code in (200, 201), a2 assert a2["asset_hash"] == a1["asset_hash"] + assert a2["hash"] == a1["hash"] assert a2["id"] != a1["id"] # new reference with same content # Third upload with the same data but different name also creates new AssetReference @@ -50,6 +66,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_ b1 = r1.json() assert r1.status_code == 201, b1 h = b1["asset_hash"] + assert b1["hash"] == h # Now POST /api/assets with only hash and no file files = [ @@ -63,6 +80,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_ assert r2.status_code == 200, b2 # fast path returns 200 with created_new == False assert b2["created_new"] is False assert b2["asset_hash"] == h + assert b2["hash"] == h def test_upload_fastpath_with_known_hash_and_file( @@ -75,6 +93,7 @@ def test_upload_fastpath_with_known_hash_and_file( b1 = r1.json() assert r1.status_code == 201, b1 h = b1["asset_hash"] + assert b1["hash"] == h # Send both file and hash of existing content -> server must drain file and create from hash (200) files = {"file": ("ignored.bin", b"ignored" * 10, "application/octet-stream")} @@ -84,6 +103,7 @@ def test_upload_fastpath_with_known_hash_and_file( assert r2.status_code == 200, b2 assert b2["created_new"] is False assert b2["asset_hash"] == h + assert b2["hash"] == h def test_upload_multiple_tags_fields_are_merged(http: requests.Session, api_base: str): @@ -142,6 +162,8 @@ def test_concurrent_upload_identical_bytes_different_names( assert r1.status_code in (200, 201), b1 assert r2.status_code in (200, 201), b2 assert b1["asset_hash"] == b2["asset_hash"] + assert b1["hash"] == b2["hash"] + assert b1["hash"] == b1["asset_hash"] assert b1["id"] != b2["id"] created_flags = sorted([bool(b1.get("created_new")), bool(b2.get("created_new"))]) From 04879a8113961cbc4e2ff20e9feeb737ba703f51 Mon Sep 17 00:00:00 2001 From: "Daxiong (Lin)" Date: Tue, 26 May 2026 03:25:16 +0800 Subject: [PATCH 02/13] Add new open-source model and built-in tool blueprints (#13980) --- ...neration (Stable Audio 3 Medium Base).json | 2091 ++++++++ ...io Generation (Stable Audio 3 Medium).json | 2091 ++++++++ .../Canny to Image (Z-Image-Turbo).json | 2 +- blueprints/Canny to Video (LTX 2.0).json | 2 +- blueprints/ControlNet (Z-Image-Turbo).json | 2 +- .../Depth to Image (Z-Image-Turbo).json | 2 +- blueprints/Depth to Video (ltx 2.0).json | 2 +- .../First-Last-Frame to Video (LTX-2.3).json | 2 +- blueprints/First-Last-Frame to Video.json | 2 +- blueprints/Geometry Estimation (MoGe).json | 1266 +++++ blueprints/Image Captioning (gemini).json | 4 +- ...Image Depth Estimation (Lotus Depth).json} | 150 +- blueprints/Image Depth Estimation (MoGe).json | 1154 +++++ .../Image Face Detection (Mediapipe).json | 779 +++ blueprints/Image Segmentation (SAM3).json | 2 +- blueprints/Image Upscale(Z-image-Turbo).json | 4 +- ...age to Pose Map (SDPose Multi-Person).json | 1206 +++++ .../Image to Pose Map (SDPose-OOD).json | 888 ++++ blueprints/Merge Videos.json | 1219 +++++ blueprints/Pose to Image (Z-Image-Turbo).json | 2 +- blueprints/Pose to Video (LTX 2.0).json | 2 +- blueprints/Prompt Enhance.json | 2 +- blueprints/Remove Background (BiRefNet).json | 2 +- blueprints/Select Per-Line Text by Index.json | 485 ++ blueprints/Split Image Grid to Tiles.json | 714 +++ blueprints/Text to Image (Anima).json | 1085 +++++ blueprints/Video Captioning (Gemini).json | 4 +- blueprints/Video Depth Estimation (MoGe).json | 1226 +++++ .../Video Face Detection (Mediapipe).json | 1109 +++++ blueprints/Video Inpaint (VOID).json | 4340 +++++++++++++++++ blueprints/Video Inpaint(Wan2.1 VACE).json | 2388 --------- .../Video Inpainting (Wan2.1 VACE).json | 4196 ++++++++++++++++ blueprints/Video Segmentation (SAM3).json | 2 +- blueprints/Video Upscale(GAN x4).json | 2 +- ...deo to Pose Map (SDPose Multi-Person).json | 1323 +++++ 35 files changed, 25260 insertions(+), 2490 deletions(-) create mode 100644 blueprints/Audio Generation (Stable Audio 3 Medium Base).json create mode 100644 blueprints/Audio Generation (Stable Audio 3 Medium).json create mode 100644 blueprints/Geometry Estimation (MoGe).json rename blueprints/{Image to Depth Map (Lotus).json => Image Depth Estimation (Lotus Depth).json} (92%) create mode 100644 blueprints/Image Depth Estimation (MoGe).json create mode 100644 blueprints/Image Face Detection (Mediapipe).json create mode 100644 blueprints/Image to Pose Map (SDPose Multi-Person).json create mode 100644 blueprints/Image to Pose Map (SDPose-OOD).json create mode 100644 blueprints/Merge Videos.json create mode 100644 blueprints/Select Per-Line Text by Index.json create mode 100644 blueprints/Split Image Grid to Tiles.json create mode 100644 blueprints/Text to Image (Anima).json create mode 100644 blueprints/Video Depth Estimation (MoGe).json create mode 100644 blueprints/Video Face Detection (Mediapipe).json create mode 100644 blueprints/Video Inpaint (VOID).json delete mode 100644 blueprints/Video Inpaint(Wan2.1 VACE).json create mode 100644 blueprints/Video Inpainting (Wan2.1 VACE).json create mode 100644 blueprints/Video to Pose Map (SDPose Multi-Person).json diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium Base).json b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json new file mode 100644 index 000000000..e561fe634 --- /dev/null +++ b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json @@ -0,0 +1,2091 @@ +{ + "revision": 0, + "last_node_id": 52, + "last_link_id": 0, + "nodes": [ + { + "id": 52, + "type": "8b66c757-fe2f-4184-91f3-479a19deb565", + "pos": [ + 370, + 1120 + ], + "size": [ + 420, + 450 + ], + "flags": { + "collapsed": false + }, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "user_input", + "name": "user_input", + "type": "STRING", + "widget": { + "name": "user_input" + }, + "link": null + }, + { + "label": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "use_reprompt", + "name": "use_reprompt", + "type": "BOOLEAN", + "widget": { + "name": "use_reprompt" + }, + "link": null + }, + { + "label": "reprompt_category", + "name": "category", + "type": "COMBO", + "widget": { + "name": "category" + }, + "link": null + }, + { + "label": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "label": "sa_clip", + "name": "sa_clip", + "type": "COMBO", + "widget": { + "name": "sa_clip" + }, + "link": null + }, + { + "label": "qwen_clip", + "name": "qwen_clip", + "type": "COMBO", + "widget": { + "name": "qwen_clip" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "31", + "value" + ], + [ + "36", + "value" + ], + [ + "3", + "seed" + ], + [ + "35", + "value" + ], + [ + "43", + "choice" + ], + [ + "25", + "ckpt_name" + ], + [ + "26", + "clip_name" + ], + [ + "29", + "clip_name" + ] + ] + }, + "widgets_values": [], + "title": "Audio Generation (Stable Audio 3 Medium Base)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "8b66c757-fe2f-4184-91f3-479a19deb565", + "version": 1, + "state": { + "lastGroupId": 8, + "lastNodeId": 56, + "lastLinkId": 84, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Audio Generation (Stable Audio 3 Medium Base)", + "inputNode": { + "id": -10, + "bounding": [ + -810, + 400, + 155.953125, + 208 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1750, + 1041, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f", + "name": "user_input", + "type": "STRING", + "linkIds": [ + 68 + ], + "label": "user_input", + "pos": [ + -678.046875, + 424 + ] + }, + { + "id": "5ca95030-aff4-4544-b545-f0d814e0e49a", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 82 + ], + "label": "duration", + "pos": [ + -678.046875, + 444 + ] + }, + { + "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960", + "name": "seed", + "type": "INT", + "linkIds": [ + 76 + ], + "label": "seed", + "pos": [ + -678.046875, + 464 + ] + }, + { + "id": "dc020099-39e6-4009-9937-408409d71736", + "name": "use_reprompt", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "use_reprompt", + "pos": [ + -678.046875, + 484 + ] + }, + { + "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169", + "name": "category", + "type": "COMBO", + "linkIds": [ + 78 + ], + "label": "reprompt_category", + "pos": [ + -678.046875, + 504 + ] + }, + { + "id": "be19b747-6a47-4028-9c30-d52f54a712ea", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 79 + ], + "label": "ckpt_name", + "pos": [ + -678.046875, + 524 + ] + }, + { + "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642", + "name": "sa_clip", + "type": "COMBO", + "linkIds": [ + 80 + ], + "label": "sa_clip", + "pos": [ + -678.046875, + 544 + ] + }, + { + "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac", + "name": "qwen_clip", + "type": "COMBO", + "linkIds": [ + 81 + ], + "label": "qwen_clip", + "pos": [ + -678.046875, + 564 + ] + } + ], + "outputs": [ + { + "id": "bbe988dd-5c03-44fd-a965-c712f9204988", + "name": "AUDIO", + "type": "AUDIO", + "linkIds": [ + 27 + ], + "localized_name": "AUDIO", + "pos": [ + 1774, + 1065 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 620, + 420 + ], + "size": [ + 440, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 35 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 12, + "type": "VAEDecodeAudio", + "pos": [ + 1450, + 110 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 13 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 39 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "slot_index": 0, + "links": [ + 27 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecodeAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 11, + "type": "EmptyLatentAudio", + "pos": [ + 630, + 610 + ], + "size": [ + 430, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "seconds", + "name": "seconds", + "type": "FLOAT", + "widget": { + "name": "seconds" + }, + "link": 50 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 1 + ] + }, + { + "id": 3, + "type": "KSampler", + "pos": [ + 1100, + 100 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 30 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 4 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 12 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 76 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "randomize", + 50, + 7, + "lcm", + "simple", + 1 + ] + }, + { + "id": 29, + "type": "CLIPLoader", + "pos": [ + 690, + 1580 + ], + "size": [ + 430, + 170 + ], + "flags": {}, + "order": 8, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 81 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "qwen3.5_2b_bf16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen3.5_2b_bf16.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 610, + 130 + ], + "size": [ + 450, + 240 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 34 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 49 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 4 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 34, + "type": "ComfySwitchNode", + "pos": [ + 210, + 610 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 47 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 46 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 49 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode" + }, + "widgets_values": [ + false + ] + }, + { + "id": 41, + "type": "ComfyMathExpression", + "pos": [ + 1370, + 1360 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 16, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 56 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 57 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a" + ] + }, + { + "id": 42, + "type": "PreviewAny", + "pos": [ + 1370, + 1310 + ], + "size": [ + 230, + 40 + ], + "flags": { + "collapsed": true + }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 57 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 39, + "type": "StringReplace", + "pos": [ + 1040, + 900 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 52 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 59 + ] + } + ], + "title": "Text Replace (USER INPUT)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "USER_INPUT", + "" + ] + }, + { + "id": 28, + "type": "TextGenerate", + "pos": [ + 1200, + 1580 + ], + "size": [ + 430, + 420 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 40 + }, + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "video", + "name": "video", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": 60 + }, + { + "localized_name": "max_length", + "name": "max_length", + "type": "INT", + "widget": { + "name": "max_length" + }, + "link": null + }, + { + "localized_name": "sampling_mode", + "name": "sampling_mode", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "sampling_mode" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "sampling_mode.temperature", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.temperature" + }, + "link": null + }, + { + "localized_name": "top_k", + "name": "sampling_mode.top_k", + "type": "INT", + "widget": { + "name": "sampling_mode.top_k" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "sampling_mode.top_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.top_p" + }, + "link": null + }, + { + "localized_name": "min_p", + "name": "sampling_mode.min_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.min_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "sampling_mode.repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "sampling_mode.seed", + "type": "INT", + "widget": { + "name": "sampling_mode.seed" + }, + "link": null + }, + { + "localized_name": "presence_penalty", + "name": "sampling_mode.presence_penalty", + "shape": 7, + "type": "FLOAT", + "widget": { + "name": "sampling_mode.presence_penalty" + }, + "link": null + }, + { + "localized_name": "thinking", + "name": "thinking", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "thinking" + }, + "link": null + }, + { + "localized_name": "use_default_template", + "name": "use_default_template", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "use_default_template" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "generated_text", + "name": "generated_text", + "type": "STRING", + "links": [ + 46, + 84 + ] + } + ], + "properties": { + "Node name for S&R": "TextGenerate", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "", + 256, + "on", + 0.7, + 64, + 0.95, + 0.05, + 1.05, + 0, + 0, + false, + true + ] + }, + { + "id": 31, + "type": "PrimitiveStringMultiline", + "pos": [ + -390, + 160 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "STRING", + "widget": { + "name": "value" + }, + "link": 68 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 47, + 53 + ] + } + ], + "title": "User: short description (USER_INPUT in template)", + "properties": { + "Node name for S&R": "PrimitiveStringMultiline" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 43, + "type": "CustomCombo", + "pos": [ + 140, + 910 + ], + "size": [ + 550, + 320 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "choice", + "name": "choice", + "type": "COMBO", + "widget": { + "name": "choice" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 65 + ] + }, + { + "localized_name": "INDEX", + "name": "INDEX", + "type": "INT", + "links": null + } + ], + "title": "Custom Combo (Category index)", + "properties": { + "Node name for S&R": "CustomCombo" + }, + "widgets_values": [ + "Music", + 0, + "Music", + "Instrument", + "SFX", + "One-shot", + "" + ] + }, + { + "id": 49, + "type": "JsonExtractString", + "pos": [ + 720, + 1200 + ], + "size": [ + 300, + 180 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "json_string", + "name": "json_string", + "type": "STRING", + "widget": { + "name": "json_string" + }, + "link": null + }, + { + "localized_name": "key", + "name": "key", + "type": "STRING", + "widget": { + "name": "key" + }, + "link": 65 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "JsonExtractString" + }, + "widgets_values": [ + "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}", + "Music" + ] + }, + { + "id": 40, + "type": "StringReplace", + "pos": [ + 1350, + 900 + ], + "size": [ + 260, + 280 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 59 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 58 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 60 + ] + } + ], + "title": "Text Replace (AUDIO LENGTH)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "AUDIO_LENGTH", + "" + ] + }, + { + "id": 38, + "type": "StringReplace", + "pos": [ + 720, + 900 + ], + "size": [ + 290, + 280 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 66 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 52 + ] + } + ], + "title": "Text Replace (PROMPT TEMPLATE)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:", + "SYSTEM_PROMPTS", + "" + ] + }, + { + "id": 35, + "type": "PrimitiveBoolean", + "pos": [ + -390, + 570 + ], + "size": [ + 400, + 100 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 48 + ] + } + ], + "title": "Boolean (Enable_Reprompt)", + "properties": { + "Node name for S&R": "PrimitiveBoolean" + }, + "widgets_values": [ + true + ] + }, + { + "id": 36, + "type": "PrimitiveFloat", + "pos": [ + -390, + 410 + ], + "size": [ + 400, + 110 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 50, + 56 + ] + } + ], + "title": "Float (Duration)", + "properties": { + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [ + 150 + ] + }, + { + "id": 25, + "type": "CheckpointLoaderSimple", + "pos": [ + 100, + 130 + ], + "size": [ + 440, + 190 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 30 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 39 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "stable_audio_3_medium_base.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium_base.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "stable_audio_3_medium_base.safetensors" + ] + }, + { + "id": 26, + "type": "CLIPLoader", + "pos": [ + 100, + 390 + ], + "size": [ + 440, + 170 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 80 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 34, + 35 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "t5gemma_b_b_ul2.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5gemma_b_b_ul2.safetensors", + "stable_audio", + "default" + ] + }, + { + "id": 54, + "type": "PreviewAny", + "pos": [ + 1720, + 1580 + ], + "size": [ + 420, + 550 + ], + "flags": {}, + "order": 20, + "mode": 4, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 84 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Loaders: checkpoint & CLIP", + "bounding": [ + 80, + 50, + 485.721654232725, + 527.2848777754299 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "CLIP encode: conditioning", + "bounding": [ + 600, + 60, + 470, + 510 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "User inputs: prompt & duration", + "bounding": [ + -400, + 10, + 430, + 740 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 7, + "title": "Reprompt: full branch (template + LLM)", + "bounding": [ + 60, + 780, + 1630, + 1360 + ], + "color": "#444", + "flags": {} + }, + { + "id": 4, + "title": "Reprompt: JSON extract & template fills", + "bounding": [ + 120, + 820, + 1520, + 650 + ], + "color": "#444", + "flags": {} + }, + { + "id": 5, + "title": "Helpers: duration to string", + "bounding": [ + 1340, + 1180, + 280, + 250 + ], + "color": "#444", + "flags": {} + }, + { + "id": 6, + "title": "Reprompt: Qwen TextGenerate", + "bounding": [ + 680, + 1510, + 960, + 614.65625 + ], + "color": "#444", + "flags": {} + }, + { + "id": 8, + "title": "Audio generation: Stable Audio", + "bounding": [ + 60, + 10, + 1627.3616782294932, + 737.0545987464304 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 35, + "origin_id": 26, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 13, + "origin_id": 3, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 39, + "origin_id": 25, + "origin_slot": 2, + "target_id": 12, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 50, + "origin_id": 36, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 30, + "origin_id": 25, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 4, + "origin_id": 6, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 6, + "origin_id": 7, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 12, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 34, + "origin_id": 26, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 49, + "origin_id": 34, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 47, + "origin_id": 31, + "origin_slot": 0, + "target_id": 34, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 46, + "origin_id": 28, + "origin_slot": 0, + "target_id": 34, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 48, + "origin_id": 35, + "origin_slot": 0, + "target_id": 34, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 56, + "origin_id": 36, + "origin_slot": 0, + "target_id": 41, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 57, + "origin_id": 41, + "origin_slot": 1, + "target_id": 42, + "target_slot": 0, + "type": "INT" + }, + { + "id": 52, + "origin_id": 38, + "origin_slot": 0, + "target_id": 39, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 39, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 40, + "origin_id": 29, + "origin_slot": 0, + "target_id": 28, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 60, + "origin_id": 40, + "origin_slot": 0, + "target_id": 28, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 65, + "origin_id": 43, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 59, + "origin_id": 39, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 58, + "origin_id": 42, + "origin_slot": 0, + "target_id": 40, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 66, + "origin_id": 49, + "origin_slot": 0, + "target_id": 38, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 27, + "origin_id": 12, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 68, + "origin_id": -10, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 2, + "target_id": 3, + "target_slot": 4, + "type": "INT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 4, + "target_id": 43, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 5, + "target_id": 25, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 6, + "target_id": 26, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 7, + "target_id": 29, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 1, + "target_id": 36, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 35, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 84, + "origin_id": 28, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "STRING" + } + ], + "extra": {}, + "category": "Audio/Music generation", + "description": "Generates music, instrument loops, sound effects, and one-shots from text using the Stable Audio 3 Medium base checkpoint, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium).json b/blueprints/Audio Generation (Stable Audio 3 Medium).json new file mode 100644 index 000000000..30add5b05 --- /dev/null +++ b/blueprints/Audio Generation (Stable Audio 3 Medium).json @@ -0,0 +1,2091 @@ +{ + "revision": 0, + "last_node_id": 52, + "last_link_id": 0, + "nodes": [ + { + "id": 52, + "type": "8b66c757-fe2f-4184-91f3-479a19deb565", + "pos": [ + 370, + 1120 + ], + "size": [ + 420, + 450 + ], + "flags": { + "collapsed": false + }, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "user_input", + "name": "user_input", + "type": "STRING", + "widget": { + "name": "user_input" + }, + "link": null + }, + { + "label": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "use_reprompt", + "name": "use_reprompt", + "type": "BOOLEAN", + "widget": { + "name": "use_reprompt" + }, + "link": null + }, + { + "label": "reprompt_category", + "name": "category", + "type": "COMBO", + "widget": { + "name": "category" + }, + "link": null + }, + { + "label": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "label": "sa_clip", + "name": "sa_clip", + "type": "COMBO", + "widget": { + "name": "sa_clip" + }, + "link": null + }, + { + "label": "qwen_clip", + "name": "qwen_clip", + "type": "COMBO", + "widget": { + "name": "qwen_clip" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [] + } + ], + "title": "Audio Generation (Stable Audio 3 Medium)", + "properties": { + "proxyWidgets": [ + [ + "31", + "value" + ], + [ + "36", + "value" + ], + [ + "3", + "seed" + ], + [ + "35", + "value" + ], + [ + "43", + "choice" + ], + [ + "25", + "ckpt_name" + ], + [ + "26", + "clip_name" + ], + [ + "29", + "clip_name" + ] + ] + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "8b66c757-fe2f-4184-91f3-479a19deb565", + "version": 1, + "state": { + "lastGroupId": 8, + "lastNodeId": 56, + "lastLinkId": 84, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Audio Generation (Stable Audio 3 Medium)", + "inputNode": { + "id": -10, + "bounding": [ + -810, + 400, + 155.953125, + 208 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1750, + 1041, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f", + "name": "user_input", + "type": "STRING", + "linkIds": [ + 68 + ], + "label": "user_input", + "pos": [ + -678.046875, + 424 + ] + }, + { + "id": "5ca95030-aff4-4544-b545-f0d814e0e49a", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 82 + ], + "label": "duration", + "pos": [ + -678.046875, + 444 + ] + }, + { + "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960", + "name": "seed", + "type": "INT", + "linkIds": [ + 76 + ], + "label": "seed", + "pos": [ + -678.046875, + 464 + ] + }, + { + "id": "dc020099-39e6-4009-9937-408409d71736", + "name": "use_reprompt", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "use_reprompt", + "pos": [ + -678.046875, + 484 + ] + }, + { + "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169", + "name": "category", + "type": "COMBO", + "linkIds": [ + 78 + ], + "label": "reprompt_category", + "pos": [ + -678.046875, + 504 + ] + }, + { + "id": "be19b747-6a47-4028-9c30-d52f54a712ea", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 79 + ], + "label": "ckpt_name", + "pos": [ + -678.046875, + 524 + ] + }, + { + "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642", + "name": "sa_clip", + "type": "COMBO", + "linkIds": [ + 80 + ], + "label": "sa_clip", + "pos": [ + -678.046875, + 544 + ] + }, + { + "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac", + "name": "qwen_clip", + "type": "COMBO", + "linkIds": [ + 81 + ], + "label": "qwen_clip", + "pos": [ + -678.046875, + 564 + ] + } + ], + "outputs": [ + { + "id": "bbe988dd-5c03-44fd-a965-c712f9204988", + "name": "AUDIO", + "type": "AUDIO", + "linkIds": [ + 27 + ], + "localized_name": "AUDIO", + "pos": [ + 1774, + 1065 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 620, + 420 + ], + "size": [ + 440, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 35 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 12, + "type": "VAEDecodeAudio", + "pos": [ + 1450, + 110 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 13 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 39 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "slot_index": 0, + "links": [ + 27 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecodeAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 11, + "type": "EmptyLatentAudio", + "pos": [ + 630, + 610 + ], + "size": [ + 430, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "seconds", + "name": "seconds", + "type": "FLOAT", + "widget": { + "name": "seconds" + }, + "link": 50 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 1 + ] + }, + { + "id": 3, + "type": "KSampler", + "pos": [ + 1100, + 100 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 30 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 4 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 6 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 12 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 76 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 13 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "randomize", + 8, + 1, + "lcm", + "simple", + 1 + ] + }, + { + "id": 29, + "type": "CLIPLoader", + "pos": [ + 690, + 1580 + ], + "size": [ + 430, + 170 + ], + "flags": {}, + "order": 8, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 81 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "qwen3.5_2b_bf16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen3.5_2b_bf16.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 610, + 130 + ], + "size": [ + 450, + 240 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 34 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 49 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 4 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 34, + "type": "ComfySwitchNode", + "pos": [ + 210, + 610 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 47 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 46 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 49 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode" + }, + "widgets_values": [ + false + ] + }, + { + "id": 41, + "type": "ComfyMathExpression", + "pos": [ + 1370, + 1360 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 16, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 56 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 57 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a" + ] + }, + { + "id": 42, + "type": "PreviewAny", + "pos": [ + 1370, + 1310 + ], + "size": [ + 230, + 40 + ], + "flags": { + "collapsed": true + }, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 57 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 39, + "type": "StringReplace", + "pos": [ + 1040, + 900 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 52 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 59 + ] + } + ], + "title": "Text Replace (USER INPUT)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "USER_INPUT", + "" + ] + }, + { + "id": 28, + "type": "TextGenerate", + "pos": [ + 1200, + 1580 + ], + "size": [ + 430, + 420 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 40 + }, + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "video", + "name": "video", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": 60 + }, + { + "localized_name": "max_length", + "name": "max_length", + "type": "INT", + "widget": { + "name": "max_length" + }, + "link": null + }, + { + "localized_name": "sampling_mode", + "name": "sampling_mode", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "sampling_mode" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "sampling_mode.temperature", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.temperature" + }, + "link": null + }, + { + "localized_name": "top_k", + "name": "sampling_mode.top_k", + "type": "INT", + "widget": { + "name": "sampling_mode.top_k" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "sampling_mode.top_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.top_p" + }, + "link": null + }, + { + "localized_name": "min_p", + "name": "sampling_mode.min_p", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.min_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "sampling_mode.repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "sampling_mode.repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "sampling_mode.seed", + "type": "INT", + "widget": { + "name": "sampling_mode.seed" + }, + "link": null + }, + { + "localized_name": "presence_penalty", + "name": "sampling_mode.presence_penalty", + "shape": 7, + "type": "FLOAT", + "widget": { + "name": "sampling_mode.presence_penalty" + }, + "link": null + }, + { + "localized_name": "thinking", + "name": "thinking", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "thinking" + }, + "link": null + }, + { + "localized_name": "use_default_template", + "name": "use_default_template", + "shape": 7, + "type": "BOOLEAN", + "widget": { + "name": "use_default_template" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "generated_text", + "name": "generated_text", + "type": "STRING", + "links": [ + 46, + 84 + ] + } + ], + "properties": { + "Node name for S&R": "TextGenerate", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "", + 256, + "on", + 0.7, + 64, + 0.95, + 0.05, + 1.05, + 0, + 0, + false, + true + ] + }, + { + "id": 31, + "type": "PrimitiveStringMultiline", + "pos": [ + -390, + 160 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "STRING", + "widget": { + "name": "value" + }, + "link": 68 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 47, + 53 + ] + } + ], + "title": "User: short description (USER_INPUT in template)", + "properties": { + "Node name for S&R": "PrimitiveStringMultiline" + }, + "widgets_values": [ + "" + ] + }, + { + "id": 43, + "type": "CustomCombo", + "pos": [ + 140, + 910 + ], + "size": [ + 550, + 320 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "choice", + "name": "choice", + "type": "COMBO", + "widget": { + "name": "choice" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 65 + ] + }, + { + "localized_name": "INDEX", + "name": "INDEX", + "type": "INT", + "links": null + } + ], + "title": "Custom Combo (Category index)", + "properties": { + "Node name for S&R": "CustomCombo" + }, + "widgets_values": [ + "Music", + 0, + "Music", + "Instrument", + "SFX", + "One-shot", + "" + ] + }, + { + "id": 49, + "type": "JsonExtractString", + "pos": [ + 720, + 1200 + ], + "size": [ + 300, + 180 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "json_string", + "name": "json_string", + "type": "STRING", + "widget": { + "name": "json_string" + }, + "link": null + }, + { + "localized_name": "key", + "name": "key", + "type": "STRING", + "widget": { + "name": "key" + }, + "link": 65 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "JsonExtractString" + }, + "widgets_values": [ + "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}", + "Music" + ] + }, + { + "id": 40, + "type": "StringReplace", + "pos": [ + 1350, + 900 + ], + "size": [ + 260, + 280 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 59 + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 58 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 60 + ] + } + ], + "title": "Text Replace (AUDIO LENGTH)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "", + "AUDIO_LENGTH", + "" + ] + }, + { + "id": 38, + "type": "StringReplace", + "pos": [ + 720, + 900 + ], + "size": [ + 290, + 280 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 66 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 52 + ] + } + ], + "title": "Text Replace (PROMPT TEMPLATE)", + "properties": { + "Node name for S&R": "StringReplace" + }, + "widgets_values": [ + "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:", + "SYSTEM_PROMPTS", + "" + ] + }, + { + "id": 35, + "type": "PrimitiveBoolean", + "pos": [ + -390, + 570 + ], + "size": [ + 400, + 100 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 48 + ] + } + ], + "title": "Boolean (Enable_Reprompt)", + "properties": { + "Node name for S&R": "PrimitiveBoolean" + }, + "widgets_values": [ + true + ] + }, + { + "id": 36, + "type": "PrimitiveFloat", + "pos": [ + -390, + 410 + ], + "size": [ + 400, + 110 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 50, + 56 + ] + } + ], + "title": "Float (Duration)", + "properties": { + "Node name for S&R": "PrimitiveFloat" + }, + "widgets_values": [ + 150 + ] + }, + { + "id": 25, + "type": "CheckpointLoaderSimple", + "pos": [ + 100, + 130 + ], + "size": [ + 440, + 190 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 30 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 39 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "stable_audio_3_medium.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "stable_audio_3_medium.safetensors" + ] + }, + { + "id": 26, + "type": "CLIPLoader", + "pos": [ + 100, + 390 + ], + "size": [ + 440, + 170 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 80 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 34, + 35 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "t5gemma_b_b_ul2.safetensors", + "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5gemma_b_b_ul2.safetensors", + "stable_audio", + "default" + ] + }, + { + "id": 54, + "type": "PreviewAny", + "pos": [ + 1720, + 1580 + ], + "size": [ + 420, + 550 + ], + "flags": {}, + "order": 20, + "mode": 4, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 84 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "PreviewAny" + }, + "widgets_values": [ + null, + null, + null + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Loaders: checkpoint & CLIP", + "bounding": [ + 80, + 50, + 485.721654232725, + 527.2848777754299 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "CLIP encode: conditioning", + "bounding": [ + 600, + 60, + 470, + 510 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "User inputs: prompt & duration", + "bounding": [ + -400, + 10, + 430, + 740 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 7, + "title": "Reprompt: full branch (template + LLM)", + "bounding": [ + 60, + 780, + 1630, + 1360 + ], + "color": "#444", + "flags": {} + }, + { + "id": 4, + "title": "Reprompt: JSON extract & template fills", + "bounding": [ + 120, + 820, + 1520, + 650 + ], + "color": "#444", + "flags": {} + }, + { + "id": 5, + "title": "Helpers: duration to string", + "bounding": [ + 1340, + 1180, + 280, + 250 + ], + "color": "#444", + "flags": {} + }, + { + "id": 6, + "title": "Reprompt: Qwen TextGenerate", + "bounding": [ + 680, + 1510, + 960, + 614.65625 + ], + "color": "#444", + "flags": {} + }, + { + "id": 8, + "title": "Audio generation: Stable Audio", + "bounding": [ + 60, + 10, + 1627.3616782294932, + 737.0545987464304 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 35, + "origin_id": 26, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 13, + "origin_id": 3, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 39, + "origin_id": 25, + "origin_slot": 2, + "target_id": 12, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 50, + "origin_id": 36, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 30, + "origin_id": 25, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 4, + "origin_id": 6, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 6, + "origin_id": 7, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 12, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 34, + "origin_id": 26, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 49, + "origin_id": 34, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 47, + "origin_id": 31, + "origin_slot": 0, + "target_id": 34, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 46, + "origin_id": 28, + "origin_slot": 0, + "target_id": 34, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 48, + "origin_id": 35, + "origin_slot": 0, + "target_id": 34, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 56, + "origin_id": 36, + "origin_slot": 0, + "target_id": 41, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 57, + "origin_id": 41, + "origin_slot": 1, + "target_id": 42, + "target_slot": 0, + "type": "INT" + }, + { + "id": 52, + "origin_id": 38, + "origin_slot": 0, + "target_id": 39, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 39, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 40, + "origin_id": 29, + "origin_slot": 0, + "target_id": 28, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 60, + "origin_id": 40, + "origin_slot": 0, + "target_id": 28, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 65, + "origin_id": 43, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 59, + "origin_id": 39, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 58, + "origin_id": 42, + "origin_slot": 0, + "target_id": 40, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 66, + "origin_id": 49, + "origin_slot": 0, + "target_id": 38, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 27, + "origin_id": 12, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 68, + "origin_id": -10, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 2, + "target_id": 3, + "target_slot": 4, + "type": "INT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 4, + "target_id": 43, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 5, + "target_id": 25, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 6, + "target_id": 26, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 7, + "target_id": 29, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 1, + "target_id": 36, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 35, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 84, + "origin_id": 28, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "STRING" + } + ], + "extra": {}, + "category": "Audio/Music generation", + "description": "Generates music, instrument loops, sound effects, and one-shots from text using Stable Audio 3 Medium, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Canny to Image (Z-Image-Turbo).json b/blueprints/Canny to Image (Z-Image-Turbo).json index 14deb64cc..903d372b1 100644 --- a/blueprints/Canny to Image (Z-Image-Turbo).json +++ b/blueprints/Canny to Image (Z-Image-Turbo).json @@ -1553,7 +1553,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Canny to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning." } ] diff --git a/blueprints/Canny to Video (LTX 2.0).json b/blueprints/Canny to Video (LTX 2.0).json index a9682c8a4..ed602b521 100644 --- a/blueprints/Canny to Video (LTX 2.0).json +++ b/blueprints/Canny to Video (LTX 2.0).json @@ -3600,7 +3600,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Canny to video", + "category": "Video generation and editing/Conditioned", "description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio." } ] diff --git a/blueprints/ControlNet (Z-Image-Turbo).json b/blueprints/ControlNet (Z-Image-Turbo).json index fbec95a97..160ee11e2 100644 --- a/blueprints/ControlNet (Z-Image-Turbo).json +++ b/blueprints/ControlNet (Z-Image-Turbo).json @@ -1401,7 +1401,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/ControlNet", + "category": "Image generation and editing/Conditioned", "description": "Generates images from a text prompt and ControlNet conditioning (e.g. depth, canny) using Z-Image-Turbo." } ] diff --git a/blueprints/Depth to Image (Z-Image-Turbo).json b/blueprints/Depth to Image (Z-Image-Turbo).json index fe9ef0f72..2790827a3 100644 --- a/blueprints/Depth to Image (Z-Image-Turbo).json +++ b/blueprints/Depth to Image (Z-Image-Turbo).json @@ -1579,7 +1579,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Depth to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning." }, { diff --git a/blueprints/Depth to Video (ltx 2.0).json b/blueprints/Depth to Video (ltx 2.0).json index bd51e4476..56912de51 100644 --- a/blueprints/Depth to Video (ltx 2.0).json +++ b/blueprints/Depth to Video (ltx 2.0).json @@ -4233,7 +4233,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Depth to video", + "category": "Video generation and editing/Conditioned", "description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio." }, { diff --git a/blueprints/First-Last-Frame to Video (LTX-2.3).json b/blueprints/First-Last-Frame to Video (LTX-2.3).json index f509aefe0..4cae2dc24 100644 --- a/blueprints/First-Last-Frame to Video (LTX-2.3).json +++ b/blueprints/First-Last-Frame to Video (LTX-2.3).json @@ -3350,7 +3350,7 @@ } ], "extra": {}, - "category": "Video generation and editing/First-Last-Frame to Video", + "category": "Video generation and editing/Conditioned", "description": "Generates a video interpolating between first and last keyframes using LTX-2.3." } ] diff --git a/blueprints/First-Last-Frame to Video.json b/blueprints/First-Last-Frame to Video.json index 84dfafbcd..d76e1e045 100644 --- a/blueprints/First-Last-Frame to Video.json +++ b/blueprints/First-Last-Frame to Video.json @@ -3350,7 +3350,7 @@ } ], "extra": {}, - "category": "Video generation and editing/First-Last-Frame to Video", + "category": "Video generation and editing/FLF2V", "description": "Generates a video that interpolates between the first and last keyframes using LTX-2.3, including optional audio." } ] diff --git a/blueprints/Geometry Estimation (MoGe).json b/blueprints/Geometry Estimation (MoGe).json new file mode 100644 index 000000000..e6f08bf71 --- /dev/null +++ b/blueprints/Geometry Estimation (MoGe).json @@ -0,0 +1,1266 @@ +{ + "revision": 0, + "last_node_id": 67, + "last_link_id": 0, + "nodes": [ + { + "id": 67, + "type": "936dfaf2-575a-48b5-9e0c-df391319d11f", + "pos": [ + -3950, + 5000 + ], + "size": [ + 430, + 480 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "mesh_frame_index", + "name": "mesh_frame_index", + "type": "INT", + "widget": { + "name": "mesh_frame_index" + }, + "link": null + }, + { + "localized_name": "mesh_decimation", + "name": "mesh_decimation", + "type": "INT", + "widget": { + "name": "mesh_decimation" + }, + "link": null + }, + { + "localized_name": "mesh_gap_threshold", + "name": "mesh_gap_threshold", + "type": "FLOAT", + "widget": { + "name": "mesh_gap_threshold" + }, + "link": null + }, + { + "localized_name": "mesh_texture", + "name": "mesh_texture", + "type": "BOOLEAN", + "widget": { + "name": "mesh_texture" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "mesh", + "name": "mesh", + "type": "MESH", + "links": [] + }, + { + "localized_name": "normal_opengl", + "name": "normal_opengl", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "normal_directx", + "name": "normal_directx", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "55", + "resolution_level" + ], + [ + "55", + "batch_size" + ], + [ + "54", + "batch_index" + ], + [ + "54", + "decimation" + ], + [ + "54", + "discontinuity_threshold" + ], + [ + "54", + "texture" + ], + [ + "58", + "model_name" + ], + [ + "66", + "switch" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Geometry Estimation (MoGe)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "936dfaf2-575a-48b5-9e0c-df391319d11f", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 69, + "lastLinkId": 91, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Geometry Estimation (MoGe)", + "inputNode": { + "id": -10, + "bounding": [ + -5130, + 5320, + 167.337890625, + 228 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 131.51953125, + 108 + ] + }, + "inputs": [ + { + "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 48, + 55, + 56, + 82 + ], + "localized_name": "source_image", + "pos": [ + -4986.662109375, + 5344 + ] + }, + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -4986.662109375, + 5364 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -4986.662109375, + 5384 + ] + }, + { + "id": "fcacfca9-7927-4c38-94da-8ab22256325f", + "name": "mesh_frame_index", + "type": "INT", + "linkIds": [ + 75 + ], + "localized_name": "mesh_frame_index", + "pos": [ + -4986.662109375, + 5404 + ] + }, + { + "id": "acbfe7f9-1b69-42c1-8614-4ccf54b28d4e", + "name": "mesh_decimation", + "type": "INT", + "linkIds": [ + 76 + ], + "localized_name": "mesh_decimation", + "pos": [ + -4986.662109375, + 5424 + ] + }, + { + "id": "cd20f9a7-3a0a-4c4c-98d7-96f423867b87", + "name": "mesh_gap_threshold", + "type": "FLOAT", + "linkIds": [ + 77 + ], + "localized_name": "mesh_gap_threshold", + "pos": [ + -4986.662109375, + 5444 + ] + }, + { + "id": "6f5c15f7-7f77-4fc9-b47b-3514467b06b6", + "name": "mesh_texture", + "type": "BOOLEAN", + "linkIds": [ + 78 + ], + "localized_name": "mesh_texture", + "pos": [ + -4986.662109375, + 5464 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -4986.662109375, + 5484 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -4986.662109375, + 5504 + ] + } + ], + "outputs": [ + { + "id": "3c616ea0-9a4c-4cff-a405-662320229df0", + "name": "mesh", + "type": "MESH", + "linkIds": [ + 34 + ], + "localized_name": "mesh", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "ff85a763-b7f7-4bcc-9b1d-a4eaf55ad2f9", + "name": "normal_opengl", + "type": "IMAGE", + "linkIds": [ + 62 + ], + "localized_name": "normal_opengl", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "26b3f88a-0ba0-4d4d-9c7d-0ad76106c844", + "name": "normal_directx", + "type": "IMAGE", + "linkIds": [ + 63 + ], + "localized_name": "normal_directx", + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 54, + "type": "MoGePointMapToMesh", + "pos": [ + -3440, + 5220 + ], + "size": [ + 290, + 200 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 33 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": 75 + }, + { + "localized_name": "decimation", + "name": "decimation", + "type": "INT", + "widget": { + "name": "decimation" + }, + "link": 76 + }, + { + "localized_name": "discontinuity_threshold", + "name": "discontinuity_threshold", + "type": "FLOAT", + "widget": { + "name": "discontinuity_threshold" + }, + "link": 77 + }, + { + "localized_name": "texture", + "name": "texture", + "type": "BOOLEAN", + "widget": { + "name": "texture" + }, + "link": 78 + } + ], + "outputs": [ + { + "localized_name": "MESH", + "name": "MESH", + "type": "MESH", + "links": [ + 34 + ] + } + ], + "properties": { + "Node name for S&R": "MoGePointMapToMesh", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 1, + 0.04, + true + ] + }, + { + "id": 55, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 33, + 59, + 60 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 9, + 0, + 4, + true, + true + ] + }, + { + "id": 58, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4910 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 59, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 60, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 61, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 55 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 62, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5120 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 56 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 63, + "type": "MoGeRender", + "pos": [ + -3430, + 4890 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 59 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 62 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "normal_opengl" + ] + }, + { + "id": 64, + "type": "MoGeRender", + "pos": [ + -3430, + 5050 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 60 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 63 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "normal_directx" + ] + }, + { + "id": 66, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 82 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 33, + "origin_id": 55, + "origin_slot": 0, + "target_id": 54, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 58, + "origin_id": 58, + "origin_slot": 0, + "target_id": 55, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 49, + "origin_id": 60, + "origin_slot": 0, + "target_id": 59, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 61, + "origin_slot": 0, + "target_id": 62, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 59, + "origin_slot": 2, + "target_id": 62, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 59, + "origin_id": 55, + "origin_slot": 0, + "target_id": 63, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 60, + "origin_id": 55, + "origin_slot": 0, + "target_id": 64, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 48, + "origin_id": -10, + "origin_slot": 0, + "target_id": 60, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": -10, + "origin_slot": 0, + "target_id": 61, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": -10, + "origin_slot": 0, + "target_id": 62, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 34, + "origin_id": 54, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MESH" + }, + { + "id": 62, + "origin_id": 63, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 63, + "origin_id": 64, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 1, + "target_id": 55, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 2, + "target_id": 55, + "target_slot": 4, + "type": "INT" + }, + { + "id": 75, + "origin_id": -10, + "origin_slot": 3, + "target_id": 54, + "target_slot": 1, + "type": "INT" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 4, + "target_id": 54, + "target_slot": 2, + "type": "INT" + }, + { + "id": 77, + "origin_id": -10, + "origin_slot": 5, + "target_id": 54, + "target_slot": 3, + "type": "FLOAT" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 6, + "target_id": 54, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 7, + "target_id": 58, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 62, + "origin_slot": 0, + "target_id": 66, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 66, + "origin_slot": 0, + "target_id": 55, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 0, + "target_id": 66, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 8, + "target_id": 66, + "target_slot": 2, + "type": "BOOLEAN" + } + ], + "category": "3D/Geometry Estimation", + "description": "Estimates 3D scene geometry from an input image using MoGe, outputting a mesh plus OpenGL and DirectX normal maps.", + "extra": {} + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Captioning (gemini).json b/blueprints/Image Captioning (gemini).json index 2fc5d6746..9005e5191 100644 --- a/blueprints/Image Captioning (gemini).json +++ b/blueprints/Image Captioning (gemini).json @@ -310,9 +310,9 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Image Captioning", + "category": "Image Tools", "description": "Generates descriptive captions for images using Google's Gemini multimodal LLM." } ] } -} +} \ No newline at end of file diff --git a/blueprints/Image to Depth Map (Lotus).json b/blueprints/Image Depth Estimation (Lotus Depth).json similarity index 92% rename from blueprints/Image to Depth Map (Lotus).json rename to blueprints/Image Depth Estimation (Lotus Depth).json index 12f10ba5b..8aa338d0d 100644 --- a/blueprints/Image to Depth Map (Lotus).json +++ b/blueprints/Image Depth Estimation (Lotus Depth).json @@ -1,19 +1,18 @@ { - "id": "6af0a6c1-0161-4528-8685-65776e838d44", "revision": 0, - "last_node_id": 75, - "last_link_id": 245, + "last_node_id": 76, + "last_link_id": 0, "nodes": [ { - "id": 75, - "type": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf", + "id": 76, + "type": "96338968-1242-4f02-b6a1-d496af4bcffe", "pos": [ - 600, - 830 + 670, + 1280 ], "size": [ 400, - 110 + 201.3125 ], "flags": {}, "order": 0, @@ -59,47 +58,44 @@ "links": [] } ], + "title": "Image Depth Estimation (Lotus Depth)", "properties": { "proxyWidgets": [ [ - "-1", + "28", "sigma" ], [ - "-1", + "10", "unet_name" ], [ - "-1", + "14", "vae_name" ] ], "cnr_id": "comfy-core", "ver": "0.14.1" }, - "widgets_values": [ - 999.0000000000002, - "lotus-depth-d-v1-1.safetensors", - "vae-ft-mse-840000-ema-pruned.safetensors" - ] + "widgets_values": [] } ], "links": [], - "groups": [], + "version": 0.4, "definitions": { "subgraphs": [ { - "id": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf", + "id": "96338968-1242-4f02-b6a1-d496af4bcffe", "version": 1, "state": { "lastGroupId": 1, - "lastNodeId": 75, + "lastNodeId": 76, "lastLinkId": 245, "lastRerouteId": 0 }, "revision": 0, "config": {}, - "name": "Image to Depth Map (Lotus)", + "name": "Image Depth Estimation (Lotus Depth)", "inputNode": { "id": -10, "bounding": [ @@ -191,12 +187,12 @@ "id": 10, "type": "UNETLoader", "pos": [ - 108.05555555555557, - -253.05555555555557 + 110, + -250 ], "size": [ - 254.93706597222226, - 82 + 260, + 90 ], "flags": {}, "order": 4, @@ -234,9 +230,9 @@ } ], "properties": { + "Node name for S&R": "UNETLoader", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "UNETLoader", "models": [ { "name": "lotus-depth-d-v1-1.safetensors", @@ -255,12 +251,12 @@ "id": 18, "type": "DisableNoise", "pos": [ - 607.0641494069639, - -268.33337840371513 + 610, + -270 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 0, @@ -278,26 +274,25 @@ } ], "properties": { + "Node name for S&R": "DisableNoise", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "DisableNoise", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { - "id": 23, + "id": 74, "type": "VAEEncode", "pos": [ 620, 160 ], "size": [ - 175, + 180, 50 ], "flags": {}, - "order": 10, + "order": 11, "mode": 0, "inputs": [ { @@ -325,12 +320,11 @@ } ], "properties": { + "Node name for S&R": "VAEEncode", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAEEncode", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 21, @@ -341,7 +335,7 @@ ], "size": [ 210, - 58 + 60 ], "flags": {}, "order": 1, @@ -369,9 +363,9 @@ } ], "properties": { + "Node name for S&R": "KSamplerSelect", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "KSamplerSelect", "widget_ue_connectable": {} }, "widgets_values": [ @@ -386,7 +380,7 @@ -170 ], "size": [ - 175, + 180, 50 ], "flags": {}, @@ -418,12 +412,11 @@ } ], "properties": { + "Node name for S&R": "BasicGuider", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "BasicGuider", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 16, @@ -433,8 +426,8 @@ -130 ], "size": [ - 295.99609375, - 271.65798611111114 + 300, + 280 ], "flags": {}, "order": 6, @@ -490,12 +483,11 @@ } ], "properties": { + "Node name for S&R": "SamplerCustomAdvanced", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "SamplerCustomAdvanced", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 28, @@ -506,10 +498,10 @@ ], "size": [ 210, - 58 + 60 ], "flags": {}, - "order": 11, + "order": 10, "mode": 0, "inputs": [ { @@ -540,9 +532,9 @@ } ], "properties": { + "Node name for S&R": "SetFirstSigma", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "SetFirstSigma", "widget_ue_connectable": {} }, "widgets_values": [ @@ -557,7 +549,7 @@ -120 ], "size": [ - 175, + 180, 50 ], "flags": {}, @@ -589,12 +581,11 @@ } ], "properties": { + "Node name for S&R": "VAEDecode", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAEDecode", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 22, @@ -604,8 +595,8 @@ -220 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 9, @@ -630,12 +621,11 @@ } ], "properties": { + "Node name for S&R": "ImageInvert", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "ImageInvert", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 14, @@ -645,8 +635,8 @@ -90 ], "size": [ - 254.93706597222226, - 58 + 260, + 60 ], "flags": {}, "order": 5, @@ -675,9 +665,9 @@ } ], "properties": { + "Node name for S&R": "VAELoader", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "VAELoader", "models": [ { "name": "vae-ft-mse-840000-ema-pruned.safetensors", @@ -692,15 +682,15 @@ ] }, { - "id": 68, + "id": 75, "type": "LotusConditioning", "pos": [ 400, -150 ], "size": [ - 175, - 33.333333333333336 + 180, + 40 ], "flags": {}, "order": 2, @@ -718,12 +708,11 @@ } ], "properties": { + "Node name for S&R": "LotusConditioning", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "LotusConditioning", "widget_ue_connectable": {} - }, - "widgets_values": [] + } }, { "id": 20, @@ -734,7 +723,7 @@ ], "size": [ 210, - 106 + 110 ], "flags": {}, "order": 8, @@ -786,9 +775,9 @@ } ], "properties": { + "Node name for S&R": "BasicScheduler", "cnr_id": "comfy-core", "ver": "0.3.34", - "Node name for S&R": "BasicScheduler", "widget_ue_connectable": {} }, "widgets_values": [ @@ -850,7 +839,7 @@ }, { "id": 201, - "origin_id": 23, + "origin_id": 74, "origin_slot": 0, "target_id": 16, "target_slot": 4, @@ -866,7 +855,7 @@ }, { "id": 238, - "origin_id": 68, + "origin_id": 75, "origin_slot": 0, "target_id": 19, "target_slot": 1, @@ -892,7 +881,7 @@ "id": 38, "origin_id": 14, "origin_slot": 0, - "target_id": 23, + "target_id": 74, "target_slot": 1, "type": "VAE" }, @@ -908,7 +897,7 @@ "id": 37, "origin_id": -10, "origin_slot": 0, - "target_id": 23, + "target_id": 74, "target_slot": 0, "type": "IMAGE" }, @@ -948,12 +937,11 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/Depth to image", + "category": "Conditioning & Preprocessors/Depth", "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model." } ] }, - "config": {}, "extra": { "ds": { "scale": 1.3589709866044692, @@ -961,8 +949,6 @@ -138.53613935617864, -786.0629126022195 ] - }, - "workflowRendererVersion": "LG" - }, - "version": 0.4 + } + } } \ No newline at end of file diff --git a/blueprints/Image Depth Estimation (MoGe).json b/blueprints/Image Depth Estimation (MoGe).json new file mode 100644 index 000000000..e2d5d1298 --- /dev/null +++ b/blueprints/Image Depth Estimation (MoGe).json @@ -0,0 +1,1154 @@ +{ + "revision": 0, + "last_node_id": 49, + "last_link_id": 0, + "nodes": [ + { + "id": 49, + "type": "ca1fac5f-abe5-4729-b7fe-2299f6630a65", + "pos": [ + -3970, + 5000 + ], + "size": [ + 430, + 330 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "depth_colored", + "name": "depth_colored", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "depth", + "name": "depth", + "type": "IMAGE", + "links": [] + }, + { + "name": "MASK", + "type": "MASK", + "links": [] + } + ], + "title": "Image Depth Estimation (MoGe)", + "properties": { + "proxyWidgets": [ + [ + "13", + "resolution_level" + ], + [ + "13", + "batch_size" + ], + [ + "32", + "model_name" + ], + [ + "53", + "switch" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "ca1fac5f-abe5-4729-b7fe-2299f6630a65", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 69, + "lastLinkId": 90, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Depth Estimation (MoGe)", + "description": "Estimates monocular depth from an input image using MoGe, outputting both raw and colorized depth maps plus a mask.", + "inputNode": { + "id": -10, + "bounding": [ + -5130, + 5320, + 167.337890625, + 148 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 129, + 108 + ] + }, + "inputs": [ + { + "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 48, + 55, + 56, + 82 + ], + "localized_name": "source_image", + "pos": [ + -4986.662109375, + 5344 + ] + }, + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -4986.662109375, + 5364 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -4986.662109375, + 5384 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -4986.662109375, + 5404 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -4986.662109375, + 5424 + ] + } + ], + "outputs": [ + { + "id": "59c37b52-074f-49fc-9731-483f899c12c4", + "name": "depth_colored", + "type": "IMAGE", + "linkIds": [ + 36 + ], + "localized_name": "depth_colored", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "f583e936-da5c-4630-9901-391fa605c1f8", + "name": "depth", + "type": "IMAGE", + "linkIds": [ + 40 + ], + "localized_name": "depth", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "6845b6a1-1980-454a-9451-314f24495c1d", + "name": "MASK", + "type": "MASK", + "linkIds": [ + 86 + ], + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 13, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 35, + 39, + 61 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 3, + 0, + 4, + true, + true + ] + }, + { + "id": 23, + "type": "MoGeRender", + "pos": [ + -3430, + 4870 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 35 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 36 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "depth_colored" + ] + }, + { + "id": 25, + "type": "MoGeRender", + "pos": [ + -3430, + 5030 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 39 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "depth" + ] + }, + { + "id": 32, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4880 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 36, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 37, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 40, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 55 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 42, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5060 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 56 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 45, + "type": "MoGeRender", + "pos": [ + -3430, + 5200 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 61 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 85 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "mask" + ] + }, + { + "id": 53, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 82 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + }, + { + "id": 68, + "type": "ImageToMask", + "pos": [ + -3420, + 5360 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 85 + }, + { + "localized_name": "channel", + "name": "channel", + "type": "COMBO", + "widget": { + "name": "channel" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 86 + ] + } + ], + "properties": { + "Node name for S&R": "ImageToMask" + }, + "widgets_values": [ + "red" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 58, + "origin_id": 32, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 35, + "origin_id": 13, + "origin_slot": 0, + "target_id": 23, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 39, + "origin_id": 13, + "origin_slot": 0, + "target_id": 25, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 49, + "origin_id": 37, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 40, + "origin_slot": 0, + "target_id": 42, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 36, + "origin_slot": 2, + "target_id": 42, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 61, + "origin_id": 13, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 48, + "origin_id": -10, + "origin_slot": 0, + "target_id": 37, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": -10, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": -10, + "origin_slot": 0, + "target_id": 42, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 36, + "origin_id": 23, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 40, + "origin_id": 25, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 1, + "target_id": 13, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 2, + "target_id": 13, + "target_slot": 4, + "type": "INT" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 3, + "target_id": 32, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 42, + "origin_slot": 0, + "target_id": 53, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 53, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 82, + "origin_id": -10, + "origin_slot": 0, + "target_id": 53, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 4, + "target_id": 53, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 85, + "origin_id": 45, + "origin_slot": 0, + "target_id": 68, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 86, + "origin_id": 68, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Depth" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Face Detection (Mediapipe).json b/blueprints/Image Face Detection (Mediapipe).json new file mode 100644 index 000000000..e2548d485 --- /dev/null +++ b/blueprints/Image Face Detection (Mediapipe).json @@ -0,0 +1,779 @@ +{ + "revision": 0, + "last_node_id": 33, + "last_link_id": 0, + "nodes": [ + { + "id": 33, + "type": "6062babb-b649-4a71-be9e-20ebce567744", + "pos": [ + -450, + 4240 + ], + "size": [ + 420, + 400 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": null + }, + { + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": null + }, + { + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": null + }, + { + "label": "custom_face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": null + }, + { + "label": "custom_lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": null + }, + { + "label": "custom_left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": null + }, + { + "label": "custom_right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": null + }, + { + "label": "custom_irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": null + }, + { + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + }, + { + "label": "mask", + "name": "MASK_1", + "type": "MASK", + "links": [] + } + ], + "title": "Image Face Detection (Mediapipe)", + "properties": { + "proxyWidgets": [ + [ + "11", + "detector_variant" + ], + [ + "11", + "num_faces" + ], + [ + "20", + "regions.face_oval" + ], + [ + "20", + "regions.lips" + ], + [ + "20", + "regions.left_eye" + ], + [ + "20", + "regions.right_eye" + ], + [ + "20", + "regions.irises" + ], + [ + "2", + "model_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "6062babb-b649-4a71-be9e-20ebce567744", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 158, + "lastLinkId": 140, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Face Detection (Mediapipe)", + "description": "Detects facial landmarks from an image using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.", + "inputNode": { + "id": -10, + "bounding": [ + -710, + 4300, + 148.880859375, + 248 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 140, + 4480, + 137.677734375, + 108 + ] + }, + "inputs": [ + { + "id": "705dc1ae-6dc9-4155-92df-52f816ad451e", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 60 + ], + "localized_name": "image", + "pos": [ + -585.119140625, + 4324 + ] + }, + { + "id": "d6277190-732c-4604-b7cd-d3a9588bf761", + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "linkIds": [ + 74 + ], + "pos": [ + -585.119140625, + 4344 + ] + }, + { + "id": "ac473a08-6a86-42a7-b460-e70c6c5e1e2b", + "name": "detector_variant", + "type": "COMBO", + "linkIds": [ + 75 + ], + "pos": [ + -585.119140625, + 4364 + ] + }, + { + "id": "1bec2252-ca2d-496e-8a33-33a61d21f897", + "name": "num_faces", + "type": "INT", + "linkIds": [ + 76 + ], + "pos": [ + -585.119140625, + 4384 + ] + }, + { + "id": "17994fa2-0ea0-4c9b-a70a-19789c459c80", + "name": "regions.face_oval", + "type": "BOOLEAN", + "linkIds": [ + 77 + ], + "label": "custom_face_oval", + "pos": [ + -585.119140625, + 4404 + ] + }, + { + "id": "1c6c5893-2aee-4c37-b702-15ef2e20d863", + "name": "regions.lips", + "type": "BOOLEAN", + "linkIds": [ + 78 + ], + "label": "custom_lips", + "pos": [ + -585.119140625, + 4424 + ] + }, + { + "id": "f353fcea-4b6f-42a1-8fdd-32b3aa1e1f09", + "name": "regions.left_eye", + "type": "BOOLEAN", + "linkIds": [ + 79 + ], + "label": "custom_left_eye", + "pos": [ + -585.119140625, + 4444 + ] + }, + { + "id": "1387e121-c1fb-4522-8f0d-43459e11dd86", + "name": "regions.right_eye", + "type": "BOOLEAN", + "linkIds": [ + 80 + ], + "label": "custom_right_eye", + "pos": [ + -585.119140625, + 4464 + ] + }, + { + "id": "14acb0a0-d1f4-48f3-ba31-811b26236ef9", + "name": "regions.irises", + "type": "BOOLEAN", + "linkIds": [ + 81 + ], + "label": "custom_irises", + "pos": [ + -585.119140625, + 4484 + ] + }, + { + "id": "25a82859-87de-42c8-8431-09948665546e", + "name": "model_name", + "type": "COMBO", + "linkIds": [ + 86 + ], + "pos": [ + -585.119140625, + 4504 + ] + } + ], + "outputs": [ + { + "id": "d2ba3f92-e8b1-49c3-9590-cfad56c54cf4", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "linkIds": [ + 44 + ], + "localized_name": "face_landmarks", + "pos": [ + 164, + 4504 + ] + }, + { + "id": "4f356bb0-d4c4-4f93-b4cf-0845a65c4e6d", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 25 + ], + "localized_name": "bboxes", + "pos": [ + 164, + 4524 + ] + }, + { + "id": "f6309e1d-6397-4363-b38f-778a122abc51", + "name": "MASK_1", + "type": "MASK", + "linkIds": [ + 83 + ], + "label": "mask", + "pos": [ + 164, + 4544 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "MediaPipeFaceLandmarker", + "pos": [ + -280, + 4280 + ], + "size": [ + 350, + 220 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "face_detection_model", + "name": "face_detection_model", + "type": "FACE_DETECTION_MODEL", + "link": 66 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 60 + }, + { + "localized_name": "detector_variant", + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": 75 + }, + { + "localized_name": "num_faces", + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": 76 + }, + { + "localized_name": "min_confidence", + "name": "min_confidence", + "type": "FLOAT", + "widget": { + "name": "min_confidence" + }, + "link": null + }, + { + "localized_name": "missing_frame_fallback", + "name": "missing_frame_fallback", + "type": "COMBO", + "widget": { + "name": "missing_frame_fallback" + }, + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": 74 + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [ + 44, + 46 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 25 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceLandmarker", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "full", + 0, + 0.5, + "empty" + ] + }, + { + "id": 2, + "type": "LoadMediaPipeFaceLandmarker", + "pos": [ + -290, + 4060 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 86 + } + ], + "outputs": [ + { + "localized_name": "FACE_DETECTION_MODEL", + "name": "FACE_DETECTION_MODEL", + "type": "FACE_DETECTION_MODEL", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMediaPipeFaceLandmarker", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "models": [ + { + "name": "mediapipe_face_fp32.safetensors", + "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors", + "directory": "detection" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "mediapipe_face_fp32.safetensors" + ] + }, + { + "id": 20, + "type": "MediaPipeFaceMask", + "pos": [ + -290, + 4560 + ], + "size": [ + 360, + 180 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "link": 46 + }, + { + "localized_name": "regions", + "name": "regions", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "regions" + }, + "link": null + }, + { + "localized_name": "regions.face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": 77 + }, + { + "localized_name": "regions.lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": 78 + }, + { + "localized_name": "regions.left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": 79 + }, + { + "localized_name": "regions.right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": 80 + }, + { + "localized_name": "regions.irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": 81 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 83 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceMask", + "cnr_id": "comfy-core", + "ver": "0.22.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "custom", + true, + false, + false, + false, + false + ] + } + ], + "groups": [], + "links": [ + { + "id": 66, + "origin_id": 2, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FACE_DETECTION_MODEL" + }, + { + "id": 46, + "origin_id": 11, + "origin_slot": 0, + "target_id": 20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 60, + "origin_id": -10, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 44, + "origin_id": 11, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 25, + "origin_id": 11, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 1, + "target_id": 11, + "target_slot": 6, + "type": "FACE_LANDMARKER" + }, + { + "id": 75, + "origin_id": -10, + "origin_slot": 2, + "target_id": 11, + "target_slot": 2, + "type": "COMBO" + }, + { + "id": 76, + "origin_id": -10, + "origin_slot": 3, + "target_id": 11, + "target_slot": 3, + "type": "INT" + }, + { + "id": 77, + "origin_id": -10, + "origin_slot": 4, + "target_id": 20, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 78, + "origin_id": -10, + "origin_slot": 5, + "target_id": 20, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 6, + "target_id": 20, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 80, + "origin_id": -10, + "origin_slot": 7, + "target_id": 20, + "target_slot": 5, + "type": "BOOLEAN" + }, + { + "id": 81, + "origin_id": -10, + "origin_slot": 8, + "target_id": 20, + "target_slot": 6, + "type": "BOOLEAN" + }, + { + "id": 83, + "origin_id": 20, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 86, + "origin_id": -10, + "origin_slot": 9, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Face Detection" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image Segmentation (SAM3).json b/blueprints/Image Segmentation (SAM3).json index b405bf623..a2ef40ac8 100644 --- a/blueprints/Image Segmentation (SAM3).json +++ b/blueprints/Image Segmentation (SAM3).json @@ -703,7 +703,7 @@ } ], "extra": {}, - "category": "Image Tools/Image Segmentation", + "category": "Conditioning & Preprocessors/Segmentation & Mask", "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes." } ] diff --git a/blueprints/Image Upscale(Z-image-Turbo).json b/blueprints/Image Upscale(Z-image-Turbo).json index bd803a0b1..25d2838a8 100644 --- a/blueprints/Image Upscale(Z-image-Turbo).json +++ b/blueprints/Image Upscale(Z-image-Turbo).json @@ -1302,7 +1302,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Image generation and editing/Enhance", + "category": "Image generation and editing/Upscale", "description": "Upscales images to higher resolution using Z-Image-Turbo." } ] @@ -1312,4 +1312,4 @@ "workflowRendererVersion": "LG" }, "version": 0.4 -} +} \ No newline at end of file diff --git a/blueprints/Image to Pose Map (SDPose Multi-Person).json b/blueprints/Image to Pose Map (SDPose Multi-Person).json new file mode 100644 index 000000000..38df20775 --- /dev/null +++ b/blueprints/Image to Pose Map (SDPose Multi-Person).json @@ -0,0 +1,1206 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 370, + 590.625 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": null + }, + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "label": "detect_threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "label": "detect_class", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": null + }, + { + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + }, + { + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "678", + "threshold" + ], + [ + "678", + "class_name" + ], + [ + "678", + "max_detections" + ], + [ + "673", + "ckpt_name" + ], + [ + "677", + "unet_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Image to Pose Map (SDPose Multi-Person)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 691, + "lastLinkId": 1740, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image to Pose Map (SDPose Multi-Person)", + "inputNode": { + "id": -10, + "bounding": [ + -3350, + 3410, + 190.8984375, + 348 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1840, + 3570, + 128, + 108 + ] + }, + "inputs": [ + { + "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0", + "name": "input", + "type": "IMAGE,MASK", + "linkIds": [ + 1700 + ], + "localized_name": "input", + "pos": [ + -3183.1015625, + 3434 + ] + }, + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3183.1015625, + 3454 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3183.1015625, + 3474 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3183.1015625, + 3494 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3183.1015625, + 3514 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3183.1015625, + 3534 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3183.1015625, + 3554 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3183.1015625, + 3574 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3183.1015625, + 3594 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3183.1015625, + 3614 + ] + }, + { + "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 1718 + ], + "label": "detect_threshold", + "pos": [ + -3183.1015625, + 3634 + ] + }, + { + "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2", + "name": "class_name", + "type": "COMBO", + "linkIds": [ + 1719 + ], + "label": "detect_class", + "pos": [ + -3183.1015625, + 3654 + ] + }, + { + "id": "4417e988-6e80-4236-be31-4c179037f5a2", + "name": "max_detections", + "type": "INT", + "linkIds": [ + 1720 + ], + "pos": [ + -3183.1015625, + 3674 + ] + }, + { + "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1721 + ], + "pos": [ + -3183.1015625, + 3694 + ] + }, + { + "id": "4d75122c-2c14-452a-98fe-d1545d3e012a", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 1722 + ], + "pos": [ + -3183.1015625, + 3714 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1816, + 3594 + ] + }, + { + "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1725 + ], + "pos": [ + -1816, + 3614 + ] + }, + { + "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1726 + ], + "pos": [ + -1816, + 3634 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2550, + 3080 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1717 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1725 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -2970, + 3580 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1700 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698, + 1716 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "lanczos" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2540, + 3590 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -3040, + 3080 + ], + "size": [ + 390, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1721 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + }, + { + "id": 677, + "type": "UNETLoader", + "pos": [ + -3030, + 3330 + ], + "size": [ + 370, + 140 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 1722 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.14.1", + "models": [ + { + "name": "rt_detr_v4-x-hgnet_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "rt_detr_v4-x-hgnet_fp16.safetensors", + "default" + ] + }, + { + "id": 678, + "type": "RTDETR_detect", + "pos": [ + -2540, + 3320 + ], + "size": [ + 270, + 200 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1715 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1716 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 1718 + }, + { + "localized_name": "class_name", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": 1719 + }, + { + "localized_name": "max_detections", + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": 1720 + } + ], + "outputs": [ + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 1717, + 1726 + ] + } + ], + "properties": { + "Node name for S&R": "RTDETR_detect", + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0.5, + "person", + 1 + ] + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1700, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE,MASK" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 2, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 9, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1715, + "origin_id": 677, + "origin_slot": 0, + "target_id": 678, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1716, + "origin_id": 674, + "origin_slot": 0, + "target_id": 678, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 1717, + "origin_id": 678, + "origin_slot": 0, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1718, + "origin_id": -10, + "origin_slot": 10, + "target_id": 678, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 1719, + "origin_id": -10, + "origin_slot": 11, + "target_id": 678, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1720, + "origin_id": -10, + "origin_slot": 12, + "target_id": 678, + "target_slot": 4, + "type": "INT" + }, + { + "id": 1721, + "origin_id": -10, + "origin_slot": 13, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1722, + "origin_id": -10, + "origin_slot": 14, + "target_id": 677, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1725, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + }, + { + "id": 1726, + "origin_id": 678, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "BOUNDING_BOX" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Detects multiple people in an image and outputs per-person pose keypoints, skeleton renders, and bounding boxes using SDPose." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Image to Pose Map (SDPose-OOD).json b/blueprints/Image to Pose Map (SDPose-OOD).json new file mode 100644 index 000000000..76ee9ff4e --- /dev/null +++ b/blueprints/Image to Pose Map (SDPose-OOD).json @@ -0,0 +1,888 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 360, + 433.3125 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": null + }, + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "673", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [], + "title": "Image to Pose Map (SDPose-OOD)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 0, + "lastNodeId": 676, + "lastLinkId": 1715, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image to Pose Map (SDPose-OOD)", + "inputNode": { + "id": -10, + "bounding": [ + -3290, + 3590, + 190.8984375, + 288 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1756.2451602089645, + 3366, + 128, + 88 + ] + }, + "inputs": [ + { + "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0", + "name": "input", + "type": "IMAGE,MASK", + "linkIds": [ + 1700 + ], + "localized_name": "input", + "pos": [ + -3123.1015625, + 3614 + ] + }, + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3123.1015625, + 3634 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3123.1015625, + 3654 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3123.1015625, + 3674 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3123.1015625, + 3694 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3123.1015625, + 3714 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3123.1015625, + 3734 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3123.1015625, + 3754 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3123.1015625, + 3774 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3123.1015625, + 3794 + ] + }, + { + "id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1713 + ], + "pos": [ + -3123.1015625, + 3814 + ] + }, + { + "id": "41bec0c6-dffa-4c78-9289-ee678715ae54", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1714 + ], + "pos": [ + -3123.1015625, + 3834 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1732.2451602089645, + 3390 + ] + }, + { + "id": "29a6584e-4685-4986-8ffd-e6d8539953fd", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1715 + ], + "pos": [ + -1732.2451602089645, + 3410 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2470, + 3250 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1714 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -2960, + 3490 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1700 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "area" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2120, + 3260 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -2960, + 3250 + ], + "size": [ + 390, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1713 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1700, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE,MASK" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 2, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 9, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1713, + "origin_id": -10, + "origin_slot": 10, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1714, + "origin_id": -10, + "origin_slot": 11, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1715, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Extracts human pose keypoints and stick-figure visuals from an image using SDPose-OOD, with optional bounding-box input per subject." + } + ] + }, + "extra": { + "ue_links": [] + } +} \ No newline at end of file diff --git a/blueprints/Merge Videos.json b/blueprints/Merge Videos.json new file mode 100644 index 000000000..689e6ec16 --- /dev/null +++ b/blueprints/Merge Videos.json @@ -0,0 +1,1219 @@ +{ + "revision": 0, + "last_node_id": 26, + "last_link_id": 0, + "nodes": [ + { + "id": 26, + "type": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb", + "pos": [ + -980, + 480 + ], + "size": [ + 290, + 190 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "base_video", + "localized_name": "clip_to_resize", + "name": "clip_to_resize", + "type": "VIDEO", + "link": null + }, + { + "label": "second_video", + "localized_name": "base_video", + "name": "base_video", + "type": "VIDEO", + "link": null + }, + { + "label": "pad_second_video", + "localized_name": "pad_second_video", + "name": "pad_second_video", + "type": "BOOLEAN", + "widget": { + "name": "pad_second_video" + }, + "link": null + }, + { + "name": "interpolation", + "type": "COMBO", + "widget": { + "name": "interpolation" + }, + "link": null + }, + { + "name": "padding_color", + "type": "COMBO", + "widget": { + "name": "padding_color" + }, + "link": null + }, + { + "label": "drop_audio", + "localized_name": "drop_audio", + "name": "drop_audio", + "type": "BOOLEAN", + "widget": { + "name": "drop_audio" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "merged_video", + "name": "merged_video", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "28", + "value" + ], + [ + "6", + "interpolation" + ], + [ + "6", + "padding_color" + ], + [ + "11", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Merge Videos" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 34, + "lastLinkId": 75, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Merge Videos", + "inputNode": { + "id": -10, + "bounding": [ + -1990, + 700, + 152.5546875, + 168 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1210, + 614, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "2fb09e41-c5fa-4654-b9d2-569b59626ec4", + "name": "clip_to_resize", + "type": "VIDEO", + "linkIds": [ + 50 + ], + "localized_name": "clip_to_resize", + "label": "base_video", + "pos": [ + -1861.4453125, + 724 + ] + }, + { + "id": "017f8d09-7900-4dc9-b95c-0cab31bcde7d", + "name": "base_video", + "type": "VIDEO", + "linkIds": [ + 51 + ], + "localized_name": "base_video", + "label": "second_video", + "pos": [ + -1861.4453125, + 744 + ] + }, + { + "id": "a39894ce-1785-4037-b39c-b40d2e470c43", + "name": "pad_second_video", + "type": "BOOLEAN", + "linkIds": [ + 59 + ], + "localized_name": "pad_second_video", + "label": "pad_second_video", + "pos": [ + -1861.4453125, + 764 + ] + }, + { + "id": "b4fb86cb-8d87-4193-8533-88a57df50e18", + "name": "interpolation", + "type": "COMBO", + "linkIds": [ + 60 + ], + "pos": [ + -1861.4453125, + 784 + ] + }, + { + "id": "2413a2e2-cfdc-4d1d-9e2e-81e7acdf35e3", + "name": "padding_color", + "type": "COMBO", + "linkIds": [ + 62 + ], + "pos": [ + -1861.4453125, + 804 + ] + }, + { + "id": "338b1e09-0efb-424f-949b-e730a0aa8527", + "name": "drop_audio", + "type": "BOOLEAN", + "linkIds": [ + 63 + ], + "localized_name": "drop_audio", + "label": "drop_audio", + "pos": [ + -1861.4453125, + 824 + ] + } + ], + "outputs": [ + { + "id": "be99efc6-7fb3-4059-93d0-136dc8cc8faf", + "name": "merged_video", + "type": "VIDEO", + "linkIds": [ + 16 + ], + "localized_name": "merged_video", + "pos": [ + 1234, + 638 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "PrimitiveBoolean", + "pos": [ + -990, + 1230 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 63 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 14 + ] + } + ], + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 10, + "type": "EmptyAudio", + "pos": [ + -990, + 1060 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "localized_name": "sample_rate", + "name": "sample_rate", + "type": "INT", + "widget": { + "name": "sample_rate" + }, + "link": null + }, + { + "localized_name": "channels", + "name": "channels", + "type": "INT", + "widget": { + "name": "channels" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 22 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyAudio", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 60, + 44100, + 2 + ] + }, + { + "id": 3, + "type": "ComfySwitchNode", + "pos": [ + -370, + 1010 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 21 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 22 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 14 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 6, + "type": "ResizeAndPadImage", + "pos": [ + -400, + 440 + ], + "size": [ + 270, + 210 + ], + "flags": {}, + "order": 4, + "mode": 0, + "showAdvanced": true, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 39 + }, + { + "localized_name": "target_width", + "name": "target_width", + "type": "INT", + "widget": { + "name": "target_width" + }, + "link": 4 + }, + { + "localized_name": "target_height", + "name": "target_height", + "type": "INT", + "widget": { + "name": "target_height" + }, + "link": 5 + }, + { + "localized_name": "padding_color", + "name": "padding_color", + "type": "COMBO", + "widget": { + "name": "padding_color" + }, + "link": 62 + }, + { + "localized_name": "interpolation", + "name": "interpolation", + "type": "COMBO", + "widget": { + "name": "interpolation" + }, + "link": 60 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 75 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeAndPadImage", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 512, + 512, + "white", + "lanczos" + ] + }, + { + "id": 8, + "type": "CreateVideo", + "pos": [ + 880, + 280 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 19 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 12 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 15 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 16 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 30 + ] + }, + { + "id": 9, + "type": "AudioMerge", + "pos": [ + -990, + 890 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "audio1", + "name": "audio1", + "type": "AUDIO", + "link": 9 + }, + { + "localized_name": "audio2", + "name": "audio2", + "type": "AUDIO", + "link": 10 + }, + { + "localized_name": "merge_method", + "name": "merge_method", + "type": "COMBO", + "widget": { + "name": "merge_method" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 21 + ] + } + ], + "properties": { + "Node name for S&R": "AudioMerge", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "add" + ] + }, + { + "id": 2, + "type": "GetVideoComponents", + "pos": [ + -1590, + 460 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 51 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 39, + 54 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 9 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 27, + "type": "ComfySwitchNode", + "pos": [ + 60, + 70 + ], + "size": [ + 280, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 54 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 75 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 56 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 55 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 1, + "type": "GetVideoComponents", + "pos": [ + -1600, + 30 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 50 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 3, + 17 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 10 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 15 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 7, + "type": "GetImageSize", + "pos": [ + -1000, + 480 + ], + "size": [ + 260, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 3 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 4 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 5 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 28, + "type": "PrimitiveBoolean", + "pos": [ + -1590, + 190 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 59 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 56 + ] + } + ], + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 13, + "type": "BatchImagesNode", + "pos": [ + 530, + 10 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "label": "image0", + "localized_name": "images.image0", + "name": "images.image0", + "type": "IMAGE", + "link": 17 + }, + { + "label": "image1", + "localized_name": "images.image1", + "name": "images.image1", + "shape": 7, + "type": "IMAGE", + "link": 55 + }, + { + "label": "image2", + "localized_name": "images.image2", + "name": "images.image2", + "shape": 7, + "type": "IMAGE", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 19 + ] + } + ], + "properties": { + "Node name for S&R": "BatchImagesNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [ + { + "id": 1, + "title": "Audio", + "bounding": [ + -1000, + 820, + 915, + 496 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 21, + "origin_id": 9, + "origin_slot": 0, + "target_id": 3, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 22, + "origin_id": 10, + "origin_slot": 0, + "target_id": 3, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 14, + "origin_id": 11, + "origin_slot": 0, + "target_id": 3, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 9, + "origin_id": 2, + "origin_slot": 1, + "target_id": 9, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 10, + "origin_id": 1, + "origin_slot": 1, + "target_id": 9, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 39, + "origin_id": 2, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 4, + "origin_id": 7, + "origin_slot": 0, + "target_id": 6, + "target_slot": 1, + "type": "INT" + }, + { + "id": 5, + "origin_id": 7, + "origin_slot": 1, + "target_id": 6, + "target_slot": 2, + "type": "INT" + }, + { + "id": 3, + "origin_id": 1, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 17, + "origin_id": 1, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 19, + "origin_id": 13, + "origin_slot": 0, + "target_id": 8, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 12, + "origin_id": 3, + "origin_slot": 0, + "target_id": 8, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 15, + "origin_id": 1, + "origin_slot": 2, + "target_id": 8, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 16, + "origin_id": 8, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 50, + "origin_id": -10, + "origin_slot": 0, + "target_id": 1, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 51, + "origin_id": -10, + "origin_slot": 1, + "target_id": 2, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 54, + "origin_id": 2, + "origin_slot": 0, + "target_id": 27, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 55, + "origin_id": 27, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 56, + "origin_id": 28, + "origin_slot": 0, + "target_id": 27, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 59, + "origin_id": -10, + "origin_slot": 2, + "target_id": 28, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 60, + "origin_id": -10, + "origin_slot": 3, + "target_id": 6, + "target_slot": 4, + "type": "COMBO" + }, + { + "id": 62, + "origin_id": -10, + "origin_slot": 4, + "target_id": 6, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 63, + "origin_id": -10, + "origin_slot": 5, + "target_id": 11, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 75, + "origin_id": 6, + "origin_slot": 0, + "target_id": 27, + "target_slot": 1, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Video Tools", + "description": "Concatenates two videos end-to-end with optional resize, letterbox padding, and audio merge or drop." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Pose to Image (Z-Image-Turbo).json b/blueprints/Pose to Image (Z-Image-Turbo).json index 5c2749efe..92ee80907 100644 --- a/blueprints/Pose to Image (Z-Image-Turbo).json +++ b/blueprints/Pose to Image (Z-Image-Turbo).json @@ -1298,7 +1298,7 @@ "VHS_MetadataImage": true, "VHS_KeepIntermediate": true }, - "category": "Image generation and editing/Pose to image", + "category": "Image generation and editing/Conditioned", "description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning." } ] diff --git a/blueprints/Pose to Video (LTX 2.0).json b/blueprints/Pose to Video (LTX 2.0).json index 1ce49351a..04eb69972 100644 --- a/blueprints/Pose to Video (LTX 2.0).json +++ b/blueprints/Pose to Video (LTX 2.0).json @@ -3870,7 +3870,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Pose to video", + "category": "Video generation and editing/Conditioned", "description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio." } ] diff --git a/blueprints/Prompt Enhance.json b/blueprints/Prompt Enhance.json index e260b1203..e3a77a73b 100644 --- a/blueprints/Prompt Enhance.json +++ b/blueprints/Prompt Enhance.json @@ -270,7 +270,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Prompt enhance", + "category": "Text Tools", "description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality." } ] diff --git a/blueprints/Remove Background (BiRefNet).json b/blueprints/Remove Background (BiRefNet).json index 732a4adc4..9ec441e51 100644 --- a/blueprints/Remove Background (BiRefNet).json +++ b/blueprints/Remove Background (BiRefNet).json @@ -389,7 +389,7 @@ } ], "extra": {}, - "category": "Image generation and editing/Background Removal" + "category": "Image Tools/Background Removal" } ] }, diff --git a/blueprints/Select Per-Line Text by Index.json b/blueprints/Select Per-Line Text by Index.json new file mode 100644 index 000000000..8a4020d50 --- /dev/null +++ b/blueprints/Select Per-Line Text by Index.json @@ -0,0 +1,485 @@ +{ + "revision": 0, + "last_node_id": 10, + "last_link_id": 0, + "nodes": [ + { + "id": 10, + "type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0", + "pos": [ + -250, + 8590 + ], + "size": [ + 280, + 360 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "text_per_line", + "name": "text_per_line", + "type": "STRING", + "widget": { + "name": "text_per_line" + }, + "link": null + }, + { + "localized_name": "index", + "name": "index", + "type": "INT", + "widget": { + "name": "index" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "selected_line", + "name": "selected_line", + "type": "STRING", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "2", + "string" + ], + [ + "3", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [], + "title": "Select Per-Line Text by Index" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0", + "version": 1, + "state": { + "lastGroupId": 0, + "lastNodeId": 10, + "lastLinkId": 14, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Select Per-Line Text by Index", + "inputNode": { + "id": -10, + "bounding": [ + -990, + 8595, + 128, + 88 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 710, + 8585, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3", + "name": "text_per_line", + "type": "STRING", + "linkIds": [ + 13 + ], + "localized_name": "text_per_line", + "pos": [ + -886, + 8619 + ] + }, + { + "id": "46e69a73-1804-4ca6-9175-31445bf0be96", + "name": "index", + "type": "INT", + "linkIds": [ + 14 + ], + "localized_name": "index", + "pos": [ + -886, + 8639 + ] + } + ], + "outputs": [ + { + "id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10", + "name": "selected_line", + "type": "STRING", + "linkIds": [ + 10 + ], + "localized_name": "selected_line", + "pos": [ + 734, + 8609 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 1, + "type": "PreviewAny", + "pos": [ + -500, + 8400 + ], + "size": [ + 230, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "source", + "name": "source", + "type": "*", + "link": 1 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 6 + ] + } + ], + "properties": { + "Node name for S&R": "PreviewAny", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + null, + null, + null + ] + }, + { + "id": 2, + "type": "RegexExtract", + "pos": [ + -240, + 8740 + ], + "size": [ + 470, + 460 + ], + "flags": {}, + "order": 1, + "mode": 0, + "showAdvanced": false, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": 13 + }, + { + "localized_name": "regex_pattern", + "name": "regex_pattern", + "type": "STRING", + "widget": { + "name": "regex_pattern" + }, + "link": 9 + }, + { + "localized_name": "mode", + "name": "mode", + "type": "COMBO", + "widget": { + "name": "mode" + }, + "link": null + }, + { + "localized_name": "case_insensitive", + "name": "case_insensitive", + "type": "BOOLEAN", + "widget": { + "name": "case_insensitive" + }, + "link": null + }, + { + "localized_name": "multiline", + "name": "multiline", + "type": "BOOLEAN", + "widget": { + "name": "multiline" + }, + "link": null + }, + { + "localized_name": "dotall", + "name": "dotall", + "type": "BOOLEAN", + "widget": { + "name": "dotall" + }, + "link": null + }, + { + "localized_name": "group_index", + "name": "group_index", + "type": "INT", + "widget": { + "name": "group_index" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 10 + ] + } + ], + "properties": { + "Node name for S&R": "RegexExtract", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "", + "", + "First Group", + false, + false, + false, + 1 + ] + }, + { + "id": 3, + "type": "PrimitiveInt", + "pos": [ + -810, + 8400 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 14 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 1 + ] + } + ], + "title": "Int (line index)", + "properties": { + "Node name for S&R": "Int (line index)", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0, + "fixed" + ] + }, + { + "id": 8, + "type": "StringReplace", + "pos": [ + -240, + 8400 + ], + "size": [ + 400, + 280 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "string", + "name": "string", + "type": "STRING", + "widget": { + "name": "string" + }, + "link": null + }, + { + "localized_name": "find", + "name": "find", + "type": "STRING", + "widget": { + "name": "find" + }, + "link": null + }, + { + "localized_name": "replace", + "name": "replace", + "type": "STRING", + "widget": { + "name": "replace" + }, + "link": 6 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "type": "STRING", + "links": [ + 9 + ] + } + ], + "properties": { + "Node name for S&R": "StringReplace", + "cnr_id": "comfy-core", + "ver": "0.19.0", + "ue_properties": { + "widget_ue_connectable": {}, + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "^(?:[^\\n]*\\n){index}([^\\n]*)(?:\\n|$)", + "index", + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 1, + "origin_id": 3, + "origin_slot": 0, + "target_id": 1, + "target_slot": 0, + "type": "INT" + }, + { + "id": 9, + "origin_id": 8, + "origin_slot": 0, + "target_id": 2, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 6, + "origin_id": 1, + "origin_slot": 0, + "target_id": 8, + "target_slot": 2, + "type": "STRING" + }, + { + "id": 10, + "origin_id": 2, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 13, + "origin_id": -10, + "origin_slot": 0, + "target_id": 2, + "target_slot": 0, + "type": "STRING" + }, + { + "id": 14, + "origin_id": -10, + "origin_slot": 1, + "target_id": 3, + "target_slot": 0, + "type": "INT" + } + ], + "extra": {}, + "category": "Text Tools", + "description": "Selects one line from multiline text by zero-based index for batch or list-driven prompt workflows." + } + ] + }, + "extra": { + "ue_links": [], + "links_added_by_ue": [] + } +} \ No newline at end of file diff --git a/blueprints/Split Image Grid to Tiles.json b/blueprints/Split Image Grid to Tiles.json new file mode 100644 index 000000000..d1f6e40ef --- /dev/null +++ b/blueprints/Split Image Grid to Tiles.json @@ -0,0 +1,714 @@ +{ + "revision": 0, + "last_node_id": 251, + "last_link_id": 0, + "nodes": [ + { + "id": 251, + "type": "609e1fd1-b731-4b78-89ac-d19b1156b025", + "pos": [ + -1490, + 130 + ], + "size": [ + 230, + 164 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "source_image", + "name": "source_image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "columns", + "name": "columns", + "type": "INT", + "widget": { + "name": "columns" + }, + "link": null + }, + { + "localized_name": "rows", + "name": "rows", + "type": "INT", + "widget": { + "name": "rows" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "tiles", + "name": "tiles", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "228", + "value" + ], + [ + "252", + "value" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Split Image Grid to Tiles" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "609e1fd1-b731-4b78-89ac-d19b1156b025", + "version": 1, + "state": { + "lastGroupId": 9, + "lastNodeId": 252, + "lastLinkId": 429, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Split Image Grid to Tiles", + "inputNode": { + "id": -10, + "bounding": [ + -1690, + 260, + 128, + 108 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -510, + 590, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "866ac798-cfbc-450a-b755-e704f86404d9", + "name": "source_image", + "type": "IMAGE", + "linkIds": [ + 386, + 389 + ], + "localized_name": "source_image", + "pos": [ + -1586, + 284 + ] + }, + { + "id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3", + "name": "columns", + "type": "INT", + "linkIds": [ + 427 + ], + "localized_name": "columns", + "pos": [ + -1586, + 304 + ] + }, + { + "id": "d45915da-e848-43dd-9ccc-e3161e9c99d9", + "name": "rows", + "type": "INT", + "linkIds": [ + 428 + ], + "localized_name": "rows", + "pos": [ + -1586, + 324 + ] + } + ], + "outputs": [ + { + "id": "18bc780f-064b-4038-87c6-67dba71deb08", + "name": "tiles", + "type": "IMAGE", + "linkIds": [ + 394 + ], + "localized_name": "tiles", + "shape": 6, + "pos": [ + -486, + 614 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 225, + "type": "SplitImageToTileList", + "pos": [ + -1010, + 620 + ], + "size": [ + 290, + 170 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 386 + }, + { + "localized_name": "tile_width", + "name": "tile_width", + "type": "INT", + "widget": { + "name": "tile_width" + }, + "link": 403 + }, + { + "localized_name": "tile_height", + "name": "tile_height", + "type": "INT", + "widget": { + "name": "tile_height" + }, + "link": 404 + }, + { + "localized_name": "overlap", + "name": "overlap", + "type": "INT", + "widget": { + "name": "overlap" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "shape": 6, + "type": "IMAGE", + "links": [ + 394 + ] + } + ], + "properties": { + "Node name for S&R": "SplitImageToTileList", + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1024, + 1024, + 0 + ] + }, + { + "id": 231, + "type": "ComfyMathExpression", + "pos": [ + -1080, + 330 + ], + "size": [ + 370, + 190 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 390 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 429 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 404 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "title": "Math Expression (Height)", + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "max(1, (int(a) + int(b) - 1) // int(b))" + ] + }, + { + "id": 229, + "type": "ComfyMathExpression", + "pos": [ + -1090, + -30 + ], + "size": [ + 370, + 190 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 387 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 388 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 403 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "title": "Math Expression (Width)", + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "max(1, (int(a) + int(b) - 1) // int(b))" + ] + }, + { + "id": 228, + "type": "PrimitiveInt", + "pos": [ + -1380, + 90 + ], + "size": [ + 230, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 427 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 388 + ] + } + ], + "title": "Int (grid columns)", + "properties": { + "Node name for S&R": "Int (grid columns)", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 2, + "fixed" + ] + }, + { + "id": 230, + "type": "GetImageSize", + "pos": [ + -1380, + 290 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 389 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 387 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 390 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + } + }, + { + "id": 252, + "type": "PrimitiveInt", + "pos": [ + -1380, + 470 + ], + "size": [ + 230, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 428 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 429 + ] + } + ], + "title": "Int (grid rows)", + "properties": { + "Node name for S&R": "Int (grid rows)", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 3, + "fixed" + ] + } + ], + "groups": [], + "links": [ + { + "id": 403, + "origin_id": 229, + "origin_slot": 1, + "target_id": 225, + "target_slot": 1, + "type": "INT" + }, + { + "id": 404, + "origin_id": 231, + "origin_slot": 1, + "target_id": 225, + "target_slot": 2, + "type": "INT" + }, + { + "id": 390, + "origin_id": 230, + "origin_slot": 1, + "target_id": 231, + "target_slot": 0, + "type": "INT" + }, + { + "id": 387, + "origin_id": 230, + "origin_slot": 0, + "target_id": 229, + "target_slot": 0, + "type": "INT" + }, + { + "id": 388, + "origin_id": 228, + "origin_slot": 0, + "target_id": 229, + "target_slot": 1, + "type": "INT" + }, + { + "id": 386, + "origin_id": -10, + "origin_slot": 0, + "target_id": 225, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 389, + "origin_id": -10, + "origin_slot": 0, + "target_id": 230, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 394, + "origin_id": 225, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 427, + "origin_id": -10, + "origin_slot": 1, + "target_id": 228, + "target_slot": 0, + "type": "INT" + }, + { + "id": 428, + "origin_id": -10, + "origin_slot": 2, + "target_id": 252, + "target_slot": 0, + "type": "INT" + }, + { + "id": 429, + "origin_id": 252, + "origin_slot": 0, + "target_id": 231, + "target_slot": 1, + "type": "INT" + } + ], + "extra": {}, + "category": "Image Tools/Crop", + "description": "Splits an image into a configurable columns×rows grid of equal tiles for tiled generation or processing." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Text to Image (Anima).json b/blueprints/Text to Image (Anima).json new file mode 100644 index 000000000..787908ca9 --- /dev/null +++ b/blueprints/Text to Image (Anima).json @@ -0,0 +1,1085 @@ +{ + "revision": 0, + "last_node_id": 60, + "last_link_id": 0, + "nodes": [ + { + "id": 60, + "type": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f", + "pos": [ + -10, + 130 + ], + "size": [ + 500, + 640 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "label": "prompt", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + }, + { + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": null + }, + { + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": null + }, + { + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + }, + { + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "11", + "text" + ], + [ + "28", + "width" + ], + [ + "28", + "height" + ], + [ + "19", + "steps" + ], + [ + "19", + "cfg" + ], + [ + "19", + "seed" + ], + [ + "44", + "unet_name" + ], + [ + "45", + "clip_name" + ], + [ + "15", + "vae_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Text to Image (Anima)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f", + "version": 1, + "state": { + "lastGroupId": 3, + "lastNodeId": 70, + "lastLinkId": 104, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Text to Image (Anima)", + "inputNode": { + "id": -10, + "bounding": [ + -330, + 530, + 120, + 220 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 1229.9999873482075, + 505, + 120, + 60 + ] + }, + "inputs": [ + { + "id": "4693f350-6ba0-446d-80d4-3038c661d26c", + "name": "text", + "type": "STRING", + "linkIds": [ + 95 + ], + "label": "prompt", + "pos": [ + -230, + 550 + ] + }, + { + "id": "4a7886a9-4ed7-49bb-afc2-977bb78a303d", + "name": "width", + "type": "INT", + "linkIds": [ + 96 + ], + "pos": [ + -230, + 570 + ] + }, + { + "id": "f6c04461-d29e-49e3-8790-07bb662bbbfe", + "name": "height", + "type": "INT", + "linkIds": [ + 97 + ], + "pos": [ + -230, + 590 + ] + }, + { + "id": "7a24f998-3808-4837-8bff-52304ad09fb6", + "name": "steps", + "type": "INT", + "linkIds": [ + 98 + ], + "pos": [ + -230, + 610 + ] + }, + { + "id": "aaa99698-b222-40fe-b946-28067528a85c", + "name": "cfg", + "type": "FLOAT", + "linkIds": [ + 99 + ], + "pos": [ + -230, + 630 + ] + }, + { + "id": "053df9ae-7311-4816-aa23-7fa13c656ced", + "name": "seed", + "type": "INT", + "linkIds": [ + 100 + ], + "pos": [ + -230, + 650 + ] + }, + { + "id": "c59194ea-015c-41a7-8edd-ae7ffc220b63", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 101 + ], + "pos": [ + -230, + 670 + ] + }, + { + "id": "e655aa3b-2db7-4e25-9ea2-61550fa7ae2d", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 102 + ], + "pos": [ + -230, + 690 + ] + }, + { + "id": "94965a7a-74dd-4f5a-87e3-9f87995d554f", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 103 + ], + "pos": [ + -230, + 710 + ] + } + ], + "outputs": [ + { + "id": "ef85ac0a-2152-4232-bfa1-929cfc913718", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 82 + ], + "localized_name": "IMAGE", + "pos": [ + 1249.9999873482075, + 525 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 45, + "type": "CLIPLoader", + "pos": [ + -60, + 380 + ], + "size": [ + 310, + 150 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 102 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 80, + 81 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.11.0", + "models": [ + { + "name": "qwen_3_06b_base.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/text_encoders/qwen_3_06b_base.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen_3_06b_base.safetensors", + "stable_diffusion", + "default" + ] + }, + { + "id": 15, + "type": "VAELoader", + "pos": [ + -50, + 610 + ], + "size": [ + 310, + 100 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 103 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 11 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "models": [ + { + "name": "qwen_image_vae.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/vae/qwen_image_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "qwen_image_vae.safetensors" + ] + }, + { + "id": 8, + "type": "VAEDecode", + "pos": [ + 880, + 840 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 10 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 11 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 82 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 28, + "type": "EmptyLatentImage", + "pos": [ + -50, + 830 + ], + "size": [ + 310, + 150 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 96 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 97 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 78 + ] + } + ], + "properties": { + "Node name for S&R": "EmptyLatentImage", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1024, + 1024, + 1 + ] + }, + { + "id": 12, + "type": "CLIPTextEncode", + "pos": [ + 330, + 830 + ], + "size": [ + 490, + 140 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 81 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 40 + ] + } + ], + "title": "CLIP Text Encode (Negative Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.65", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "worst quality, low quality, score_1, score_2, score_3, blurry, jpeg artifacts, sepia" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 19, + "type": "KSampler", + "pos": [ + 870, + 120 + ], + "size": [ + 300, + 620 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 79 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 39 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 40 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 78 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 100 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": 98 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": 99 + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 10 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "fixed", + 30, + 4, + "er_sde", + "simple", + 1 + ] + }, + { + "id": 11, + "type": "CLIPTextEncode", + "pos": [ + 320, + 170 + ], + "size": [ + 490, + 610 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 80 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 95 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 39 + ] + } + ], + "title": "CLIP Text Encode (Positive Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.65", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 44, + "type": "UNETLoader", + "pos": [ + -50, + 170 + ], + "size": [ + 310, + 130 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 101 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 79 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.11.0", + "models": [ + { + "name": "anima-base-v1.0.safetensors", + "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/diffusion_models/anima-base-v1.0.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "anima-base-v1.0.safetensors", + "default" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Model", + "bounding": [ + -80, + 80, + 360, + 640 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Image Size(1MP)", + "bounding": [ + -80, + 750, + 360, + 240 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Prompt", + "bounding": [ + 300, + 80, + 530, + 910 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 10, + "origin_id": 19, + "origin_slot": 0, + "target_id": 8, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 11, + "origin_id": 15, + "origin_slot": 0, + "target_id": 8, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 81, + "origin_id": 45, + "origin_slot": 0, + "target_id": 12, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 79, + "origin_id": 44, + "origin_slot": 0, + "target_id": 19, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 39, + "origin_id": 11, + "origin_slot": 0, + "target_id": 19, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 40, + "origin_id": 12, + "origin_slot": 0, + "target_id": 19, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 78, + "origin_id": 28, + "origin_slot": 0, + "target_id": 19, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 80, + "origin_id": 45, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 82, + "origin_id": 8, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 95, + "origin_id": -10, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 96, + "origin_id": -10, + "origin_slot": 1, + "target_id": 28, + "target_slot": 0, + "type": "INT" + }, + { + "id": 97, + "origin_id": -10, + "origin_slot": 2, + "target_id": 28, + "target_slot": 1, + "type": "INT" + }, + { + "id": 98, + "origin_id": -10, + "origin_slot": 3, + "target_id": 19, + "target_slot": 5, + "type": "INT" + }, + { + "id": 99, + "origin_id": -10, + "origin_slot": 4, + "target_id": 19, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 100, + "origin_id": -10, + "origin_slot": 5, + "target_id": 19, + "target_slot": 4, + "type": "INT" + }, + { + "id": 101, + "origin_id": -10, + "origin_slot": 6, + "target_id": 44, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 102, + "origin_id": -10, + "origin_slot": 7, + "target_id": 45, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 103, + "origin_id": -10, + "origin_slot": 8, + "target_id": 15, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Image generation and editing/Text to image" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Captioning (Gemini).json b/blueprints/Video Captioning (Gemini).json index 7642b23c1..54a7d6e78 100644 --- a/blueprints/Video Captioning (Gemini).json +++ b/blueprints/Video Captioning (Gemini).json @@ -307,9 +307,9 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Text generation/Video Captioning", + "category": "Video Tools", "description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM." } ] } -} +} \ No newline at end of file diff --git a/blueprints/Video Depth Estimation (MoGe).json b/blueprints/Video Depth Estimation (MoGe).json new file mode 100644 index 000000000..025e20cda --- /dev/null +++ b/blueprints/Video Depth Estimation (MoGe).json @@ -0,0 +1,1226 @@ +{ + "revision": 0, + "last_node_id": 72, + "last_link_id": 0, + "nodes": [ + { + "id": 72, + "type": "7ff83f68-6848-47a8-aa43-9036ca6c46e8", + "pos": [ + -4440, + 4550 + ], + "size": [ + 430, + 330 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "inference_resolution", + "name": "inference_resolution", + "type": "INT", + "widget": { + "name": "inference_resolution" + }, + "link": null + }, + { + "localized_name": "inference_batch_size", + "name": "inference_batch_size", + "type": "INT", + "widget": { + "name": "inference_batch_size" + }, + "link": null + }, + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "COMBO", + "widget": { + "name": "moge_model" + }, + "link": null + }, + { + "label": "auto_resize_input", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + }, + { + "name": "video", + "type": "VIDEO", + "link": null + } + ], + "outputs": [ + { + "localized_name": "depth_colored", + "name": "depth_colored", + "type": "IMAGE", + "links": [] + }, + { + "localized_name": "depth", + "name": "depth", + "type": "IMAGE", + "links": [] + }, + { + "name": "MASK", + "type": "MASK", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "13", + "resolution_level" + ], + [ + "13", + "batch_size" + ], + [ + "32", + "model_name" + ], + [ + "53", + "switch" + ] + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [], + "title": "Video Depth Estimation (MoGe)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "7ff83f68-6848-47a8-aa43-9036ca6c46e8", + "version": 1, + "state": { + "lastGroupId": 1, + "lastNodeId": 72, + "lastLinkId": 96, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Depth Estimation (MoGe)", + "inputNode": { + "id": -10, + "bounding": [ + -5320, + 5320, + 167.337890625, + 148 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -3090, + 4966, + 129, + 108 + ] + }, + "inputs": [ + { + "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52", + "name": "inference_resolution", + "type": "INT", + "linkIds": [ + 73 + ], + "localized_name": "inference_resolution", + "pos": [ + -5176.662109375, + 5344 + ] + }, + { + "id": "616638fe-f603-4d10-bae9-fc87c134380f", + "name": "inference_batch_size", + "type": "INT", + "linkIds": [ + 74 + ], + "localized_name": "inference_batch_size", + "pos": [ + -5176.662109375, + 5364 + ] + }, + { + "id": "65694805-186e-4181-a721-df8b5af49d31", + "name": "moge_model", + "type": "COMBO", + "linkIds": [ + 79 + ], + "localized_name": "moge_model", + "pos": [ + -5176.662109375, + 5384 + ] + }, + { + "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 83 + ], + "label": "auto_resize_input", + "pos": [ + -5176.662109375, + 5404 + ] + }, + { + "id": "749bad18-d00a-4ec4-a5ff-e45b1d0cf089", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 91 + ], + "pos": [ + -5176.662109375, + 5424 + ] + } + ], + "outputs": [ + { + "id": "59c37b52-074f-49fc-9731-483f899c12c4", + "name": "depth_colored", + "type": "IMAGE", + "linkIds": [ + 36 + ], + "localized_name": "depth_colored", + "pos": [ + -3066, + 4990 + ] + }, + { + "id": "f583e936-da5c-4630-9901-391fa605c1f8", + "name": "depth", + "type": "IMAGE", + "linkIds": [ + 40 + ], + "localized_name": "depth", + "pos": [ + -3066, + 5010 + ] + }, + { + "id": "6845b6a1-1980-454a-9451-314f24495c1d", + "name": "MASK", + "type": "MASK", + "linkIds": [ + 86 + ], + "pos": [ + -3066, + 5030 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 13, + "type": "MoGeInference", + "pos": [ + -3790, + 5180 + ], + "size": [ + 270, + 230 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_model", + "name": "moge_model", + "type": "MOGE_MODEL", + "link": 58 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 81 + }, + { + "localized_name": "resolution_level", + "name": "resolution_level", + "type": "INT", + "widget": { + "name": "resolution_level" + }, + "link": 73 + }, + { + "localized_name": "fov_x_degrees", + "name": "fov_x_degrees", + "type": "FLOAT", + "widget": { + "name": "fov_x_degrees" + }, + "link": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": 74 + }, + { + "localized_name": "force_projection", + "name": "force_projection", + "type": "BOOLEAN", + "widget": { + "name": "force_projection" + }, + "link": null + }, + { + "localized_name": "apply_mask", + "name": "apply_mask", + "type": "BOOLEAN", + "widget": { + "name": "apply_mask" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "links": [ + 35, + 39, + 61 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeInference", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + 9, + 0, + 4, + true, + true + ] + }, + { + "id": 23, + "type": "MoGeRender", + "pos": [ + -3430, + 4870 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 35 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 36 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "depth_colored" + ] + }, + { + "id": 25, + "type": "MoGeRender", + "pos": [ + -3430, + 5030 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 39 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 40 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "depth" + ] + }, + { + "id": 32, + "type": "LoadMoGeModel", + "pos": [ + -4180, + 4880 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 79 + } + ], + "outputs": [ + { + "localized_name": "MOGE_MODEL", + "name": "MOGE_MODEL", + "type": "MOGE_MODEL", + "links": [ + 58 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMoGeModel", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1", + "models": [ + { + "name": "moge_2_vitl_normal_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors", + "directory": "geometry_estimation" + } + ] + }, + "widgets_values": [ + "moge_2_vitl_normal_fp16.safetensors" + ] + }, + { + "id": 36, + "type": "ComfyMathExpression", + "pos": [ + -4720, + 4910 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 49 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": null + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": null + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [ + 53 + ] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "a > 2048" + ] + }, + { + "id": 37, + "type": "GetImageSize", + "pos": [ + -4980, + 4910 + ], + "size": [ + 230, + 160 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 92 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 49 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + } + }, + { + "id": 40, + "type": "ResizeImagesByLongerEdge", + "pos": [ + -4650, + 5210 + ], + "size": [ + 310, + 110 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 93 + }, + { + "localized_name": "longer_edge", + "name": "longer_edge", + "type": "INT", + "widget": { + "name": "longer_edge" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 54 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImagesByLongerEdge", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + 2048 + ] + }, + { + "id": 42, + "type": "ComfySwitchNode", + "pos": [ + -4180, + 5060 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 94 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 54 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 80 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + false + ] + }, + { + "id": 45, + "type": "MoGeRender", + "pos": [ + -3430, + 5200 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "moge_geometry", + "name": "moge_geometry", + "type": "MOGE_GEOMETRY", + "link": 61 + }, + { + "localized_name": "output", + "name": "output", + "type": "COMBO", + "widget": { + "name": "output" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 85 + ] + } + ], + "properties": { + "Node name for S&R": "MoGeRender", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + "mask" + ] + }, + { + "id": 53, + "type": "ComfySwitchNode", + "pos": [ + -4160, + 5340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 95 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 80 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 83 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 81 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.21.1" + }, + "widgets_values": [ + true + ] + }, + { + "id": 68, + "type": "ImageToMask", + "pos": [ + -3420, + 5360 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 85 + }, + { + "localized_name": "channel", + "name": "channel", + "type": "COMBO", + "widget": { + "name": "channel" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 86 + ] + } + ], + "properties": { + "Node name for S&R": "ImageToMask", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "red" + ] + }, + { + "id": 70, + "type": "GetVideoComponents", + "pos": [ + -4920, + 5490 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 91 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 92, + 93, + 94, + 95 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": null + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + } + } + ], + "groups": [ + { + "id": 1, + "title": "auto_resize_if_width_gt_2048", + "bounding": [ + -5000, + 4840, + 690, + 280 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 58, + "origin_id": 32, + "origin_slot": 0, + "target_id": 13, + "target_slot": 0, + "type": "MOGE_MODEL" + }, + { + "id": 35, + "origin_id": 13, + "origin_slot": 0, + "target_id": 23, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 39, + "origin_id": 13, + "origin_slot": 0, + "target_id": 25, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 49, + "origin_id": 37, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "INT" + }, + { + "id": 54, + "origin_id": 40, + "origin_slot": 0, + "target_id": 42, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 53, + "origin_id": 36, + "origin_slot": 2, + "target_id": 42, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 61, + "origin_id": 13, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "MOGE_GEOMETRY" + }, + { + "id": 36, + "origin_id": 23, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 40, + "origin_id": 25, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 73, + "origin_id": -10, + "origin_slot": 0, + "target_id": 13, + "target_slot": 2, + "type": "INT" + }, + { + "id": 74, + "origin_id": -10, + "origin_slot": 1, + "target_id": 13, + "target_slot": 4, + "type": "INT" + }, + { + "id": 79, + "origin_id": -10, + "origin_slot": 2, + "target_id": 32, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 80, + "origin_id": 42, + "origin_slot": 0, + "target_id": 53, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 81, + "origin_id": 53, + "origin_slot": 0, + "target_id": 13, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 83, + "origin_id": -10, + "origin_slot": 3, + "target_id": 53, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 85, + "origin_id": 45, + "origin_slot": 0, + "target_id": 68, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 86, + "origin_id": 68, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 91, + "origin_id": -10, + "origin_slot": 4, + "target_id": 70, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 92, + "origin_id": 70, + "origin_slot": 0, + "target_id": 37, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 93, + "origin_id": 70, + "origin_slot": 0, + "target_id": 40, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 94, + "origin_id": 70, + "origin_slot": 0, + "target_id": 42, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 95, + "origin_id": 70, + "origin_slot": 0, + "target_id": 53, + "target_slot": 0, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Depth", + "description": "Estimates monocular depth from an input video using MoGe, outputting both raw and colorized depth maps plus a mask." + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Face Detection (Mediapipe).json b/blueprints/Video Face Detection (Mediapipe).json new file mode 100644 index 000000000..c70352481 --- /dev/null +++ b/blueprints/Video Face Detection (Mediapipe).json @@ -0,0 +1,1109 @@ +{ + "revision": 0, + "last_node_id": 167, + "last_link_id": 0, + "nodes": [ + { + "id": 167, + "type": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c", + "pos": [ + -3410, + 6100 + ], + "size": [ + 420, + 481.3125 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "name": "video", + "type": "VIDEO", + "link": null + }, + { + "label": "trim_audio", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": null + }, + { + "name": "start_time", + "type": "FLOAT", + "widget": { + "name": "start_time" + }, + "link": null + }, + { + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": null + }, + { + "label": "face_landmarker", + "name": "face_landmarker_1", + "type": "FACE_LANDMARKER", + "link": null + }, + { + "label": "detector_variant", + "name": "detector_variant_1", + "type": "COMBO", + "widget": { + "name": "detector_variant_1" + }, + "link": null + }, + { + "label": "num_faces", + "name": "num_faces_1", + "type": "INT", + "widget": { + "name": "num_faces_1" + }, + "link": null + }, + { + "label": "face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": null + }, + { + "label": "face_lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": null + }, + { + "label": "left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": null + }, + { + "label": "right_eye", + "name": "regions.right_eye_1", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye_1" + }, + "link": null + }, + { + "label": "irises", + "name": "regions.irises_1", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises_1" + }, + "link": null + }, + { + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": null + } + ], + "outputs": [ + { + "label": "mask", + "name": "MASK_1", + "type": "MASK", + "links": [] + }, + { + "label": "bboxes", + "name": "bboxes_1", + "type": "BOUNDING_BOX", + "links": null + }, + { + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": null + } + ], + "title": "Video Face Detection (Mediapipe)", + "properties": { + "proxyWidgets": [ + [ + "165", + "switch" + ], + [ + "164", + "start_time" + ], + [ + "164", + "duration" + ], + [ + "11", + "detector_variant" + ], + [ + "11", + "num_faces" + ], + [ + "20", + "regions.face_oval" + ], + [ + "20", + "regions.lips" + ], + [ + "20", + "regions.left_eye" + ], + [ + "20", + "regions.right_eye" + ], + [ + "20", + "regions.irises" + ], + [ + "2", + "model_name" + ] + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [] + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 167, + "lastLinkId": 168, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Face Detection (Mediapipe)", + "description": "Detects facial landmarks from a video using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.", + "inputNode": { + "id": -10, + "bounding": [ + -1060, + 4350, + 142.587890625, + 308 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 470, + 4460, + 137.677734375, + 108 + ] + }, + "inputs": [ + { + "id": "16e5a20f-22bc-4960-a67b-e32c64409c49", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 150, + 153 + ], + "pos": [ + -941.412109375, + 4374 + ] + }, + { + "id": "cc7fc7d4-24ec-4c00-878e-1af1b6809b4b", + "name": "switch", + "type": "BOOLEAN", + "linkIds": [ + 154 + ], + "label": "trim_audio", + "pos": [ + -941.412109375, + 4394 + ] + }, + { + "id": "efa9ab9f-ca70-449c-be43-5ca60c7f0d59", + "name": "start_time", + "type": "FLOAT", + "linkIds": [ + 155 + ], + "pos": [ + -941.412109375, + 4414 + ] + }, + { + "id": "45050127-4089-4b85-bf81-73b725196c2e", + "name": "duration", + "type": "FLOAT", + "linkIds": [ + 156 + ], + "pos": [ + -941.412109375, + 4434 + ] + }, + { + "id": "239fcd3b-6324-4824-8255-98199ae58914", + "name": "face_landmarker_1", + "type": "FACE_LANDMARKER", + "linkIds": [ + 157 + ], + "label": "face_landmarker", + "pos": [ + -941.412109375, + 4454 + ] + }, + { + "id": "f79f67b9-5bcb-4cab-9101-8b9dee461bca", + "name": "detector_variant_1", + "type": "COMBO", + "linkIds": [ + 158 + ], + "label": "detector_variant", + "pos": [ + -941.412109375, + 4474 + ] + }, + { + "id": "3369790b-e730-41bf-b5b2-dc1f5fafbe11", + "name": "num_faces_1", + "type": "INT", + "linkIds": [ + 159 + ], + "label": "num_faces", + "pos": [ + -941.412109375, + 4494 + ] + }, + { + "id": "964f6b5f-44ac-456e-ba3a-a3039dfe0729", + "name": "regions.face_oval", + "type": "BOOLEAN", + "linkIds": [ + 160 + ], + "label": "face_oval", + "pos": [ + -941.412109375, + 4514 + ] + }, + { + "id": "d6e89b51-65a2-4f37-a561-8cec3a5040fd", + "name": "regions.lips", + "type": "BOOLEAN", + "linkIds": [ + 161 + ], + "label": "face_lips", + "pos": [ + -941.412109375, + 4534 + ] + }, + { + "id": "49f02319-ea4a-4a69-88f8-589d2ef7c97a", + "name": "regions.left_eye", + "type": "BOOLEAN", + "linkIds": [ + 162 + ], + "label": "left_eye", + "pos": [ + -941.412109375, + 4554 + ] + }, + { + "id": "89179a19-aca6-4469-a0b9-2a4bd21bceea", + "name": "regions.right_eye_1", + "type": "BOOLEAN", + "linkIds": [ + 163 + ], + "label": "right_eye", + "pos": [ + -941.412109375, + 4574 + ] + }, + { + "id": "f5667690-24b5-4df9-9210-b8610c68ff5f", + "name": "regions.irises_1", + "type": "BOOLEAN", + "linkIds": [ + 164 + ], + "label": "irises", + "pos": [ + -941.412109375, + 4594 + ] + }, + { + "id": "66c805f6-6ccd-41f9-8a77-fc934b7f4713", + "name": "model_name", + "type": "COMBO", + "linkIds": [ + 165 + ], + "pos": [ + -941.412109375, + 4614 + ] + } + ], + "outputs": [ + { + "id": "f6309e1d-6397-4363-b38f-778a122abc51", + "name": "MASK_1", + "type": "MASK", + "linkIds": [ + 83 + ], + "label": "mask", + "pos": [ + 494, + 4484 + ] + }, + { + "id": "59669f0a-b4b2-49d1-85f8-fc2a88059b1a", + "name": "bboxes_1", + "type": "BOUNDING_BOX", + "linkIds": [ + 166 + ], + "label": "bboxes", + "pos": [ + 494, + 4504 + ] + }, + { + "id": "57f66731-e106-4f8b-a0a0-aed3c620b37b", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "linkIds": [ + 167 + ], + "pos": [ + 494, + 4524 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 11, + "type": "MediaPipeFaceLandmarker", + "pos": [ + -60, + 4380 + ], + "size": [ + 350, + 220 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "face_detection_model", + "name": "face_detection_model", + "type": "FACE_DETECTION_MODEL", + "link": 66 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 149 + }, + { + "localized_name": "detector_variant", + "name": "detector_variant", + "type": "COMBO", + "widget": { + "name": "detector_variant" + }, + "link": 158 + }, + { + "localized_name": "num_faces", + "name": "num_faces", + "type": "INT", + "widget": { + "name": "num_faces" + }, + "link": 159 + }, + { + "localized_name": "min_confidence", + "name": "min_confidence", + "type": "FLOAT", + "widget": { + "name": "min_confidence" + }, + "link": null + }, + { + "localized_name": "missing_frame_fallback", + "name": "missing_frame_fallback", + "type": "COMBO", + "widget": { + "name": "missing_frame_fallback" + }, + "link": null + }, + { + "name": "face_landmarker", + "type": "FACE_LANDMARKER", + "link": 157 + } + ], + "outputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "links": [ + 46, + 167 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 166 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceLandmarker", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "full", + 0, + 0.5, + "empty" + ] + }, + { + "id": 2, + "type": "LoadMediaPipeFaceLandmarker", + "pos": [ + -70, + 4160 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 165 + } + ], + "outputs": [ + { + "localized_name": "FACE_DETECTION_MODEL", + "name": "FACE_DETECTION_MODEL", + "type": "FACE_DETECTION_MODEL", + "links": [ + 66 + ] + } + ], + "properties": { + "Node name for S&R": "LoadMediaPipeFaceLandmarker", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0", + "models": [ + { + "name": "mediapipe_face_fp32.safetensors", + "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors", + "directory": "detection" + } + ] + }, + "widgets_values": [ + "mediapipe_face_fp32.safetensors" + ] + }, + { + "id": 20, + "type": "MediaPipeFaceMask", + "pos": [ + -70, + 4660 + ], + "size": [ + 360, + 180 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "face_landmarks", + "name": "face_landmarks", + "type": "FACE_LANDMARKS", + "link": 46 + }, + { + "localized_name": "regions", + "name": "regions", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "regions" + }, + "link": null + }, + { + "localized_name": "regions.face_oval", + "name": "regions.face_oval", + "type": "BOOLEAN", + "widget": { + "name": "regions.face_oval" + }, + "link": 160 + }, + { + "localized_name": "regions.lips", + "name": "regions.lips", + "type": "BOOLEAN", + "widget": { + "name": "regions.lips" + }, + "link": 161 + }, + { + "localized_name": "regions.left_eye", + "name": "regions.left_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.left_eye" + }, + "link": 162 + }, + { + "localized_name": "regions.right_eye", + "name": "regions.right_eye", + "type": "BOOLEAN", + "widget": { + "name": "regions.right_eye" + }, + "link": 163 + }, + { + "localized_name": "regions.irises", + "name": "regions.irises", + "type": "BOOLEAN", + "widget": { + "name": "regions.irises" + }, + "link": 164 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 83 + ] + } + ], + "properties": { + "Node name for S&R": "MediaPipeFaceMask", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + "custom", + true, + false, + false, + false, + false + ] + }, + { + "id": 160, + "type": "GetVideoComponents", + "pos": [ + -420, + 4360 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 152 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 149 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": null + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": null + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + } + }, + { + "id": 164, + "type": "Video Slice", + "pos": [ + -780, + 4330 + ], + "size": [ + 270, + 170 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 150 + }, + { + "localized_name": "start_time", + "name": "start_time", + "type": "FLOAT", + "widget": { + "name": "start_time" + }, + "link": 155 + }, + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": 156 + }, + { + "localized_name": "strict_duration", + "name": "strict_duration", + "type": "BOOLEAN", + "widget": { + "name": "strict_duration" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 151 + ] + } + ], + "properties": { + "Node name for S&R": "Video Slice", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + 0, + 0, + false + ] + }, + { + "id": 165, + "type": "ComfySwitchNode", + "pos": [ + -420, + 4590 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 153 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 151 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 154 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 152 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "cnr_id": "comfy-core", + "ver": "0.22.0" + }, + "widgets_values": [ + false + ] + } + ], + "groups": [], + "links": [ + { + "id": 66, + "origin_id": 2, + "origin_slot": 0, + "target_id": 11, + "target_slot": 0, + "type": "FACE_DETECTION_MODEL" + }, + { + "id": 46, + "origin_id": 11, + "origin_slot": 0, + "target_id": 20, + "target_slot": 0, + "type": "FACE_LANDMARKS" + }, + { + "id": 83, + "origin_id": 20, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 149, + "origin_id": 160, + "origin_slot": 0, + "target_id": 11, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 150, + "origin_id": -10, + "origin_slot": 0, + "target_id": 164, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 151, + "origin_id": 164, + "origin_slot": 0, + "target_id": 165, + "target_slot": 1, + "type": "VIDEO" + }, + { + "id": 152, + "origin_id": 165, + "origin_slot": 0, + "target_id": 160, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 153, + "origin_id": -10, + "origin_slot": 0, + "target_id": 165, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 154, + "origin_id": -10, + "origin_slot": 1, + "target_id": 165, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 155, + "origin_id": -10, + "origin_slot": 2, + "target_id": 164, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 156, + "origin_id": -10, + "origin_slot": 3, + "target_id": 164, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 157, + "origin_id": -10, + "origin_slot": 4, + "target_id": 11, + "target_slot": 6, + "type": "FACE_LANDMARKER" + }, + { + "id": 158, + "origin_id": -10, + "origin_slot": 5, + "target_id": 11, + "target_slot": 2, + "type": "COMBO" + }, + { + "id": 159, + "origin_id": -10, + "origin_slot": 6, + "target_id": 11, + "target_slot": 3, + "type": "INT" + }, + { + "id": 160, + "origin_id": -10, + "origin_slot": 7, + "target_id": 20, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 161, + "origin_id": -10, + "origin_slot": 8, + "target_id": 20, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 162, + "origin_id": -10, + "origin_slot": 9, + "target_id": 20, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 163, + "origin_id": -10, + "origin_slot": 10, + "target_id": 20, + "target_slot": 5, + "type": "BOOLEAN" + }, + { + "id": 164, + "origin_id": -10, + "origin_slot": 11, + "target_id": 20, + "target_slot": 6, + "type": "BOOLEAN" + }, + { + "id": 165, + "origin_id": -10, + "origin_slot": 12, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 166, + "origin_id": 11, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 167, + "origin_id": 11, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "FACE_LANDMARKS" + } + ], + "extra": {}, + "category": "Conditioning & Preprocessors/Face Detection" + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Inpaint (VOID).json b/blueprints/Video Inpaint (VOID).json new file mode 100644 index 000000000..a7cc806b5 --- /dev/null +++ b/blueprints/Video Inpaint (VOID).json @@ -0,0 +1,4340 @@ +{ + "revision": 0, + "last_node_id": 167, + "last_link_id": 0, + "nodes": [ + { + "id": 167, + "type": "c3157b75-484a-459e-b8de-57823bef5130", + "pos": [ + -430, + 690 + ], + "size": [ + 590, + 723.9375 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "label": "Source video", + "localized_name": "source_video", + "name": "source_video", + "type": "VIDEO", + "link": null + }, + { + "label": "Positive prompt (inpaint fill)", + "localized_name": "positive_prompt", + "name": "positive_prompt", + "type": "STRING", + "widget": { + "name": "positive_prompt" + }, + "link": null + }, + { + "label": "Negative prompt", + "localized_name": "negative_prompt", + "name": "negative_prompt", + "type": "STRING", + "widget": { + "name": "negative_prompt" + }, + "link": null + }, + { + "label": "SAM3 object mask prompt", + "localized_name": "sam3_text_prompt", + "name": "sam3_text_prompt", + "type": "STRING", + "widget": { + "name": "sam3_text_prompt" + }, + "link": null + }, + { + "label": "Start frame index", + "localized_name": "start_frame_index", + "name": "start_frame_index", + "type": "INT", + "widget": { + "name": "start_frame_index" + }, + "link": null + }, + { + "label": "Clip duration (seconds)", + "localized_name": "duration_seconds", + "name": "duration_seconds", + "type": "INT", + "widget": { + "name": "duration_seconds" + }, + "link": null + }, + { + "label": "Width (pass 2)", + "localized_name": "latent_width", + "name": "latent_width", + "type": "INT", + "widget": { + "name": "latent_width" + }, + "link": null + }, + { + "label": "Height (pass 2)", + "localized_name": "latent_height", + "name": "latent_height", + "type": "INT", + "widget": { + "name": "latent_height" + }, + "link": null + }, + { + "label": "Skip pass 2 (reuse pass 1)", + "localized_name": "skip_pass_2", + "name": "skip_pass_2", + "type": "BOOLEAN", + "widget": { + "name": "skip_pass_2" + }, + "link": null + }, + { + "label": "Noise seed", + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { + "name": "noise_seed" + }, + "link": null + }, + { + "label": "SAM3 checkpoint", + "localized_name": "sam3_checkpoint", + "name": "sam3_checkpoint", + "type": "COMBO", + "widget": { + "name": "sam3_checkpoint" + }, + "link": null + }, + { + "label": "VOID UNet — pass 1", + "localized_name": "void_unet_pass1", + "name": "void_unet_pass1", + "type": "COMBO", + "widget": { + "name": "void_unet_pass1" + }, + "link": null + }, + { + "label": "VOID UNet — pass 2", + "localized_name": "void_unet_pass2", + "name": "void_unet_pass2", + "type": "COMBO", + "widget": { + "name": "void_unet_pass2" + }, + "link": null + }, + { + "label": "Optical flow model", + "localized_name": "optical_flow_model", + "name": "optical_flow_model", + "type": "COMBO", + "widget": { + "name": "optical_flow_model" + }, + "link": null + }, + { + "label": "CLIP / T5 weights", + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "label": "VAE weights", + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + } + ], + "outputs": [ + { + "label": "Pass 1 (intermediate)", + "localized_name": "pass_1_video", + "name": "pass_1_video", + "type": "VIDEO", + "links": [] + }, + { + "label": "Pass 2 (final)", + "localized_name": "final_pass_2_video", + "name": "final_pass_2_video", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "6", + "text" + ], + [ + "7", + "text" + ], + [ + "149", + "text" + ], + [ + "168", + "value" + ], + [ + "163", + "value" + ], + [ + "147", + "value" + ], + [ + "148", + "value" + ], + [ + "153", + "value" + ], + [ + "141", + "noise_seed" + ], + [ + "149", + "ckpt_name" + ], + [ + "144", + "unet_name" + ], + [ + "143", + "unet_name" + ], + [ + "142", + "model_name" + ], + [ + "2", + "clip_name" + ], + [ + "3", + "vae_name" + ] + ] + }, + "widgets_values": [], + "title": "Video Inpaint (VOID)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "c3157b75-484a-459e-b8de-57823bef5130", + "version": 1, + "state": { + "lastGroupId": 13, + "lastNodeId": 171, + "lastLinkId": 406, + "lastRerouteId": 0 + }, + "revision": 5, + "config": {}, + "name": "Video Inpaint (VOID)", + "inputNode": { + "id": -10, + "bounding": [ + -1530, + 800, + 203.1796875, + 368 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 2030, + 710, + 166.130859375, + 88 + ] + }, + "inputs": [ + { + "id": "1865ea29-14b1-4471-b5e0-d35bba595b9c", + "name": "source_video", + "type": "VIDEO", + "linkIds": [ + 373 + ], + "localized_name": "source_video", + "label": "Source video", + "pos": [ + -1350.8203125, + 824 + ] + }, + { + "id": "f1b2b2c4-bc2e-4e72-b16c-7e560e58d2d6", + "name": "positive_prompt", + "type": "STRING", + "linkIds": [ + 377 + ], + "localized_name": "positive_prompt", + "label": "Positive prompt (inpaint fill)", + "pos": [ + -1350.8203125, + 844 + ] + }, + { + "id": "931ac4dd-3cb6-4555-a1f0-619be81d64f6", + "name": "negative_prompt", + "type": "STRING", + "linkIds": [ + 387 + ], + "localized_name": "negative_prompt", + "label": "Negative prompt", + "pos": [ + -1350.8203125, + 864 + ] + }, + { + "id": "7a0963c3-bf2f-464d-80c2-6a6c90569883", + "name": "sam3_text_prompt", + "type": "STRING", + "linkIds": [ + 388 + ], + "localized_name": "sam3_text_prompt", + "label": "SAM3 object mask prompt", + "pos": [ + -1350.8203125, + 884 + ] + }, + { + "id": "f53f340f-2031-401d-b613-157622ef336f", + "name": "start_frame_index", + "type": "INT", + "linkIds": [ + 389 + ], + "localized_name": "start_frame_index", + "label": "Start frame index", + "pos": [ + -1350.8203125, + 904 + ] + }, + { + "id": "d5b8704b-7c8c-4cf0-87cd-26b293f65f83", + "name": "duration_seconds", + "type": "INT", + "linkIds": [ + 390 + ], + "localized_name": "duration_seconds", + "label": "Clip duration (seconds)", + "pos": [ + -1350.8203125, + 924 + ] + }, + { + "id": "7140209f-5058-4933-ae06-438256f77f23", + "name": "latent_width", + "type": "INT", + "linkIds": [ + 391 + ], + "localized_name": "latent_width", + "label": "Width (pass 2)", + "pos": [ + -1350.8203125, + 944 + ] + }, + { + "id": "084a140a-6fa9-4676-9483-ad30e0b14947", + "name": "latent_height", + "type": "INT", + "linkIds": [ + 392 + ], + "localized_name": "latent_height", + "label": "Height (pass 2)", + "pos": [ + -1350.8203125, + 964 + ] + }, + { + "id": "a8109321-e101-4ed8-b6f3-8ad1c815f35c", + "name": "skip_pass_2", + "type": "BOOLEAN", + "linkIds": [ + 393 + ], + "localized_name": "skip_pass_2", + "label": "Skip pass 2 (reuse pass 1)", + "pos": [ + -1350.8203125, + 984 + ] + }, + { + "id": "6964ab42-0662-47f2-9c2a-96782fdcb883", + "name": "noise_seed", + "type": "INT", + "linkIds": [ + 400 + ], + "localized_name": "noise_seed", + "label": "Noise seed", + "pos": [ + -1350.8203125, + 1004 + ] + }, + { + "id": "dccde360-461d-417e-b3f5-e1a4d6cece39", + "name": "sam3_checkpoint", + "type": "COMBO", + "linkIds": [ + 401 + ], + "localized_name": "sam3_checkpoint", + "label": "SAM3 checkpoint", + "pos": [ + -1350.8203125, + 1024 + ] + }, + { + "id": "5ce0d036-be08-4539-9ec6-e923fcdb8825", + "name": "void_unet_pass1", + "type": "COMBO", + "linkIds": [ + 402 + ], + "localized_name": "void_unet_pass1", + "label": "VOID UNet — pass 1", + "pos": [ + -1350.8203125, + 1044 + ] + }, + { + "id": "c1de695a-a08a-40bc-b9e4-d156fef73cd0", + "name": "void_unet_pass2", + "type": "COMBO", + "linkIds": [ + 403 + ], + "localized_name": "void_unet_pass2", + "label": "VOID UNet — pass 2", + "pos": [ + -1350.8203125, + 1064 + ] + }, + { + "id": "99da50bc-db57-4a21-9831-0f77b3c4fe99", + "name": "optical_flow_model", + "type": "COMBO", + "linkIds": [ + 404 + ], + "localized_name": "optical_flow_model", + "label": "Optical flow model", + "pos": [ + -1350.8203125, + 1084 + ] + }, + { + "id": "c756ce20-cfa6-4fe0-9eb0-543d56781cb7", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 405 + ], + "localized_name": "clip_name", + "label": "CLIP / T5 weights", + "pos": [ + -1350.8203125, + 1104 + ] + }, + { + "id": "d8eb12ad-a805-42d9-86b4-6f2c2cc5a231", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 406 + ], + "localized_name": "vae_name", + "label": "VAE weights", + "pos": [ + -1350.8203125, + 1124 + ] + } + ], + "outputs": [ + { + "id": "a21e83df-8c95-43a3-bd73-feeea67e90cd", + "name": "pass_1_video", + "type": "VIDEO", + "linkIds": [ + 77 + ], + "localized_name": "pass_1_video", + "label": "Pass 1 (intermediate)", + "pos": [ + 2054, + 734 + ] + }, + { + "id": "02c265f3-012f-499f-a4e8-a6d6aaf72885", + "name": "final_pass_2_video", + "type": "VIDEO", + "linkIds": [ + 362 + ], + "localized_name": "final_pass_2_video", + "label": "Pass 2 (final)", + "pos": [ + 2054, + 754 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 2, + "type": "CLIPLoader", + "pos": [ + -710, + 30 + ], + "size": [ + 320, + 150 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 405 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [ + 2, + 3 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "t5xxl_fp16.safetensors", + "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "t5xxl_fp16.safetensors", + "cogvideox", + "default" + ] + }, + { + "id": 3, + "type": "VAELoader", + "pos": [ + -710, + 220 + ], + "size": [ + 320, + 90 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 406 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 0, + "links": [ + 4, + 45, + 70 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "models": [ + { + "name": "cogvideox_vae.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/vae/cogvideox_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "cogvideox_vae.safetensors" + ] + }, + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + -260, + 200 + ], + "size": [ + 590, + 180 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 3 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 387 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 9 + ] + } + ], + "title": "Negative Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 136, + "type": "CFGGuider", + "pos": [ + 410, + 1640 + ], + "size": [ + 300, + 130 + ], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 322 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 309 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 310 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "links": [ + 311 + ] + } + ], + "title": "CFGGuider (Pass 2 cfg=6)", + "properties": { + "Node name for S&R": "CFGGuider", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 138, + "type": "BasicScheduler", + "pos": [ + 410, + 160 + ], + "size": [ + 270, + 150 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 324 + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "links": [ + 315 + ] + } + ], + "properties": { + "Node name for S&R": "BasicScheduler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "simple", + 30, + 1 + ] + }, + { + "id": 140, + "type": "CFGGuider", + "pos": [ + 410, + -30 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 325 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 317 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 318 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "GUIDER", + "name": "GUIDER", + "type": "GUIDER", + "links": [ + 319 + ] + } + ], + "properties": { + "Node name for S&R": "CFGGuider", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 141, + "type": "RandomNoise", + "pos": [ + 410, + -180 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "noise_seed", + "name": "noise_seed", + "type": "INT", + "widget": { + "name": "noise_seed" + }, + "link": 400 + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "links": [ + 320 + ] + } + ], + "properties": { + "Node name for S&R": "RandomNoise", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 43, + "fixed" + ] + }, + { + "id": 31, + "type": "VOIDWarpedNoise", + "pos": [ + 410, + 1090 + ], + "size": [ + 300, + 200 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "optical_flow", + "name": "optical_flow", + "type": "OPTICAL_FLOW", + "link": 321 + }, + { + "localized_name": "video", + "name": "video", + "type": "IMAGE", + "link": 72 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 333 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 335 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 67 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "warped_noise", + "name": "warped_noise", + "type": "LATENT", + "slot_index": 0, + "links": [ + 53 + ] + } + ], + "title": "Warped Noise (from Pass 1 output)", + "properties": { + "Node name for S&R": "VOIDWarpedNoise", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + 384, + 45, + 1 + ] + }, + { + "id": 35, + "type": "SamplerCustomAdvanced", + "pos": [ + 870, + 1110 + ], + "size": [ + 250, + 170 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 54 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 311 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 305 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 313 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 48 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "LATENT", + "slot_index": 0, + "links": [ + 49 + ] + }, + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "slot_index": 1, + "links": [] + } + ], + "title": "Pass 2 Sample", + "properties": { + "Node name for S&R": "SamplerCustomAdvanced", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 132, + "type": "MaskPreview", + "pos": [ + 390, + 560 + ], + "size": [ + 790, + 430 + ], + "flags": {}, + "order": 15, + "mode": 4, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 340 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "MaskPreview", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 142, + "type": "OpticalFlowLoader", + "pos": [ + -710, + 410 + ], + "size": [ + 320, + 90 + ], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "model_name", + "name": "model_name", + "type": "COMBO", + "widget": { + "name": "model_name" + }, + "link": 404 + } + ], + "outputs": [ + { + "localized_name": "OPTICAL_FLOW", + "name": "OPTICAL_FLOW", + "type": "OPTICAL_FLOW", + "links": [ + 321 + ] + } + ], + "properties": { + "Node name for S&R": "OpticalFlowLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/optical_flow/raft_large_C_T_SKHT_V2-ff5fadd5.safetensors", + "directory": "optical_flow" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors" + ] + }, + { + "id": 10, + "type": "VOIDInpaintConditioning", + "pos": [ + -110, + 430 + ], + "size": [ + 300, + 280 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 8 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 9 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 4 + }, + { + "localized_name": "video", + "name": "video", + "type": "IMAGE", + "link": 326 + }, + { + "localized_name": "quadmask", + "name": "quadmask", + "type": "MASK", + "link": 339 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 332 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 334 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 63 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 309, + 317 + ] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "slot_index": 1, + "links": [ + 310, + 318 + ] + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "slot_index": 2, + "links": [ + 48, + 82 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDInpaintConditioning", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + 384, + 45, + 1 + ] + }, + { + "id": 32, + "type": "VOIDWarpedNoiseSource", + "pos": [ + 410, + 1350 + ], + "size": [ + 300, + 50 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "warped_noise", + "name": "warped_noise", + "type": "LATENT", + "link": 53 + } + ], + "outputs": [ + { + "localized_name": "NOISE", + "name": "NOISE", + "type": "NOISE", + "slot_index": 0, + "links": [ + 54 + ] + } + ], + "title": "Warped Noise → NOISE", + "properties": { + "Node name for S&R": "VOIDWarpedNoiseSource", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 137, + "type": "BasicScheduler", + "pos": [ + 410, + 1470 + ], + "size": [ + 300, + 150 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 323 + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "SIGMAS", + "name": "SIGMAS", + "type": "SIGMAS", + "links": [ + 313 + ] + } + ], + "properties": { + "Node name for S&R": "BasicScheduler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "simple", + 30, + 1 + ] + }, + { + "id": 134, + "type": "VOIDSampler", + "pos": [ + 410, + 1800 + ], + "size": [ + 300, + 50 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "links": [ + 305 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDSampler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 143, + "type": "UNETLoader", + "pos": [ + -710, + 550 + ], + "size": [ + 320, + 120 + ], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 403 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 322, + 323 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "void_pass2.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass2.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "void_pass2.safetensors", + "default" + ] + }, + { + "id": 144, + "type": "UNETLoader", + "pos": [ + -720, + -150 + ], + "size": [ + 320, + 120 + ], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 402 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 324, + 325 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "models": [ + { + "name": "void_pass1.safetensors", + "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass1.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "void_pass1.safetensors", + "default" + ] + }, + { + "id": 46, + "type": "CreateVideo", + "pos": [ + 1230, + -20 + ], + "size": [ + 240, + 110 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 73 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 355 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 368 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 77 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 30 + ] + }, + { + "id": 133, + "type": "VOIDSampler", + "pos": [ + 410, + 370 + ], + "size": [ + 280, + 50 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "localized_name": "SAMPLER", + "name": "SAMPLER", + "type": "SAMPLER", + "links": [ + 304 + ] + } + ], + "properties": { + "Node name for S&R": "VOIDSampler", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 49, + "type": "SamplerCustomAdvanced", + "pos": [ + 880, + -180 + ], + "size": [ + 250, + 270 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "noise", + "name": "noise", + "type": "NOISE", + "link": 320 + }, + { + "localized_name": "guider", + "name": "guider", + "type": "GUIDER", + "link": 319 + }, + { + "localized_name": "sampler", + "name": "sampler", + "type": "SAMPLER", + "link": 304 + }, + { + "localized_name": "sigmas", + "name": "sigmas", + "type": "SIGMAS", + "link": 315 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 82 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "LATENT", + "links": [ + 83 + ] + }, + { + "localized_name": "denoised_output", + "name": "denoised_output", + "type": "LATENT", + "links": null + } + ], + "properties": { + "Node name for S&R": "SamplerCustomAdvanced", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 45, + "type": "VAEDecode", + "pos": [ + 1230, + -180 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 83 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 70 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 72, + 73, + 342 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + -260, + -180 + ], + "size": [ + 580, + 310 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 2 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 377 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 8 + ] + } + ], + "title": "Positive Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 145, + "type": "ImageFromBatch", + "pos": [ + -410, + 850 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 24, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 366 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": 384 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 361 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 326, + 327, + 336 + ] + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 197 + ] + }, + { + "id": 36, + "type": "VAEDecode", + "pos": [ + 1220, + 1110 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 49 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 45 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 341 + ] + } + ], + "title": "Pass 2 VAE Decode", + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 149, + "type": "c3e0d783-9aa3-4e75-a94d-19937968ef86", + "pos": [ + -20, + 840 + ], + "size": [ + 290, + 370 + ], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 336 + }, + { + "label": "object", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 388 + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + }, + { + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": null + }, + { + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 401 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 339, + 340 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "78", + "text" + ], + [ + "75", + "threshold" + ], + [ + "75", + "refine_iterations" + ], + [ + "75", + "individual_masks" + ], + [ + "77", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": { + "text": true + }, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [] + }, + { + "id": 43, + "type": "GetImageSize", + "pos": [ + -410, + 1140 + ], + "size": [ + 230, + 160 + ], + "flags": { + "collapsed": false + }, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 327 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": null + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": [ + 63, + 67 + ] + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.20.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 147, + "type": "PrimitiveInt", + "pos": [ + -570, + 1660 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 25, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 391 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 332, + 333 + ] + } + ], + "title": "Int (Width)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 672, + "fixed" + ] + }, + { + "id": 148, + "type": "PrimitiveInt", + "pos": [ + -570, + 1790 + ], + "size": [ + 270, + 90 + ], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 392 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 334, + 335 + ] + } + ], + "title": "Int (Height)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 384, + "fixed" + ] + }, + { + "id": 150, + "type": "ComfySwitchNode", + "pos": [ + 1510, + 1080 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 342 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 341 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 346 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 363 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 153, + "type": "PrimitiveBoolean", + "pos": [ + -580, + 1440 + ], + "size": [ + 270, + 80 + ], + "flags": {}, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 393 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 346 + ] + } + ], + "title": "Boolean (Skip Pass 2?)", + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 158, + "type": "TrimAudioDuration", + "pos": [ + -10, + 1580 + ], + "size": [ + 270, + 120 + ], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "link": 367 + }, + { + "localized_name": "start_index", + "name": "start_index", + "type": "FLOAT", + "widget": { + "name": "start_index" + }, + "link": 386 + }, + { + "localized_name": "duration", + "name": "duration", + "type": "FLOAT", + "widget": { + "name": "duration" + }, + "link": 385 + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 355, + 364 + ] + } + ], + "properties": { + "Node name for S&R": "TrimAudioDuration", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 60 + ] + }, + { + "id": 163, + "type": "PrimitiveInt", + "pos": [ + -740, + 1170 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 31, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 390 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 360 + ] + } + ], + "title": "Int (Video duration)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 5, + "fixed" + ] + }, + { + "id": 164, + "type": "ComfyMathExpression", + "pos": [ + -740, + 1300 + ], + "size": [ + 230, + 100 + ], + "flags": { + "collapsed": true + }, + "order": 32, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 360 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 371 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 385 + ] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 361 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a * b" + ] + }, + { + "id": 165, + "type": "CreateVideo", + "pos": [ + 1510, + 1270 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 363 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 364 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 372 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 362 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo" + }, + "widgets_values": [ + 24 + ] + }, + { + "id": 166, + "type": "GetVideoComponents", + "pos": [ + -740, + 840 + ], + "size": [ + 230, + 100 + ], + "flags": {}, + "order": 34, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 373 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 366 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 367 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 368, + 371, + 372, + 383 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents" + } + }, + { + "id": 168, + "type": "PrimitiveInt", + "pos": [ + -740, + 980 + ], + "size": [ + 230, + 90 + ], + "flags": {}, + "order": 35, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 389 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 382 + ] + } + ], + "title": "Int (Index)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + "fixed" + ] + }, + { + "id": 169, + "type": "ComfyMathExpression", + "pos": [ + -740, + 1110 + ], + "size": [ + 230, + 100 + ], + "flags": { + "collapsed": true + }, + "order": 36, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 382 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": 383 + }, + { + "label": "c", + "localized_name": "values.c", + "name": "values.c", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 386 + ] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 384 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": null + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression" + }, + "widgets_values": [ + "a * b" + ] + } + ], + "groups": [ + { + "id": 1, + "title": "Models", + "bounding": [ + -790, + -260, + 470, + 990 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Input videos (place files in ComfyUI/input/)", + "bounding": [ + -790, + 760, + 660, + 560 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Shared: Text & Mask Conditioning", + "bounding": [ + -290, + -260, + 640, + 990 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 4, + "title": "Pass 1: Sample (Random Noise → DDIM)", + "bounding": [ + 380, + -260, + 810, + 750 + ], + "color": "#8A8", + "flags": {} + }, + { + "id": 6, + "title": "Pass 2: Sample (Warped Noise → DDIM)", + "bounding": [ + 380, + 1020, + 810, + 880 + ], + "color": "#8A8", + "flags": {} + }, + { + "id": 8, + "title": "Create Mask", + "bounding": [ + -100, + 760, + 450, + 560 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 9, + "title": "Pass 1", + "bounding": [ + -730, + -220, + 360, + 210 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 10, + "title": "Pass 2", + "bounding": [ + -720, + 340, + 340, + 340 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 11, + "title": "Output Video Size", + "bounding": [ + -790, + 1580, + 660, + 320 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 12, + "title": "Skip Pass 2", + "bounding": [ + -790, + 1350, + 660, + 200 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 13, + "title": "Trim Audio", + "bounding": [ + -100, + 1350, + 450, + 550 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 3, + "origin_id": 2, + "origin_slot": 0, + "target_id": 7, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 322, + "origin_id": 143, + "origin_slot": 0, + "target_id": 136, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 309, + "origin_id": 10, + "origin_slot": 0, + "target_id": 136, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 310, + "origin_id": 10, + "origin_slot": 1, + "target_id": 136, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 324, + "origin_id": 144, + "origin_slot": 0, + "target_id": 138, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 325, + "origin_id": 144, + "origin_slot": 0, + "target_id": 140, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 317, + "origin_id": 10, + "origin_slot": 0, + "target_id": 140, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 318, + "origin_id": 10, + "origin_slot": 1, + "target_id": 140, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 321, + "origin_id": 142, + "origin_slot": 0, + "target_id": 31, + "target_slot": 0, + "type": "OPTICAL_FLOW" + }, + { + "id": 72, + "origin_id": 45, + "origin_slot": 0, + "target_id": 31, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 333, + "origin_id": 147, + "origin_slot": 0, + "target_id": 31, + "target_slot": 2, + "type": "INT" + }, + { + "id": 335, + "origin_id": 148, + "origin_slot": 0, + "target_id": 31, + "target_slot": 3, + "type": "INT" + }, + { + "id": 67, + "origin_id": 43, + "origin_slot": 2, + "target_id": 31, + "target_slot": 4, + "type": "INT" + }, + { + "id": 54, + "origin_id": 32, + "origin_slot": 0, + "target_id": 35, + "target_slot": 0, + "type": "NOISE" + }, + { + "id": 311, + "origin_id": 136, + "origin_slot": 0, + "target_id": 35, + "target_slot": 1, + "type": "GUIDER" + }, + { + "id": 305, + "origin_id": 134, + "origin_slot": 0, + "target_id": 35, + "target_slot": 2, + "type": "SAMPLER" + }, + { + "id": 313, + "origin_id": 137, + "origin_slot": 0, + "target_id": 35, + "target_slot": 3, + "type": "SIGMAS" + }, + { + "id": 48, + "origin_id": 10, + "origin_slot": 2, + "target_id": 35, + "target_slot": 4, + "type": "LATENT" + }, + { + "id": 340, + "origin_id": 149, + "origin_slot": 0, + "target_id": 132, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 8, + "origin_id": 6, + "origin_slot": 0, + "target_id": 10, + "target_slot": 0, + "type": "CONDITIONING" + }, + { + "id": 9, + "origin_id": 7, + "origin_slot": 0, + "target_id": 10, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 4, + "origin_id": 3, + "origin_slot": 0, + "target_id": 10, + "target_slot": 2, + "type": "VAE" + }, + { + "id": 326, + "origin_id": 145, + "origin_slot": 0, + "target_id": 10, + "target_slot": 3, + "type": "IMAGE" + }, + { + "id": 339, + "origin_id": 149, + "origin_slot": 0, + "target_id": 10, + "target_slot": 4, + "type": "MASK" + }, + { + "id": 332, + "origin_id": 147, + "origin_slot": 0, + "target_id": 10, + "target_slot": 5, + "type": "INT" + }, + { + "id": 334, + "origin_id": 148, + "origin_slot": 0, + "target_id": 10, + "target_slot": 6, + "type": "INT" + }, + { + "id": 63, + "origin_id": 43, + "origin_slot": 2, + "target_id": 10, + "target_slot": 7, + "type": "INT" + }, + { + "id": 53, + "origin_id": 31, + "origin_slot": 0, + "target_id": 32, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 323, + "origin_id": 143, + "origin_slot": 0, + "target_id": 137, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 73, + "origin_id": 45, + "origin_slot": 0, + "target_id": 46, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 355, + "origin_id": 158, + "origin_slot": 0, + "target_id": 46, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 368, + "origin_id": 166, + "origin_slot": 2, + "target_id": 46, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 320, + "origin_id": 141, + "origin_slot": 0, + "target_id": 49, + "target_slot": 0, + "type": "NOISE" + }, + { + "id": 319, + "origin_id": 140, + "origin_slot": 0, + "target_id": 49, + "target_slot": 1, + "type": "GUIDER" + }, + { + "id": 304, + "origin_id": 133, + "origin_slot": 0, + "target_id": 49, + "target_slot": 2, + "type": "SAMPLER" + }, + { + "id": 315, + "origin_id": 138, + "origin_slot": 0, + "target_id": 49, + "target_slot": 3, + "type": "SIGMAS" + }, + { + "id": 82, + "origin_id": 10, + "origin_slot": 2, + "target_id": 49, + "target_slot": 4, + "type": "LATENT" + }, + { + "id": 83, + "origin_id": 49, + "origin_slot": 0, + "target_id": 45, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 70, + "origin_id": 3, + "origin_slot": 0, + "target_id": 45, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 2, + "origin_id": 2, + "origin_slot": 0, + "target_id": 6, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 366, + "origin_id": 166, + "origin_slot": 0, + "target_id": 145, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 361, + "origin_id": 164, + "origin_slot": 1, + "target_id": 145, + "target_slot": 2, + "type": "INT" + }, + { + "id": 49, + "origin_id": 35, + "origin_slot": 0, + "target_id": 36, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 45, + "origin_id": 3, + "origin_slot": 0, + "target_id": 36, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 336, + "origin_id": 145, + "origin_slot": 0, + "target_id": 149, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 327, + "origin_id": 145, + "origin_slot": 0, + "target_id": 43, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 342, + "origin_id": 45, + "origin_slot": 0, + "target_id": 150, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 341, + "origin_id": 36, + "origin_slot": 0, + "target_id": 150, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 346, + "origin_id": 153, + "origin_slot": 0, + "target_id": 150, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 367, + "origin_id": 166, + "origin_slot": 1, + "target_id": 158, + "target_slot": 0, + "type": "AUDIO" + }, + { + "id": 360, + "origin_id": 163, + "origin_slot": 0, + "target_id": 164, + "target_slot": 0, + "type": "INT" + }, + { + "id": 371, + "origin_id": 166, + "origin_slot": 2, + "target_id": 164, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 363, + "origin_id": 150, + "origin_slot": 0, + "target_id": 165, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 364, + "origin_id": 158, + "origin_slot": 0, + "target_id": 165, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 372, + "origin_id": 166, + "origin_slot": 2, + "target_id": 165, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 373, + "origin_id": -10, + "origin_slot": 0, + "target_id": 166, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 77, + "origin_id": 46, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 362, + "origin_id": 165, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "VIDEO" + }, + { + "id": 377, + "origin_id": -10, + "origin_slot": 1, + "target_id": 6, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 382, + "origin_id": 168, + "origin_slot": 0, + "target_id": 169, + "target_slot": 0, + "type": "INT" + }, + { + "id": 383, + "origin_id": 166, + "origin_slot": 2, + "target_id": 169, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 384, + "origin_id": 169, + "origin_slot": 1, + "target_id": 145, + "target_slot": 1, + "type": "INT" + }, + { + "id": 385, + "origin_id": 164, + "origin_slot": 0, + "target_id": 158, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 386, + "origin_id": 169, + "origin_slot": 0, + "target_id": 158, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 387, + "origin_id": -10, + "origin_slot": 2, + "target_id": 7, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 388, + "origin_id": -10, + "origin_slot": 3, + "target_id": 149, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 389, + "origin_id": -10, + "origin_slot": 4, + "target_id": 168, + "target_slot": 0, + "type": "INT" + }, + { + "id": 390, + "origin_id": -10, + "origin_slot": 5, + "target_id": 163, + "target_slot": 0, + "type": "INT" + }, + { + "id": 391, + "origin_id": -10, + "origin_slot": 6, + "target_id": 147, + "target_slot": 0, + "type": "INT" + }, + { + "id": 392, + "origin_id": -10, + "origin_slot": 7, + "target_id": 148, + "target_slot": 0, + "type": "INT" + }, + { + "id": 393, + "origin_id": -10, + "origin_slot": 8, + "target_id": 153, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 400, + "origin_id": -10, + "origin_slot": 9, + "target_id": 141, + "target_slot": 0, + "type": "INT" + }, + { + "id": 401, + "origin_id": -10, + "origin_slot": 10, + "target_id": 149, + "target_slot": 8, + "type": "COMBO" + }, + { + "id": 402, + "origin_id": -10, + "origin_slot": 11, + "target_id": 144, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 403, + "origin_id": -10, + "origin_slot": 12, + "target_id": 143, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 404, + "origin_id": -10, + "origin_slot": 13, + "target_id": 142, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 405, + "origin_id": -10, + "origin_slot": 14, + "target_id": 2, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 406, + "origin_id": -10, + "origin_slot": 15, + "target_id": 3, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": {}, + "category": "Video generation and editing/Inpaint video", + "description": "Removes objects from video by inpainting masked regions using VOID (CogVideoX), with SAM3 text-guided segmentation and optional two-pass optical-flow refinement." + }, + { + "id": "c3e0d783-9aa3-4e75-a94d-19937968ef86", + "version": 1, + "state": { + "lastGroupId": 13, + "lastNodeId": 171, + "lastLinkId": 406, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Segmentation (SAM3)", + "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.", + "inputNode": { + "id": -10, + "bounding": [ + -2260, + -3450, + 144.369140625, + 228 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1130, + -3305, + 128, + 88 + ] + }, + "inputs": [ + { + "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 264 + ], + "localized_name": "image", + "label": "image", + "pos": [ + -2139.630859375, + -3426 + ] + }, + { + "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745", + "name": "text", + "type": "STRING", + "linkIds": [ + 265 + ], + "label": "object", + "pos": [ + -2139.630859375, + -3406 + ] + }, + { + "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 266 + ], + "pos": [ + -2139.630859375, + -3386 + ] + }, + { + "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899", + "name": "positive_coords", + "type": "STRING", + "linkIds": [ + 267 + ], + "pos": [ + -2139.630859375, + -3366 + ] + }, + { + "id": "c65f8b87-9bd7-48be-9fc2-823431e95019", + "name": "negative_coords", + "type": "STRING", + "linkIds": [ + 268 + ], + "pos": [ + -2139.630859375, + -3346 + ] + }, + { + "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 269 + ], + "pos": [ + -2139.630859375, + -3326 + ] + }, + { + "id": "b1439668-b050-490b-a5dc-fc4052c55666", + "name": "refine_iterations", + "type": "INT", + "linkIds": [ + 270 + ], + "pos": [ + -2139.630859375, + -3306 + ] + }, + { + "id": "86e239e5-c098-4302-b54d-d42a38bc0f89", + "name": "individual_masks", + "type": "BOOLEAN", + "linkIds": [ + 271 + ], + "pos": [ + -2139.630859375, + -3286 + ] + }, + { + "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 272 + ], + "pos": [ + -2139.630859375, + -3266 + ] + } + ], + "outputs": [ + { + "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913", + "name": "masks", + "type": "MASK", + "linkIds": [ + 231 + ], + "localized_name": "masks", + "pos": [ + -1106, + -3281 + ] + }, + { + "id": "8f622e40-8528-4078-b7d3-147e9f872194", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 232 + ], + "localized_name": "bboxes", + "pos": [ + -1106, + -3261 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 75, + "type": "SAM3_Detect", + "pos": [ + -1470, + -3460 + ], + "size": [ + 270, + 260 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 237 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 264 + }, + { + "label": "conditioning", + "localized_name": "conditioning", + "name": "conditioning", + "shape": 7, + "type": "CONDITIONING", + "link": 200 + }, + { + "label": "bboxes", + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 266 + }, + { + "label": "positive_coords", + "localized_name": "positive_coords", + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": 267 + }, + { + "label": "negative_coords", + "localized_name": "negative_coords", + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": 268 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 269 + }, + { + "localized_name": "refine_iterations", + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": 270 + }, + { + "localized_name": "individual_masks", + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": 271 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 231 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 232 + ] + } + ], + "properties": { + "Node name for S&R": "SAM3_Detect", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0.5, + 2, + false + ] + }, + { + "id": 77, + "type": "CheckpointLoaderSimple", + "pos": [ + -1970, + -3200 + ], + "size": [ + 330, + 140 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 272 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 237 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 240 + ] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": null + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "models": [ + { + "name": "sam3.1_multiplex_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sam3.1_multiplex_fp16.safetensors" + ] + }, + { + "id": 78, + "type": "CLIPTextEncode", + "pos": [ + -2000, + -3000 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 240 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 265 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 200 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 237, + "origin_id": 77, + "origin_slot": 0, + "target_id": 75, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 200, + "origin_id": 78, + "origin_slot": 0, + "target_id": 75, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 240, + "origin_id": 77, + "origin_slot": 1, + "target_id": 78, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 231, + "origin_id": 75, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 232, + "origin_id": 75, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 264, + "origin_id": -10, + "origin_slot": 0, + "target_id": 75, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 265, + "origin_id": -10, + "origin_slot": 1, + "target_id": 78, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 266, + "origin_id": -10, + "origin_slot": 2, + "target_id": 75, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 267, + "origin_id": -10, + "origin_slot": 3, + "target_id": 75, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 268, + "origin_id": -10, + "origin_slot": 4, + "target_id": 75, + "target_slot": 5, + "type": "STRING" + }, + { + "id": 269, + "origin_id": -10, + "origin_slot": 5, + "target_id": 75, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 270, + "origin_id": -10, + "origin_slot": 6, + "target_id": 75, + "target_slot": 7, + "type": "INT" + }, + { + "id": 271, + "origin_id": -10, + "origin_slot": 7, + "target_id": 75, + "target_slot": 8, + "type": "BOOLEAN" + }, + { + "id": 272, + "origin_id": -10, + "origin_slot": 8, + "target_id": 77, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": { + "ue_links": [] + } + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Inpaint(Wan2.1 VACE).json b/blueprints/Video Inpaint(Wan2.1 VACE).json deleted file mode 100644 index a658be5f8..000000000 --- a/blueprints/Video Inpaint(Wan2.1 VACE).json +++ /dev/null @@ -1,2388 +0,0 @@ -{ - "id": "2f429c60-2e03-4117-908b-31e1fab04bba", - "revision": 0, - "last_node_id": 229, - "last_link_id": 366, - "nodes": [ - { - "id": 229, - "type": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67", - "pos": [ - -230, - 160 - ], - "size": [ - 400, - 480 - ], - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [ - { - "label": "video mask", - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": null - }, - { - "localized_name": "video", - "name": "video", - "type": "VIDEO", - "link": null - }, - { - "name": "width", - "type": "INT", - "widget": { - "name": "width" - }, - "link": null - }, - { - "name": "height", - "type": "INT", - "widget": { - "name": "height" - }, - "link": null - }, - { - "label": "reference image", - "name": "reference_image_1", - "type": "IMAGE", - "link": null - }, - { - "name": "unet_name", - "type": "COMBO", - "widget": { - "name": "unet_name" - }, - "link": null - }, - { - "name": "lora_name", - "type": "COMBO", - "widget": { - "name": "lora_name" - }, - "link": null - }, - { - "name": "clip_name", - "type": "COMBO", - "widget": { - "name": "clip_name" - }, - "link": null - }, - { - "name": "vae_name", - "type": "COMBO", - "widget": { - "name": "vae_name" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "VIDEO", - "name": "VIDEO", - "type": "VIDEO", - "links": [] - } - ], - "properties": { - "proxyWidgets": [ - [ - "6", - "text" - ], - [ - "-1", - "width" - ], - [ - "-1", - "height" - ], - [ - "3", - "seed" - ], - [ - "3", - "control_after_generate" - ], - [ - "-1", - "unet_name" - ], - [ - "-1", - "lora_name" - ], - [ - "-1", - "clip_name" - ], - [ - "-1", - "vae_name" - ] - ], - "cnr_id": "comfy-core", - "ver": "0.13.0" - }, - "widgets_values": [ - null, - 720, - 720, - null, - null, - "wan2.1_vace_14B_fp16.safetensors", - "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "wan_2.1_vae.safetensors" - ] - } - ], - "links": [], - "groups": [], - "definitions": { - "subgraphs": [ - { - "id": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67", - "version": 1, - "state": { - "lastGroupId": 25, - "lastNodeId": 229, - "lastLinkId": 366, - "lastRerouteId": 0 - }, - "revision": 0, - "config": {}, - "name": "Video Inpaint (Wan 2.1 VACE)", - "inputNode": { - "id": -10, - "bounding": [ - -970, - 800, - 132.54296875, - 220 - ] - }, - "outputNode": { - "id": -20, - "bounding": [ - 1480, - 535, - 120, - 60 - ] - }, - "inputs": [ - { - "id": "9fdda38d-6aa7-48ad-b425-f493d8aa585c", - "name": "mask", - "type": "MASK", - "linkIds": [ - 351, - 335, - 345 - ], - "localized_name": "mask", - "label": "video mask", - "pos": [ - -857.45703125, - 820 - ] - }, - { - "id": "8b1788cc-46d2-4f40-8b33-70fd56b4cb24", - "name": "video", - "type": "VIDEO", - "linkIds": [ - 336 - ], - "localized_name": "video", - "pos": [ - -857.45703125, - 840 - ] - }, - { - "id": "09393f21-257e-4476-bb02-54899a8252b8", - "name": "width", - "type": "INT", - "linkIds": [ - 355 - ], - "pos": [ - -857.45703125, - 860 - ] - }, - { - "id": "07a030f7-7eac-4b3f-b8f3-f00ee87b191d", - "name": "height", - "type": "INT", - "linkIds": [ - 356 - ], - "pos": [ - -857.45703125, - 880 - ] - }, - { - "id": "255908d3-6cc9-48fc-b76b-ab9fb72695bc", - "name": "reference_image_1", - "type": "IMAGE", - "linkIds": [ - 361 - ], - "label": "reference image", - "pos": [ - -857.45703125, - 900 - ] - }, - { - "id": "18a5d241-523c-433d-ae05-25b6e69d1e29", - "name": "unet_name", - "type": "COMBO", - "linkIds": [ - 363 - ], - "pos": [ - -857.45703125, - 920 - ] - }, - { - "id": "d7576e1b-da5f-402f-81b2-d37f838b1f8f", - "name": "lora_name", - "type": "COMBO", - "linkIds": [ - 364 - ], - "pos": [ - -857.45703125, - 940 - ] - }, - { - "id": "41676a3e-c710-4723-821e-f651ad3784b1", - "name": "clip_name", - "type": "COMBO", - "linkIds": [ - 365 - ], - "pos": [ - -857.45703125, - 960 - ] - }, - { - "id": "41fc878c-9aa6-4c12-bef3-ceda6b094b7c", - "name": "vae_name", - "type": "COMBO", - "linkIds": [ - 366 - ], - "pos": [ - -857.45703125, - 980 - ] - } - ], - "outputs": [ - { - "id": "d4861f39-1011-49dc-80fd-ee318b614a8d", - "name": "VIDEO", - "type": "VIDEO", - "linkIds": [ - 129 - ], - "localized_name": "VIDEO", - "pos": [ - 1500, - 555 - ] - } - ], - "widgets": [], - "nodes": [ - { - "id": 58, - "type": "TrimVideoLatent", - "pos": [ - 760, - 390 - ], - "size": [ - 315, - 60 - ], - "flags": { - "collapsed": false - }, - "order": 13, - "mode": 0, - "inputs": [ - { - "localized_name": "samples", - "name": "samples", - "type": "LATENT", - "link": 116 - }, - { - "localized_name": "trim_amount", - "name": "trim_amount", - "type": "INT", - "widget": { - "name": "trim_amount" - }, - "link": 115 - } - ], - "outputs": [ - { - "localized_name": "LATENT", - "name": "LATENT", - "type": "LATENT", - "links": [ - 117 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "TrimVideoLatent", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "trim_amount": true - } - }, - "widgets_values": [ - 0 - ] - }, - { - "id": 8, - "type": "VAEDecode", - "pos": [ - 770, - 500 - ], - "size": [ - 315, - 46 - ], - "flags": { - "collapsed": false - }, - "order": 11, - "mode": 0, - "inputs": [ - { - "localized_name": "samples", - "name": "samples", - "type": "LATENT", - "link": 117 - }, - { - "localized_name": "vae", - "name": "vae", - "type": "VAE", - "link": 76 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "slot_index": 0, - "links": [ - 139 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "VAEDecode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [] - }, - { - "id": 48, - "type": "ModelSamplingSD3", - "pos": [ - 400, - 50 - ], - "size": [ - 315, - 58 - ], - "flags": {}, - "order": 9, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 279 - }, - { - "localized_name": "shift", - "name": "shift", - "type": "FLOAT", - "widget": { - "name": "shift" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "slot_index": 0, - "links": [ - 280 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "ModelSamplingSD3", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 5 - ] - }, - { - "id": 219, - "type": "InvertMask", - "pos": [ - 400, - 990 - ], - "size": [ - 140, - 26 - ], - "flags": {}, - "order": 24, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 351 - } - ], - "outputs": [ - { - "localized_name": "MASK", - "name": "MASK", - "type": "MASK", - "links": [ - 352 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "InvertMask" - }, - "widgets_values": [] - }, - { - "id": 216, - "type": "MaskToImage", - "pos": [ - 560, - 990 - ], - "size": [ - 193.2779296875, - 26 - ], - "flags": {}, - "order": 23, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 352 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 334 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "MaskToImage" - }, - "widgets_values": [] - }, - { - "id": 213, - "type": "RebatchImages", - "pos": [ - 410, - 690 - ], - "size": [ - 230, - 60 - ], - "flags": {}, - "order": 21, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 360 - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "widget": { - "name": "batch_size" - }, - "link": 340 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "shape": 6, - "type": "IMAGE", - "links": [ - 333 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "RebatchImages" - }, - "widgets_values": [ - 1 - ] - }, - { - "id": 68, - "type": "CreateVideo", - "pos": [ - 1150, - 50 - ], - "size": [ - 270, - 78 - ], - "flags": { - "collapsed": false - }, - "order": 14, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 139 - }, - { - "localized_name": "audio", - "name": "audio", - "shape": 7, - "type": "AUDIO", - "link": 362 - }, - { - "localized_name": "fps", - "name": "fps", - "type": "FLOAT", - "widget": { - "name": "fps" - }, - "link": 353 - } - ], - "outputs": [ - { - "localized_name": "VIDEO", - "name": "VIDEO", - "type": "VIDEO", - "links": [ - 129 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CreateVideo", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 16 - ] - }, - { - "id": 208, - "type": "ImageCompositeMasked", - "pos": [ - 410, - 790 - ], - "size": [ - 230, - 146 - ], - "flags": {}, - "order": 18, - "mode": 0, - "inputs": [ - { - "localized_name": "destination", - "name": "destination", - "type": "IMAGE", - "link": 333 - }, - { - "localized_name": "source", - "name": "source", - "type": "IMAGE", - "link": 334 - }, - { - "localized_name": "mask", - "name": "mask", - "shape": 7, - "type": "MASK", - "link": 335 - }, - { - "localized_name": "x", - "name": "x", - "type": "INT", - "widget": { - "name": "x" - }, - "link": null - }, - { - "localized_name": "y", - "name": "y", - "type": "INT", - "widget": { - "name": "y" - }, - "link": null - }, - { - "localized_name": "resize_source", - "name": "resize_source", - "type": "BOOLEAN", - "widget": { - "name": "resize_source" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 341, - 344 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "ImageCompositeMasked" - }, - "widgets_values": [ - 0, - 0, - true - ] - }, - { - "id": 214, - "type": "PreviewImage", - "pos": [ - 760, - 690 - ], - "size": [ - 300, - 300 - ], - "flags": {}, - "order": 22, - "mode": 0, - "inputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "link": 341 - } - ], - "outputs": [], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "PreviewImage" - }, - "widgets_values": [] - }, - { - "id": 111, - "type": "MaskToImage", - "pos": [ - 20, - 1270 - ], - "size": [ - 240, - 26 - ], - "flags": {}, - "order": 15, - "mode": 0, - "inputs": [ - { - "localized_name": "mask", - "name": "mask", - "type": "MASK", - "link": 345 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 201 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "MaskToImage", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [] - }, - { - "id": 129, - "type": "RepeatImageBatch", - "pos": [ - 20, - 1160 - ], - "size": [ - 240, - 60 - ], - "flags": {}, - "order": 16, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 201 - }, - { - "localized_name": "amount", - "name": "amount", - "type": "INT", - "widget": { - "name": "amount" - }, - "link": 346 - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 202 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "RepeatImageBatch", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "amount": true - } - }, - "widgets_values": [ - 17 - ] - }, - { - "id": 130, - "type": "ImageToMask", - "pos": [ - 20, - 1050 - ], - "size": [ - 240, - 60 - ], - "flags": {}, - "order": 17, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 202 - }, - { - "localized_name": "channel", - "name": "channel", - "type": "COMBO", - "widget": { - "name": "channel" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MASK", - "name": "MASK", - "type": "MASK", - "links": [ - 349 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "ImageToMask", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "red" - ] - }, - { - "id": 3, - "type": "KSampler", - "pos": [ - 770, - 50 - ], - "size": [ - 315, - 262 - ], - "flags": {}, - "order": 10, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 280 - }, - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "link": 98 - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "link": 99 - }, - { - "localized_name": "latent_image", - "name": "latent_image", - "type": "LATENT", - "link": 160 - }, - { - "localized_name": "seed", - "name": "seed", - "type": "INT", - "widget": { - "name": "seed" - }, - "link": null - }, - { - "localized_name": "steps", - "name": "steps", - "type": "INT", - "widget": { - "name": "steps" - }, - "link": null - }, - { - "localized_name": "cfg", - "name": "cfg", - "type": "FLOAT", - "widget": { - "name": "cfg" - }, - "link": null - }, - { - "localized_name": "sampler_name", - "name": "sampler_name", - "type": "COMBO", - "widget": { - "name": "sampler_name" - }, - "link": null - }, - { - "localized_name": "scheduler", - "name": "scheduler", - "type": "COMBO", - "widget": { - "name": "scheduler" - }, - "link": null - }, - { - "localized_name": "denoise", - "name": "denoise", - "type": "FLOAT", - "widget": { - "name": "denoise" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "LATENT", - "name": "LATENT", - "type": "LATENT", - "slot_index": 0, - "links": [ - 116 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "KSampler", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - 584027519362099, - "randomize", - 4, - 1, - "uni_pc", - "simple", - 1 - ] - }, - { - "id": 224, - "type": "MarkdownNote", - "pos": [ - 420, - -160 - ], - "size": [ - 310, - 110 - ], - "flags": {}, - "order": 0, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "About Video Size", - "properties": {}, - "widgets_values": [ - "| Model | 480P | 720P |\n| ------------------------------------------------------------ | ---- | ---- |\n| [VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) | ✅ | ❌ |\n| [VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) | ✅ | ✅ |" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 223, - "type": "MarkdownNote", - "pos": [ - 770, - -210 - ], - "size": [ - 303.90106201171875, - 158.5415802001953 - ], - "flags": {}, - "order": 1, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "KSampler Setting", - "properties": {}, - "widgets_values": [ - "## Default\n\n- steps:20\n- cfg:6.0\n\n## For CausVid LoRA\n\n- steps: 2-4\n- cfg: 1.0\n\n" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 6, - "type": "CLIPTextEncode", - "pos": [ - -80, - 60 - ], - "size": [ - 420, - 280 - ], - "flags": {}, - "order": 7, - "mode": 0, - "inputs": [ - { - "localized_name": "clip", - "name": "clip", - "type": "CLIP", - "link": 74 - }, - { - "localized_name": "text", - "name": "text", - "type": "STRING", - "widget": { - "name": "text" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CONDITIONING", - "name": "CONDITIONING", - "type": "CONDITIONING", - "slot_index": 0, - "links": [ - 96 - ] - } - ], - "title": "CLIP Text Encode (Positive Prompt)", - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPTextEncode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "" - ], - "color": "#232", - "bgcolor": "#353" - }, - { - "id": 140, - "type": "UNETLoader", - "pos": [ - -505.8336486816406, - 88.22794342041016 - ], - "size": [ - 360, - 82 - ], - "flags": {}, - "order": 2, - "mode": 0, - "inputs": [ - { - "localized_name": "unet_name", - "name": "unet_name", - "type": "COMBO", - "widget": { - "name": "unet_name" - }, - "link": 363 - }, - { - "localized_name": "weight_dtype", - "name": "weight_dtype", - "type": "COMBO", - "widget": { - "name": "weight_dtype" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "slot_index": 0, - "links": [ - 248 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "UNETLoader", - "models": [ - { - "name": "wan2.1_vace_14B_fp16.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors", - "directory": "diffusion_models" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "wan2.1_vace_14B_fp16.safetensors", - "fp8_e4m3fn_fast" - ] - }, - { - "id": 154, - "type": "LoraLoaderModelOnly", - "pos": [ - -505.8336486816406, - 228.2279510498047 - ], - "size": [ - 360, - 85.11004638671875 - ], - "flags": {}, - "order": 6, - "mode": 0, - "inputs": [ - { - "localized_name": "model", - "name": "model", - "type": "MODEL", - "link": 248 - }, - { - "localized_name": "lora_name", - "name": "lora_name", - "type": "COMBO", - "widget": { - "name": "lora_name" - }, - "link": 364 - }, - { - "localized_name": "strength_model", - "name": "strength_model", - "type": "FLOAT", - "widget": { - "name": "strength_model" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "MODEL", - "name": "MODEL", - "type": "MODEL", - "links": [ - 279 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "LoraLoaderModelOnly", - "models": [ - { - "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - "directory": "loras" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", - 0.30000000000000004 - ] - }, - { - "id": 38, - "type": "CLIPLoader", - "pos": [ - -499.14141845703125, - 368.0911865234375 - ], - "size": [ - 360, - 106 - ], - "flags": {}, - "order": 3, - "mode": 0, - "inputs": [ - { - "localized_name": "clip_name", - "name": "clip_name", - "type": "COMBO", - "widget": { - "name": "clip_name" - }, - "link": 365 - }, - { - "localized_name": "type", - "name": "type", - "type": "COMBO", - "widget": { - "name": "type" - }, - "link": null - }, - { - "localized_name": "device", - "name": "device", - "shape": 7, - "type": "COMBO", - "widget": { - "name": "device" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CLIP", - "name": "CLIP", - "type": "CLIP", - "slot_index": 0, - "links": [ - 74, - 75 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPLoader", - "models": [ - { - "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true", - "directory": "text_encoders" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "umt5_xxl_fp8_e4m3fn_scaled.safetensors", - "wan", - "default" - ] - }, - { - "id": 39, - "type": "VAELoader", - "pos": [ - -498.5298156738281, - 517.2576293945312 - ], - "size": [ - 360, - 60 - ], - "flags": {}, - "order": 4, - "mode": 0, - "inputs": [ - { - "localized_name": "vae_name", - "name": "vae_name", - "type": "COMBO", - "widget": { - "name": "vae_name" - }, - "link": 366 - } - ], - "outputs": [ - { - "localized_name": "VAE", - "name": "VAE", - "type": "VAE", - "slot_index": 0, - "links": [ - 76, - 101 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "VAELoader", - "models": [ - { - "name": "wan_2.1_vae.safetensors", - "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors", - "directory": "vae" - } - ], - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "wan_2.1_vae.safetensors" - ] - }, - { - "id": 221, - "type": "MarkdownNote", - "pos": [ - 380, - 1090 - ], - "size": [ - 480, - 170 - ], - "flags": {}, - "order": 5, - "mode": 0, - "inputs": [], - "outputs": [], - "title": "[EN] About video mask", - "properties": { - "widget_ue_connectable": {} - }, - "widgets_values": [ - "Currently, it's difficult to perfectly draw dynamic masks for different frames using only core nodes. However, to avoid requiring users to install additional custom nodes, our templates only use core nodes. You can refer to this implementation idea to achieve video inpainting.\n\nYou can use KJNode’s Points Editor and Sam2Segmentation to create some dynamic mask functions.\n\nCustom node links:\n- [ComfyUI-KJNodes](https://github.com/kijai/ComfyUI-KJNodes)\n- [ComfyUI-segment-anything-2](https://github.com/kijai/ComfyUI-segment-anything-2)" - ], - "color": "#432", - "bgcolor": "#000" - }, - { - "id": 7, - "type": "CLIPTextEncode", - "pos": [ - -80, - 390 - ], - "size": [ - 425.27801513671875, - 180.6060791015625 - ], - "flags": {}, - "order": 8, - "mode": 0, - "inputs": [ - { - "localized_name": "clip", - "name": "clip", - "type": "CLIP", - "link": 75 - }, - { - "localized_name": "text", - "name": "text", - "type": "STRING", - "widget": { - "name": "text" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "CONDITIONING", - "name": "CONDITIONING", - "type": "CONDITIONING", - "slot_index": 0, - "links": [ - 97 - ] - } - ], - "title": "CLIP Text Encode (Negative Prompt)", - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "CLIPTextEncode", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": {} - }, - "widgets_values": [ - "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝," - ], - "color": "#223", - "bgcolor": "#335" - }, - { - "id": 229, - "type": "ImageFromBatch", - "pos": [ - -510, - 800 - ], - "size": [ - 270, - 82 - ], - "flags": {}, - "order": 25, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 358 - }, - { - "localized_name": "batch_index", - "name": "batch_index", - "type": "INT", - "widget": { - "name": "batch_index" - }, - "link": null - }, - { - "localized_name": "length", - "name": "length", - "type": "INT", - "widget": { - "name": "length" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "IMAGE", - "name": "IMAGE", - "type": "IMAGE", - "links": [ - 359, - 360 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.13.0", - "Node name for S&R": "ImageFromBatch" - }, - "widgets_values": [ - 0, - 81 - ] - }, - { - "id": 49, - "type": "WanVaceToVideo", - "pos": [ - 400, - 200 - ], - "size": [ - 315, - 254 - ], - "flags": {}, - "order": 12, - "mode": 0, - "inputs": [ - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "link": 96 - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "link": 97 - }, - { - "localized_name": "vae", - "name": "vae", - "type": "VAE", - "link": 101 - }, - { - "localized_name": "control_video", - "name": "control_video", - "shape": 7, - "type": "IMAGE", - "link": 344 - }, - { - "localized_name": "control_masks", - "name": "control_masks", - "shape": 7, - "type": "MASK", - "link": 349 - }, - { - "localized_name": "reference_image", - "name": "reference_image", - "shape": 7, - "type": "IMAGE", - "link": 361 - }, - { - "localized_name": "width", - "name": "width", - "type": "INT", - "widget": { - "name": "width" - }, - "link": 355 - }, - { - "localized_name": "height", - "name": "height", - "type": "INT", - "widget": { - "name": "height" - }, - "link": 356 - }, - { - "localized_name": "length", - "name": "length", - "type": "INT", - "widget": { - "name": "length" - }, - "link": null - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "widget": { - "name": "batch_size" - }, - "link": null - }, - { - "localized_name": "strength", - "name": "strength", - "type": "FLOAT", - "widget": { - "name": "strength" - }, - "link": null - } - ], - "outputs": [ - { - "localized_name": "positive", - "name": "positive", - "type": "CONDITIONING", - "links": [ - 98 - ] - }, - { - "localized_name": "negative", - "name": "negative", - "type": "CONDITIONING", - "links": [ - 99 - ] - }, - { - "localized_name": "latent", - "name": "latent", - "type": "LATENT", - "links": [ - 160 - ] - }, - { - "localized_name": "trim_latent", - "name": "trim_latent", - "type": "INT", - "links": [ - 115 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.34", - "Node name for S&R": "WanVaceToVideo", - "enableTabs": false, - "tabWidth": 65, - "tabXOffset": 10, - "hasSecondTab": false, - "secondTabText": "Send Back", - "secondTabOffset": 80, - "secondTabWidth": 65, - "widget_ue_connectable": { - "width": true, - "height": true, - "length": true - } - }, - "widgets_values": [ - 720, - 720, - 81, - 1, - 1 - ] - }, - { - "id": 211, - "type": "GetImageSize", - "pos": [ - 70, - 800 - ], - "size": [ - 190, - 66 - ], - "flags": { - "collapsed": false - }, - "order": 20, - "mode": 0, - "inputs": [ - { - "localized_name": "image", - "name": "image", - "type": "IMAGE", - "link": 359 - } - ], - "outputs": [ - { - "localized_name": "width", - "name": "width", - "type": "INT", - "links": null - }, - { - "localized_name": "height", - "name": "height", - "type": "INT", - "links": null - }, - { - "localized_name": "batch_size", - "name": "batch_size", - "type": "INT", - "links": [ - 340, - 346 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "GetImageSize" - }, - "widgets_values": [] - }, - { - "id": 210, - "type": "GetVideoComponents", - "pos": [ - -510, - 690 - ], - "size": [ - 193.530859375, - 66 - ], - "flags": {}, - "order": 19, - "mode": 0, - "inputs": [ - { - "localized_name": "video", - "name": "video", - "type": "VIDEO", - "link": 336 - } - ], - "outputs": [ - { - "localized_name": "images", - "name": "images", - "type": "IMAGE", - "links": [ - 358 - ] - }, - { - "localized_name": "audio", - "name": "audio", - "type": "AUDIO", - "links": [ - 362 - ] - }, - { - "localized_name": "fps", - "name": "fps", - "type": "FLOAT", - "links": [ - 353 - ] - } - ], - "properties": { - "cnr_id": "comfy-core", - "ver": "0.3.40", - "Node name for S&R": "GetVideoComponents" - }, - "widgets_values": [] - } - ], - "groups": [ - { - "id": 1, - "title": "Step1 - Load models here", - "bounding": [ - -540, - -30, - 430, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 2, - "title": "Prompt", - "bounding": [ - -90, - -30, - 450, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 3, - "title": "Sampling & Decoding", - "bounding": [ - 380, - -30, - 720, - 620 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 10, - "title": "Repeat Mask Batch", - "bounding": [ - -90, - 910, - 450, - 460 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 21, - "title": "Get video info", - "bounding": [ - -540, - 610, - 900, - 290 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 22, - "title": "Composite video & masks", - "bounding": [ - 380, - 610, - 720, - 420 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - }, - { - "id": 23, - "title": "Step4 - Set video size & length", - "bounding": [ - 390, - 130, - 360, - 340 - ], - "color": "#A88", - "font_size": 24, - "flags": {} - }, - { - "id": 25, - "title": "14B", - "bounding": [ - -520, - 10, - 380, - 308.7100524902344 - ], - "color": "#3f789e", - "font_size": 24, - "flags": {} - } - ], - "links": [ - { - "id": 116, - "origin_id": 3, - "origin_slot": 0, - "target_id": 58, - "target_slot": 0, - "type": "LATENT" - }, - { - "id": 115, - "origin_id": 49, - "origin_slot": 3, - "target_id": 58, - "target_slot": 1, - "type": "INT" - }, - { - "id": 117, - "origin_id": 58, - "origin_slot": 0, - "target_id": 8, - "target_slot": 0, - "type": "LATENT" - }, - { - "id": 76, - "origin_id": 39, - "origin_slot": 0, - "target_id": 8, - "target_slot": 1, - "type": "VAE" - }, - { - "id": 279, - "origin_id": 154, - "origin_slot": 0, - "target_id": 48, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 352, - "origin_id": 219, - "origin_slot": 0, - "target_id": 216, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 340, - "origin_id": 211, - "origin_slot": 2, - "target_id": 213, - "target_slot": 1, - "type": "INT" - }, - { - "id": 96, - "origin_id": 6, - "origin_slot": 0, - "target_id": 49, - "target_slot": 0, - "type": "CONDITIONING" - }, - { - "id": 97, - "origin_id": 7, - "origin_slot": 0, - "target_id": 49, - "target_slot": 1, - "type": "CONDITIONING" - }, - { - "id": 101, - "origin_id": 39, - "origin_slot": 0, - "target_id": 49, - "target_slot": 2, - "type": "VAE" - }, - { - "id": 344, - "origin_id": 208, - "origin_slot": 0, - "target_id": 49, - "target_slot": 3, - "type": "IMAGE" - }, - { - "id": 349, - "origin_id": 130, - "origin_slot": 0, - "target_id": 49, - "target_slot": 4, - "type": "MASK" - }, - { - "id": 139, - "origin_id": 8, - "origin_slot": 0, - "target_id": 68, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 353, - "origin_id": 210, - "origin_slot": 2, - "target_id": 68, - "target_slot": 2, - "type": "FLOAT" - }, - { - "id": 333, - "origin_id": 213, - "origin_slot": 0, - "target_id": 208, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 334, - "origin_id": 216, - "origin_slot": 0, - "target_id": 208, - "target_slot": 1, - "type": "IMAGE" - }, - { - "id": 341, - "origin_id": 208, - "origin_slot": 0, - "target_id": 214, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 201, - "origin_id": 111, - "origin_slot": 0, - "target_id": 129, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 346, - "origin_id": 211, - "origin_slot": 2, - "target_id": 129, - "target_slot": 1, - "type": "INT" - }, - { - "id": 202, - "origin_id": 129, - "origin_slot": 0, - "target_id": 130, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 280, - "origin_id": 48, - "origin_slot": 0, - "target_id": 3, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 98, - "origin_id": 49, - "origin_slot": 0, - "target_id": 3, - "target_slot": 1, - "type": "CONDITIONING" - }, - { - "id": 99, - "origin_id": 49, - "origin_slot": 1, - "target_id": 3, - "target_slot": 2, - "type": "CONDITIONING" - }, - { - "id": 160, - "origin_id": 49, - "origin_slot": 2, - "target_id": 3, - "target_slot": 3, - "type": "LATENT" - }, - { - "id": 74, - "origin_id": 38, - "origin_slot": 0, - "target_id": 6, - "target_slot": 0, - "type": "CLIP" - }, - { - "id": 248, - "origin_id": 140, - "origin_slot": 0, - "target_id": 154, - "target_slot": 0, - "type": "MODEL" - }, - { - "id": 75, - "origin_id": 38, - "origin_slot": 0, - "target_id": 7, - "target_slot": 0, - "type": "CLIP" - }, - { - "id": 351, - "origin_id": -10, - "origin_slot": 0, - "target_id": 219, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 335, - "origin_id": -10, - "origin_slot": 0, - "target_id": 208, - "target_slot": 2, - "type": "MASK" - }, - { - "id": 345, - "origin_id": -10, - "origin_slot": 0, - "target_id": 111, - "target_slot": 0, - "type": "MASK" - }, - { - "id": 336, - "origin_id": -10, - "origin_slot": 1, - "target_id": 210, - "target_slot": 0, - "type": "VIDEO" - }, - { - "id": 129, - "origin_id": 68, - "origin_slot": 0, - "target_id": -20, - "target_slot": 0, - "type": "VIDEO" - }, - { - "id": 355, - "origin_id": -10, - "origin_slot": 2, - "target_id": 49, - "target_slot": 6, - "type": "INT" - }, - { - "id": 356, - "origin_id": -10, - "origin_slot": 3, - "target_id": 49, - "target_slot": 7, - "type": "INT" - }, - { - "id": 358, - "origin_id": 210, - "origin_slot": 0, - "target_id": 229, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 359, - "origin_id": 229, - "origin_slot": 0, - "target_id": 211, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 360, - "origin_id": 229, - "origin_slot": 0, - "target_id": 213, - "target_slot": 0, - "type": "IMAGE" - }, - { - "id": 361, - "origin_id": -10, - "origin_slot": 4, - "target_id": 49, - "target_slot": 5, - "type": "IMAGE" - }, - { - "id": 362, - "origin_id": 210, - "origin_slot": 1, - "target_id": 68, - "target_slot": 1, - "type": "AUDIO" - }, - { - "id": 363, - "origin_id": -10, - "origin_slot": 5, - "target_id": 140, - "target_slot": 0, - "type": "COMBO" - }, - { - "id": 364, - "origin_id": -10, - "origin_slot": 6, - "target_id": 154, - "target_slot": 1, - "type": "COMBO" - }, - { - "id": 365, - "origin_id": -10, - "origin_slot": 7, - "target_id": 38, - "target_slot": 0, - "type": "COMBO" - }, - { - "id": 366, - "origin_id": -10, - "origin_slot": 8, - "target_id": 39, - "target_slot": 0, - "type": "COMBO" - } - ], - "extra": { - "workflowRendererVersion": "LG" - }, - "category": "Video generation and editing/Inpaint video", - "description": "Inpaints masked regions in video frames using Wan 2.1 VACE." - } - ] - }, - "config": {}, - "extra": { - "workflowRendererVersion": "LG", - "ds": { - "scale": 0.8183828377358485, - "offset": [ - 1215.8643989712405, - 178.87024992690183 - ] - } - }, - "version": 0.4 -} diff --git a/blueprints/Video Inpainting (Wan2.1 VACE).json b/blueprints/Video Inpainting (Wan2.1 VACE).json new file mode 100644 index 000000000..7460f3d44 --- /dev/null +++ b/blueprints/Video Inpainting (Wan2.1 VACE).json @@ -0,0 +1,4196 @@ +{ + "revision": 0, + "last_node_id": 306, + "last_link_id": 0, + "nodes": [ + { + "id": 306, + "type": "bd7f73a0-ec67-4f46-8671-17088d8e31b7", + "pos": [ + -2950, + -410 + ], + "size": [ + 440, + 650 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "label": "source_video", + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": null + }, + { + "label": "reference_image", + "name": "reference_image_1", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "label": "prompt", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + }, + { + "label": "width", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + }, + { + "label": "height", + "name": "value_1", + "type": "INT", + "widget": { + "name": "value_1" + }, + "link": null + }, + { + "label": "frame_counts", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": null + }, + { + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + }, + { + "label": "wan_vace_model", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "label": "clip_model", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": null + }, + { + "label": "vae_model", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": null + }, + { + "label": "enable_turbo_mode", + "name": "value_2", + "type": "BOOLEAN", + "widget": { + "name": "value_2" + }, + "link": null + }, + { + "label": "lightning_lora", + "name": "lora_name", + "type": "COMBO", + "widget": { + "name": "lora_name" + }, + "link": null + }, + { + "label": "sam3_mask_object", + "name": "text_1", + "type": "STRING", + "widget": { + "name": "text_1" + }, + "link": null + }, + { + "label": "mask_expand", + "name": "expand", + "type": "INT", + "widget": { + "name": "expand" + }, + "link": null + }, + { + "label": "sam3_model", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "280", + "text" + ], + [ + "297", + "value" + ], + [ + "290", + "value" + ], + [ + "289", + "length" + ], + [ + "288", + "seed" + ], + [ + "299", + "unet_name" + ], + [ + "277", + "clip_name" + ], + [ + "278", + "vae_name" + ], + [ + "300", + "value" + ], + [ + "272", + "lora_name" + ], + [ + "268", + "text" + ], + [ + "269", + "expand" + ], + [ + "268", + "ckpt_name" + ], + [ + "312", + "$$canvas-image-preview" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Video Inpainting (Wan2.1 VACE)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "bd7f73a0-ec67-4f46-8671-17088d8e31b7", + "version": 1, + "state": { + "lastGroupId": 31, + "lastNodeId": 315, + "lastLinkId": 499, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video Inpainting (Wan2.1 VACE)", + "inputNode": { + "id": -10, + "bounding": [ + -3450, + 3170, + 159.744140625, + 348 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + 900, + 2840, + 128, + 68 + ] + }, + "inputs": [ + { + "id": "a636746e-5b9f-4b91-96f0-7f2657415b93", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 473 + ], + "localized_name": "video", + "label": "source_video", + "pos": [ + -3314.255859375, + 3194 + ] + }, + { + "id": "46275350-98b8-4d7c-8ca4-c452dc40a6bd", + "name": "reference_image_1", + "type": "IMAGE", + "linkIds": [ + 478 + ], + "label": "reference_image", + "pos": [ + -3314.255859375, + 3214 + ] + }, + { + "id": "0f5bee71-3485-4e10-81a7-2b9f85851353", + "name": "text", + "type": "STRING", + "linkIds": [ + 479 + ], + "label": "prompt", + "pos": [ + -3314.255859375, + 3234 + ] + }, + { + "id": "16675512-c229-43ed-944e-190a7f61b571", + "name": "value", + "type": "INT", + "linkIds": [ + 480 + ], + "label": "width", + "pos": [ + -3314.255859375, + 3254 + ] + }, + { + "id": "84330129-a0c7-44cd-91fe-c033946749db", + "name": "value_1", + "type": "INT", + "linkIds": [ + 481 + ], + "label": "height", + "pos": [ + -3314.255859375, + 3274 + ] + }, + { + "id": "3bd895e6-cba9-477b-bf6e-8c77dd56bb4a", + "name": "length", + "type": "INT", + "linkIds": [ + 494 + ], + "label": "frame_counts", + "pos": [ + -3314.255859375, + 3294 + ] + }, + { + "id": "dbc2e9c5-f86a-48ba-874a-2991c75d1ae7", + "name": "seed", + "type": "INT", + "linkIds": [ + 483 + ], + "pos": [ + -3314.255859375, + 3314 + ] + }, + { + "id": "572db94d-e64d-464f-bf3c-23a23aeb79f1", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 485 + ], + "label": "wan_vace_model", + "pos": [ + -3314.255859375, + 3334 + ] + }, + { + "id": "32185180-f627-47c2-971b-6ef3007e9455", + "name": "clip_name", + "type": "COMBO", + "linkIds": [ + 486 + ], + "label": "clip_model", + "pos": [ + -3314.255859375, + 3354 + ] + }, + { + "id": "2af354d3-108a-42a9-acfc-7bad158715aa", + "name": "vae_name", + "type": "COMBO", + "linkIds": [ + 487 + ], + "label": "vae_model", + "pos": [ + -3314.255859375, + 3374 + ] + }, + { + "id": "c9777a8c-267f-4c5e-b4d5-e9727d822e50", + "name": "value_2", + "type": "BOOLEAN", + "linkIds": [ + 489 + ], + "label": "enable_turbo_mode", + "pos": [ + -3314.255859375, + 3394 + ] + }, + { + "id": "84a258a3-4f25-4edb-9f50-6fcd8411394e", + "name": "lora_name", + "type": "COMBO", + "linkIds": [ + 490 + ], + "label": "lightning_lora", + "pos": [ + -3314.255859375, + 3414 + ] + }, + { + "id": "9c5fb6f8-407b-4a13-94d8-cbbba546a082", + "name": "text_1", + "type": "STRING", + "linkIds": [ + 491 + ], + "label": "sam3_mask_object", + "pos": [ + -3314.255859375, + 3434 + ] + }, + { + "id": "598323c9-2256-44bd-9745-492a74628300", + "name": "expand", + "type": "INT", + "linkIds": [ + 496 + ], + "label": "mask_expand", + "pos": [ + -3314.255859375, + 3454 + ] + }, + { + "id": "856c1937-8caa-4d85-9d8a-6a900234d6d6", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 497 + ], + "label": "sam3_model", + "pos": [ + -3314.255859375, + 3474 + ] + } + ], + "outputs": [ + { + "id": "be46c9d5-ced7-445b-996f-fff59d9b684d", + "name": "VIDEO", + "type": "VIDEO", + "linkIds": [ + 474 + ], + "localized_name": "VIDEO", + "pos": [ + 924, + 2864 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 266, + "type": "ModelSamplingSD3", + "pos": [ + -560, + 1940 + ], + "size": [ + 320, + 110 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 422 + }, + { + "localized_name": "shift", + "name": "shift", + "type": "FLOAT", + "widget": { + "name": "shift" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [ + 454 + ] + } + ], + "properties": { + "Node name for S&R": "ModelSamplingSD3", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 5 + ] + }, + { + "id": 267, + "type": "CreateVideo", + "pos": [ + 530, + 2590 + ], + "size": [ + 310, + 130 + ], + "flags": { + "collapsed": false + }, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 423 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 424 + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "widget": { + "name": "fps" + }, + "link": 425 + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 474 + ] + } + ], + "properties": { + "Node name for S&R": "CreateVideo", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 268, + "type": "17df2eeb-d89e-46ee-9480-a4ca2494b207", + "pos": [ + -1960, + 3220 + ], + "size": [ + 290, + 370 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 426 + }, + { + "label": "object", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 491 + }, + { + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": null + }, + { + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": null + }, + { + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": null + }, + { + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 497 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 427 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "237", + "text" + ], + [ + "75", + "threshold" + ], + [ + "75", + "refine_iterations" + ], + [ + "75", + "individual_masks" + ], + [ + "236", + "ckpt_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": { + "text": true + }, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [] + }, + { + "id": 269, + "type": "GrowMask", + "pos": [ + -1530, + 3220 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 427 + }, + { + "localized_name": "expand", + "name": "expand", + "type": "INT", + "widget": { + "name": "expand" + }, + "link": 496 + }, + { + "localized_name": "tapered_corners", + "name": "tapered_corners", + "type": "BOOLEAN", + "widget": { + "name": "tapered_corners" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 441, + 445, + 449, + 498 + ] + } + ], + "properties": { + "Node name for S&R": "GrowMask", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 20, + true + ] + }, + { + "id": 270, + "type": "PrimitiveInt", + "pos": [ + -1350, + 1980 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 466 + ] + } + ], + "title": "Int (Steps)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 20, + "fixed" + ] + }, + { + "id": 271, + "type": "PrimitiveFloat", + "pos": [ + -1340, + 2140 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 432 + ] + } + ], + "title": "Float (CFG)", + "properties": { + "Node name for S&R": "PrimitiveFloat", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6 + ] + }, + { + "id": 272, + "type": "LoraLoaderModelOnly", + "pos": [ + -1380, + 2390 + ], + "size": [ + 350, + 140 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 428 + }, + { + "localized_name": "lora_name", + "name": "lora_name", + "type": "COMBO", + "widget": { + "name": "lora_name" + }, + "link": 490 + }, + { + "localized_name": "strength_model", + "name": "strength_model", + "type": "FLOAT", + "widget": { + "name": "strength_model" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 430 + ] + } + ], + "properties": { + "Node name for S&R": "LoraLoaderModelOnly", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + "directory": "loras" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "Wan21_CausVid_14B_T2V_lora_rank32.safetensors", + 0.30000000000000004 + ] + }, + { + "id": 273, + "type": "PrimitiveInt", + "pos": [ + -1340, + 2600 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 467 + ] + } + ], + "title": "Int (Steps)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 6, + "fixed" + ] + }, + { + "id": 274, + "type": "PrimitiveFloat", + "pos": [ + -1340, + 2760 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "FLOAT", + "widget": { + "name": "value" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [ + 433 + ] + } + ], + "title": "Float (CFG)", + "properties": { + "Node name for S&R": "PrimitiveFloat", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 1 + ] + }, + { + "id": 275, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2530 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 429 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 430 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 431 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 422 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 276, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2340 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 432 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 433 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 434 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 459 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 277, + "type": "CLIPLoader", + "pos": [ + -2710, + 2210 + ], + "size": [ + 360, + 170 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "localized_name": "clip_name", + "name": "clip_name", + "type": "COMBO", + "widget": { + "name": "clip_name" + }, + "link": 486 + }, + { + "localized_name": "type", + "name": "type", + "type": "COMBO", + "widget": { + "name": "type" + }, + "link": null + }, + { + "localized_name": "device", + "name": "device", + "shape": 7, + "type": "COMBO", + "widget": { + "name": "device" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "slot_index": 0, + "links": [ + 435, + 436 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPLoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true", + "directory": "text_encoders" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "umt5_xxl_fp8_e4m3fn_scaled.safetensors", + "wan", + "default" + ] + }, + { + "id": 278, + "type": "VAELoader", + "pos": [ + -2700, + 2500 + ], + "size": [ + 360, + 110 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "localized_name": "vae_name", + "name": "vae_name", + "type": "COMBO", + "widget": { + "name": "vae_name" + }, + "link": 487 + } + ], + "outputs": [ + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "slot_index": 0, + "links": [ + 439, + 471 + ] + } + ], + "properties": { + "Node name for S&R": "VAELoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "wan_2.1_vae.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors", + "directory": "vae" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "wan_2.1_vae.safetensors" + ] + }, + { + "id": 279, + "type": "CLIPTextEncode", + "pos": [ + -2280, + 2410 + ], + "size": [ + 430, + 190 + ], + "flags": {}, + "order": 14, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 435 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 438 + ] + } + ], + "title": "CLIP Text Encode (Negative Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝," + ], + "color": "#223", + "bgcolor": "#335" + }, + { + "id": 280, + "type": "CLIPTextEncode", + "pos": [ + -2270, + 1940 + ], + "size": [ + 420, + 420 + ], + "flags": {}, + "order": 15, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 436 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 479 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "slot_index": 0, + "links": [ + 437 + ] + } + ], + "title": "CLIP Text Encode (Positive Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 281, + "type": "WanVaceToVideo", + "pos": [ + -1780, + 1940 + ], + "size": [ + 320, + 360 + ], + "flags": {}, + "order": 16, + "mode": 0, + "inputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 437 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 438 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 439 + }, + { + "localized_name": "control_video", + "name": "control_video", + "shape": 7, + "type": "IMAGE", + "link": 440 + }, + { + "localized_name": "control_masks", + "name": "control_masks", + "shape": 7, + "type": "MASK", + "link": 441 + }, + { + "localized_name": "reference_image", + "name": "reference_image", + "shape": 7, + "type": "IMAGE", + "link": 478 + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": 442 + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": 443 + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 444 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + }, + { + "localized_name": "strength", + "name": "strength", + "type": "FLOAT", + "widget": { + "name": "strength" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "links": [ + 455 + ] + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "links": [ + 456 + ] + }, + { + "localized_name": "latent", + "name": "latent", + "type": "LATENT", + "links": [ + 457 + ] + }, + { + "localized_name": "trim_latent", + "name": "trim_latent", + "type": "INT", + "links": [ + 453 + ] + } + ], + "properties": { + "Node name for S&R": "WanVaceToVideo", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": { + "width": true, + "height": true, + "length": true + } + }, + "widgets_values": [ + 720, + 720, + 81, + 1, + 1 + ] + }, + { + "id": 282, + "type": "InvertMask", + "pos": [ + -1510, + 3410 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 17, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 445 + } + ], + "outputs": [ + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": [ + 446 + ] + } + ], + "properties": { + "Node name for S&R": "InvertMask", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 283, + "type": "MaskToImage", + "pos": [ + -1510, + 3550 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 18, + "mode": 0, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 446 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 448 + ] + } + ], + "properties": { + "Node name for S&R": "MaskToImage", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 284, + "type": "ImageCompositeMasked", + "pos": [ + -1210, + 3210 + ], + "size": [ + 230, + 220 + ], + "flags": {}, + "order": 19, + "mode": 0, + "inputs": [ + { + "localized_name": "destination", + "name": "destination", + "type": "IMAGE", + "link": 447 + }, + { + "localized_name": "source", + "name": "source", + "type": "IMAGE", + "link": 448 + }, + { + "localized_name": "mask", + "name": "mask", + "shape": 7, + "type": "MASK", + "link": 449 + }, + { + "localized_name": "x", + "name": "x", + "type": "INT", + "widget": { + "name": "x" + }, + "link": null + }, + { + "localized_name": "y", + "name": "y", + "type": "INT", + "widget": { + "name": "y" + }, + "link": null + }, + { + "localized_name": "resize_source", + "name": "resize_source", + "type": "BOOLEAN", + "widget": { + "name": "resize_source" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 440, + 499 + ] + } + ], + "properties": { + "Node name for S&R": "ImageCompositeMasked", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 0, + true + ] + }, + { + "id": 287, + "type": "TrimVideoLatent", + "pos": [ + -220, + 1950 + ], + "size": [ + 320, + 110 + ], + "flags": { + "collapsed": false + }, + "order": 20, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 452 + }, + { + "localized_name": "trim_amount", + "name": "trim_amount", + "type": "INT", + "widget": { + "name": "trim_amount" + }, + "link": 453 + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "links": [ + 470 + ] + } + ], + "properties": { + "Node name for S&R": "TrimVideoLatent", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": { + "trim_amount": true + } + }, + "widgets_values": [ + 0 + ] + }, + { + "id": 288, + "type": "KSampler", + "pos": [ + -560, + 2120 + ], + "size": [ + 320, + 350 + ], + "flags": {}, + "order": 21, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 454 + }, + { + "localized_name": "positive", + "name": "positive", + "type": "CONDITIONING", + "link": 455 + }, + { + "localized_name": "negative", + "name": "negative", + "type": "CONDITIONING", + "link": 456 + }, + { + "localized_name": "latent_image", + "name": "latent_image", + "type": "LATENT", + "link": 457 + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": 483 + }, + { + "localized_name": "steps", + "name": "steps", + "type": "INT", + "widget": { + "name": "steps" + }, + "link": 458 + }, + { + "localized_name": "cfg", + "name": "cfg", + "type": "FLOAT", + "widget": { + "name": "cfg" + }, + "link": 459 + }, + { + "localized_name": "sampler_name", + "name": "sampler_name", + "type": "COMBO", + "widget": { + "name": "sampler_name" + }, + "link": null + }, + { + "localized_name": "scheduler", + "name": "scheduler", + "type": "COMBO", + "widget": { + "name": "scheduler" + }, + "link": null + }, + { + "localized_name": "denoise", + "name": "denoise", + "type": "FLOAT", + "widget": { + "name": "denoise" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "LATENT", + "name": "LATENT", + "type": "LATENT", + "slot_index": 0, + "links": [ + 452 + ] + } + ], + "properties": { + "Node name for S&R": "KSampler", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + 832378512055965, + "fixed", + 4, + 1, + "uni_pc", + "simple", + 1 + ] + }, + { + "id": 289, + "type": "ImageFromBatch", + "pos": [ + -2360, + 3410 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 22, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 460 + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": 494 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 463 + ] + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 81 + ] + }, + { + "id": 290, + "type": "PrimitiveInt", + "pos": [ + -2690, + 3540 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 23, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 481 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 461 + ] + } + ], + "title": "Int (Height)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 720, + "fixed" + ] + }, + { + "id": 291, + "type": "ComfyMathExpression", + "pos": [ + -2650, + 3700 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 24, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 461 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 465 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "floor(a/16)*16" + ] + }, + { + "id": 292, + "type": "ComfyMathExpression", + "pos": [ + -2650, + 3500 + ], + "size": [ + 230, + 80 + ], + "flags": { + "collapsed": true + }, + "order": 25, + "mode": 0, + "inputs": [ + { + "label": "a", + "localized_name": "values.a", + "name": "values.a", + "type": "FLOAT,INT,BOOLEAN", + "link": 462 + }, + { + "label": "b", + "localized_name": "values.b", + "name": "values.b", + "shape": 7, + "type": "FLOAT,INT,BOOLEAN", + "link": null + }, + { + "localized_name": "expression", + "name": "expression", + "type": "STRING", + "widget": { + "name": "expression" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "FLOAT", + "name": "FLOAT", + "type": "FLOAT", + "links": [] + }, + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 464 + ] + }, + { + "localized_name": "BOOL", + "name": "BOOL", + "type": "BOOLEAN", + "links": [] + } + ], + "properties": { + "Node name for S&R": "ComfyMathExpression", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "floor(a/16)*16" + ] + }, + { + "id": 293, + "type": "ResizeImageMaskNode", + "pos": [ + -2360, + 3590 + ], + "size": [ + 280, + 160 + ], + "flags": {}, + "order": 26, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 463 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "width", + "name": "resize_type.width", + "type": "INT", + "widget": { + "name": "resize_type.width" + }, + "link": 464 + }, + { + "localized_name": "height", + "name": "resize_type.height", + "type": "INT", + "widget": { + "name": "resize_type.height" + }, + "link": 465 + }, + { + "localized_name": "crop", + "name": "resize_type.crop", + "type": "COMBO", + "widget": { + "name": "resize_type.crop" + }, + "link": null + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 426, + 447, + 469 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale dimensions", + 512, + 512, + "center", + "area" + ] + }, + { + "id": 294, + "type": "ComfySwitchNode", + "pos": [ + -960, + 2150 + ], + "size": [ + 270, + 130 + ], + "flags": {}, + "order": 27, + "mode": 0, + "inputs": [ + { + "localized_name": "on_false", + "name": "on_false", + "type": "*", + "link": 466 + }, + { + "localized_name": "on_true", + "name": "on_true", + "type": "*", + "link": 467 + }, + { + "localized_name": "switch", + "name": "switch", + "type": "BOOLEAN", + "widget": { + "name": "switch" + }, + "link": 468 + } + ], + "outputs": [ + { + "localized_name": "output", + "name": "output", + "type": "*", + "links": [ + 458 + ] + } + ], + "properties": { + "Node name for S&R": "ComfySwitchNode", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + false + ] + }, + { + "id": 295, + "type": "GetImageSize", + "pos": [ + -2010, + 2920 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 28, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 469 + } + ], + "outputs": [ + { + "localized_name": "width", + "name": "width", + "type": "INT", + "links": [ + 442 + ] + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "links": [ + 443 + ] + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "links": [ + 444 + ] + } + ], + "properties": { + "Node name for S&R": "GetImageSize", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 296, + "type": "VAEDecode", + "pos": [ + 520, + 2450 + ], + "size": [ + 320, + 100 + ], + "flags": { + "collapsed": false + }, + "order": 29, + "mode": 0, + "inputs": [ + { + "localized_name": "samples", + "name": "samples", + "type": "LATENT", + "link": 470 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 471 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "slot_index": 0, + "links": [ + 423 + ] + } + ], + "properties": { + "Node name for S&R": "VAEDecode", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + } + }, + { + "id": 297, + "type": "PrimitiveInt", + "pos": [ + -2690, + 3350 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 30, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "INT", + "widget": { + "name": "value" + }, + "link": 480 + } + ], + "outputs": [ + { + "localized_name": "INT", + "name": "INT", + "type": "INT", + "links": [ + 462 + ] + } + ], + "title": "Int (Width)", + "properties": { + "Node name for S&R": "PrimitiveInt", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 720, + "fixed" + ] + }, + { + "id": 298, + "type": "GetVideoComponents", + "pos": [ + -2330, + 3210 + ], + "size": [ + 230, + 120 + ], + "flags": { + "collapsed": false + }, + "order": 31, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 473 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 460 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 424 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 425 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.3.40", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 299, + "type": "UNETLoader", + "pos": [ + -2720, + 1980 + ], + "size": [ + 370, + 140 + ], + "flags": {}, + "order": 32, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 485 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "slot_index": 0, + "links": [ + 428, + 429 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.3.34", + "models": [ + { + "name": "wan2.1_vace_14B_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "widget_ue_connectable": {} + }, + "widgets_values": [ + "wan2.1_vace_14B_fp16.safetensors", + "fp8_e4m3fn_fast" + ] + }, + { + "id": 300, + "type": "PrimitiveBoolean", + "pos": [ + -1390, + 2980 + ], + "size": [ + 270, + 100 + ], + "flags": {}, + "order": 33, + "mode": 0, + "inputs": [ + { + "localized_name": "value", + "name": "value", + "type": "BOOLEAN", + "widget": { + "name": "value" + }, + "link": 489 + } + ], + "outputs": [ + { + "localized_name": "BOOLEAN", + "name": "BOOLEAN", + "type": "BOOLEAN", + "links": [ + 431, + 434, + 468 + ] + } + ], + "title": "Boolean (Enable Lightning LoRA)", + "properties": { + "Node name for S&R": "PrimitiveBoolean", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true + ] + }, + { + "id": 308, + "type": "ImageFromBatch", + "pos": [ + -2360, + 3410 + ], + "size": [ + 270, + 140 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": null + }, + { + "localized_name": "batch_index", + "name": "batch_index", + "type": "INT", + "widget": { + "name": "batch_index" + }, + "link": null + }, + { + "localized_name": "length", + "name": "length", + "type": "INT", + "widget": { + "name": "length" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": null + } + ], + "properties": { + "Node name for S&R": "ImageFromBatch", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0, + 1 + ] + }, + { + "id": 310, + "type": "MaskPreview", + "pos": [ + -900, + 3230 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 34, + "mode": 4, + "inputs": [ + { + "localized_name": "mask", + "name": "mask", + "type": "MASK", + "link": 498 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "MaskPreview", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + }, + { + "id": 312, + "type": "PreviewImage", + "pos": [ + -520, + 3230 + ], + "size": [ + 230, + 80 + ], + "flags": {}, + "order": 35, + "mode": 4, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 499 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage", + "cnr_id": "comfy-core", + "ver": "0.21.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [ + { + "id": 1, + "title": "Models", + "bounding": [ + -2750, + 1860, + 430, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 2, + "title": "Prompt", + "bounding": [ + -2290, + 1860, + 460, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 3, + "title": "Sampling", + "bounding": [ + -590, + 1860, + 700, + 620 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 20, + "title": "Create Video Mask", + "bounding": [ + -2030, + 3110, + 440, + 550 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 23, + "title": "Conditioning", + "bounding": [ + -1800, + 1860, + 370, + 450 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 26, + "title": "Apply Mask to Video", + "bounding": [ + -1560, + 3110, + 1320, + 550 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 29, + "title": "Swtich Logic", + "bounding": [ + -1400, + 1860, + 780, + 1060 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 27, + "title": "Lightning LoRA", + "bounding": [ + -1390, + 2290, + 370, + 620 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 28, + "title": "Original", + "bounding": [ + -1390, + 1900, + 370, + 370 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 31, + "title": "Video Size Preprocessing", + "bounding": [ + -2740, + 3110, + 680, + 770 + ], + "color": "#3f789e", + "flags": {} + }, + { + "id": 30, + "title": "Size", + "bounding": [ + -2710, + 3270, + 330, + 470 + ], + "color": "#3f789e", + "flags": {} + } + ], + "links": [ + { + "id": 422, + "origin_id": 275, + "origin_slot": 0, + "target_id": 266, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 423, + "origin_id": 296, + "origin_slot": 0, + "target_id": 267, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 424, + "origin_id": 298, + "origin_slot": 1, + "target_id": 267, + "target_slot": 1, + "type": "AUDIO" + }, + { + "id": 425, + "origin_id": 298, + "origin_slot": 2, + "target_id": 267, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 426, + "origin_id": 293, + "origin_slot": 0, + "target_id": 268, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 427, + "origin_id": 268, + "origin_slot": 0, + "target_id": 269, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 428, + "origin_id": 299, + "origin_slot": 0, + "target_id": 272, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 429, + "origin_id": 299, + "origin_slot": 0, + "target_id": 275, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 430, + "origin_id": 272, + "origin_slot": 0, + "target_id": 275, + "target_slot": 1, + "type": "MODEL" + }, + { + "id": 431, + "origin_id": 300, + "origin_slot": 0, + "target_id": 275, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 432, + "origin_id": 271, + "origin_slot": 0, + "target_id": 276, + "target_slot": 0, + "type": "FLOAT" + }, + { + "id": 433, + "origin_id": 274, + "origin_slot": 0, + "target_id": 276, + "target_slot": 1, + "type": "FLOAT" + }, + { + "id": 434, + "origin_id": 300, + "origin_slot": 0, + "target_id": 276, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 435, + "origin_id": 277, + "origin_slot": 0, + "target_id": 279, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 436, + "origin_id": 277, + "origin_slot": 0, + "target_id": 280, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 437, + "origin_id": 280, + "origin_slot": 0, + "target_id": 281, + "target_slot": 0, + "type": "CONDITIONING" + }, + { + "id": 438, + "origin_id": 279, + "origin_slot": 0, + "target_id": 281, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 439, + "origin_id": 278, + "origin_slot": 0, + "target_id": 281, + "target_slot": 2, + "type": "VAE" + }, + { + "id": 440, + "origin_id": 284, + "origin_slot": 0, + "target_id": 281, + "target_slot": 3, + "type": "IMAGE" + }, + { + "id": 441, + "origin_id": 269, + "origin_slot": 0, + "target_id": 281, + "target_slot": 4, + "type": "MASK" + }, + { + "id": 442, + "origin_id": 295, + "origin_slot": 0, + "target_id": 281, + "target_slot": 6, + "type": "INT" + }, + { + "id": 443, + "origin_id": 295, + "origin_slot": 1, + "target_id": 281, + "target_slot": 7, + "type": "INT" + }, + { + "id": 444, + "origin_id": 295, + "origin_slot": 2, + "target_id": 281, + "target_slot": 8, + "type": "INT" + }, + { + "id": 445, + "origin_id": 269, + "origin_slot": 0, + "target_id": 282, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 446, + "origin_id": 282, + "origin_slot": 0, + "target_id": 283, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 447, + "origin_id": 293, + "origin_slot": 0, + "target_id": 284, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 448, + "origin_id": 283, + "origin_slot": 0, + "target_id": 284, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 449, + "origin_id": 269, + "origin_slot": 0, + "target_id": 284, + "target_slot": 2, + "type": "MASK" + }, + { + "id": 452, + "origin_id": 288, + "origin_slot": 0, + "target_id": 287, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 453, + "origin_id": 281, + "origin_slot": 3, + "target_id": 287, + "target_slot": 1, + "type": "INT" + }, + { + "id": 454, + "origin_id": 266, + "origin_slot": 0, + "target_id": 288, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 455, + "origin_id": 281, + "origin_slot": 0, + "target_id": 288, + "target_slot": 1, + "type": "CONDITIONING" + }, + { + "id": 456, + "origin_id": 281, + "origin_slot": 1, + "target_id": 288, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 457, + "origin_id": 281, + "origin_slot": 2, + "target_id": 288, + "target_slot": 3, + "type": "LATENT" + }, + { + "id": 458, + "origin_id": 294, + "origin_slot": 0, + "target_id": 288, + "target_slot": 5, + "type": "INT" + }, + { + "id": 459, + "origin_id": 276, + "origin_slot": 0, + "target_id": 288, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 460, + "origin_id": 298, + "origin_slot": 0, + "target_id": 289, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 461, + "origin_id": 290, + "origin_slot": 0, + "target_id": 291, + "target_slot": 0, + "type": "INT" + }, + { + "id": 462, + "origin_id": 297, + "origin_slot": 0, + "target_id": 292, + "target_slot": 0, + "type": "INT" + }, + { + "id": 463, + "origin_id": 289, + "origin_slot": 0, + "target_id": 293, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 464, + "origin_id": 292, + "origin_slot": 1, + "target_id": 293, + "target_slot": 2, + "type": "INT" + }, + { + "id": 465, + "origin_id": 291, + "origin_slot": 1, + "target_id": 293, + "target_slot": 3, + "type": "INT" + }, + { + "id": 466, + "origin_id": 270, + "origin_slot": 0, + "target_id": 294, + "target_slot": 0, + "type": "INT" + }, + { + "id": 467, + "origin_id": 273, + "origin_slot": 0, + "target_id": 294, + "target_slot": 1, + "type": "INT" + }, + { + "id": 468, + "origin_id": 300, + "origin_slot": 0, + "target_id": 294, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 469, + "origin_id": 293, + "origin_slot": 0, + "target_id": 295, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 470, + "origin_id": 287, + "origin_slot": 0, + "target_id": 296, + "target_slot": 0, + "type": "LATENT" + }, + { + "id": 471, + "origin_id": 278, + "origin_slot": 0, + "target_id": 296, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 473, + "origin_id": -10, + "origin_slot": 0, + "target_id": 298, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 474, + "origin_id": 267, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 478, + "origin_id": -10, + "origin_slot": 1, + "target_id": 281, + "target_slot": 5, + "type": "IMAGE" + }, + { + "id": 479, + "origin_id": -10, + "origin_slot": 2, + "target_id": 280, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 480, + "origin_id": -10, + "origin_slot": 3, + "target_id": 297, + "target_slot": 0, + "type": "INT" + }, + { + "id": 481, + "origin_id": -10, + "origin_slot": 4, + "target_id": 290, + "target_slot": 0, + "type": "INT" + }, + { + "id": 494, + "origin_id": -10, + "origin_slot": 5, + "target_id": 289, + "target_slot": 2, + "type": "INT" + }, + { + "id": 483, + "origin_id": -10, + "origin_slot": 6, + "target_id": 288, + "target_slot": 4, + "type": "INT" + }, + { + "id": 485, + "origin_id": -10, + "origin_slot": 7, + "target_id": 299, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 486, + "origin_id": -10, + "origin_slot": 8, + "target_id": 277, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 487, + "origin_id": -10, + "origin_slot": 9, + "target_id": 278, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 489, + "origin_id": -10, + "origin_slot": 10, + "target_id": 300, + "target_slot": 0, + "type": "BOOLEAN" + }, + { + "id": 490, + "origin_id": -10, + "origin_slot": 11, + "target_id": 272, + "target_slot": 1, + "type": "COMBO" + }, + { + "id": 491, + "origin_id": -10, + "origin_slot": 12, + "target_id": 268, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 496, + "origin_id": -10, + "origin_slot": 13, + "target_id": 269, + "target_slot": 1, + "type": "INT" + }, + { + "id": 497, + "origin_id": -10, + "origin_slot": 14, + "target_id": 268, + "target_slot": 8, + "type": "COMBO" + }, + { + "id": 498, + "origin_id": 269, + "origin_slot": 0, + "target_id": 310, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 499, + "origin_id": 284, + "origin_slot": 0, + "target_id": 312, + "target_slot": 0, + "type": "IMAGE" + } + ], + "extra": {}, + "category": "Video generation and editing/Inpaint video", + "description": "Removes objects from video by inpainting masked regions using Wan 2.1 VACE, with SAM3 text-guided segmentation and optional Lightning LoRA turbo mode." + }, + { + "id": "17df2eeb-d89e-46ee-9480-a4ca2494b207", + "version": 1, + "state": { + "lastGroupId": 31, + "lastNodeId": 315, + "lastLinkId": 499, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Image Segmentation (SAM3)", + "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.", + "inputNode": { + "id": -10, + "bounding": [ + -2260, + -3450, + 136.369140625, + 220 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1130, + -3305, + 120, + 80 + ] + }, + "inputs": [ + { + "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6", + "name": "image", + "type": "IMAGE", + "linkIds": [ + 264 + ], + "localized_name": "image", + "label": "image", + "pos": [ + -2143.630859375, + -3430 + ] + }, + { + "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745", + "name": "text", + "type": "STRING", + "linkIds": [ + 265 + ], + "label": "object", + "pos": [ + -2143.630859375, + -3410 + ] + }, + { + "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 266 + ], + "pos": [ + -2143.630859375, + -3390 + ] + }, + { + "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899", + "name": "positive_coords", + "type": "STRING", + "linkIds": [ + 267 + ], + "pos": [ + -2143.630859375, + -3370 + ] + }, + { + "id": "c65f8b87-9bd7-48be-9fc2-823431e95019", + "name": "negative_coords", + "type": "STRING", + "linkIds": [ + 268 + ], + "pos": [ + -2143.630859375, + -3350 + ] + }, + { + "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 269 + ], + "pos": [ + -2143.630859375, + -3330 + ] + }, + { + "id": "b1439668-b050-490b-a5dc-fc4052c55666", + "name": "refine_iterations", + "type": "INT", + "linkIds": [ + 270 + ], + "pos": [ + -2143.630859375, + -3310 + ] + }, + { + "id": "86e239e5-c098-4302-b54d-d42a38bc0f89", + "name": "individual_masks", + "type": "BOOLEAN", + "linkIds": [ + 271 + ], + "pos": [ + -2143.630859375, + -3290 + ] + }, + { + "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 272 + ], + "pos": [ + -2143.630859375, + -3270 + ] + } + ], + "outputs": [ + { + "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913", + "name": "masks", + "type": "MASK", + "linkIds": [ + 231 + ], + "localized_name": "masks", + "pos": [ + -1110, + -3285 + ] + }, + { + "id": "8f622e40-8528-4078-b7d3-147e9f872194", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 232 + ], + "localized_name": "bboxes", + "pos": [ + -1110, + -3265 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 75, + "type": "SAM3_Detect", + "pos": [ + -1470, + -3460 + ], + "size": [ + 270, + 260 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 237 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 264 + }, + { + "label": "conditioning", + "localized_name": "conditioning", + "name": "conditioning", + "shape": 7, + "type": "CONDITIONING", + "link": 200 + }, + { + "label": "bboxes", + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 266 + }, + { + "label": "positive_coords", + "localized_name": "positive_coords", + "name": "positive_coords", + "shape": 7, + "type": "STRING", + "link": 267 + }, + { + "label": "negative_coords", + "localized_name": "negative_coords", + "name": "negative_coords", + "shape": 7, + "type": "STRING", + "link": 268 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 269 + }, + { + "localized_name": "refine_iterations", + "name": "refine_iterations", + "type": "INT", + "widget": { + "name": "refine_iterations" + }, + "link": 270 + }, + { + "localized_name": "individual_masks", + "name": "individual_masks", + "type": "BOOLEAN", + "widget": { + "name": "individual_masks" + }, + "link": 271 + } + ], + "outputs": [ + { + "localized_name": "masks", + "name": "masks", + "type": "MASK", + "links": [ + 231 + ] + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 232 + ] + } + ], + "properties": { + "Node name for S&R": "SAM3_Detect", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + 0.5, + 2, + false + ] + }, + { + "id": 236, + "type": "CheckpointLoaderSimple", + "pos": [ + -1970, + -3200 + ], + "size": [ + 330, + 140 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 272 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 237 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [ + 240 + ] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": null + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "models": [ + { + "name": "sam3.1_multiplex_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "sam3.1_multiplex_fp16.safetensors" + ] + }, + { + "id": 237, + "type": "CLIPTextEncode", + "pos": [ + -2000, + -3000 + ], + "size": [ + 400, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "clip", + "name": "clip", + "type": "CLIP", + "link": 240 + }, + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "widget": { + "name": "text" + }, + "link": 265 + } + ], + "outputs": [ + { + "localized_name": "CONDITIONING", + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 200 + ] + } + ], + "properties": { + "Node name for S&R": "CLIPTextEncode", + "cnr_id": "comfy-core", + "ver": "0.19.3", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65, + "ue_properties": { + "widget_ue_connectable": {}, + "version": "7.7", + "input_ue_unconnectable": {} + } + }, + "widgets_values": [ + "" + ] + } + ], + "groups": [], + "links": [ + { + "id": 237, + "origin_id": 236, + "origin_slot": 0, + "target_id": 75, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 200, + "origin_id": 237, + "origin_slot": 0, + "target_id": 75, + "target_slot": 2, + "type": "CONDITIONING" + }, + { + "id": 240, + "origin_id": 236, + "origin_slot": 1, + "target_id": 237, + "target_slot": 0, + "type": "CLIP" + }, + { + "id": 231, + "origin_id": 75, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "MASK" + }, + { + "id": 232, + "origin_id": 75, + "origin_slot": 1, + "target_id": -20, + "target_slot": 1, + "type": "BOUNDING_BOX" + }, + { + "id": 264, + "origin_id": -10, + "origin_slot": 0, + "target_id": 75, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 265, + "origin_id": -10, + "origin_slot": 1, + "target_id": 237, + "target_slot": 1, + "type": "STRING" + }, + { + "id": 266, + "origin_id": -10, + "origin_slot": 2, + "target_id": 75, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 267, + "origin_id": -10, + "origin_slot": 3, + "target_id": 75, + "target_slot": 4, + "type": "STRING" + }, + { + "id": 268, + "origin_id": -10, + "origin_slot": 4, + "target_id": 75, + "target_slot": 5, + "type": "STRING" + }, + { + "id": 269, + "origin_id": -10, + "origin_slot": 5, + "target_id": 75, + "target_slot": 6, + "type": "FLOAT" + }, + { + "id": 270, + "origin_id": -10, + "origin_slot": 6, + "target_id": 75, + "target_slot": 7, + "type": "INT" + }, + { + "id": 271, + "origin_id": -10, + "origin_slot": 7, + "target_id": 75, + "target_slot": 8, + "type": "BOOLEAN" + }, + { + "id": 272, + "origin_id": -10, + "origin_slot": 8, + "target_id": 236, + "target_slot": 0, + "type": "COMBO" + } + ], + "extra": { + "ue_links": [] + } + } + ] + }, + "extra": {} +} \ No newline at end of file diff --git a/blueprints/Video Segmentation (SAM3).json b/blueprints/Video Segmentation (SAM3).json index 4d9a13412..4c7253869 100644 --- a/blueprints/Video Segmentation (SAM3).json +++ b/blueprints/Video Segmentation (SAM3).json @@ -818,7 +818,7 @@ } ], "extra": {}, - "category": "Video Tools", + "category": "Conditioning & Preprocessors/Segmentation & Mask", "description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts." } ] diff --git a/blueprints/Video Upscale(GAN x4).json b/blueprints/Video Upscale(GAN x4).json index 73476e36b..fc291ac41 100644 --- a/blueprints/Video Upscale(GAN x4).json +++ b/blueprints/Video Upscale(GAN x4).json @@ -412,7 +412,7 @@ "extra": { "workflowRendererVersion": "LG" }, - "category": "Video generation and editing/Enhance video", + "category": "Video generation and editing/Upscale", "description": "Upscales video to 4× resolution using a GAN-based upscaling model." } ] diff --git a/blueprints/Video to Pose Map (SDPose Multi-Person).json b/blueprints/Video to Pose Map (SDPose Multi-Person).json new file mode 100644 index 000000000..64ef6e524 --- /dev/null +++ b/blueprints/Video to Pose Map (SDPose Multi-Person).json @@ -0,0 +1,1323 @@ +{ + "revision": 0, + "last_node_id": 675, + "last_link_id": 0, + "nodes": [ + { + "id": 675, + "type": "01b6a731-fb78-4070-9a38-c87146da9604", + "pos": [ + -2480, + 3400 + ], + "size": [ + 370, + 638.625 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "resize_target_longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": null + }, + { + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": null + }, + { + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": null + }, + { + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": null + }, + { + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": null + }, + { + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": null + }, + { + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": null + }, + { + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": null + }, + { + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": null + }, + { + "label": "detect_threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": null + }, + { + "label": "detect_class", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": null + }, + { + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": null + }, + { + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": null + }, + { + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": null + }, + { + "name": "video", + "type": "VIDEO", + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [] + }, + { + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": null + }, + { + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [] + }, + { + "name": "audio", + "type": "AUDIO", + "links": [] + }, + { + "name": "fps", + "type": "FLOAT", + "links": [] + } + ], + "properties": { + "proxyWidgets": [ + [ + "674", + "resize_type.longer_size" + ], + [ + "674", + "scale_method" + ], + [ + "672", + "draw_body" + ], + [ + "672", + "draw_hands" + ], + [ + "672", + "draw_face" + ], + [ + "672", + "draw_feet" + ], + [ + "672", + "stick_width" + ], + [ + "672", + "face_point_size" + ], + [ + "672", + "score_threshold" + ], + [ + "678", + "threshold" + ], + [ + "678", + "class_name" + ], + [ + "678", + "max_detections" + ], + [ + "673", + "ckpt_name" + ], + [ + "677", + "unet_name" + ] + ], + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [], + "title": "Video to Pose Map (SDPose Multi-Person)" + } + ], + "links": [], + "version": 0.4, + "definitions": { + "subgraphs": [ + { + "id": "01b6a731-fb78-4070-9a38-c87146da9604", + "version": 1, + "state": { + "lastGroupId": 2, + "lastNodeId": 699, + "lastLinkId": 1754, + "lastRerouteId": 0 + }, + "revision": 0, + "config": {}, + "name": "Video to Pose Map (SDPose Multi-Person)", + "inputNode": { + "id": -10, + "bounding": [ + -3570, + 3300, + 182.8984375, + 340 + ] + }, + "outputNode": { + "id": -20, + "bounding": [ + -1890, + 3730, + 120, + 140 + ] + }, + "inputs": [ + { + "id": "088eefc1-cd8a-4573-993f-9e4da008a12d", + "name": "resize_type.longer_size", + "type": "INT", + "linkIds": [ + 1704 + ], + "label": "resize_target_longer_size", + "pos": [ + -3407.1015625, + 3320 + ] + }, + { + "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e", + "name": "scale_method", + "type": "COMBO", + "linkIds": [ + 1705 + ], + "pos": [ + -3407.1015625, + 3340 + ] + }, + { + "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0", + "name": "draw_body", + "type": "BOOLEAN", + "linkIds": [ + 1706 + ], + "pos": [ + -3407.1015625, + 3360 + ] + }, + { + "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c", + "name": "draw_hands", + "type": "BOOLEAN", + "linkIds": [ + 1707 + ], + "pos": [ + -3407.1015625, + 3380 + ] + }, + { + "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e", + "name": "draw_face", + "type": "BOOLEAN", + "linkIds": [ + 1708 + ], + "pos": [ + -3407.1015625, + 3400 + ] + }, + { + "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f", + "name": "draw_feet", + "type": "BOOLEAN", + "linkIds": [ + 1709 + ], + "pos": [ + -3407.1015625, + 3420 + ] + }, + { + "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb", + "name": "stick_width", + "type": "INT", + "linkIds": [ + 1710 + ], + "pos": [ + -3407.1015625, + 3440 + ] + }, + { + "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133", + "name": "face_point_size", + "type": "INT", + "linkIds": [ + 1711 + ], + "pos": [ + -3407.1015625, + 3460 + ] + }, + { + "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3", + "name": "score_threshold", + "type": "FLOAT", + "linkIds": [ + 1712 + ], + "pos": [ + -3407.1015625, + 3480 + ] + }, + { + "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7", + "name": "threshold", + "type": "FLOAT", + "linkIds": [ + 1718 + ], + "label": "detect_threshold", + "pos": [ + -3407.1015625, + 3500 + ] + }, + { + "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2", + "name": "class_name", + "type": "COMBO", + "linkIds": [ + 1719 + ], + "label": "detect_class", + "pos": [ + -3407.1015625, + 3520 + ] + }, + { + "id": "4417e988-6e80-4236-be31-4c179037f5a2", + "name": "max_detections", + "type": "INT", + "linkIds": [ + 1720 + ], + "pos": [ + -3407.1015625, + 3540 + ] + }, + { + "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492", + "name": "ckpt_name", + "type": "COMBO", + "linkIds": [ + 1721 + ], + "pos": [ + -3407.1015625, + 3560 + ] + }, + { + "id": "4d75122c-2c14-452a-98fe-d1545d3e012a", + "name": "unet_name", + "type": "COMBO", + "linkIds": [ + 1722 + ], + "pos": [ + -3407.1015625, + 3580 + ] + }, + { + "id": "6c46c988-4dd1-41a2-957e-03caf60d7657", + "name": "video", + "type": "VIDEO", + "linkIds": [ + 1741 + ], + "pos": [ + -3407.1015625, + 3600 + ] + } + ], + "outputs": [ + { + "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48", + "name": "IMAGE", + "type": "IMAGE", + "linkIds": [ + 1701 + ], + "localized_name": "IMAGE", + "pos": [ + -1870, + 3750 + ] + }, + { + "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "linkIds": [ + 1725 + ], + "pos": [ + -1870, + 3770 + ] + }, + { + "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f", + "name": "bboxes", + "type": "BOUNDING_BOX", + "linkIds": [ + 1726 + ], + "pos": [ + -1870, + 3790 + ] + }, + { + "id": "b7fe351d-2b38-41ea-9f4d-3be1a0aad275", + "name": "audio", + "type": "AUDIO", + "linkIds": [ + 1743 + ], + "pos": [ + -1870, + 3810 + ] + }, + { + "id": "ae187b6f-c9ca-4487-b5c1-3ad775fe945e", + "name": "fps", + "type": "FLOAT", + "linkIds": [ + 1744 + ], + "pos": [ + -1870, + 3830 + ] + } + ], + "widgets": [], + "nodes": [ + { + "id": 671, + "type": "SDPoseKeypointExtractor", + "pos": [ + -2550, + 3080 + ], + "size": [ + 270, + 180 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1696 + }, + { + "localized_name": "vae", + "name": "vae", + "type": "VAE", + "link": 1697 + }, + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1698 + }, + { + "localized_name": "bboxes", + "name": "bboxes", + "shape": 7, + "type": "BOUNDING_BOX", + "link": 1717 + }, + { + "localized_name": "batch_size", + "name": "batch_size", + "type": "INT", + "widget": { + "name": "batch_size" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "links": [ + 1699, + 1725 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseKeypointExtractor", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 16 + ] + }, + { + "id": 674, + "type": "ResizeImageMaskNode", + "pos": [ + -3010, + 3880 + ], + "size": [ + 270, + 110 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "input", + "name": "input", + "type": "IMAGE,MASK", + "link": 1742 + }, + { + "localized_name": "resize_type", + "name": "resize_type", + "type": "COMFY_DYNAMICCOMBO_V3", + "widget": { + "name": "resize_type" + }, + "link": null + }, + { + "localized_name": "resize_type.longer_size", + "name": "resize_type.longer_size", + "type": "INT", + "widget": { + "name": "resize_type.longer_size" + }, + "link": 1704 + }, + { + "localized_name": "scale_method", + "name": "scale_method", + "type": "COMBO", + "widget": { + "name": "scale_method" + }, + "link": 1705 + } + ], + "outputs": [ + { + "localized_name": "resized", + "name": "resized", + "type": "*", + "links": [ + 1698, + 1716 + ] + } + ], + "properties": { + "Node name for S&R": "ResizeImageMaskNode", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "scale longer dimension", + 1024, + "lanczos" + ] + }, + { + "id": 672, + "type": "SDPoseDrawKeypoints", + "pos": [ + -2540, + 3590 + ], + "size": [ + 270, + 280 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "keypoints", + "name": "keypoints", + "type": "POSE_KEYPOINT", + "link": 1699 + }, + { + "localized_name": "draw_body", + "name": "draw_body", + "type": "BOOLEAN", + "widget": { + "name": "draw_body" + }, + "link": 1706 + }, + { + "localized_name": "draw_hands", + "name": "draw_hands", + "type": "BOOLEAN", + "widget": { + "name": "draw_hands" + }, + "link": 1707 + }, + { + "localized_name": "draw_face", + "name": "draw_face", + "type": "BOOLEAN", + "widget": { + "name": "draw_face" + }, + "link": 1708 + }, + { + "localized_name": "draw_feet", + "name": "draw_feet", + "type": "BOOLEAN", + "widget": { + "name": "draw_feet" + }, + "link": 1709 + }, + { + "localized_name": "stick_width", + "name": "stick_width", + "type": "INT", + "widget": { + "name": "stick_width" + }, + "link": 1710 + }, + { + "localized_name": "face_point_size", + "name": "face_point_size", + "type": "INT", + "widget": { + "name": "face_point_size" + }, + "link": 1711 + }, + { + "localized_name": "score_threshold", + "name": "score_threshold", + "type": "FLOAT", + "widget": { + "name": "score_threshold" + }, + "link": 1712 + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1701 + ] + } + ], + "properties": { + "Node name for S&R": "SDPoseDrawKeypoints", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + true, + true, + true, + true, + 4, + 2, + 0.5 + ] + }, + { + "id": 673, + "type": "CheckpointLoaderSimple", + "pos": [ + -3040, + 3080 + ], + "size": [ + 390, + 160 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "ckpt_name", + "name": "ckpt_name", + "type": "COMBO", + "widget": { + "name": "ckpt_name" + }, + "link": 1721 + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1696 + ] + }, + { + "localized_name": "CLIP", + "name": "CLIP", + "type": "CLIP", + "links": [] + }, + { + "localized_name": "VAE", + "name": "VAE", + "type": "VAE", + "links": [ + 1697 + ] + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple", + "cnr_id": "comfy-core", + "ver": "0.15.0", + "models": [ + { + "name": "sdpose_wholebody_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors", + "directory": "checkpoints" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "sdpose_wholebody_fp16.safetensors" + ] + }, + { + "id": 677, + "type": "UNETLoader", + "pos": [ + -3030, + 3300 + ], + "size": [ + 370, + 110 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "unet_name", + "name": "unet_name", + "type": "COMBO", + "widget": { + "name": "unet_name" + }, + "link": 1722 + }, + { + "localized_name": "weight_dtype", + "name": "weight_dtype", + "type": "COMBO", + "widget": { + "name": "weight_dtype" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "MODEL", + "name": "MODEL", + "type": "MODEL", + "links": [ + 1715 + ] + } + ], + "properties": { + "Node name for S&R": "UNETLoader", + "cnr_id": "comfy-core", + "ver": "0.14.1", + "models": [ + { + "name": "rt_detr_v4-x-hgnet_fp16.safetensors", + "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors", + "directory": "diffusion_models" + } + ], + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + "rt_detr_v4-x-hgnet_fp16.safetensors", + "default" + ] + }, + { + "id": 678, + "type": "RTDETR_detect", + "pos": [ + -2540, + 3320 + ], + "size": [ + 270, + 200 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "label": "model", + "localized_name": "model", + "name": "model", + "type": "MODEL", + "link": 1715 + }, + { + "label": "image", + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "link": 1716 + }, + { + "localized_name": "threshold", + "name": "threshold", + "type": "FLOAT", + "widget": { + "name": "threshold" + }, + "link": 1718 + }, + { + "localized_name": "class_name", + "name": "class_name", + "type": "COMBO", + "widget": { + "name": "class_name" + }, + "link": 1719 + }, + { + "localized_name": "max_detections", + "name": "max_detections", + "type": "INT", + "widget": { + "name": "max_detections" + }, + "link": 1720 + } + ], + "outputs": [ + { + "localized_name": "bboxes", + "name": "bboxes", + "type": "BOUNDING_BOX", + "links": [ + 1717, + 1726 + ] + } + ], + "properties": { + "Node name for S&R": "RTDETR_detect", + "cnr_id": "comfy-core", + "ver": "0.15.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + }, + "widgets_values": [ + 0.5, + "person", + 2 + ] + }, + { + "id": 692, + "type": "GetVideoComponents", + "pos": [ + -3010, + 4100 + ], + "size": [ + 230, + 120 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "localized_name": "video", + "name": "video", + "type": "VIDEO", + "link": 1741 + } + ], + "outputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "links": [ + 1742 + ] + }, + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "links": [ + 1743 + ] + }, + { + "localized_name": "fps", + "name": "fps", + "type": "FLOAT", + "links": [ + 1744 + ] + } + ], + "properties": { + "Node name for S&R": "GetVideoComponents", + "cnr_id": "comfy-core", + "ver": "0.18.1", + "enableTabs": false, + "tabWidth": 65, + "tabXOffset": 10, + "hasSecondTab": false, + "secondTabText": "Send Back", + "secondTabOffset": 80, + "secondTabWidth": 65 + } + } + ], + "groups": [], + "links": [ + { + "id": 1696, + "origin_id": 673, + "origin_slot": 0, + "target_id": 671, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1697, + "origin_id": 673, + "origin_slot": 2, + "target_id": 671, + "target_slot": 1, + "type": "VAE" + }, + { + "id": 1698, + "origin_id": 674, + "origin_slot": 0, + "target_id": 671, + "target_slot": 2, + "type": "IMAGE" + }, + { + "id": 1699, + "origin_id": 671, + "origin_slot": 0, + "target_id": 672, + "target_slot": 0, + "type": "POSE_KEYPOINT" + }, + { + "id": 1701, + "origin_id": 672, + "origin_slot": 0, + "target_id": -20, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1704, + "origin_id": -10, + "origin_slot": 0, + "target_id": 674, + "target_slot": 2, + "type": "INT" + }, + { + "id": 1705, + "origin_id": -10, + "origin_slot": 1, + "target_id": 674, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1706, + "origin_id": -10, + "origin_slot": 2, + "target_id": 672, + "target_slot": 1, + "type": "BOOLEAN" + }, + { + "id": 1707, + "origin_id": -10, + "origin_slot": 3, + "target_id": 672, + "target_slot": 2, + "type": "BOOLEAN" + }, + { + "id": 1708, + "origin_id": -10, + "origin_slot": 4, + "target_id": 672, + "target_slot": 3, + "type": "BOOLEAN" + }, + { + "id": 1709, + "origin_id": -10, + "origin_slot": 5, + "target_id": 672, + "target_slot": 4, + "type": "BOOLEAN" + }, + { + "id": 1710, + "origin_id": -10, + "origin_slot": 6, + "target_id": 672, + "target_slot": 5, + "type": "INT" + }, + { + "id": 1711, + "origin_id": -10, + "origin_slot": 7, + "target_id": 672, + "target_slot": 6, + "type": "INT" + }, + { + "id": 1712, + "origin_id": -10, + "origin_slot": 8, + "target_id": 672, + "target_slot": 7, + "type": "FLOAT" + }, + { + "id": 1715, + "origin_id": 677, + "origin_slot": 0, + "target_id": 678, + "target_slot": 0, + "type": "MODEL" + }, + { + "id": 1716, + "origin_id": 674, + "origin_slot": 0, + "target_id": 678, + "target_slot": 1, + "type": "IMAGE" + }, + { + "id": 1717, + "origin_id": 678, + "origin_slot": 0, + "target_id": 671, + "target_slot": 3, + "type": "BOUNDING_BOX" + }, + { + "id": 1718, + "origin_id": -10, + "origin_slot": 9, + "target_id": 678, + "target_slot": 2, + "type": "FLOAT" + }, + { + "id": 1719, + "origin_id": -10, + "origin_slot": 10, + "target_id": 678, + "target_slot": 3, + "type": "COMBO" + }, + { + "id": 1720, + "origin_id": -10, + "origin_slot": 11, + "target_id": 678, + "target_slot": 4, + "type": "INT" + }, + { + "id": 1721, + "origin_id": -10, + "origin_slot": 12, + "target_id": 673, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1722, + "origin_id": -10, + "origin_slot": 13, + "target_id": 677, + "target_slot": 0, + "type": "COMBO" + }, + { + "id": 1725, + "origin_id": 671, + "origin_slot": 0, + "target_id": -20, + "target_slot": 1, + "type": "POSE_KEYPOINT" + }, + { + "id": 1726, + "origin_id": 678, + "origin_slot": 0, + "target_id": -20, + "target_slot": 2, + "type": "BOUNDING_BOX" + }, + { + "id": 1741, + "origin_id": -10, + "origin_slot": 14, + "target_id": 692, + "target_slot": 0, + "type": "VIDEO" + }, + { + "id": 1742, + "origin_id": 692, + "origin_slot": 0, + "target_id": 674, + "target_slot": 0, + "type": "IMAGE" + }, + { + "id": 1743, + "origin_id": 692, + "origin_slot": 1, + "target_id": -20, + "target_slot": 3, + "type": "AUDIO" + }, + { + "id": 1744, + "origin_id": 692, + "origin_slot": 2, + "target_id": -20, + "target_slot": 4, + "type": "FLOAT" + } + ], + "extra": { + "workflowRendererVersion": "LG" + }, + "category": "Conditioning & Preprocessors/Pose", + "description": "Extracts multi-person pose keypoints and skeleton frame sequences from video using SDPose with built-in person detection." + } + ] + }, + "extra": {} +} \ No newline at end of file From 0a2dd86e782dadfad43e4b995c12d1901ce48823 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Mon, 25 May 2026 18:26:40 -0700 Subject: [PATCH 03/13] MultiGPU Work Units For Accelerated Sampling (CORE-184) (#7063) --- comfy/cli_args.py | 2 +- comfy/controlnet.py | 65 +++- comfy/ldm/hunyuan3dv2_1/hunyuandit.py | 20 +- comfy/memory_management.py | 36 +-- comfy/model_management.py | 151 +++++++++- comfy/model_patcher.py | 177 ++++++++++- comfy/multigpu.py | 248 ++++++++++++++++ comfy/patcher_extension.py | 2 + comfy/sampler_helpers.py | 65 +++- comfy/samplers.py | 310 +++++++++++++++++-- comfy/sd.py | 381 ++++++++++++++---------- comfy/utils.py | 3 +- comfy_extras/nodes_multigpu.py | 412 ++++++++++++++++++++++++++ main.py | 2 +- nodes.py | 10 + server.py | 39 ++- 16 files changed, 1679 insertions(+), 244 deletions(-) create mode 100644 comfy/multigpu.py create mode 100644 comfy_extras/nodes_multigpu.py diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 47b8174f4..9bda414d1 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.") parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.") parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.") -parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.") +parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.") parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.") cm_group = parser.add_mutually_exclusive_group() cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).") diff --git a/comfy/controlnet.py b/comfy/controlnet.py index ba670b16d..6dbbaa959 100644 --- a/comfy/controlnet.py +++ b/comfy/controlnet.py @@ -15,13 +15,14 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ - +from __future__ import annotations import torch from enum import Enum import math import os import logging +import copy import comfy.utils import comfy.model_management import comfy.model_detection @@ -38,7 +39,7 @@ import comfy.ldm.hydit.controlnet import comfy.ldm.flux.controlnet import comfy.ldm.qwen_image.controlnet import comfy.cldm.dit_embedder -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union if TYPE_CHECKING: from comfy.hooks import HookGroup @@ -64,6 +65,18 @@ class StrengthType(Enum): CONSTANT = 1 LINEAR_UP = 2 +class ControlIsolation: + '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.''' + def __init__(self, control: ControlBase): + self.control = control + self.orig_previous_controlnet = control.previous_controlnet + + def __enter__(self): + self.control.previous_controlnet = None + + def __exit__(self, *args): + self.control.previous_controlnet = self.orig_previous_controlnet + class ControlBase: def __init__(self): self.cond_hint_original = None @@ -77,7 +90,7 @@ class ControlBase: self.compression_ratio = 8 self.upscale_algorithm = 'nearest-exact' self.extra_args = {} - self.previous_controlnet = None + self.previous_controlnet: Union[ControlBase, None] = None self.extra_conds = [] self.strength_type = StrengthType.CONSTANT self.concat_mask = False @@ -85,6 +98,7 @@ class ControlBase: self.extra_concat = None self.extra_hooks: HookGroup = None self.preprocess_image = lambda a: a + self.multigpu_clones: dict[torch.device, ControlBase] = {} def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]): self.cond_hint_original = cond_hint @@ -111,17 +125,38 @@ class ControlBase: def cleanup(self): if self.previous_controlnet is not None: self.previous_controlnet.cleanup() - + for device_cnet in self.multigpu_clones.values(): + with ControlIsolation(device_cnet): + device_cnet.cleanup() self.cond_hint = None self.extra_concat = None self.timestep_range = None def get_models(self): out = [] + for device_cnet in self.multigpu_clones.values(): + out += device_cnet.get_models_only_self() if self.previous_controlnet is not None: out += self.previous_controlnet.get_models() return out + def get_models_only_self(self): + 'Calls get_models, but temporarily sets previous_controlnet to None.' + with ControlIsolation(self): + return self.get_models() + + def get_instance_for_device(self, device): + 'Returns instance of this Control object intended for selected device.' + return self.multigpu_clones.get(device, self) + + def deepclone_multigpu(self, load_device, autoregister=False): + ''' + Create deep clone of Control object where model(s) is set to other devices. + + When autoregister is set to True, the deep clone is also added to multigpu_clones dict. + ''' + raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.") + def get_extra_hooks(self): out = [] if self.extra_hooks is not None: @@ -130,7 +165,7 @@ class ControlBase: out += self.previous_controlnet.get_extra_hooks() return out - def copy_to(self, c): + def copy_to(self, c: ControlBase): c.cond_hint_original = self.cond_hint_original c.strength = self.strength c.timestep_percent_range = self.timestep_percent_range @@ -284,6 +319,14 @@ class ControlNet(ControlBase): self.copy_to(c) return c + def deepclone_multigpu(self, load_device, autoregister=False): + c = self.copy() + c.control_model = copy.deepcopy(c.control_model) + c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device()) + if autoregister: + self.multigpu_clones[load_device] = c + return c + def get_models(self): out = super().get_models() out.append(self.control_model_wrapped) @@ -314,6 +357,10 @@ class QwenFunControlNet(ControlNet): super().pre_run(model, percent_to_timestep_function) self.set_extra_arg("base_model", model.diffusion_model) + def cleanup(self): + self.extra_args.pop("base_model", None) + super().cleanup() + def copy(self): c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype) c.control_model = self.control_model @@ -906,6 +953,14 @@ class T2IAdapter(ControlBase): self.copy_to(c) return c + def deepclone_multigpu(self, load_device, autoregister=False): + c = self.copy() + c.t2i_model = copy.deepcopy(c.t2i_model) + c.device = load_device + if autoregister: + self.multigpu_clones[load_device] = c + return c + def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options compression_ratio = 8 upscale_algorithm = 'nearest-exact' diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py index bc36b8998..4e4819fe3 100644 --- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py +++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py @@ -607,9 +607,13 @@ class HunYuanDiTPlain(nn.Module): def forward(self, x, t, context, transformer_options = {}, **kwargs): x = x.movedim(-1, -2) - if context.shape[0] >= 2: - uncond_emb, cond_emb = context.chunk(2, dim = 0) - context = torch.cat([cond_emb, uncond_emb], dim = 0) + + swap_cfg_halves = context.shape[0] >= 2 + + if swap_cfg_halves: + first_half, second_half = context.chunk(2, dim = 0) + context = torch.cat([second_half, first_half], dim = 0) + main_condition = context t = 1.0 - t @@ -657,8 +661,8 @@ class HunYuanDiTPlain(nn.Module): output = self.final_layer(combined) output = output.movedim(-2, -1) * (-1.0) - if output.shape[0] >= 2: - cond_emb, uncond_emb = output.chunk(2, dim = 0) - return torch.cat([uncond_emb, cond_emb]) - else: - return output + if swap_cfg_halves: + first_half, second_half = output.chunk(2, dim = 0) + output = torch.cat([second_half, first_half], dim = 0) + + return output diff --git a/comfy/memory_management.py b/comfy/memory_management.py index c43f0c4a2..962addb27 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -1,6 +1,5 @@ import math import ctypes -import threading import dataclasses import torch from typing import NamedTuple @@ -10,7 +9,7 @@ from comfy.quant_ops import QuantizedTensor class TensorFileSlice(NamedTuple): file_ref: object - thread_id: int + lock: object offset: int size: int @@ -43,7 +42,6 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N file_obj = info.file_ref if (destination.device.type != "cpu" or file_obj is None - or threading.get_ident() != info.thread_id or destination.numel() * destination.element_size() < info.size or tensor.numel() * tensor.element_size() != info.size or tensor.storage_offset() != 0 @@ -57,27 +55,29 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N if hostbuf is not None: stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 device_ptr = destination2.data_ptr() if destination2 is not None else 0 - hostbuf.read_file_slice(file_obj, info.offset, info.size, - offset=destination.data_ptr() - hostbuf.get_raw_address(), - stream=stream_ptr, - device_ptr=device_ptr, - device=None if destination2 is None else destination2.device.index) + with info.lock: + hostbuf.read_file_slice(file_obj, info.offset, info.size, + offset=destination.data_ptr() - hostbuf.get_raw_address(), + stream=stream_ptr, + device_ptr=device_ptr, + device=None if destination2 is None else destination2.device.index) return True buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) try: - file_obj.seek(info.offset) - done = 0 - while done < info.size: - try: - n = file_obj.readinto(view[done:]) - except OSError: - return False - if n <= 0: - return False - done += n + with info.lock: + file_obj.seek(info.offset) + done = 0 + while done < info.size: + try: + n = file_obj.readinto(view[done:]) + except OSError: + return False + if n <= 0: + return False + done += n return True finally: view.release() diff --git a/comfy/model_management.py b/comfy/model_management.py index cd8772d3a..b01c4d7fa 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -15,6 +15,7 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ +from __future__ import annotations import psutil import logging @@ -27,13 +28,18 @@ import platform import weakref import gc import os -from contextlib import nullcontext +from contextlib import contextmanager, nullcontext import comfy.memory_management import comfy.utils import comfy.quant_ops import comfy_aimdo.host_buffer import comfy_aimdo.vram_buffer +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher + + class VRAMState(Enum): DISABLED = 0 #No vram present: no need to move models to vram NO_VRAM = 1 #Very low vram: enable all the options to save vram @@ -204,6 +210,107 @@ def get_torch_device(): else: return torch.device(torch.cuda.current_device()) +def get_all_torch_devices(exclude_current=False): + global cpu_state + devices = [] + if cpu_state == CPUState.GPU: + # NVIDIA + AMD/ROCm both expose their GPUs through torch.cuda.*; + # without the AMD arm, single-GPU ROCm users get an empty list + # which silently turns unload_all_models() into a no-op. + if is_nvidia() or is_amd(): + for i in range(torch.cuda.device_count()): + devices.append(torch.device("cuda", i)) + elif is_intel_xpu(): + for i in range(torch.xpu.device_count()): + devices.append(torch.device("xpu", i)) + elif is_ascend_npu(): + for i in range(torch.npu.device_count()): + devices.append(torch.device("npu", i)) + elif is_mlu(): + for i in range(torch.mlu.device_count()): + devices.append(torch.device("mlu", i)) + else: + # Fallback for unhandled GPU backends (e.g. DirectML): at least + # report the current device so callers like unload_all_models() + # do not silently no-op. + devices.append(get_torch_device()) + else: + devices.append(get_torch_device()) + if exclude_current: + current = get_torch_device() + if current in devices: + devices.remove(current) + return devices + +def get_gpu_device_options(): + """Return list of device option strings for node widgets. + + Always includes "default" and "cpu". When multiple GPUs are present, + adds "gpu:0", "gpu:1", etc. (vendor-agnostic labels). + """ + options = ["default", "cpu"] + devices = get_all_torch_devices() + if len(devices) > 1: + for i in range(len(devices)): + options.append(f"gpu:{i}") + return options + +def get_gpu_device_options_no_cpu(): + """Variant of get_gpu_device_options that omits "cpu". + + Intended for components like the VAE selector where running on CPU + is impractical and should not be offered as a choice. + """ + return [o for o in get_gpu_device_options() if o != "cpu"] + +def resolve_gpu_device_option(option: str): + """Resolve a device option string to a torch.device. + + Returns None for "default" (let the caller use its normal default). + Returns torch.device("cpu") for "cpu". + For "gpu:N", returns the Nth torch device. Returns None if the + index is out of range, the option string is malformed, or + unrecognized (callers are expected to log their own context-rich + message before falling back to the default device). + """ + if option is None or option == "default": + return None + if option == "cpu": + return torch.device("cpu") + if option.startswith("gpu:"): + try: + idx = int(option[4:]) + except ValueError: + return None + devices = get_all_torch_devices() + if 0 <= idx < len(devices): + return devices[idx] + return None + +@contextmanager +def cuda_device_context(device): + """Context manager that sets torch.cuda.current_device to match *device*. + + Used when running operations on a non-default CUDA device so that custom + CUDA kernels (e.g. comfy_kitchen fp8 quantization) pick up the correct + device index. The previous device is restored on exit. + + No-op when *device* is not CUDA, has no explicit index, or already matches + the current device. + """ + prev = None + if device.type == "cuda" and device.index is not None: + prev = torch.cuda.current_device() + if prev != device.index: + torch.cuda.set_device(device) + else: + prev = None + try: + yield + finally: + if prev is not None: + torch.cuda.set_device(prev) + def get_total_memory(dev=None, torch_total_too=False): global directml_enabled if dev is None: @@ -492,9 +599,13 @@ try: logging.info("Device: {}".format(get_torch_device_name(get_torch_device()))) except: logging.warning("Could not pick default device.") +try: + for device in get_all_torch_devices(exclude_current=True): + logging.info("Device: {}".format(get_torch_device_name(device))) +except: + pass - -current_loaded_models = [] +current_loaded_models: list[LoadedModel] = [] DIRTY_MMAPS = set() @@ -554,7 +665,7 @@ def ensure_pin_registerable(size, evict_active=False): return shortfall <= REGISTERABLE_PIN_HYSTERESIS class LoadedModel: - def __init__(self, model): + def __init__(self, model: ModelPatcher): self._set_model(model) self.device = model.load_device self.real_model = None @@ -562,7 +673,7 @@ class LoadedModel: self.model_finalizer = None self._patcher_finalizer = None - def _set_model(self, model): + def _set_model(self, model: ModelPatcher): self._model = weakref.ref(model) if model.parent is not None: self._parent_model = weakref.ref(model.parent) @@ -573,6 +684,7 @@ class LoadedModel: model = self._parent_model() if model is not None: self._set_model(model) + self.device = model.load_device @property def model(self): @@ -1848,7 +1960,34 @@ def soft_empty_cache(force=False): torch.cuda.ipc_collect() def unload_all_models(): - free_memory(1e30, get_torch_device()) + for device in get_all_torch_devices(): + free_memory(1e30, device) + +def unload_model_and_clones(model: ModelPatcher, unload_additional_models=True, all_devices=False): + 'Unload only model and its clones - primarily for multigpu cloning purposes.' + initial_keep_loaded: list[LoadedModel] = current_loaded_models.copy() + additional_models = [] + if unload_additional_models: + additional_models = model.get_nested_additional_models() + keep_loaded = [] + for loaded_model in initial_keep_loaded: + if loaded_model.model is not None: + if model.clone_base_uuid == loaded_model.model.clone_base_uuid: + continue + # check additional models if they are a match + skip = False + for add_model in additional_models: + if add_model.clone_base_uuid == loaded_model.model.clone_base_uuid: + skip = True + break + if skip: + continue + keep_loaded.append(loaded_model) + if not all_devices: + free_memory(1e30, get_torch_device(), keep_loaded) + else: + for device in get_all_torch_devices(): + free_memory(1e30, device, keep_loaded) def debug_memory_summary(): if is_amd() or is_nvidia(): diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index b44b99e4a..00a15fa63 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -78,12 +78,15 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_ def create_model_options_clone(orig_model_options: dict): return comfy.patcher_extension.copy_nested_dicts(orig_model_options) -def create_hook_patches_clone(orig_hook_patches): +def create_hook_patches_clone(orig_hook_patches, copy_tuples=False): new_hook_patches = {} for hook_ref in orig_hook_patches: new_hook_patches[hook_ref] = {} for k in orig_hook_patches[hook_ref]: new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:] + if copy_tuples: + for i in range(len(new_hook_patches[hook_ref][k])): + new_hook_patches[hook_ref][k][i] = tuple(new_hook_patches[hook_ref][k][i]) return new_hook_patches def wipe_lowvram_weight(m): @@ -329,7 +332,10 @@ class ModelPatcher: self.is_clip = False self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed - self.cached_patcher_init: tuple[Callable, tuple] | None = None + self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None + self.is_multigpu_base_clone = False + self.clone_base_uuid = uuid.uuid4() + if not hasattr(self.model, 'model_loaded_weight_memory'): self.model.model_loaded_weight_memory = 0 @@ -366,7 +372,8 @@ class ModelPatcher: #than pays for CFG. So return everything both torch and Aimdo could give us aimdo_mem = 0 if comfy.memory_management.aimdo_enabled: - aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze() + aimdo_device = device.index if getattr(device, "type", None) == "cuda" else None + aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze(aimdo_device) return comfy.model_management.get_free_memory(device) + aimdo_mem def get_clone_model_override(self): @@ -380,6 +387,8 @@ class ModelPatcher: if self.cached_patcher_init is None: raise RuntimeError("Cannot create non-dynamic delegate: cached_patcher_init is not initialized.") temp_model_patcher = self.cached_patcher_init[0](*self.cached_patcher_init[1], disable_dynamic=True) + if len(self.cached_patcher_init) > 2: + temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]] model_override = temp_model_patcher.get_clone_model_override() if model_override is None: model_override = self.get_clone_model_override() @@ -438,19 +447,113 @@ class ModelPatcher: n.hook_mode = self.hook_mode n.cached_patcher_init = self.cached_patcher_init + n.is_multigpu_base_clone = self.is_multigpu_base_clone + n.clone_base_uuid = self.clone_base_uuid for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE): callback(self, n) return n + def deepclone_multigpu(self, new_load_device=None, models_cache: dict[uuid.UUID,ModelPatcher]=None): + logging.info(f"Creating deepclone of {self.model.__class__.__name__} for {new_load_device if new_load_device else self.load_device}.") + if self.cached_patcher_init is None: + raise RuntimeError( + f"Cannot create multigpu deepclone of {self.model.__class__.__name__}: " + "the loader that produced this model does not support multigpu " + "(cached_patcher_init is not initialized). Use a core loader " + "(CheckpointLoaderSimple, UNETLoader, CLIPLoader/DualCLIPLoader, VAELoader), " + "or have the custom loader register a cached_patcher_init factory." + ) + comfy.model_management.unload_model_and_clones(self) + # Produce a freshly-loaded patcher from the loader factory so the multigpu + # clone owns its own untainted model weights (rather than relying on + # copy.deepcopy of an already-patched/already-loaded module). + temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1]) + if len(self.cached_patcher_init) > 2: + temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]] + # Override clone()'s normal "share self.model + share backup containers" with + # the pristine model from temp_model_patcher plus empty backup containers -- + # the fresh model has no patches applied, so any deepcopy of self's stale + # backup/object_patches_backup/pinned would just propagate dead state that + # no longer corresponds to anything in n.model. + model_override = (temp_model_patcher.model, ({}, {}, {}, set())) + n = self.clone(model_override=model_override) + # clone() copies hook_backup by reference from self; reset since model is pristine. + n.hook_backup = {} + # set load device, if present + if new_load_device is not None: + n.load_device = new_load_device + # Ensure any per-device bookkeeping (e.g. ModelPatcherDynamic.dynamic_pins) + # has an entry for n.load_device on the freshly-loaded n.model. temp_model_patcher's + # __init__ only registered its own (default) load_device. + if hasattr(n, "register_load_device"): + n.register_load_device(n.load_device) + # multigpu clone should not have multigpu additional_models entry + n.remove_additional_models("multigpu") + # multigpu_clone all stored additional_models; make sure circular references are properly handled + if models_cache is None: + models_cache = {} + for key, model_list in n.additional_models.items(): + for i in range(len(model_list)): + add_model = n.additional_models[key][i] + if add_model.clone_base_uuid not in models_cache: + models_cache[add_model.clone_base_uuid] = add_model.deepclone_multigpu(new_load_device=new_load_device, models_cache=models_cache) + n.additional_models[key][i] = models_cache[add_model.clone_base_uuid] + for callback in self.get_all_callbacks(CallbacksMP.ON_DEEPCLONE_MULTIGPU): + callback(self, n) + return n + + def match_multigpu_clones(self): + multigpu_models = self.get_additional_models_with_key("multigpu") + if len(multigpu_models) > 0: + new_multigpu_models = [] + for mm in multigpu_models: + # clone main model, but bring over relevant props from existing multigpu clone + n = self.clone() + n.load_device = mm.load_device + n.backup = mm.backup + n.object_patches_backup = mm.object_patches_backup + n.hook_backup = mm.hook_backup + n.model = mm.model + n.is_multigpu_base_clone = mm.is_multigpu_base_clone + n.remove_additional_models("multigpu") + orig_additional_models: dict[str, list[ModelPatcher]] = comfy.patcher_extension.copy_nested_dicts(n.additional_models) + n.additional_models = comfy.patcher_extension.copy_nested_dicts(mm.additional_models) + # figure out which additional models are not present in multigpu clone + models_cache = {} + for mm_add_model in mm.get_additional_models(): + models_cache[mm_add_model.clone_base_uuid] = mm_add_model + remove_models_uuids = set(list(models_cache.keys())) + for key, model_list in orig_additional_models.items(): + for orig_add_model in model_list: + if orig_add_model.clone_base_uuid not in models_cache: + models_cache[orig_add_model.clone_base_uuid] = orig_add_model.deepclone_multigpu(new_load_device=n.load_device, models_cache=models_cache) + existing_list = n.get_additional_models_with_key(key) + existing_list.append(models_cache[orig_add_model.clone_base_uuid]) + n.set_additional_models(key, existing_list) + if orig_add_model.clone_base_uuid in remove_models_uuids: + remove_models_uuids.remove(orig_add_model.clone_base_uuid) + # remove duplicate additional models + for key, model_list in n.additional_models.items(): + new_model_list = [x for x in model_list if x.clone_base_uuid not in remove_models_uuids] + n.set_additional_models(key, new_model_list) + for callback in self.get_all_callbacks(CallbacksMP.ON_MATCH_MULTIGPU_CLONES): + callback(self, n) + new_multigpu_models.append(n) + self.set_additional_models("multigpu", new_multigpu_models) + def is_clone(self, other): if hasattr(other, 'model') and self.model is other.model: return True return False - def clone_has_same_weights(self, clone: 'ModelPatcher'): - if not self.is_clone(clone): - return False + def clone_has_same_weights(self, clone: ModelPatcher, allow_multigpu=False): + if allow_multigpu: + if self.clone_base_uuid != clone.clone_base_uuid: + return False + else: + if not self.is_clone(clone): + return False if self.current_hooks != clone.current_hooks: return False @@ -1232,7 +1335,7 @@ class ModelPatcher: return self.additional_models.get(key, []) def get_additional_models(self): - all_models = [] + all_models: list[ModelPatcher] = [] for models in self.additional_models.values(): all_models.extend(models) return all_models @@ -1286,9 +1389,18 @@ class ModelPatcher: for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN): callback(self) - def prepare_state(self, timestep): + def prepare_state(self, timestep, model_options): + ignore_multigpu = model_options.get("ignore_multigpu", False) for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE): - callback(self, timestep) + callback(self, timestep, model_options) + if not ignore_multigpu and "multigpu_clones" in model_options: + model_options["ignore_multigpu"] = True + try: + for p in model_options["multigpu_clones"].values(): + p: ModelPatcher + p.prepare_state(timestep, model_options) + finally: + model_options.pop("ignore_multigpu", None) def restore_hook_patches(self): if self.hook_patches_backup is not None: @@ -1301,12 +1413,18 @@ class ModelPatcher: def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]): curr_t = t[0] reset_current_hooks = False + multigpu_kf_changed_cache = None transformer_options = model_options.get("transformer_options", {}) for hook in hook_group.hooks: changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options) # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref; # this will cause the weights to be recalculated when sampling if changed: + # cache changed for multigpu usage + if "multigpu_clones" in model_options: + if multigpu_kf_changed_cache is None: + multigpu_kf_changed_cache = [] + multigpu_kf_changed_cache.append(hook) # reset current_hooks if contains hook that changed if self.current_hooks is not None: for current_hook in self.current_hooks.hooks: @@ -1318,6 +1436,28 @@ class ModelPatcher: self.cached_hook_patches.pop(cached_group) if reset_current_hooks: self.patch_hooks(None) + if "multigpu_clones" in model_options: + for p in model_options["multigpu_clones"].values(): + p: ModelPatcher + p._handle_changed_hook_keyframes(multigpu_kf_changed_cache) + + def _handle_changed_hook_keyframes(self, kf_changed_cache: list[comfy.hooks.Hook]): + 'Used to handle multigpu behavior inside prepare_hook_patches_current_keyframe.' + if kf_changed_cache is None: + return + reset_current_hooks = False + # reset current_hooks if contains hook that changed + for hook in kf_changed_cache: + if self.current_hooks is not None: + for current_hook in self.current_hooks.hooks: + if current_hook == hook: + reset_current_hooks = True + break + for cached_group in list(self.cached_hook_patches.keys()): + if cached_group.contains(hook): + self.cached_hook_patches.pop(cached_group) + if reset_current_hooks: + self.patch_hooks(None) def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None, registered: comfy.hooks.HookGroup = None): @@ -1566,16 +1706,27 @@ class ModelPatcherDynamic(ModelPatcher): self.model.dynamic_vbars = {} if not hasattr(self.model, "dynamic_pins"): self.model.dynamic_pins = {} - if self.load_device not in self.model.dynamic_pins: - self.model.dynamic_pins[self.load_device] = { + self.register_load_device(self.load_device) + self.non_dynamic_delegate_model = None + assert load_device is not None + + def register_load_device(self, device): + """Ensure dynamic_pins has an entry for *device*. + + Called from __init__ and also from any code that retargets an + already-constructed patcher to a new load_device (e.g. the + Select{Model,CLIP,VAE}Device selector nodes); without this entry + partially_unload_ram() raises KeyError when it tries to read the + per-device pin state. + """ + if device not in self.model.dynamic_pins: + self.model.dynamic_pins[device] = { "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), "hostbufs_initialized": False, "failed": False, "active": False, } - self.non_dynamic_delegate_model = None - assert load_device is not None def is_dynamic(self): return True diff --git a/comfy/multigpu.py b/comfy/multigpu.py new file mode 100644 index 000000000..e7f5b3d6f --- /dev/null +++ b/comfy/multigpu.py @@ -0,0 +1,248 @@ +from __future__ import annotations +import queue +import threading +import torch +import logging + +from collections import namedtuple +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher +import comfy.utils +import comfy.patcher_extension +import comfy.model_management + + +class MultiGPUThreadPool: + """Persistent thread pool for multi-GPU work distribution. + + Maintains one worker thread per extra GPU device. Each thread calls + torch.cuda.set_device() once at startup so that compiled kernel caches + (inductor/triton) stay warm across diffusion steps. + """ + + def __init__(self, devices: list[torch.device]): + self._workers: list[threading.Thread] = [] + self._work_queues: dict[torch.device, queue.Queue] = {} + self._result_queues: dict[torch.device, queue.Queue] = {} + + for device in devices: + wq = queue.Queue() + rq = queue.Queue() + self._work_queues[device] = wq + self._result_queues[device] = rq + t = threading.Thread(target=self._worker_loop, args=(device, wq, rq), daemon=True) + t.start() + self._workers.append(t) + + def _worker_loop(self, device: torch.device, work_q: queue.Queue, result_q: queue.Queue): + try: + torch.cuda.set_device(device) + except Exception as e: + logging.error(f"MultiGPUThreadPool: failed to set device {device}: {e}") + while True: + item = work_q.get() + if item is None: + return + result_q.put((None, e)) + return + while True: + item = work_q.get() + if item is None: + break + fn, args, kwargs = item + try: + result = fn(*args, **kwargs) + result_q.put((result, None)) + except Exception as e: + result_q.put((None, e)) + + def submit(self, device: torch.device, fn, *args, **kwargs): + self._work_queues[device].put((fn, args, kwargs)) + + def get_result(self, device: torch.device): + return self._result_queues[device].get() + + @property + def devices(self) -> list[torch.device]: + return list(self._work_queues.keys()) + + def shutdown(self): + for wq in self._work_queues.values(): + wq.put(None) # sentinel + for t in self._workers: + t.join(timeout=5.0) + + +class GPUOptions: + def __init__(self, device_index: int, relative_speed: float): + self.device_index = device_index + self.relative_speed = relative_speed + + def clone(self): + return GPUOptions(self.device_index, self.relative_speed) + + def create_dict(self): + return { + "relative_speed": self.relative_speed + } + +class GPUOptionsGroup: + def __init__(self): + self.options: dict[int, GPUOptions] = {} + + def add(self, info: GPUOptions): + self.options[info.device_index] = info + + def clone(self): + c = GPUOptionsGroup() + for opt in self.options.values(): + c.add(opt) + return c + + def register(self, model: ModelPatcher): + opts_dict = {} + # get devices that are valid for this model + devices: list[torch.device] = [model.load_device] + for extra_model in model.get_additional_models_with_key("multigpu"): + extra_model: ModelPatcher + devices.append(extra_model.load_device) + # create dictionary with actual device mapped to its GPUOptions + device_opts_list: list[GPUOptions] = [] + for device in devices: + device_opts = self.options.get(device.index, GPUOptions(device_index=device.index, relative_speed=1.0)) + opts_dict[device] = device_opts.create_dict() + device_opts_list.append(device_opts) + # make relative_speed relative to 1.0 + min_speed = min([x.relative_speed for x in device_opts_list]) + for value in opts_dict.values(): + value['relative_speed'] /= min_speed + model.model_options['multigpu_options'] = opts_dict + + +def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: GPUOptionsGroup=None, reuse_loaded=False): + 'Prepare ModelPatcher to contain deepclones of its BaseModel and related properties.' + model = model.clone() + # check if multigpu is already prepared - get the load devices from them if possible to exclude + skip_devices = set() + multigpu_models = model.get_additional_models_with_key("multigpu") + if len(multigpu_models) > 0: + for mm in multigpu_models: + skip_devices.add(mm.load_device) + skip_devices = list(skip_devices) + + # Exclude the primary model's actual device, not the global current device: + # after SelectModelDevice(gpu:N) the primary may not live on the process's + # current CUDA device, and excluding the wrong device picks bad extras. + all_devices = comfy.model_management.get_all_torch_devices(exclude_current=False) + full_extra_devices = [d for d in all_devices if d != model.load_device] + limit_extra_devices = full_extra_devices[:max_gpus-1] + extra_devices = limit_extra_devices.copy() + # exclude skipped devices + for skip in skip_devices: + if skip in extra_devices: + extra_devices.remove(skip) + # create new deepclones + if len(extra_devices) > 0: + for device in extra_devices: + device_patcher = None + if reuse_loaded: + # Only reuse a previously-loaded MultiGPU clone. A SelectModelDevice + # patcher on the same device shares clone_base_uuid but has + # is_multigpu_base_clone=False, which would later be filtered out by + # prepare_model_patcher_multigpu_clones() and silently shrink the + # work split back to one GPU. + loaded_models: list[ModelPatcher] = comfy.model_management.loaded_models() + for lm in loaded_models: + if lm.model is None: + continue + if lm.load_device != device: + continue + if lm.clone_base_uuid != model.clone_base_uuid: + continue + if not getattr(lm, "is_multigpu_base_clone", False): + continue + device_patcher = lm.clone() + logging.info(f"Reusing loaded multigpu deepclone of {device_patcher.model.__class__.__name__} for {device}") + break + if device_patcher is None: + device_patcher = model.deepclone_multigpu(new_load_device=device) + # Always flag the clone; whether reused or freshly deepcloned, it must + # advertise itself as a MultiGPU base clone so the cond scheduler picks + # it up in prepare_model_patcher_multigpu_clones(). + device_patcher.is_multigpu_base_clone = True + multigpu_models = model.get_additional_models_with_key("multigpu") + multigpu_models.append(device_patcher) + model.set_additional_models("multigpu", multigpu_models) + model.match_multigpu_clones() + if gpu_options is None: + gpu_options = GPUOptionsGroup() + gpu_options.register(model) + else: + logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.") + # only keep model clones that don't go 'past' the intended max_gpu count; + # this prunes any inherited multigpu clones whose load_device is no longer allowed + # when max_gpus is lowered between runs. + allowed_devices = set(limit_extra_devices) + allowed_devices.add(model.load_device) + multigpu_models = model.get_additional_models_with_key("multigpu") + new_multigpu_models = [m for m in multigpu_models if m.load_device in allowed_devices] + if len(new_multigpu_models) != len(multigpu_models): + model.set_additional_models("multigpu", new_multigpu_models) + model.match_multigpu_clones() + return model + + +LoadBalance = namedtuple('LoadBalance', ['work_per_device', 'idle_time']) +def load_balance_devices(model_options: dict[str], total_work: int, return_idle_time=False, work_normalized: int=None): + 'Optimize work assigned to different devices, accounting for their relative speeds and splittable work.' + opts_dict = model_options['multigpu_options'] + devices = list(model_options['multigpu_clones'].keys()) + speed_per_device = [] + work_per_device = [] + # get sum of each device's relative_speed + total_speed = 0.0 + for opts in opts_dict.values(): + total_speed += opts['relative_speed'] + # get relative work for each device; + # obtained by w = (W*r)/R + for device in devices: + relative_speed = opts_dict[device]['relative_speed'] + relative_work = (total_work*relative_speed) / total_speed + speed_per_device.append(relative_speed) + work_per_device.append(relative_work) + # relative work must be expressed in whole numbers, but likely is a decimal; + # perform rounding while maintaining total sum equal to total work (sum of relative works) + work_per_device = round_preserved(work_per_device) + dict_work_per_device = {} + for device, relative_work in zip(devices, work_per_device): + dict_work_per_device[device] = relative_work + if not return_idle_time: + return LoadBalance(dict_work_per_device, None) + # divide relative work by relative speed to get estimated completion time of said work by each device; + # time here is relative and does not correspond to real-world units + completion_time = [w/r for w,r in zip(work_per_device, speed_per_device)] + # calculate relative time spent by the devices waiting on each other after their work is completed + idle_time = abs(min(completion_time) - max(completion_time)) + # if need to compare work idle time, need to normalize to a common total work + if work_normalized: + idle_time *= (work_normalized/total_work) + + return LoadBalance(dict_work_per_device, idle_time) + +def round_preserved(values: list[float]): + 'Round all values in a list, preserving the combined sum of values.' + # get floor of values; casting to int does it too + floored = [int(x) for x in values] + total_floored = sum(floored) + # get remainder to distribute + remainder = round(sum(values)) - total_floored + # pair values with fractional portions + fractional = [(i, x-floored[i]) for i, x in enumerate(values)] + # sort by fractional part in descending order + fractional.sort(key=lambda x: x[1], reverse=True) + # distribute the remainder + for i in range(remainder): + index = fractional[i][0] + floored[index] += 1 + return floored diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 5ee4d5ee5..4b276b175 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -3,6 +3,8 @@ from typing import Callable class CallbacksMP: ON_CLONE = "on_clone" + ON_DEEPCLONE_MULTIGPU = "on_deepclone_multigpu" + ON_MATCH_MULTIGPU_CLONES = "on_match_multigpu_clones" ON_LOAD = "on_load_after" ON_DETACH = "on_detach_after" ON_CLEANUP = "on_cleanup" diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py index 3782fd2d5..bdce2f2d8 100644 --- a/comfy/sampler_helpers.py +++ b/comfy/sampler_helpers.py @@ -1,16 +1,18 @@ from __future__ import annotations +import torch import uuid import math import collections import comfy.model_management import comfy.conds +import comfy.model_patcher import comfy.utils import comfy.hooks import comfy.patcher_extension from typing import TYPE_CHECKING if TYPE_CHECKING: - from comfy.model_patcher import ModelPatcher from comfy.model_base import BaseModel + from comfy.model_patcher import ModelPatcher from comfy.controlnet import ControlBase def prepare_mask(noise_mask, shape, device): @@ -119,6 +121,47 @@ def cleanup_additional_models(models): if hasattr(m, 'cleanup'): m.cleanup() +def preprocess_multigpu_conds(conds: dict[str, list[dict[str]]], model: ModelPatcher, model_options: dict[str]): + '''If multigpu acceleration required, creates deepclones of ControlNets and GLIGEN per device.''' + multigpu_models: list[ModelPatcher] = model.get_additional_models_with_key("multigpu") + if len(multigpu_models) == 0: + return + extra_devices = [x.load_device for x in multigpu_models] + # handle controlnets + controlnets: set[ControlBase] = set() + for k in conds: + for kk in conds[k]: + if 'control' in kk: + controlnets.add(kk['control']) + if len(controlnets) > 0: + # first, unload all controlnet clones + for cnet in list(controlnets): + cnet_models = cnet.get_models() + for cm in cnet_models: + comfy.model_management.unload_model_and_clones(cm, unload_additional_models=True) + + # next, make sure each controlnet has a deepclone for all relevant devices + for cnet in controlnets: + curr_cnet = cnet + while curr_cnet is not None: + for device in extra_devices: + if device not in curr_cnet.multigpu_clones: + curr_cnet.deepclone_multigpu(device, autoregister=True) + curr_cnet = curr_cnet.previous_controlnet + # since all device clones are now present, recreate the linked list for cloned cnets per device + for cnet in controlnets: + curr_cnet = cnet + while curr_cnet is not None: + prev_cnet = curr_cnet.previous_controlnet + for device in extra_devices: + device_cnet = curr_cnet.get_instance_for_device(device) + prev_device_cnet = None + if prev_cnet is not None: + prev_device_cnet = prev_cnet.get_instance_for_device(device) + device_cnet.set_previous_controlnet(prev_device_cnet) + curr_cnet = prev_cnet + # potentially handle gligen - since not widely used, ignored for now + def estimate_memory(model, noise_shape, conds): cond_shapes = collections.defaultdict(list) cond_shapes_min = {} @@ -143,7 +186,8 @@ def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload) def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False): - real_model: BaseModel = None + model.match_multigpu_clones() + preprocess_multigpu_conds(conds, model, model_options) models, inference_memory = get_additional_models(conds, model.model_dtype()) models += get_additional_models_from_model_options(model_options) models += model.get_nested_additional_models() # TODO: does this require inference_memory update? @@ -155,7 +199,7 @@ def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=Non memory_required += inference_memory minimum_memory_required += inference_memory comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load) - real_model = model.model + real_model: BaseModel = model.model return real_model, conds, models @@ -201,3 +245,18 @@ def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict): comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name], copy_dict1=False) return to_load_options + +def prepare_model_patcher_multigpu_clones(model_patcher: ModelPatcher, loaded_models: list[ModelPatcher], model_options: dict): + ''' + In case multigpu acceleration is enabled, prep ModelPatchers for each device. + ''' + multigpu_patchers: list[ModelPatcher] = [x for x in loaded_models if x.is_multigpu_base_clone] + if len(multigpu_patchers) > 0: + multigpu_dict: dict[torch.device, ModelPatcher] = {} + multigpu_dict[model_patcher.load_device] = model_patcher + for x in multigpu_patchers: + x.hook_patches = comfy.model_patcher.create_hook_patches_clone(model_patcher.hook_patches, copy_tuples=True) + x.hook_mode = model_patcher.hook_mode # match main model's hook_mode + multigpu_dict[x.load_device] = x + model_options["multigpu_clones"] = multigpu_dict + return multigpu_patchers diff --git a/comfy/samplers.py b/comfy/samplers.py index c5e36ff05..e31277f7b 100755 --- a/comfy/samplers.py +++ b/comfy/samplers.py @@ -1,7 +1,9 @@ from __future__ import annotations + +import comfy.model_management from .k_diffusion import sampling as k_diffusion_sampling from .extra_samplers import uni_pc -from typing import TYPE_CHECKING, Callable, NamedTuple +from typing import TYPE_CHECKING, Callable, NamedTuple, Any if TYPE_CHECKING: from comfy.model_patcher import ModelPatcher from comfy.model_base import BaseModel @@ -16,6 +18,7 @@ import comfy.model_patcher import comfy.patcher_extension import comfy.hooks import comfy.context_windows +import comfy.multigpu import comfy.utils import scipy.stats import numpy @@ -141,7 +144,7 @@ def can_concat_cond(c1, c2): return cond_equal_size(c1.conditioning, c2.conditioning) -def cond_cat(c_list): +def cond_cat(c_list, device=None): temp = {} for x in c_list: for k in x: @@ -153,6 +156,8 @@ def cond_cat(c_list): for k in temp: conds = temp[k] out[k] = conds[0].concat(conds[1:]) + if device is not None and hasattr(out[k], 'to'): + out[k] = out[k].to(device) return out @@ -212,7 +217,12 @@ def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torc ) return executor.execute(model, conds, x_in, timestep, model_options) -def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options): +def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + # NOTE: keep in sync with _calc_cond_batch_multigpu below. Shared logic + # (hooked_to_run accumulation, memory-fit batching, per-chunk output + # aggregation) is duplicated there with per-device scheduling layered on top. + if 'multigpu_clones' in model_options: + return _calc_cond_batch_multigpu(model, conds, x_in, timestep, model_options) out_conds = [] out_counts = [] # separate conds by matching hooks @@ -244,7 +254,7 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens if has_default_conds: finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options) - model.current_patcher.prepare_state(timestep) + model.current_patcher.prepare_state(timestep, model_options) # run every hooked_to_run separately for hooks, to_run in hooked_to_run.items(): @@ -344,6 +354,239 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens return out_conds +def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]): + # NOTE: keep in sync with _calc_cond_batch above. Same conds-by-hooks + # accumulation, memory-fit batching, and output aggregation, but adds a + # per-device scheduler, per-device patcher/control lookup, tensor .to(device) + # placement, and MultiGPUThreadPool dispatch around the inner loop. + out_conds = [] + out_counts = [] + # separate conds by matching hooks + hooked_to_run: dict[comfy.hooks.HookGroup,list[tuple[tuple,int]]] = {} + default_conds = [] + has_default_conds = False + + output_device = x_in.device + + for i in range(len(conds)): + out_conds.append(torch.zeros_like(x_in)) + out_counts.append(torch.ones_like(x_in) * 1e-37) + + cond = conds[i] + default_c = [] + if cond is not None: + for x in cond: + if 'default' in x: + default_c.append(x) + has_default_conds = True + continue + p = get_area_and_mult(x, x_in, timestep) + if p is None: + continue + if p.hooks is not None: + model.current_patcher.prepare_hook_patches_current_keyframe(timestep, p.hooks, model_options) + hooked_to_run.setdefault(p.hooks, list()) + hooked_to_run[p.hooks] += [(p, i)] + default_conds.append(default_c) + + if has_default_conds: + finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options) + + model.current_patcher.prepare_state(timestep, model_options) + + devices = list(model_options['multigpu_clones'].keys()) + device_batched_hooked_to_run: dict[torch.device, list[tuple[comfy.hooks.HookGroup, tuple]]] = {} + # Track conds currently scheduled per device; single source of truth for capacity checks. + device_load: dict[torch.device, int] = {d: 0 for d in devices} + + total_conds = sum(len(to_run) for to_run in hooked_to_run.values()) + conds_per_device = max(1, math.ceil(total_conds / len(devices))) + + def next_available_device(start: int) -> tuple[int, torch.device]: + """Return (index, device) for the next device with remaining capacity, starting at `start`. + + Scans at most len(devices) positions, so this always terminates. Raises if no device + has remaining capacity, which would indicate a bug in conds_per_device accounting. + """ + for offset in range(len(devices)): + i = (start + offset) % len(devices) + if device_load[devices[i]] < conds_per_device: + return i, devices[i] + raise RuntimeError( + f"MultiGPU scheduler: all {len(devices)} devices at capacity " + f"({conds_per_device}) but conds remain to schedule" + ) + + # run every hooked_to_run separately + index_device = 0 + for hooks, to_run in hooked_to_run.items(): + while len(to_run) > 0: + index_device, current_device = next_available_device(index_device) + remaining_capacity = conds_per_device - device_load[current_device] + + first = to_run[0] + first_shape = first[0][0].shape + # collect candidate indices that can be concatenated with `first`, up to remaining capacity + to_batch_temp = [] + for x in range(len(to_run)): + if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < remaining_capacity: + to_batch_temp += [x] + + to_batch_temp.reverse() + to_batch = to_batch_temp[:1] + + free_memory = comfy.model_management.get_free_memory(current_device) + for i in range(1, len(to_batch_temp) + 1): + batch_amount = to_batch_temp[:len(to_batch_temp)//i] + input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:] + cond_shapes = collections.defaultdict(list) + for tt in batch_amount: + for k, v in to_run[tt][0].conditioning.items(): + cond_shapes[k].append(v.size()) + if model.memory_required(input_shape, cond_shapes=cond_shapes) * 1.5 < free_memory: + to_batch = batch_amount + break + + conds_to_batch = [to_run.pop(x) for x in to_batch] + device_load[current_device] += len(conds_to_batch) + device_batched_hooked_to_run.setdefault(current_device, []).append((hooks, conds_to_batch)) + + if device_load[current_device] >= conds_per_device: + index_device += 1 + + class thread_result(NamedTuple): + output: Any + mult: Any + area: Any + batch_chunks: int + cond_or_uncond: Any + error: Exception = None + + def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]): + try: + # TODO: non-NVIDIA support -- guard with `if device.type == "cuda":` once + # we extend multigpu QA beyond CUDA. Unconditional call crashes on + # XPU/NPU/MPS/CPU/DirectML backends. + torch.cuda.set_device(device) + model_current: BaseModel = model_options["multigpu_clones"][device].model + # run every hooked_to_run separately + with torch.no_grad(): + for hooks, to_batch in batch_tuple: + input_x = [] + mult = [] + c = [] + cond_or_uncond = [] + uuids = [] + area = [] + control: ControlBase = None + patches = None + for x in to_batch: + o = x + p = o[0] + input_x.append(p.input_x) + mult.append(p.mult) + c.append(p.conditioning) + area.append(p.area) + cond_or_uncond.append(o[1]) + uuids.append(p.uuid) + control = p.control + patches = p.patches + + batch_chunks = len(cond_or_uncond) + input_x = torch.cat(input_x).to(device) + c = cond_cat(c, device=device) + timestep_ = torch.cat([timestep.to(device)] * batch_chunks) + + transformer_options = model_current.current_patcher.apply_hooks(hooks=hooks) + if 'transformer_options' in model_options: + transformer_options = comfy.patcher_extension.merge_nested_dicts(transformer_options, + model_options['transformer_options'], + copy_dict1=False) + + if patches is not None: + transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts( + transformer_options.get("patches", {}), + patches + ) + + transformer_options["cond_or_uncond"] = cond_or_uncond[:] + transformer_options["uuids"] = uuids[:] + transformer_options["sigmas"] = timestep.to(device) + transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device) + transformer_options["multigpu_thread_device"] = device + + cast_transformer_options(transformer_options, device=device) + c['transformer_options'] = transformer_options + + if control is not None: + device_control = control.get_instance_for_device(device) + c['control'] = device_control.get_control(input_x, timestep_, c, len(cond_or_uncond), transformer_options) + + if 'model_function_wrapper' in model_options: + output = model_options['model_function_wrapper'](model_current.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).to(output_device).chunk(batch_chunks) + else: + output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks) + # TODO: non-NVIDIA support -- the `.to(output_device)` copies + # above are async on CUDA, so the main thread's aggregation + # could race with in-flight transfers. CUDA-only QA has not + # surfaced this in practice, but before extending multigpu + # beyond NVIDIA add a `torch.cuda.synchronize(output_device)` + # here (guarded by `output_device.type == "cuda"`). + results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond)) + except Exception as e: + results.append(thread_result(None, None, None, None, None, error=e)) + raise + + + def _handle_batch_pooled(device, batch_tuple): + worker_results = [] + _handle_batch(device, batch_tuple, worker_results) + return worker_results + + results: list[thread_result] = [] + thread_pool: comfy.multigpu.MultiGPUThreadPool = model_options.get("multigpu_thread_pool") + + # Submit all GPU work to pool threads + pool_devices = [] + for device, batch_tuple in device_batched_hooked_to_run.items(): + if thread_pool is not None: + thread_pool.submit(device, _handle_batch_pooled, device, batch_tuple) + pool_devices.append(device) + else: + # Fallback: no pool, run everything on main thread + _handle_batch(device, batch_tuple, results) + + # Collect results from pool workers + for device in pool_devices: + worker_results, error = thread_pool.get_result(device) + if error is not None: + raise error + results.extend(worker_results) + + for output, mult, area, batch_chunks, cond_or_uncond, error in results: + if error is not None: + raise error + for o in range(batch_chunks): + cond_index = cond_or_uncond[o] + a = area[o] + if a is None: + out_conds[cond_index] += output[o] * mult[o] + out_counts[cond_index] += mult[o] + else: + out_c = out_conds[cond_index] + out_cts = out_counts[cond_index] + dims = len(a) // 2 + for i in range(dims): + out_c = out_c.narrow(i + 2, a[i + dims], a[i]) + out_cts = out_cts.narrow(i + 2, a[i + dims], a[i]) + out_c += output[o] * mult[o] + out_cts += mult[o] + + for i in range(len(out_conds)): + out_conds[i] /= out_counts[i] + + return out_conds + def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): #TODO: remove logging.warning("WARNING: The comfy.samplers.calc_cond_uncond_batch function is deprecated please use the calc_cond_batch one instead.") return tuple(calc_cond_batch(model, [cond, uncond], x_in, timestep, model_options)) @@ -642,12 +885,21 @@ def calculate_start_end_timesteps(model, conds): def pre_run_control(model, conds): s = model.model_sampling + # Per-device model lookup so multigpu control clones get the matching + # diffusion_model (e.g. QwenFunControlNet stashes it into extra_args). + device_models: dict = {} + patcher = getattr(model, "current_patcher", None) + if patcher is not None: + for p in patcher.get_additional_models_with_key("multigpu"): + device_models[p.load_device] = p.model for t in range(len(conds)): x = conds[t] percent_to_timestep_function = lambda a: s.percent_to_sigma(a) if 'control' in x: x['control'].pre_run(model, percent_to_timestep_function) + for device, device_cnet in x['control'].multigpu_clones.items(): + device_cnet.pre_run(device_models.get(device, model), percent_to_timestep_function) def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func): cond_cnets = [] @@ -890,7 +1142,9 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): to_load_options = model_options.get("to_load_options", None) if to_load_options is None: return + cast_transformer_options(to_load_options, device, dtype) +def cast_transformer_options(transformer_options: dict[str], device=None, dtype=None): casts = [] if device is not None: casts.append(device) @@ -899,18 +1153,17 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): # if nothing to apply, do nothing if len(casts) == 0: return - # try to call .to on patches - if "patches" in to_load_options: - patches = to_load_options["patches"] + if "patches" in transformer_options: + patches = transformer_options["patches"] for name in patches: patch_list = patches[name] for i in range(len(patch_list)): if hasattr(patch_list[i], "to"): for cast in casts: patch_list[i] = patch_list[i].to(cast) - if "patches_replace" in to_load_options: - patches = to_load_options["patches_replace"] + if "patches_replace" in transformer_options: + patches = transformer_options["patches_replace"] for name in patches: patch_list = patches[name] for k in patch_list: @@ -920,8 +1173,8 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): # try to call .to on any wrappers/callbacks wrappers_and_callbacks = ["wrappers", "callbacks"] for wc_name in wrappers_and_callbacks: - if wc_name in to_load_options: - wc: dict[str, list] = to_load_options[wc_name] + if wc_name in transformer_options: + wc: dict[str, list] = transformer_options[wc_name] for wc_dict in wc.values(): for wc_list in wc_dict.values(): for i in range(len(wc_list)): @@ -929,7 +1182,6 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None): for cast in casts: wc_list[i] = wc_list[i].to(cast) - class CFGGuider: def __init__(self, model_patcher: ModelPatcher): self.model_patcher = model_patcher @@ -984,16 +1236,32 @@ class CFGGuider: self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options) device = self.model_patcher.load_device - noise = noise.to(device=device, dtype=torch.float32) - latent_image = latent_image.to(device=device, dtype=torch.float32) - sigmas = sigmas.to(device) - cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype()) + multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options) - try: - self.model_patcher.pre_run() - output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) - finally: - self.model_patcher.cleanup() + # Create persistent thread pool for all GPU devices (main + extras) + if multigpu_patchers: + extra_devices = [p.load_device for p in multigpu_patchers] + all_devices = [device] + extra_devices + self.model_options["multigpu_thread_pool"] = comfy.multigpu.MultiGPUThreadPool(all_devices) + + with comfy.model_management.cuda_device_context(device): + try: + noise = noise.to(device=device, dtype=torch.float32) + latent_image = latent_image.to(device=device, dtype=torch.float32) + sigmas = sigmas.to(device) + cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype()) + + self.model_patcher.pre_run() + for multigpu_patcher in multigpu_patchers: + multigpu_patcher.pre_run() + output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes) + finally: + thread_pool = self.model_options.pop("multigpu_thread_pool", None) + if thread_pool is not None: + thread_pool.shutdown() + self.model_patcher.cleanup() + for multigpu_patcher in multigpu_patchers: + multigpu_patcher.cleanup() comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models) del self.inner_model diff --git a/comfy/sd.py b/comfy/sd.py index 7bd07ed3a..084170c62 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -335,41 +335,43 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model(tokens) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) + device = self.patcher.load_device + self.cond_stage_model.set_clip_options({"execution_device": device}) all_hooks.reset() self.patcher.patch_hooks(None) if show_pbar: pbar = ProgressBar(len(scheduled_keyframes)) - for scheduled_opts in scheduled_keyframes: - t_range = scheduled_opts[0] - # don't bother encoding any conds outside of start_percent and end_percent bounds - if "start_percent" in add_dict: - if t_range[1] < add_dict["start_percent"]: - continue - if "end_percent" in add_dict: - if t_range[0] > add_dict["end_percent"]: - continue - hooks_keyframes = scheduled_opts[1] - for hook, keyframe in hooks_keyframes: - hook.hook_keyframe._current_keyframe = keyframe - # apply appropriate hooks with values that match new hook_keyframe - self.patcher.patch_hooks(all_hooks) - # perform encoding as normal - o = self.cond_stage_model.encode_token_weights(tokens) - cond, pooled = o[:2] - pooled_dict = {"pooled_output": pooled} - # add clip_start_percent and clip_end_percent in pooled - pooled_dict["clip_start_percent"] = t_range[0] - pooled_dict["clip_end_percent"] = t_range[1] - # add/update any keys with the provided add_dict - pooled_dict.update(add_dict) - # add hooks stored on clip - self.add_hooks_to_dict(pooled_dict) - all_cond_pooled.append([cond, pooled_dict]) - if show_pbar: - pbar.update(1) - model_management.throw_exception_if_processing_interrupted() + with model_management.cuda_device_context(device): + for scheduled_opts in scheduled_keyframes: + t_range = scheduled_opts[0] + # don't bother encoding any conds outside of start_percent and end_percent bounds + if "start_percent" in add_dict: + if t_range[1] < add_dict["start_percent"]: + continue + if "end_percent" in add_dict: + if t_range[0] > add_dict["end_percent"]: + continue + hooks_keyframes = scheduled_opts[1] + for hook, keyframe in hooks_keyframes: + hook.hook_keyframe._current_keyframe = keyframe + # apply appropriate hooks with values that match new hook_keyframe + self.patcher.patch_hooks(all_hooks) + # perform encoding as normal + o = self.cond_stage_model.encode_token_weights(tokens) + cond, pooled = o[:2] + pooled_dict = {"pooled_output": pooled} + # add clip_start_percent and clip_end_percent in pooled + pooled_dict["clip_start_percent"] = t_range[0] + pooled_dict["clip_end_percent"] = t_range[1] + # add/update any keys with the provided add_dict + pooled_dict.update(add_dict) + # add hooks stored on clip + self.add_hooks_to_dict(pooled_dict) + all_cond_pooled.append([cond, pooled_dict]) + if show_pbar: + pbar.update(1) + model_management.throw_exception_if_processing_interrupted() all_hooks.reset() return all_cond_pooled @@ -383,8 +385,12 @@ class CLIP: self.cond_stage_model.set_clip_options({"projected_pooled": False}) self.load_model(tokens) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) - o = self.cond_stage_model.encode_token_weights(tokens) + device = self.patcher.load_device + self.cond_stage_model.set_clip_options({"execution_device": device}) + + with model_management.cuda_device_context(device): + o = self.cond_stage_model.encode_token_weights(tokens) + cond, pooled = o[:2] if return_dict: out = {"cond": cond, "pooled_output": pooled} @@ -446,9 +452,12 @@ class CLIP: self.cond_stage_model.reset_clip_options() self.load_model(tokens) + device = self.patcher.load_device self.cond_stage_model.set_clip_options({"layer": None}) - self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device}) - return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty) + self.cond_stage_model.set_clip_options({"execution_device": device}) + + with model_management.cuda_device_context(device): + return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty) def decode(self, token_ids, skip_special_tokens=True): return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) @@ -1026,50 +1035,52 @@ class VAE: do_tile = False if self.latent_dim == 2 and samples_in.ndim == 5: samples_in = samples_in[:, :, 0] - try: - memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) - free_memory = self.patcher.get_free_memory(self.device) - batch_number = int(free_memory / memory_used) - batch_number = max(1, batch_number) - # Pre-allocate output for VAEs that support direct buffer writes - preallocated = False - if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): - pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype()) - preallocated = True + with model_management.cuda_device_context(self.device): + try: + memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) + free_memory = self.patcher.get_free_memory(self.device) + batch_number = int(free_memory / memory_used) + batch_number = max(1, batch_number) - for x in range(0, samples_in.shape[0], batch_number): - samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype) - if preallocated: - self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options) - else: - out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True) - if pixel_samples is None: - pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) - pixel_samples[x:x+batch_number].copy_(out) - del out - self.process_output(pixel_samples[x:x+batch_number]) - except Exception as e: - model_management.raise_non_oom(e) - logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") - #NOTE: We don't know what tensors were allocated to stack variables at the time of the - #exception and the exception itself refs them all until we get out of this except block. - #So we just set a flag for tiler fallback so that tensor gc can happen once the - #exception is fully off the books. - do_tile = True + # Pre-allocate output for VAEs that support direct buffer writes + preallocated = False + if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): + pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype()) + preallocated = True - if do_tile: - comfy.model_management.soft_empty_cache() - dims = samples_in.ndim - 2 - if dims == 1 or self.extra_1d_channel is not None: - pixel_samples = self.decode_tiled_1d(samples_in) - elif dims == 2: - pixel_samples = self.decode_tiled_(samples_in) - elif dims == 3: - tile = 256 // self.spacial_compression_decode() - overlap = tile // 4 - pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) + for x in range(0, samples_in.shape[0], batch_number): + samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype) + if preallocated: + self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options) + else: + out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True) + if pixel_samples is None: + pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) + pixel_samples[x:x+batch_number].copy_(out) + del out + self.process_output(pixel_samples[x:x+batch_number]) + except Exception as e: + model_management.raise_non_oom(e) + logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: + comfy.model_management.soft_empty_cache() + dims = samples_in.ndim - 2 + if dims == 1 or self.extra_1d_channel is not None: + pixel_samples = self.decode_tiled_1d(samples_in) + elif dims == 2: + pixel_samples = self.decode_tiled_(samples_in) + elif dims == 3: + tile = 256 // self.spacial_compression_decode() + overlap = tile // 4 + pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1) return pixel_samples @@ -1087,20 +1098,21 @@ class VAE: if overlap is not None: args["overlap"] = overlap - if dims == 1 or self.extra_1d_channel is not None: - args.pop("tile_y") - output = self.decode_tiled_1d(samples, **args) - elif dims == 2: - output = self.decode_tiled_(samples, **args) - elif dims == 3: - if overlap_t is None: - args["overlap"] = (1, overlap, overlap) - else: - args["overlap"] = (max(1, overlap_t), overlap, overlap) - if tile_t is not None: - args["tile_t"] = max(2, tile_t) + with model_management.cuda_device_context(self.device): + if dims == 1 or self.extra_1d_channel is not None: + args.pop("tile_y") + output = self.decode_tiled_1d(samples, **args) + elif dims == 2: + output = self.decode_tiled_(samples, **args) + elif dims == 3: + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (max(1, overlap_t), overlap, overlap) + if tile_t is not None: + args["tile_t"] = max(2, tile_t) - output = self.decode_tiled_3d(samples, **args) + output = self.decode_tiled_3d(samples, **args) return output.movedim(1, -1) def encode(self, pixel_samples): @@ -1113,44 +1125,46 @@ class VAE: pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0) else: pixel_samples = pixel_samples.unsqueeze(2) - try: - memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) - model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) - free_memory = self.patcher.get_free_memory(self.device) - batch_number = int(free_memory / max(1, memory_used)) - batch_number = max(1, batch_number) - samples = None - for x in range(0, pixel_samples.shape[0], batch_number): - pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype) - if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): - out = self.first_stage_model.encode(pixels_in, device=self.device) + + with model_management.cuda_device_context(self.device): + try: + memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype) + model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload) + free_memory = self.patcher.get_free_memory(self.device) + batch_number = int(free_memory / max(1, memory_used)) + batch_number = max(1, batch_number) + samples = None + for x in range(0, pixel_samples.shape[0], batch_number): + pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype) + if getattr(self.first_stage_model, 'comfy_has_chunked_io', False): + out = self.first_stage_model.encode(pixels_in, device=self.device) + else: + pixels_in = pixels_in.to(self.device) + out = self.first_stage_model.encode(pixels_in) + out = out.to(self.output_device).to(dtype=self.vae_output_dtype()) + if samples is None: + samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) + samples[x:x + batch_number] = out + + except Exception as e: + model_management.raise_non_oom(e) + logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") + #NOTE: We don't know what tensors were allocated to stack variables at the time of the + #exception and the exception itself refs them all until we get out of this except block. + #So we just set a flag for tiler fallback so that tensor gc can happen once the + #exception is fully off the books. + do_tile = True + + if do_tile: + comfy.model_management.soft_empty_cache() + if self.latent_dim == 3: + tile = 256 + overlap = tile // 4 + samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) + elif self.latent_dim == 1 or self.extra_1d_channel is not None: + samples = self.encode_tiled_1d(pixel_samples) else: - pixels_in = pixels_in.to(self.device) - out = self.first_stage_model.encode(pixels_in) - out = out.to(self.output_device).to(dtype=self.vae_output_dtype()) - if samples is None: - samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype()) - samples[x:x + batch_number] = out - - except Exception as e: - model_management.raise_non_oom(e) - logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.") - #NOTE: We don't know what tensors were allocated to stack variables at the time of the - #exception and the exception itself refs them all until we get out of this except block. - #So we just set a flag for tiler fallback so that tensor gc can happen once the - #exception is fully off the books. - do_tile = True - - if do_tile: - comfy.model_management.soft_empty_cache() - if self.latent_dim == 3: - tile = 256 - overlap = tile // 4 - samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap)) - elif self.latent_dim == 1 or self.extra_1d_channel is not None: - samples = self.encode_tiled_1d(pixel_samples) - else: - samples = self.encode_tiled_(pixel_samples) + samples = self.encode_tiled_(pixel_samples) return samples @@ -1176,26 +1190,27 @@ class VAE: if overlap is not None: args["overlap"] = overlap - if dims == 1: - args.pop("tile_y") - samples = self.encode_tiled_1d(pixel_samples, **args) - elif dims == 2: - samples = self.encode_tiled_(pixel_samples, **args) - elif dims == 3: - if tile_t is not None: - tile_t_latent = max(2, self.downscale_ratio[0](tile_t)) - else: - tile_t_latent = 9999 - args["tile_t"] = self.upscale_ratio[0](tile_t_latent) + with model_management.cuda_device_context(self.device): + if dims == 1: + args.pop("tile_y") + samples = self.encode_tiled_1d(pixel_samples, **args) + elif dims == 2: + samples = self.encode_tiled_(pixel_samples, **args) + elif dims == 3: + if tile_t is not None: + tile_t_latent = max(2, self.downscale_ratio[0](tile_t)) + else: + tile_t_latent = 9999 + args["tile_t"] = self.upscale_ratio[0](tile_t_latent) - if overlap_t is None: - args["overlap"] = (1, overlap, overlap) - else: - args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap) - maximum = pixel_samples.shape[2] - maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum)) + if overlap_t is None: + args["overlap"] = (1, overlap, overlap) + else: + args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap) + maximum = pixel_samples.shape[2] + maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum)) - samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args) + samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args) return samples @@ -1710,12 +1725,52 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata, disable_dynamic=disable_dynamic) if out is None: raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd))) - if output_model and out[0] is not None: - out[0].cached_patcher_init = (load_checkpoint_guess_config_model_only, (ckpt_path, embedding_directory, model_options, te_model_options)) - if output_clip and out[1] is not None: - out[1].patcher.cached_patcher_init = (load_checkpoint_guess_config_clip_only, (ckpt_path, embedding_directory, model_options, te_model_options)) + if out[0] is not None: + out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0) + # Register reload factories for the CLIP and VAE produced by the same checkpoint so + # ModelPatcher.deepclone_multigpu can spawn per-device copies (Select{CLIP,VAE}Device, + # MultiGPU work-units, etc.) without falling back to copy.deepcopy of an + # already-loaded module. + if out[1] is not None and getattr(out[1], "patcher", None) is not None: + out[1].patcher.cached_patcher_init = (load_checkpoint_clip_patcher, (ckpt_path, embedding_directory, model_options, te_model_options)) + if out[2] is not None and getattr(out[2], "patcher", None) is not None: + out[2].patcher.cached_patcher_init = (load_checkpoint_vae_patcher, (ckpt_path, embedding_directory, model_options, te_model_options)) return out + +def load_checkpoint_clip_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): + """Reload only the CLIP patcher from a checkpoint. Used as the cached_patcher_init + factory for the CLIP returned by load_checkpoint_guess_config.""" + _, clip, _, _ = load_checkpoint_guess_config( + ckpt_path, + output_vae=False, + output_clip=True, + output_clipvision=False, + embedding_directory=embedding_directory, + output_model=False, + model_options=model_options, + te_model_options=te_model_options, + disable_dynamic=disable_dynamic, + ) + return clip.patcher + + +def load_checkpoint_vae_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): + """Reload only the VAE patcher from a checkpoint. Used as the cached_patcher_init + factory for the VAE returned by load_checkpoint_guess_config.""" + _, _, vae, _ = load_checkpoint_guess_config( + ckpt_path, + output_vae=True, + output_clip=False, + output_clipvision=False, + embedding_directory=embedding_directory, + output_model=False, + model_options=model_options, + te_model_options=te_model_options, + disable_dynamic=disable_dynamic, + ) + return vae.patcher + def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False): model, *_ = load_checkpoint_guess_config(ckpt_path, False, False, False, embedding_directory=embedding_directory, @@ -1742,7 +1797,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd) parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix) weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix) - load_device = model_management.get_torch_device() + load_device = model_options.get("load_device", model_management.get_torch_device()) custom_operations = model_options.get("custom_operations", None) if custom_operations is None: @@ -1782,13 +1837,15 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype) model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device) ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher - model_patcher = ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device()) + offload_device = model_options.get("offload_device", model_management.unet_offload_device()) + model_patcher = ModelPatcher(model, load_device=load_device, offload_device=offload_device) model.load_model_weights(sd, diffusion_model_prefix, assign=model_patcher.is_dynamic()) if output_vae: vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True) vae_sd = model_config.process_vae_state_dict(vae_sd) - vae = VAE(sd=vae_sd, metadata=metadata) + vae_device = model_options.get("load_device", None) + vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device) if output_clip: if te_model_options.get("custom_operations", None) is None: @@ -1872,7 +1929,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable parameters = comfy.utils.calculate_parameters(sd) weight_dtype = comfy.utils.weight_dtype(sd) - load_device = model_management.get_torch_device() + load_device = model_options.get("load_device", model_management.get_torch_device()) model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata) if model_config is not None: @@ -1897,7 +1954,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable else: logging.warning("{} {}".format(diffusers_keys[k], k)) - offload_device = model_management.unet_offload_device() + offload_device = model_options.get("offload_device", model_management.unet_offload_device()) unet_weight_dtype = list(model_config.supported_inference_dtypes) if model_config.quant_config is not None: weight_dtype = None @@ -1939,6 +1996,26 @@ def load_diffusion_model(unet_path, model_options={}, disable_dynamic=False): model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options)) return model + +def load_vae_patcher(vae_path, metadata=None, device=None, disable_dynamic=False): + """Reload a disk-backed VAE from ``vae_path`` and return its patcher. + + Used as the ``cached_patcher_init`` factory on ``VAE.patcher`` so + :meth:`comfy.model_patcher.ModelPatcher.deepclone_multigpu` can produce a + fresh, untainted VAE patcher (no inherited per-device load state, no + in-place quantization fallout) for multigpu work-units and the + SelectVAEDevice node. The optional ``device`` matches the source loader's + VAE initialization path; the deepclone's ``load_device`` still controls + where the cloned patcher is targeted. + """ + if metadata is None: + sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True) + else: + sd = comfy.utils.load_torch_file(vae_path) + vae = VAE(sd=sd, metadata=metadata, device=device) + vae.throw_exception_if_invalid() + return vae.patcher + def load_unet(unet_path, dtype=None): logging.warning("The load_unet function has been deprecated and will be removed please switch to: load_diffusion_model") return load_diffusion_model(unet_path, model_options={"dtype": dtype}) diff --git a/comfy/utils.py b/comfy/utils.py index 31052714a..49ae12b06 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -86,6 +86,7 @@ def load_safetensors(ckpt): import comfy_aimdo.model_mmap f = open(ckpt, "rb", buffering=0) + file_lock = threading.Lock() model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt) file_size = os.path.getsize(ckpt) mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get())) @@ -111,7 +112,7 @@ def load_safetensors(ckpt): storage = tensor.untyped_storage() setattr(storage, "_comfy_tensor_file_slice", - comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start)) + comfy.memory_management.TensorFileSlice(f, file_lock, data_base_offset + start, end - start)) setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv)) sd[name] = tensor diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py new file mode 100644 index 000000000..2bd752b7d --- /dev/null +++ b/comfy_extras/nodes_multigpu.py @@ -0,0 +1,412 @@ +from __future__ import annotations + +import copy +import logging +from inspect import cleandoc +from typing import TYPE_CHECKING +from typing_extensions import override + +from comfy_api.latest import ComfyExtension, io + +if TYPE_CHECKING: + from comfy.model_patcher import ModelPatcher + from comfy.sd import CLIP, VAE +import torch + +import comfy.model_management +import comfy.multigpu + + +class MultiGPUCFGSplitNode(io.ComfyNode): + """ + Prepares model to have sampling accelerated via splitting work units. + + Should be placed after nodes that modify the model object itself, such as compile or attention-switch nodes. + + Other than those exceptions, this node can be placed in any order. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MultiGPU_WorkUnits", + display_name="MultiGPU CFG Split", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Model.Input("model"), + io.Int.Input("max_gpus", default=2, min=1, step=1), + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def execute(cls, model: ModelPatcher, max_gpus: int) -> io.NodeOutput: + model = comfy.multigpu.create_multigpu_deepclones(model, max_gpus, reuse_loaded=True) + return io.NodeOutput(model) + + +def _force_fp32_cpu_compute(patcher: ModelPatcher): + """Force fp32 inference dtype for CPU. + + PyTorch's CPU conv2d kernels fall back to software emulation for fp16/bf16 + and run ~500-600x slower than fp32, which makes a normal-sized workflow + look frozen for hours. Routing through set_model_compute_dtype leaves the + weights as-is and casts at use, so peak memory does not blow up.""" + dtype = patcher.model_dtype() + if dtype in (torch.float16, torch.bfloat16): + logging.info(f"Select Model Device: using fp32 compute dtype for CPU inference (model dtype was {dtype}).") + patcher.set_model_compute_dtype(torch.float32) + + +def _remember_base_devices(patcher: ModelPatcher): + """Stash the original load/offload device on the underlying model. + + Stored on patcher.model (which is shared with the input patcher), so + later "default" selections can recover the loader's original routing. + Only the first Select on a given chain writes these attrs; subsequent + deepclones inherit them onto their freshly-loaded model below. + """ + if not hasattr(patcher.model, "_select_base_load_device"): + patcher.model._select_base_load_device = patcher.load_device + patcher.model._select_base_offload_device = patcher.offload_device + + +def _propagate_base_devices(src_model, dst_model): + """Carry the loader-original device attrs onto the freshly-deepcloned model.""" + if hasattr(src_model, "_select_base_load_device") and not hasattr(dst_model, "_select_base_load_device"): + dst_model._select_base_load_device = src_model._select_base_load_device + dst_model._select_base_offload_device = src_model._select_base_offload_device + + +def _retarget_patcher(patcher: ModelPatcher, target_load_device, target_offload_device): + """Return a patcher whose actual model weights live on *target_load_device*. + + If *patcher* is already on *target_load_device* we just retarget the + (already-cloned) patcher's metadata in place. Otherwise we call + :meth:`ModelPatcher.deepclone_multigpu` to spawn a fresh model from + the loader's ``cached_patcher_init`` factory -- the only safe way to + move weights that may already be partially loaded onto another device. + + NOTE: reusing the input patcher's model when the requested device + matches its current load_device is a deliberate fast path. Anything + that has already mutated the original model (e.g. a prior KSampler + invocation on the same model) will be observed here. This is by + design and documented on the SelectXDeviceNode docstrings -- placing + Select X Device after a node that consumes the same model is not + recommended. + """ + if patcher.load_device == target_load_device: + # Fast path: weights already on the desired device, just update offload. + patcher.offload_device = target_offload_device + return patcher + src_model = patcher.model + patcher = patcher.deepclone_multigpu(new_load_device=target_load_device) + patcher.offload_device = target_offload_device + _propagate_base_devices(src_model, patcher.model) + if hasattr(patcher, "register_load_device"): + patcher.register_load_device(patcher.load_device) + return patcher + + +def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None): + """Resolve the requested device and produce a patcher routed there. + + For "default" we restore the loader's original load/offload pair. + For CPU we pin both load and offload to CPU (and, on a dynamic + patcher, downgrade to a plain ModelPatcher so the dynamic-only + code paths are bypassed). + For an explicit GPU we keep the loader's original offload but + target the requested load device; if that differs from the current + load device the patcher is deepcloned onto the new device. + """ + _remember_base_devices(patcher) + base_load = patcher.model._select_base_load_device + base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device + + if resolved is None: + # "default" -> route back to the loader's original devices. + return _retarget_patcher(patcher, base_load, base_offload) + if resolved.type == "cpu": + if patcher.is_dynamic(): + # clone(disable_dynamic=True) requires cached_patcher_init; let the + # exception surface to the caller (Select*DeviceNode.execute), which + # will translate it into a passthrough+log so unsupported loaders + # don't hard-fail the workflow. + patcher = patcher.clone(disable_dynamic=True) + patcher.load_device = resolved + patcher.offload_device = resolved + return patcher + return _retarget_patcher(patcher, resolved, base_offload) + + +def _prune_multigpu_collision(model: ModelPatcher, primary_device): + """Drop any multigpu clone whose load_device matches *primary_device*. + + Without pruning, MultiGPU CFG Split would have stacked a clone on + the same device the primary now occupies (i.e. the workflow places + MultiGPU CFG Split before Select Model Device). Keeps the clone set + consistent with the new primary placement. + """ + multigpu_models = model.get_additional_models_with_key("multigpu") + if not multigpu_models: + return + filtered = [m for m in multigpu_models if m.load_device != primary_device] + if len(filtered) != len(multigpu_models): + logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.") + model.set_additional_models("multigpu", filtered) + if hasattr(model, "match_multigpu_clones"): + model.match_multigpu_clones() + + +class SelectModelDeviceNode(io.ComfyNode): + """ + Place the diffusion model on a specific device (default / cpu / gpu:N). + + - "default" restores the device assigned by the loader (even after a + prior Select Model Device call). + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is restored to the loader's original choice. + + When the requested device differs from the device the input model is + already on, a fresh model is spawned via the loader's reload factory + (cached_patcher_init) so the new patcher owns independent weights on + the new device. Loaders that don't support multigpu (no factory) will + cause the node to pass through unchanged with a warning. + + If the workflow already has MultiGPU CFG Split applied and the chosen + GPU collides with one of the existing multigpu clones, that clone is + dropped so two patchers don't end up bound to the same device. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the model through unchanged and logs a message + instead of failing. + + NOTE: Placing Select Model Device *after* a node that has already + consumed the same model (e.g. a KSampler that ran on this model on + the original device) is not recommended -- any state the prior + consumer mutated on the original model will be observed when the + selected device matches the original (fast path). Place Select Model + Device before any consumer of the model. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectModelDevice", + display_name="Select Model Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Model.Input("model"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()), + ], + outputs=[ + io.Model.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + # Allow unknown gpu:N values so portable workflows do not error + # at validation time; runtime fallback will handle them. + return True + + @classmethod + def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput: + model = model.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(model) + try: + model = _apply_patcher_device(model, resolved) + except RuntimeError as e: + logging.warning(f"Select Model Device: cannot retarget model, passing through unchanged. ({e})") + return io.NodeOutput(model) + if resolved is not None: + if resolved.type == "cpu": + _force_fp32_cpu_compute(model) + _prune_multigpu_collision(model, model.load_device) + return io.NodeOutput(model) + + +class SelectCLIPDeviceNode(io.ComfyNode): + """ + Place the CLIP text encoder on a specific device (default / cpu / gpu:N). + + - "default" restores the device assigned by the loader. + - "cpu" pins both the load and offload device to CPU. + - "gpu:N" pins the load device to the Nth available GPU. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the CLIP through unchanged and logs a message + instead of failing. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectCLIPDevice", + display_name="Select CLIP Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Clip.Input("clip"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()), + ], + outputs=[ + io.Clip.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + return True + + @classmethod + def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput: + clip = clip.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(clip) + try: + clip.patcher = _apply_patcher_device(clip.patcher, resolved) + except RuntimeError as e: + logging.warning(f"Select CLIP Device: cannot retarget CLIP, passing through unchanged. ({e})") + return io.NodeOutput(clip) + + +class SelectVAEDeviceNode(io.ComfyNode): + """ + Place the VAE on a specific device (default / gpu:N). + + - "default" restores the device assigned by the loader. + - "gpu:N" pins the load device to the Nth available GPU; the offload + device is set to the standard VAE offload device. + + CPU is intentionally not exposed in the UI for the VAE; if a workflow + supplies "cpu" anyway (e.g. opened from another machine), the request + is dropped with a log message and the VAE is passed through unchanged. + + When the selected device does not exist on the current machine + (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box), + the node passes the VAE through unchanged and logs a message + instead of failing. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="SelectVAEDevice", + display_name="Select VAE Device", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Vae.Input("vae"), + io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options_no_cpu()), + ], + outputs=[ + io.Vae.Output(), + ], + ) + + @classmethod + def validate_inputs(cls, device="default"): + return True + + @classmethod + def execute(cls, vae: VAE, device: str = "default") -> io.NodeOutput: + # VAE has no .clone(); shallow-copy the wrapper and clone the patcher + # so we can retarget load/offload device without affecting the input VAE. + vae = copy.copy(vae) + vae.patcher = vae.patcher.clone() + resolved = comfy.model_management.resolve_gpu_device_option(device) + if resolved is None and device not in (None, "default"): + logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.") + return io.NodeOutput(vae) + if resolved is not None and resolved.type == "cpu": + logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.") + return io.NodeOutput(vae) + if not hasattr(vae, "_select_base_device"): + vae._select_base_device = vae.device + try: + vae.patcher = _apply_patcher_device( + vae.patcher, resolved, + base_offload_override=comfy.model_management.vae_offload_device(), + ) + except RuntimeError as e: + logging.warning(f"Select VAE Device: cannot retarget VAE, passing through unchanged. ({e})") + return io.NodeOutput(vae) + # Keep VAE wrapper in sync with whatever model the patcher now owns; + # deepclone_multigpu may have produced a fresh first_stage_model. + vae.first_stage_model = vae.patcher.model + vae.device = vae._select_base_device if resolved is None else resolved + return io.NodeOutput(vae) + + +class MultiGPUOptionsNode(io.ComfyNode): + """ + Select the relative speed of GPUs in the special case they have significantly different performance from one another. + + NOTE (not registered yet, see MultiGPUExtension.get_node_list below): + The output GPUOptionsGroup is plumbed through create_multigpu_deepclones() and stored on + model.model_options['multigpu_options'] via GPUOptionsGroup.register(), but the cond + scheduler in comfy/samplers.py (calc_cond_batch_outer_multigpu) does NOT yet consult + relative_speed when distributing conds across devices; it uses a uniform conds_per_device + round-robin via next_available_device(). Before re-enabling this node, wire its + relative_speed into the scheduler (e.g. via comfy.multigpu.load_balance_devices(), + which already implements the proportional split) so the input actually affects work + distribution. + """ + + @classmethod + def define_schema(cls): + return io.Schema( + node_id="MultiGPU_Options", + display_name="MultiGPU Options", + category="advanced/multigpu", + description=cleandoc(cls.__doc__), + inputs=[ + io.Int.Input("device_index", default=0, min=0, max=64), + io.Float.Input("relative_speed", default=1.0, min=0.0, step=0.01), + io.Custom("GPU_OPTIONS").Input("gpu_options", optional=True), + ], + outputs=[ + io.Custom("GPU_OPTIONS").Output(), + ], + ) + + @classmethod + def execute(cls, device_index: int, relative_speed: float, gpu_options: comfy.multigpu.GPUOptionsGroup = None) -> io.NodeOutput: + if not gpu_options: + gpu_options = comfy.multigpu.GPUOptionsGroup() + else: + gpu_options = gpu_options.clone() + + opt = comfy.multigpu.GPUOptions(device_index=device_index, relative_speed=relative_speed) + gpu_options.add(opt) + + return io.NodeOutput(gpu_options) + + +class MultiGPUExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [ + MultiGPUCFGSplitNode, + SelectModelDeviceNode, + SelectCLIPDeviceNode, + SelectVAEDeviceNode, + # MultiGPUOptionsNode, + ] + + +async def comfy_entrypoint() -> MultiGPUExtension: + return MultiGPUExtension() diff --git a/main.py b/main.py index 26d523c30..bce451a83 100644 --- a/main.py +++ b/main.py @@ -218,7 +218,7 @@ import comfy.model_patcher if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()): if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)): logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows") - elif comfy_aimdo.control.init_device(comfy.model_management.get_torch_device().index): + elif comfy_aimdo.control.init_devices(d.index for d in comfy.model_management.get_all_torch_devices()): if args.verbose == 'DEBUG': comfy_aimdo.control.set_log_debug() elif args.verbose == 'CRITICAL': diff --git a/nodes.py b/nodes.py index 13e46ac8a..fd4365c90 100644 --- a/nodes.py +++ b/nodes.py @@ -795,6 +795,7 @@ class VAELoader: #TODO: scale factor? def load_vae(self, vae_name): metadata = None + vae_path = None if vae_name == "pixel_space": sd = {} sd["pixel_space_vae"] = torch.tensor(1.0) @@ -813,6 +814,14 @@ class VAELoader: metadata["tae_latent_channels"] = 128 vae = comfy.sd.VAE(sd=sd, metadata=metadata) vae.throw_exception_if_invalid() + # Register a reload factory on the patcher so multigpu deepclones + # (Select VAE Device, future MultiGPU VAE work-units) can produce + # per-device clones from the same loader context. Only set when we + # actually have a single backing file -- pixel_space and the + # image TAESDs (composed from separate encoder/decoder files via + # load_taesd) are not addressable by a single vae_path. + if vae_path is not None: + vae.patcher.cached_patcher_init = (comfy.sd.load_vae_patcher, (vae_path, metadata, None)) return (vae,) class ControlNetLoader: @@ -2389,6 +2398,7 @@ async def init_builtin_extra_nodes(): "nodes_lt_audio.py", "nodes_lt.py", "nodes_hooks.py", + "nodes_multigpu.py", "nodes_load_3d.py", "nodes_cosmos.py", "nodes_video.py", diff --git a/server.py b/server.py index 44470b904..268441bd1 100644 --- a/server.py +++ b/server.py @@ -646,18 +646,37 @@ class PromptServer(): @routes.get("/system_stats") async def system_stats(request): - device = comfy.model_management.get_torch_device() - device_name = comfy.model_management.get_torch_device_name(device) + primary_device = comfy.model_management.get_torch_device() cpu_device = comfy.model_management.torch.device("cpu") ram_total = comfy.model_management.get_total_memory(cpu_device) ram_free = comfy.model_management.get_free_memory(cpu_device) - vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True) - vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True) required_frontend_version = FrontendManager.get_required_frontend_version() installed_templates_version = FrontendManager.get_installed_templates_version() required_templates_version = FrontendManager.get_required_templates_version() comfy_package_versions = FrontendManager.get_comfy_package_versions() + # Report every torch device visible to multigpu, with the primary + # device first so existing clients that read devices[0] keep working. + torch_devices = comfy.model_management.get_all_torch_devices() + if primary_device in torch_devices: + torch_devices = [primary_device] + [d for d in torch_devices if d != primary_device] + else: + torch_devices = [primary_device] + list(torch_devices) + + device_entries = [] + for d in torch_devices: + vram_total, torch_vram_total = comfy.model_management.get_total_memory(d, torch_total_too=True) + vram_free, torch_vram_free = comfy.model_management.get_free_memory(d, torch_free_too=True) + device_entries.append({ + "name": comfy.model_management.get_torch_device_name(d), + "type": d.type, + "index": d.index, + "vram_total": vram_total, + "vram_free": vram_free, + "torch_vram_total": torch_vram_total, + "torch_vram_free": torch_vram_free, + }) + system_stats = { "system": { "os": sys.platform, @@ -673,17 +692,7 @@ class PromptServer(): "embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded", "argv": sys.argv }, - "devices": [ - { - "name": device_name, - "type": device.type, - "index": device.index, - "vram_total": vram_total, - "vram_free": vram_free, - "torch_vram_total": torch_vram_total, - "torch_vram_free": torch_vram_free, - } - ] + "devices": device_entries } return web.json_response(system_stats) From da49b7d0b6a183e4b8e1520ac73fdae0e90cfb89 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Mon, 25 May 2026 19:23:29 -0700 Subject: [PATCH 04/13] Remove useless annotations imports. (#14105) --- app/assets/services/metadata_extract.py | 1 - app/custom_node_manager.py | 2 -- app/frontend_management.py | 1 - app/model_manager.py | 2 -- app/user_manager.py | 1 - comfy/comfy_types/node_typing.py | 1 - comfy/ldm/lightricks/vae/causal_audio_autoencoder.py | 1 - comfy/ldm/lightricks/vae/causal_video_autoencoder.py | 1 - comfy/ldm/lumina/model.py | 1 - comfy/ldm/moge/geometry.py | 1 - comfy/ldm/moge/model.py | 1 - comfy/ldm/moge/modules.py | 1 - comfy/ldm/moge/panorama.py | 1 - comfy/lora.py | 1 - comfy/patcher_extension.py | 1 - comfy/sd.py | 1 - comfy_api/latest/__init__.py | 2 -- comfy_api/latest/_input_impl/video_types.py | 1 - comfy_api/latest/_util/video_types.py | 1 - comfy_api_nodes/apis/__init__.py | 1 - comfy_api_nodes/apis/bfl.py | 2 -- comfy_api_nodes/apis/stability.py | 2 -- comfy_execution/graph.py | 1 - comfy_execution/progress.py | 2 -- comfy_execution/validation.py | 1 - comfy_extras/mediapipe/face_geometry.py | 1 - comfy_extras/mediapipe/face_landmarker.py | 1 - comfy_extras/nodes_audio.py | 2 -- comfy_extras/nodes_context_windows.py | 1 - comfy_extras/nodes_curve.py | 2 -- comfy_extras/nodes_images.py | 2 -- comfy_extras/nodes_logic.py | 1 - comfy_extras/nodes_math.py | 1 - comfy_extras/nodes_mediapipe.py | 1 - comfy_extras/nodes_moge.py | 1 - comfy_extras/nodes_number_convert.py | 1 - comfy_extras/nodes_painter.py | 2 -- comfy_extras/nodes_resolution.py | 1 - comfy_extras/nodes_toolkit.py | 1 - comfy_extras/nodes_video.py | 2 -- folder_paths.py | 2 -- nodes.py | 1 - 42 files changed, 54 deletions(-) diff --git a/app/assets/services/metadata_extract.py b/app/assets/services/metadata_extract.py index a004929bc..bdfe60218 100644 --- a/app/assets/services/metadata_extract.py +++ b/app/assets/services/metadata_extract.py @@ -4,7 +4,6 @@ Tier 1: Filesystem metadata (zero parsing) Tier 2: Safetensors header metadata (fast JSON read only) """ -from __future__ import annotations import json import logging diff --git a/app/custom_node_manager.py b/app/custom_node_manager.py index 281febca9..738af2abd 100644 --- a/app/custom_node_manager.py +++ b/app/custom_node_manager.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import folder_paths import glob diff --git a/app/frontend_management.py b/app/frontend_management.py index 483da2d29..8e84e8dd9 100644 --- a/app/frontend_management.py +++ b/app/frontend_management.py @@ -1,4 +1,3 @@ -from __future__ import annotations import argparse import logging import os diff --git a/app/model_manager.py b/app/model_manager.py index f124d1117..8f6e34b33 100644 --- a/app/model_manager.py +++ b/app/model_manager.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import base64 import json diff --git a/app/user_manager.py b/app/user_manager.py index 0517b3344..7b11e381c 100644 --- a/app/user_manager.py +++ b/app/user_manager.py @@ -1,4 +1,3 @@ -from __future__ import annotations import json import os import re diff --git a/comfy/comfy_types/node_typing.py b/comfy/comfy_types/node_typing.py index 57126fa4a..bb21eb1d1 100644 --- a/comfy/comfy_types/node_typing.py +++ b/comfy/comfy_types/node_typing.py @@ -1,6 +1,5 @@ """Comfy-specific type hinting""" -from __future__ import annotations from typing import Literal, TypedDict, Optional from typing_extensions import NotRequired from abc import ABC, abstractmethod diff --git a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py index b556b128f..58b67d45a 100644 --- a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py @@ -1,4 +1,3 @@ -from __future__ import annotations import torch from torch import nn from torch.nn import functional as F diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py index 998122c85..5975015e2 100644 --- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py +++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py @@ -1,4 +1,3 @@ -from __future__ import annotations import threading import torch from torch import nn diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py index 9e432d5c0..d0ee97d33 100644 --- a/comfy/ldm/lumina/model.py +++ b/comfy/ldm/lumina/model.py @@ -1,5 +1,4 @@ # Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py -from __future__ import annotations from typing import List, Optional, Tuple diff --git a/comfy/ldm/moge/geometry.py b/comfy/ldm/moge/geometry.py index 7fdc97871..d1a1e445f 100644 --- a/comfy/ldm/moge/geometry.py +++ b/comfy/ldm/moge/geometry.py @@ -1,6 +1,5 @@ """Pure-torch + scipy geometry helpers for MoGe inference and mesh export.""" -from __future__ import annotations from typing import Optional, Tuple diff --git a/comfy/ldm/moge/model.py b/comfy/ldm/moge/model.py index 6876c4af2..1695626bc 100644 --- a/comfy/ldm/moge/model.py +++ b/comfy/ldm/moge/model.py @@ -4,7 +4,6 @@ V1: DINOv2 backbone + multi-output head (points, mask). V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP). """ -from __future__ import annotations from numbers import Number from typing import Any, Dict, List, Optional, Tuple, Union diff --git a/comfy/ldm/moge/modules.py b/comfy/ldm/moge/modules.py index 235a59212..f6443d65a 100644 --- a/comfy/ldm/moge/modules.py +++ b/comfy/ldm/moge/modules.py @@ -1,6 +1,5 @@ """Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head.""" -from __future__ import annotations from typing import List, Optional, Sequence, Tuple, Union diff --git a/comfy/ldm/moge/panorama.py b/comfy/ldm/moge/panorama.py index de53ebe68..18d0cb665 100644 --- a/comfy/ldm/moge/panorama.py +++ b/comfy/ldm/moge/panorama.py @@ -6,7 +6,6 @@ equirect distance map via a multi-scale Poisson + gradient sparse solve. Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU). """ -from __future__ import annotations from typing import Callable, List, Optional, Tuple diff --git a/comfy/lora.py b/comfy/lora.py index c0e8b865c..4e0ea29e0 100644 --- a/comfy/lora.py +++ b/comfy/lora.py @@ -16,7 +16,6 @@ along with this program. If not, see . """ -from __future__ import annotations import comfy.memory_management import comfy.utils import comfy.model_management diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py index 4b276b175..189ee84ca 100644 --- a/comfy/patcher_extension.py +++ b/comfy/patcher_extension.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing import Callable class CallbacksMP: diff --git a/comfy/sd.py b/comfy/sd.py index 084170c62..a4e49763a 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -1,4 +1,3 @@ -from __future__ import annotations import json import torch from enum import Enum diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py index 04973fea0..e0a585b10 100644 --- a/comfy_api/latest/__init__.py +++ b/comfy_api/latest/__init__.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from abc import ABC, abstractmethod from typing import TYPE_CHECKING from comfy_api.internal import ComfyAPIBase diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py index 942278d88..99e67d363 100644 --- a/comfy_api/latest/_input_impl/video_types.py +++ b/comfy_api/latest/_input_impl/video_types.py @@ -1,4 +1,3 @@ -from __future__ import annotations from av.container import InputContainer from av.subtitles.stream import SubtitleStream from fractions import Fraction diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py index c92477f08..6c9d6a526 100644 --- a/comfy_api/latest/_util/video_types.py +++ b/comfy_api/latest/_util/video_types.py @@ -1,4 +1,3 @@ -from __future__ import annotations from dataclasses import dataclass from enum import Enum from fractions import Fraction diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py index 46a583b5e..9c4cfb9b6 100644 --- a/comfy_api_nodes/apis/__init__.py +++ b/comfy_api_nodes/apis/__init__.py @@ -3,7 +3,6 @@ # timestamp: 2025-07-30T08:54:00+00:00 # pylint: disable -from __future__ import annotations from datetime import date, datetime from enum import Enum diff --git a/comfy_api_nodes/apis/bfl.py b/comfy_api_nodes/apis/bfl.py index d8d3557b3..f0665fa09 100644 --- a/comfy_api_nodes/apis/bfl.py +++ b/comfy_api_nodes/apis/bfl.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from enum import Enum from typing import Any, Dict, Optional diff --git a/comfy_api_nodes/apis/stability.py b/comfy_api_nodes/apis/stability.py index 718360187..5b9b5ac7d 100644 --- a/comfy_api_nodes/apis/stability.py +++ b/comfy_api_nodes/apis/stability.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from enum import Enum from typing import Optional diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py index c47f3c79b..479ee8a53 100644 --- a/comfy_execution/graph.py +++ b/comfy_execution/graph.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing import Type, Literal import nodes diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py index f951a3350..731b8dc66 100644 --- a/comfy_execution/progress.py +++ b/comfy_execution/progress.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import TypedDict, Dict, Optional, Tuple from typing_extensions import override from PIL import Image diff --git a/comfy_execution/validation.py b/comfy_execution/validation.py index e73624bd1..ae9a2376c 100644 --- a/comfy_execution/validation.py +++ b/comfy_execution/validation.py @@ -1,4 +1,3 @@ -from __future__ import annotations from comfy_api.latest import IO diff --git a/comfy_extras/mediapipe/face_geometry.py b/comfy_extras/mediapipe/face_geometry.py index 04b2b0557..4f3813430 100644 --- a/comfy_extras/mediapipe/face_geometry.py +++ b/comfy_extras/mediapipe/face_geometry.py @@ -2,7 +2,6 @@ + weighted Procrustes solver. Computes the 4x4 facial transformation matrix. """ -from __future__ import annotations import math import numpy as np diff --git a/comfy_extras/mediapipe/face_landmarker.py b/comfy_extras/mediapipe/face_landmarker.py index a792b6046..e6b463c4c 100644 --- a/comfy_extras/mediapipe/face_landmarker.py +++ b/comfy_extras/mediapipe/face_landmarker.py @@ -1,7 +1,6 @@ """Pure-PyTorch port of MediaPipe's face_landmarker_v2_with_blendshapes.task: BlazeFace detector → FaceMesh v2 → ARKit-52 blendshapes.""" -from __future__ import annotations import math from functools import lru_cache diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py index d5084497e..f09a8a874 100644 --- a/comfy_extras/nodes_audio.py +++ b/comfy_extras/nodes_audio.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import av import torchaudio import torch diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py index f7ca833dc..24729c3a7 100644 --- a/comfy_extras/nodes_context_windows.py +++ b/comfy_extras/nodes_context_windows.py @@ -1,4 +1,3 @@ -from __future__ import annotations from comfy_api.latest import ComfyExtension, io import comfy.context_windows import nodes diff --git a/comfy_extras/nodes_curve.py b/comfy_extras/nodes_curve.py index 9803e8034..099453131 100644 --- a/comfy_extras/nodes_curve.py +++ b/comfy_extras/nodes_curve.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import numpy as np from comfy_api.latest import ComfyExtension, io diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py index 33933229d..fe6008aa3 100644 --- a/comfy_extras/nodes_images.py +++ b/comfy_extras/nodes_images.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import nodes import folder_paths diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py index 342cadb69..92507f1fc 100644 --- a/comfy_extras/nodes_logic.py +++ b/comfy_extras/nodes_logic.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing import TypedDict from typing_extensions import override from comfy_api.latest import ComfyExtension, io diff --git a/comfy_extras/nodes_math.py b/comfy_extras/nodes_math.py index 06aefa475..0040d1a92 100644 --- a/comfy_extras/nodes_math.py +++ b/comfy_extras/nodes_math.py @@ -4,7 +4,6 @@ Provides a ComfyMathExpression node that evaluates math expressions against dynamically-grown numeric inputs. """ -from __future__ import annotations import math import string diff --git a/comfy_extras/nodes_mediapipe.py b/comfy_extras/nodes_mediapipe.py index 6b7916aee..32dc22de3 100644 --- a/comfy_extras/nodes_mediapipe.py +++ b/comfy_extras/nodes_mediapipe.py @@ -10,7 +10,6 @@ Custom IO types: MediaPipeFaceLandmarker also emits the core BOUNDING_BOX type — pair with DrawBBoxes. """ -from __future__ import annotations import numpy as np import torch diff --git a/comfy_extras/nodes_moge.py b/comfy_extras/nodes_moge.py index 3508781a0..79aec5d7f 100644 --- a/comfy_extras/nodes_moge.py +++ b/comfy_extras/nodes_moge.py @@ -1,6 +1,5 @@ """ComfyUI nodes for the native MoGe (Monocular Geometry Estimation) integration.""" -from __future__ import annotations import torch diff --git a/comfy_extras/nodes_number_convert.py b/comfy_extras/nodes_number_convert.py index e38a33c15..01593b6e6 100644 --- a/comfy_extras/nodes_number_convert.py +++ b/comfy_extras/nodes_number_convert.py @@ -4,7 +4,6 @@ Provides a single node that converts INT, FLOAT, STRING, and BOOL inputs into FLOAT and INT outputs. """ -from __future__ import annotations import math diff --git a/comfy_extras/nodes_painter.py b/comfy_extras/nodes_painter.py index e104c8480..df7a0b76a 100644 --- a/comfy_extras/nodes_painter.py +++ b/comfy_extras/nodes_painter.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import hashlib import os diff --git a/comfy_extras/nodes_resolution.py b/comfy_extras/nodes_resolution.py index 520b4067e..1628038cc 100644 --- a/comfy_extras/nodes_resolution.py +++ b/comfy_extras/nodes_resolution.py @@ -1,4 +1,3 @@ -from __future__ import annotations import math from enum import Enum from typing_extensions import override diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py index ae802896b..0548a0cf8 100644 --- a/comfy_extras/nodes_toolkit.py +++ b/comfy_extras/nodes_toolkit.py @@ -1,4 +1,3 @@ -from __future__ import annotations from typing_extensions import override from comfy_api.latest import ComfyExtension, io diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py index 78a2a28f8..ae1d826d5 100644 --- a/comfy_extras/nodes_video.py +++ b/comfy_extras/nodes_video.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import av import torch diff --git a/folder_paths.py b/folder_paths.py index 36d61fcd0..7304e1b73 100644 --- a/folder_paths.py +++ b/folder_paths.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import os import time import mimetypes diff --git a/nodes.py b/nodes.py index fd4365c90..669a7057b 100644 --- a/nodes.py +++ b/nodes.py @@ -1,4 +1,3 @@ -from __future__ import annotations import torch From 88956e77af4e62f29b820582927d61a5be88e956 Mon Sep 17 00:00:00 2001 From: Jedrzej Kosinski Date: Mon, 25 May 2026 20:03:37 -0700 Subject: [PATCH 05/13] multigpu: use unet_manual_cast for SelectModelDevice compute dtype (#14108) --- comfy_extras/nodes_multigpu.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py index 2bd752b7d..d2f6fe67a 100644 --- a/comfy_extras/nodes_multigpu.py +++ b/comfy_extras/nodes_multigpu.py @@ -48,17 +48,14 @@ class MultiGPUCFGSplitNode(io.ComfyNode): return io.NodeOutput(model) -def _force_fp32_cpu_compute(patcher: ModelPatcher): - """Force fp32 inference dtype for CPU. - - PyTorch's CPU conv2d kernels fall back to software emulation for fp16/bf16 - and run ~500-600x slower than fp32, which makes a normal-sized workflow - look frozen for hours. Routing through set_model_compute_dtype leaves the - weights as-is and casts at use, so peak memory does not blow up.""" - dtype = patcher.model_dtype() - if dtype in (torch.float16, torch.bfloat16): - logging.info(f"Select Model Device: using fp32 compute dtype for CPU inference (model dtype was {dtype}).") - patcher.set_model_compute_dtype(torch.float32) +def _force_supported_compute_dtype(patcher: ModelPatcher, device: torch.device): + """Cast compute dtype to one the device supports; no-op if already supported.""" + weight_dtype = patcher.model_dtype() + cast_dtype = comfy.model_management.unet_manual_cast(weight_dtype, device) + if cast_dtype is None: + return + logging.info(f"Select Model Device: using {cast_dtype} compute dtype on {device} (model weight dtype was {weight_dtype}).") + patcher.set_model_compute_dtype(cast_dtype) def _remember_base_devices(patcher: ModelPatcher): @@ -229,8 +226,7 @@ class SelectModelDeviceNode(io.ComfyNode): logging.warning(f"Select Model Device: cannot retarget model, passing through unchanged. ({e})") return io.NodeOutput(model) if resolved is not None: - if resolved.type == "cpu": - _force_fp32_cpu_compute(model) + _force_supported_compute_dtype(model, resolved) _prune_multigpu_collision(model, model.load_device) return io.NodeOutput(model) From 57414dadfe732b8c37754a9680c39c7fb6691437 Mon Sep 17 00:00:00 2001 From: Ivan Zorin Date: Tue, 26 May 2026 06:07:09 +0300 Subject: [PATCH 06/13] fix: cross-attention AdaLN scale, shift, sigma parameters calculation (#14097) --- comfy/ldm/lightricks/av_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/comfy/ldm/lightricks/av_model.py b/comfy/ldm/lightricks/av_model.py index bc09fb77e..ef9938465 100644 --- a/comfy/ldm/lightricks/av_model.py +++ b/comfy/ldm/lightricks/av_model.py @@ -767,25 +767,25 @@ class LTXAVModel(LTXVModel): # Cross-attention timesteps - compress these too av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single( - timestep.max().expand_as(a_timestep_flat), + a_timestep_flat, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single( - a_timestep.max().expand_as(timestep_flat), + timestep_flat, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single( - a_timestep.max().expand_as(timestep_flat) * av_ca_factor, + a_timestep_scaled.max().expand_as(timestep_flat) * av_ca_factor, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, ) av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single( - timestep.max().expand_as(a_timestep_flat) * av_ca_factor, + timestep_scaled.max().expand_as(a_timestep_flat) * av_ca_factor, {"resolution": None, "aspect_ratio": None}, batch_size=batch_size, hidden_dtype=hidden_dtype, From 41812fa0ac67455391a3482f0dab111c858726ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 26 May 2026 09:01:51 +0300 Subject: [PATCH 07/13] feat: Microsoft Lens support (CORE-248) (#14077) --- comfy/ldm/lens/model.py | 513 ++++++++++++++++++++++++++++ comfy/model_base.py | 22 ++ comfy/model_detection.py | 24 ++ comfy/ops.py | 476 +++++++++++++++----------- comfy/sd.py | 10 + comfy/supported_models.py | 43 +++ comfy/text_encoders/gpt_oss.py | 600 +++++++++++++++++++++++++++++++++ comfy_extras/nodes_cfg.py | 49 ++- nodes.py | 4 +- 9 files changed, 1533 insertions(+), 208 deletions(-) create mode 100644 comfy/ldm/lens/model.py create mode 100644 comfy/text_encoders/gpt_oss.py diff --git a/comfy/ldm/lens/model.py b/comfy/ldm/lens/model.py new file mode 100644 index 000000000..7bff7f6af --- /dev/null +++ b/comfy/ldm/lens/model.py @@ -0,0 +1,513 @@ +"""Lens denoising transformer (DiT)""" + +from __future__ import annotations + +from typing import Any, Dict, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ldm.flux.layers +import comfy.patcher_extension +from comfy.ldm.flux.layers import EmbedND +from comfy.ldm.flux.math import apply_rope +from comfy.ldm.modules.attention import optimized_attention + + +def _lens_time_proj(t: torch.Tensor, dim: int = 256) -> torch.Tensor: + return comfy.ldm.flux.layers.timestep_embedding(t, dim) + + +def _lens_position_ids( + frame: int, height: int, width: int, text_seq_len: int, + scale_rope: bool = True, device=None, +) -> torch.Tensor: + """Lens axial (frame, h, w) position ids for joint image + text sequence. + + With ``scale_rope=True`` h/w are centered around 0 (negative + positive + halves) and text starts at ``max(h//2, w//2)``. Result shape ``[seq, 3]``; + caller adds a batch dim for ``EmbedND``. + """ + if scale_rope: + h_pos = torch.cat([torch.arange(-(height - height // 2), 0, device=device), + torch.arange(0, height // 2, device=device)]) + w_pos = torch.cat([torch.arange(-(width - width // 2), 0, device=device), + torch.arange(0, width // 2, device=device)]) + text_start = max(height // 2, width // 2) + else: + h_pos = torch.arange(height, device=device) + w_pos = torch.arange(width, device=device) + text_start = max(height, width) + + f_pos = torch.arange(frame, device=device) + img_ids = torch.zeros(frame, height, width, 3, device=device) + img_ids[..., 0] = f_pos[:, None, None] + img_ids[..., 1] = h_pos[None, :, None] + img_ids[..., 2] = w_pos[None, None, :] + img_ids = img_ids.reshape(-1, 3) + + # Text positions replicate across all 3 axes (matches original packing). + txt_pos = torch.arange(text_start, text_start + text_seq_len, device=device).float() + txt_ids = txt_pos[:, None].expand(text_seq_len, 3) + + return torch.cat([img_ids, txt_ids], dim=0) + + +class _TimestepEmbedder(nn.Module): + def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device) + self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.linear_1(x) + x = F.silu(x) + return self.linear_2(x) + + +class LensTimestepProjEmbeddings(nn.Module): + def __init__(self, embedding_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.timestep_embedder = _TimestepEmbedder(256, embedding_dim, dtype=dtype, device=device, operations=operations) + + def forward(self, timestep: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor: + proj = _lens_time_proj(timestep, 256) + return self.timestep_embedder(proj.to(dtype=hidden_states.dtype)) + + +class GateMLP(nn.Module): + """SwiGLU MLP.""" + + def __init__(self, dim: int, hidden_dim: int, dtype=None, device=None, operations=None) -> None: + super().__init__() + self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device) + self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device) + self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device) + + def forward(self, x): + return self.w2(F.silu(self.w1(x), inplace=True).mul_(self.w3(x))) + + +class LensJointAttention(nn.Module): + """Joint image+text attention with fused QKV per stream.""" + + def __init__( + self, + query_dim: int, + added_kv_proj_dim: int, + dim_head: int = 64, + heads: int = 8, + out_dim: Optional[int] = None, + eps: float = 1e-5, + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.heads = self.inner_dim // dim_head + self.dim_head = dim_head + self.out_dim = out_dim if out_dim is not None else query_dim + + self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device) + + self.img_qkv = operations.Linear(query_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device) + self.txt_qkv = operations.Linear(added_kv_proj_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device) + + # ModuleList([Linear, Identity]) for state-dict key compatibility. + self.to_out = nn.ModuleList([ + operations.Linear(self.inner_dim, self.out_dim, bias=True, dtype=dtype, device=device), + nn.Identity(), + ]) + self.to_add_out = operations.Linear(self.inner_dim, query_dim, bias=True, dtype=dtype, device=device) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + freqs_cis: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + bsz, seq_img, _ = hidden_states.shape + seq_txt = encoder_hidden_states.shape[1] + + # image stream + img_qkv = self.img_qkv(hidden_states).view(bsz, seq_img, 3, self.heads, self.dim_head) + img_q, img_k, img_v = img_qkv.unbind(dim=2) + img_q = self.norm_q(img_q) + img_k = self.norm_k(img_k) + img_v = img_v.contiguous() + del img_qkv + + # text stream + txt_qkv = self.txt_qkv(encoder_hidden_states).view(bsz, seq_txt, 3, self.heads, self.dim_head) + txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2) + txt_q = self.norm_added_q(txt_q) + txt_k = self.norm_added_k(txt_k) + txt_v = txt_v.contiguous() + del txt_qkv + + # [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks + q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2) + del img_q, txt_q + k = torch.cat([img_k, txt_k], dim=1).transpose(1, 2) + del img_k, txt_k + v = torch.cat([img_v, txt_v], dim=1).transpose(1, 2) + del img_v, txt_v + + q, k = apply_rope(q, k, freqs_cis) + + if attention_mask is not None: + expected = (bsz, 1, 1, seq_img + seq_txt) + if attention_mask.shape != expected: + raise ValueError( + f"attention_mask must be {expected}, got {tuple(attention_mask.shape)}" + ) + attention_mask = attention_mask.to(q.dtype) + + out = optimized_attention( + q, k, v, self.heads, mask=attention_mask, skip_reshape=True, + transformer_options=transformer_options, + ) + + img_out = self.to_out[1](self.to_out[0](out[:, :seq_img, :])) + txt_out = self.to_add_out(out[:, seq_img:, :]) + return img_out, txt_out + + +class LensTransformerBlock(nn.Module): + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + eps: float = 1e-6, + rms_norm: bool = True, + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + + self.attn = LensJointAttention( + query_dim=dim, + added_kv_proj_dim=dim, + dim_head=attention_head_dim, + heads=num_attention_heads, + out_dim=dim, + eps=1e-5, + dtype=dtype, + device=device, + operations=operations, + ) + + if rms_norm: + NormCls = operations.RMSNorm + norm_kwargs = {} + else: + NormCls = operations.LayerNorm + norm_kwargs = {"elementwise_affine": False} + + mlp_hidden = int(dim / 3 * 8) + + # Sequential(SiLU, Linear) so state-dict lands at img_mod.1.{weight,bias}. + self.img_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.img_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.img_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.img_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations) + + self.txt_mod = nn.Sequential( + nn.SiLU(), + operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device), + ) + self.txt_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.txt_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs) + self.txt_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations) + + @staticmethod + def _modulate(x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + shift, scale, gate = mod_params.chunk(3, dim=-1) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + temb: torch.Tensor, + freqs_cis: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + img_mod1, img_mod2 = self.img_mod(temb).chunk(2, dim=-1) + txt_mod1, txt_mod2 = self.txt_mod(temb).chunk(2, dim=-1) + + img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1) + txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1) + + img_attn, txt_attn = self.attn( + hidden_states=img_modulated, + encoder_hidden_states=txt_modulated, + freqs_cis=freqs_cis, + attention_mask=attention_mask, + transformer_options=transformer_options, + ) + + hidden_states = hidden_states + img_gate1 * img_attn + encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn + + img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2) + hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2) + + txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2) + encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2) + + return encoder_hidden_states, hidden_states + + +class _AdaLayerNormContinuousNoAffine(nn.Module): + """AdaLayerNormContinuous(elementwise_affine=False). + + The reference uses ``scale, shift = chunk(2)`` (scale first) — opposite + to Flux's ``LastLayer``. + """ + + def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, eps: float = 1e-6, + dtype=None, device=None, operations=None) -> None: + super().__init__() + self.linear = operations.Linear( + conditioning_embedding_dim, embedding_dim * 2, bias=True, dtype=dtype, device=device + ) + self.eps = eps + self.embedding_dim = embedding_dim + + def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor: + emb = self.linear(F.silu(conditioning)) + scale, shift = torch.chunk(emb, 2, dim=-1) + x = F.layer_norm(x, (self.embedding_dim,), None, None, self.eps) + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + + +class LensTransformer2DModel(nn.Module): + """Lens dual-stream MMDiT (48 blocks, inner_dim=1536, multi-layer text).""" + + def __init__( + self, + patch_size: int = 2, + in_channels: int = 128, + out_channels: Optional[int] = 32, + num_layers: int = 48, + attention_head_dim: int = 64, + num_attention_heads: int = 24, + enc_hidden_dim: int = 2880, + axes_dims_rope: Tuple[int, int, int] = (8, 28, 28), + rms_norm: bool = True, + multi_layer_encoder_feature: bool = True, + selected_layer_index: Tuple[int, ...] = (5, 11, 17, 23), + image_model=None, # unused; accepted for detection-side configs. + dtype=None, + device=None, + operations=None, + ) -> None: + super().__init__() + self.patch_size = patch_size + self.in_channels = in_channels + self.out_channels = out_channels if out_channels is not None else in_channels + self.inner_dim = num_attention_heads * attention_head_dim + self.multi_layer_encoder_feature = multi_layer_encoder_feature + self.selected_layer_index = list(selected_layer_index) + self.dtype = dtype + + self.pos_embed = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope)) + self.time_text_embed = LensTimestepProjEmbeddings( + embedding_dim=self.inner_dim, dtype=dtype, device=device, operations=operations + ) + + if self.multi_layer_encoder_feature: + self.txt_norm = nn.ModuleList( + [operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device) + for _ in self.selected_layer_index] + ) + self.txt_in = operations.Linear( + enc_hidden_dim * len(self.selected_layer_index), + self.inner_dim, bias=True, dtype=dtype, device=device, + ) + else: + self.txt_norm = operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device) + self.txt_in = operations.Linear(enc_hidden_dim, self.inner_dim, bias=True, dtype=dtype, device=device) + + self.img_in = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device) + + self.transformer_blocks = nn.ModuleList([ + LensTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + eps=1e-6, + rms_norm=rms_norm, + dtype=dtype, device=device, operations=operations, + ) + for _ in range(num_layers) + ]) + + self.norm_out = _AdaLayerNormContinuousNoAffine( + self.inner_dim, self.inner_dim, eps=1e-6, + dtype=dtype, device=device, operations=operations, + ) + self.proj_out = operations.Linear( + self.inner_dim, patch_size * patch_size * self.out_channels, bias=True, + dtype=dtype, device=device, + ) + + def forward(self, x: torch.Tensor, timestep: torch.Tensor, context: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, **kwargs) -> torch.Tensor: + if transformer_options is None: + transformer_options = {} + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, self, + comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timestep, context, attention_mask, transformer_options, **kwargs) + + def _forward( + self, + x: torch.Tensor, + timestep: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + transformer_options: Optional[Dict[str, Any]] = None, + control: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> torch.Tensor: + """ComfyUI bridge: ``(x[B,128,h,w], t[B], context[B,S,L*H], mask[B,S])``.""" + if transformer_options is None: + transformer_options = {} + transformer_options = transformer_options.copy() + patches = transformer_options.get("patches", {}) + patches_replace = transformer_options.get("patches_replace", {}) + blocks_replace = patches_replace.get("dit", {}) + + B, C, h, w = x.shape + hidden_states = x.permute(0, 2, 3, 1).reshape(B, h * w, C) + + if self.multi_layer_encoder_feature: + L = len(self.selected_layer_index) + enc_dim = context.shape[-1] // L + encoder_hidden_states = list( + context.reshape(B, -1, L, enc_dim).unbind(dim=2) + ) + text_seq_len = encoder_hidden_states[0].shape[1] + else: + encoder_hidden_states = context + text_seq_len = context.shape[1] + + if attention_mask is None: + attention_mask = torch.ones( + (B, text_seq_len), dtype=torch.bool, device=x.device + ) + + img_len = h * w + joint_mask = self._build_joint_attention_mask(attention_mask, img_len) + + hidden_states = self.img_in(hidden_states) + timestep = timestep.to(hidden_states.dtype) + + if self.multi_layer_encoder_feature: + normed = [self.txt_norm[i](encoder_hidden_states[i]) for i in range(L)] + encoder_hidden_states = torch.cat(normed, dim=-1) + else: + encoder_hidden_states = self.txt_norm(encoder_hidden_states) + encoder_hidden_states = self.txt_in(encoder_hidden_states) + + if "post_input" in patches: + for p in patches["post_input"]: + out = p({ + "img": hidden_states, + "txt": encoder_hidden_states, + "transformer_options": transformer_options, + }) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + + temb = self.time_text_embed(timestep, hidden_states) + ids = _lens_position_ids(1, h, w, text_seq_len, device=hidden_states.device).unsqueeze(0) + freqs_cis = self.pos_embed(ids) + + transformer_options["total_blocks"] = len(self.transformer_blocks) + transformer_options["block_type"] = "double" + for i, block in enumerate(self.transformer_blocks): + transformer_options["block_index"] = i + if ("double_block", i) in blocks_replace: + def block_wrap(args): + out = {} + out["txt"], out["img"] = block( + hidden_states=args["img"], + encoder_hidden_states=args["txt"], + temb=args["vec"], + freqs_cis=args["pe"], + attention_mask=args.get("attn_mask"), + transformer_options=args.get("transformer_options"), + ) + return out + out = blocks_replace[("double_block", i)]( + { + "img": hidden_states, + "txt": encoder_hidden_states, + "vec": temb, + "pe": freqs_cis, + "attn_mask": joint_mask, + "transformer_options": transformer_options, + }, + {"original_block": block_wrap}, + ) + encoder_hidden_states = out["txt"] + hidden_states = out["img"] + else: + encoder_hidden_states, hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + temb=temb, + freqs_cis=freqs_cis, + attention_mask=joint_mask, + transformer_options=transformer_options, + ) + + if "double_block" in patches: + for p in patches["double_block"]: + out = p({ + "img": hidden_states, + "txt": encoder_hidden_states, + "x": x, + "block_index": i, + "transformer_options": transformer_options, + }) + hidden_states = out["img"] + encoder_hidden_states = out["txt"] + + if control is not None: + control_i = control.get("input") + if control_i is not None and i < len(control_i): + add = control_i[i] + if add is not None: + hidden_states[:, :add.shape[1]] += add + + hidden_states = self.norm_out(hidden_states, temb) + out = self.proj_out(hidden_states) + return out.reshape(B, h, w, C).permute(0, 3, 1, 2).contiguous() + + @staticmethod + def _build_joint_attention_mask(text_mask: torch.Tensor, img_len: int) -> torch.Tensor: + if text_mask.dtype != torch.bool: + text_mask = text_mask.bool() + bsz = text_mask.shape[0] + img_ones = torch.ones((bsz, img_len), dtype=torch.bool, device=text_mask.device) + joint = torch.cat([img_ones, text_mask], dim=1) + additive = torch.zeros_like(joint, dtype=torch.float32) + additive.masked_fill_(~joint, torch.finfo(torch.float32).min) + return additive[:, None, None, :] diff --git a/comfy/model_base.py b/comfy/model_base.py index d81f13c69..d4ab1499e 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -35,6 +35,7 @@ import comfy.ldm.hydit.models import comfy.ldm.audio.dit import comfy.ldm.audio.embedders import comfy.ldm.flux.model +import comfy.ldm.lens.model import comfy.ldm.lightricks.model import comfy.ldm.hunyuan_video.model import comfy.ldm.cosmos.model @@ -1058,6 +1059,27 @@ class Flux2(Flux): out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) return out + +class Lens(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLUX, device=None): + super().__init__( + model_config, model_type, device=device, + unet_model=comfy.ldm.lens.model.LensTransformer2DModel, + ) + + def encode_adm(self, **kwargs): + return None # Lens has no pooled/ADM conditioning. + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + cross_attn = kwargs.get("cross_attn", None) + if cross_attn is not None: + out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out['attention_mask'] = comfy.conds.CONDRegular(attention_mask) + return out + class GenmoMochi(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.genmo.joint_model.asymm_models_joint.AsymmDiTJoint) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 70b4df8b3..2b0b98cd8 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -755,6 +755,30 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["timestep_scale"] = 1000.0 return dit_config + if '{}transformer_blocks.0.attn.norm_added_q.weight'.format(key_prefix) in state_dict_keys \ + and '{}transformer_blocks.0.img_mlp.w1.weight'.format(key_prefix) in state_dict_keys: # Lens + img_in_w = state_dict['{}img_in.weight'.format(key_prefix)] + proj_out_w = state_dict['{}proj_out.weight'.format(key_prefix)] + multi_layer = '{}txt_norm.0.weight'.format(key_prefix) in state_dict_keys + if multi_layer: + enc_hidden_dim = state_dict['{}txt_norm.0.weight'.format(key_prefix)].shape[0] + # Indices are TE-side; the DiT just consumes L layers in order. + selected_layer_index = tuple(range(count_blocks(state_dict_keys, '{}txt_norm.'.format(key_prefix) + '{}.'))) + else: + enc_hidden_dim = state_dict['{}txt_norm.weight'.format(key_prefix)].shape[0] + selected_layer_index = (0,) + + return { + "image_model": "lens", + "in_channels": img_in_w.shape[1], + "out_channels": proj_out_w.shape[0] // 4, # patch_size ** 2 (=2² default) + "num_layers": count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.'), + "num_attention_heads": img_in_w.shape[0] // 64, # // attention_head_dim default + "enc_hidden_dim": enc_hidden_dim, + "multi_layer_encoder_feature": multi_layer, + "selected_layer_index": selected_layer_index, + } + if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image dit_config = {} dit_config["image_model"] = "qwen_image" diff --git a/comfy/ops.py b/comfy/ops.py index 9bcd6c900..56445be8d 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -18,6 +18,7 @@ import torch import logging +import contextlib import comfy.model_management from comfy.cli_args import args, PerformanceFeature import comfy.float @@ -1047,6 +1048,144 @@ class QuantLinearFunc(torch.autograd.Function): return grad_input, grad_weight, grad_bias, None, None, None +# Quantized-weight module helpers + +def _quantized_apply(module, fn, recurse=True): + """Re-wrap Parameters after fn so .to()/.cuda() propagate through QuantizedTensor weights.""" + if recurse: + for child in module.children(): + child._apply(fn) + for key, param in module._parameters.items(): + if param is None: + continue + p = fn(param) + if (not torch.is_inference_mode_enabled()) and p.is_inference(): + p = p.clone() + module.register_parameter(key, torch.nn.Parameter(p, requires_grad=False)) + for key, buf in module._buffers.items(): + if buf is not None: + module._buffers[key] = fn(buf) + return module + + +def _load_quantized_module(module, super_load, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs, load_extra_params=False): + """Shared _load_from_state_dict body for quantized-weight modules. + + Pops weight (+ scales, +/- extras), populates module.weight as a Parameter + or Parameter-wrapped QuantizedTensor, then calls super_load and strips + consumed keys from missing_keys. Reads compute_dtype from factory_kwargs + and disabled formats from module._disabled_formats. + """ + device = module.factory_kwargs["device"] + compute_dtype = module.factory_kwargs["dtype"] + disabled_formats = module._disabled_formats + layer_name = prefix.rstrip('.') + + weight = state_dict.pop(f"{prefix}weight", None) + if weight is None: + logging.warning(f"Missing weight for layer {layer_name}") + module.weight = None + return + manually_loaded_keys = [f"{prefix}weight"] + + def pop_scale(name, dtype=None): + key = f"{prefix}{name}" + v = state_dict.pop(key, None) + if v is not None: + v = v.to(device=device) + if dtype is not None: + v = v.view(dtype=dtype) + manually_loaded_keys.append(key) + return v + + layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) + if layer_conf is not None: + layer_conf = json.loads(layer_conf.numpy().tobytes()) + + if layer_conf is None: + module.weight = torch.nn.Parameter(weight.to(device=device, dtype=compute_dtype), requires_grad=False) + else: + module.quant_format = layer_conf.get("format", None) + module._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False) + if not module._full_precision_mm: + module._full_precision_mm = module._full_precision_mm_config + if module.quant_format in disabled_formats: + module._full_precision_mm = True + if module.quant_format is None: + raise ValueError(f"Unknown quantization format for layer {layer_name}") + + qconfig = QUANT_ALGOS[module.quant_format] + module.layout_type = qconfig["comfy_tensor_layout"] + layout_cls = get_layout_class(module.layout_type) + + # Per-format scales; fp8 dtype views handle both legacy uint8-on-disk and native fp8. + if module.quant_format in ("float8_e4m3fn", "float8_e5m2"): + scales = {"scale": pop_scale("weight_scale")} + elif module.quant_format == "mxfp8": + bs = pop_scale("weight_scale", torch.float8_e8m0fnu) + if bs is None: + raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}") + scales = {"scale": bs} + elif module.quant_format == "nvfp4": + ts = pop_scale("weight_scale_2") + bs = pop_scale("weight_scale", torch.float8_e4m3fn) + if ts is None or bs is None: + raise ValueError(f"Missing NVFP4 scales for layer {layer_name}") + scales = {"scale": ts, "block_scale": bs} + else: + raise ValueError(f"Unsupported quantization format: {module.quant_format}") + + params = layout_cls.Params(**scales, orig_dtype=compute_dtype, orig_shape=module._orig_shape) + module.weight = torch.nn.Parameter( + QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), module.layout_type, params), + requires_grad=False, + ) + + if load_extra_params: + for param_name in qconfig["parameters"]: + if param_name in {"weight_scale", "weight_scale_2"}: + continue + param_key = f"{prefix}{param_name}" + _v = state_dict.pop(param_key, None) + if _v is None: + continue + module.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) + manually_loaded_keys.append(param_key) + + super_load(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + for key in manually_loaded_keys: + if key in missing_keys: + missing_keys.remove(key) + + +def _quantized_weight_state_dict(module, sd, prefix, extra_quant_conf=None, extra_quant_params=()): + """Shared state_dict body. extra_quant_conf merges into the comfy_quant JSON; + extra_quant_params names attributes written as additional top-level keys.""" + if not hasattr(module, 'weight'): + logging.warning(f"Warning: state dict on uninitialized op {prefix}") + return sd + bias = getattr(module, 'bias', None) + if bias is not None: + sd[f"{prefix}bias"] = bias + if module.weight is None: + return sd + if isinstance(module.weight, QuantizedTensor): + sd.update(module.weight.state_dict(f"{prefix}weight")) + quant_conf = {"format": module.quant_format} + if getattr(module, '_full_precision_mm_config', False): + quant_conf["full_precision_matrix_mult"] = True + if extra_quant_conf: + quant_conf.update(extra_quant_conf) + sd[f"{prefix}comfy_quant"] = torch.tensor(list(json.dumps(quant_conf).encode("utf-8")), dtype=torch.uint8) + for name in extra_quant_params: + value = getattr(module, name, None) + if value is not None: + sd[f"{prefix}{name}"] = value + else: + sd[f"{prefix}weight"] = module.weight + return sd + def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False, disabled=[]): class MixedPrecisionOps(manual_cast): @@ -1056,21 +1195,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec _disabled = disabled class Linear(torch.nn.Module, CastWeightBiasOp): - def __init__( - self, - in_features: int, - out_features: int, - bias: bool = True, - device=None, - dtype=None, - ) -> None: + _disabled_formats = disabled + + def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None): super().__init__() self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} - # self.factory_kwargs = {"device": device, "dtype": dtype} self.in_features = in_features self.out_features = out_features + self._orig_shape = (out_features, in_features) if bias: self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs)) else: @@ -1083,151 +1217,12 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec def reset_parameters(self): return None - def _load_scale_param(self, state_dict, prefix, param_name, device, manually_loaded_keys, dtype=None): - key = f"{prefix}{param_name}" - value = state_dict.pop(key, None) - if value is not None: - value = value.to(device=device) - if dtype is not None: - value = value.view(dtype=dtype) - manually_loaded_keys.append(key) - return value - - def _load_from_state_dict(self, state_dict, prefix, local_metadata, - strict, missing_keys, unexpected_keys, error_msgs): - - device = self.factory_kwargs["device"] - layer_name = prefix.rstrip('.') - weight_key = f"{prefix}weight" - weight = state_dict.pop(weight_key, None) - if weight is None: - logging.warning(f"Missing weight for layer {layer_name}") - self.weight = None - return - - manually_loaded_keys = [weight_key] - - layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) - if layer_conf is not None: - layer_conf = json.loads(layer_conf.numpy().tobytes()) - - if layer_conf is None: - self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False) - else: - self.quant_format = layer_conf.get("format", None) - self._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False) - if not self._full_precision_mm: - self._full_precision_mm = self._full_precision_mm_config - - if self.quant_format in MixedPrecisionOps._disabled: - self._full_precision_mm = True - - if self.quant_format is None: - raise ValueError(f"Unknown quantization format for layer {layer_name}") - - qconfig = QUANT_ALGOS[self.quant_format] - self.layout_type = qconfig["comfy_tensor_layout"] - layout_cls = get_layout_class(self.layout_type) - - # Load format-specific parameters - if self.quant_format in ["float8_e4m3fn", "float8_e5m2"]: - # FP8: single tensor scale - scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys) - - params = layout_cls.Params( - scale=scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - - elif self.quant_format == "mxfp8": - # MXFP8: E8M0 block scales stored as uint8 in safetensors - block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys, - dtype=torch.uint8) - - if block_scale is None: - raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}") - - block_scale = block_scale.view(torch.float8_e8m0fnu) - - params = layout_cls.Params( - scale=block_scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - - elif self.quant_format == "nvfp4": - # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale) - tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys) - block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys, - dtype=torch.float8_e4m3fn) - - if tensor_scale is None or block_scale is None: - raise ValueError(f"Missing NVFP4 scales for layer {layer_name}") - - params = layout_cls.Params( - scale=tensor_scale, - block_scale=block_scale, - orig_dtype=MixedPrecisionOps._compute_dtype, - orig_shape=(self.out_features, self.in_features), - ) - else: - raise ValueError(f"Unsupported quantization format: {self.quant_format}") - - self.weight = torch.nn.Parameter( - QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), self.layout_type, params), - requires_grad=False - ) - - for param_name in qconfig["parameters"]: - if param_name in {"weight_scale", "weight_scale_2"}: - continue # Already handled above - - param_key = f"{prefix}{param_name}" - _v = state_dict.pop(param_key, None) - if _v is None: - continue - self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False)) - manually_loaded_keys.append(param_key) - - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - - for key in manually_loaded_keys: - if key in missing_keys: - missing_keys.remove(key) + def _load_from_state_dict(self, *args): + _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=True) def state_dict(self, *args, destination=None, prefix="", **kwargs): - if destination is not None: - sd = destination - else: - sd = {} - - if not hasattr(self, 'weight'): - logging.warning("Warning: state dict on uninitialized op {}".format(prefix)) - return sd - - if self.bias is not None: - sd["{}bias".format(prefix)] = self.bias - - if self.weight is None: - return sd - - if isinstance(self.weight, QuantizedTensor): - sd_out = self.weight.state_dict("{}weight".format(prefix)) - for k in sd_out: - sd[k] = sd_out[k] - - quant_conf = {"format": self.quant_format} - if self._full_precision_mm_config: - quant_conf["full_precision_matrix_mult"] = True - sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) - - input_scale = getattr(self, 'input_scale', None) - if input_scale is not None: - sd["{}input_scale".format(prefix)] = input_scale - else: - sd["{}weight".format(prefix)] = self.weight - return sd + sd = destination if destination is not None else {} + return _quantized_weight_state_dict(self, sd, prefix, extra_quant_params=("input_scale",)) def _forward(self, input, weight, bias): return torch.nn.functional.linear(input, weight, bias) @@ -1317,25 +1312,126 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter(weight, requires_grad=False) def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working - if recurse: - for module in self.children(): - module._apply(fn) + return _quantized_apply(self, fn, recurse) - for key, param in self._parameters.items(): - if param is None: - continue - p = fn(param) - if (not torch.is_inference_mode_enabled()) and p.is_inference(): - p = p.clone() - self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False)) - for key, buf in self._buffers.items(): - if buf is not None: - self._buffers[key] = fn(buf) - return self + class MoEExperts(torch.nn.Module, CastWeightBiasOp): + """Container for E quantized expert weights, indexed via expert_weight(i). + + The bank lives on self.weight as a single 3D tensor — either a + compute_dtype Parameter or a Parameter wrapping a QuantizedTensor + with leading expert dim. + + State-dict layout matches mixed_precision_ops.Linear with a leading + expert dim: + {prefix}.weight quant data (storage_t), leading dim = E + {prefix}.weight_scale block / per-tensor scale + {prefix}.weight_scale_2 [E] or scalar NVFP4 only + {prefix}.bias [E, out_features] optional, compute_dtype + {prefix}.comfy_quant json -> {{"format": "...", "num_experts": E}} + + Without comfy_quant the weight loads as a plain compute_dtype 3D Parameter [E, out, in]. + """ + + _disabled_formats = disabled + + def __init__(self, num_experts: int, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None): + super().__init__() + self.num_experts = num_experts + self.in_features = in_features + self.out_features = out_features + self._orig_shape = (num_experts, out_features, in_features) + self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype} + if bias: + self.bias = torch.nn.Parameter(torch.empty(num_experts, out_features, **self.factory_kwargs)) + else: + self.register_parameter("bias", None) + + # Populated by _load_from_state_dict: + self.weight = None + self.quant_format = None + self.layout_type = None + self._full_precision_mm = MixedPrecisionOps._full_precision_mm + self._full_precision_mm_config = False + self._resident_bank = None + + def reset_parameters(self): + return None + + def _apply(self, fn, recurse=True): + return _quantized_apply(self, fn, recurse) + + def _load_from_state_dict(self, *args): + _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=False) + + def expert_weight(self, i: int): + """Expert i's weight (Tensor or per-expert QuantizedTensor view).""" + if isinstance(self.weight, QuantizedTensor): + return self._expert_qt_from(self.weight, i) + return self.weight[i] + + @contextlib.contextmanager + def bank_resident(self, input): + """Cast the whole bank once; expert_linear inside reuses the cast. + Not re-entrant — do not nest calls on the same instance. + """ + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + self._resident_bank = (weight, bias) + try: + yield self + finally: + self._resident_bank = None + uncast_bias_weight(self, weight, bias, offload_stream) + + def expert_linear(self, input: torch.Tensor, i: int) -> torch.Tensor: + """Linear against expert i's weight (with optional bias).""" + resident = getattr(self, "_resident_bank", None) + if resident is not None: + weight, bias = resident + return self._expert_linear_impl(input, weight, bias, i) + weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True) + try: + return self._expert_linear_impl(input, weight, bias, i) + finally: + uncast_bias_weight(self, weight, bias, offload_stream) + + def _expert_linear_impl(self, input, weight, bias, i): + if isinstance(weight, QuantizedTensor): + qw = self._expert_qt_from(weight, i) + else: + qw = weight[i] + b = cast_to_input(bias[i], input, copy=False) if bias is not None else None + + if isinstance(qw, QuantizedTensor): + use_fast = ( + not self._full_precision_mm + and qw.layout_cls.supports_fast_matmul() + and input.dim() == 2 + ) + if use_fast: + qin = QuantizedTensor.from_float(input, self.layout_type) + return torch.nn.functional.linear(qin, qw, b) + out = input @ qw.dequantize().t() + return out + b if b is not None else out + return torch.nn.functional.linear(input, qw, b) + + def _expert_qt_from(self, weight: QuantizedTensor, i: int) -> QuantizedTensor: + """Build a per-expert QuantizedTensor by indexing into a resident bank.""" + params = weight._params + kwargs = { + "scale": params.scale[i] if params.scale.dim() else params.scale, + "orig_dtype": params.orig_dtype, + "orig_shape": (self.out_features, self.in_features), + } + if hasattr(params, "block_scale"): # NVFP4 + kwargs["block_scale"] = params.block_scale[i] + return QuantizedTensor(weight._qdata[i], weight._layout_cls, type(params)(**kwargs)) + + def state_dict(self, *args, destination=None, prefix="", **kwargs): + sd = destination if destination is not None else {} + return _quantized_weight_state_dict(self, sd, prefix, extra_quant_conf={"num_experts": self.num_experts}) class Embedding(manual_cast.Embedding): - def _load_from_state_dict(self, state_dict, prefix, local_metadata, - strict, missing_keys, unexpected_keys, error_msgs): + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): weight_key = f"{prefix}weight" layer_conf = state_dict.pop(f"{prefix}comfy_quant", None) if layer_conf is not None: @@ -1343,14 +1439,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec # Only fp8 makes sense for embeddings (per-row dequant via index select). # Block-scaled formats (NVFP4, MXFP8) can't do per-row lookup efficiently. - quant_format = layer_conf.get("format", None) if layer_conf is not None else None - if quant_format in ["float8_e4m3fn", "float8_e5m2"] and weight_key in state_dict: + quant_format = layer_conf.get("format") if layer_conf is not None else None + manually_loaded_keys = [] + + if quant_format in ("float8_e4m3fn", "float8_e5m2") and weight_key in state_dict: self.quant_format = quant_format qconfig = QUANT_ALGOS[quant_format] self.layout_type = qconfig["comfy_tensor_layout"] layout_cls = get_layout_class(self.layout_type) weight = state_dict.pop(weight_key) - manually_loaded_keys = [weight_key] + manually_loaded_keys.append(weight_key) scale_key = f"{prefix}weight_scale" scale = state_dict.pop(scale_key, None) @@ -1366,35 +1464,19 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec self.weight = torch.nn.Parameter( QuantizedTensor(weight.to(dtype=qconfig["storage_t"]), qconfig["comfy_tensor_layout"], params), requires_grad=False) + elif layer_conf is not None: + # Unsupported format — restore the marker so it round-trips; fall through to default load. + state_dict[f"{prefix}comfy_quant"] = torch.tensor( + list(json.dumps(layer_conf).encode('utf-8')), dtype=torch.uint8) - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) - for k in manually_loaded_keys: - if k in missing_keys: - missing_keys.remove(k) - else: - if layer_conf is not None: - state_dict[f"{prefix}comfy_quant"] = torch.tensor(list(json.dumps(layer_conf).encode('utf-8')), dtype=torch.uint8) - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) + for k in manually_loaded_keys: + if k in missing_keys: + missing_keys.remove(k) def state_dict(self, *args, destination=None, prefix="", **kwargs): - if destination is not None: - sd = destination - else: - sd = {} - - if not hasattr(self, 'weight') or self.weight is None: - return sd - - if isinstance(self.weight, QuantizedTensor): - sd_out = self.weight.state_dict("{}weight".format(prefix)) - for k in sd_out: - sd[k] = sd_out[k] - - quant_conf = {"format": self.quant_format} - sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8) - else: - sd["{}weight".format(prefix)] = self.weight - return sd + sd = destination if destination is not None else {} + return _quantized_weight_state_dict(self, sd, prefix) def forward_comfy_cast_weights(self, input, out_dtype=None): weight = self.weight diff --git a/comfy/sd.py b/comfy/sd.py index a4e49763a..beb782310 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -68,6 +68,7 @@ import comfy.text_encoders.ernie import comfy.text_encoders.gemma4 import comfy.text_encoders.cogvideo import comfy.text_encoders.sa3 +import comfy.text_encoders.gpt_oss import comfy.model_patcher import comfy.lora @@ -1283,6 +1284,7 @@ class CLIPType(Enum): FLUX2 = 25 LONGCAT_IMAGE = 26 COGVIDEOX = 27 + LENS = 28 @@ -1335,6 +1337,7 @@ class TEModel(Enum): GEMMA_4_E2B = 30 GEMMA_4_31B = 31 T5_GEMMA = 32 + GPT_OSS_20B = 33 def detect_te_model(sd): @@ -1376,6 +1379,9 @@ def detect_te_model(sd): else: return TEModel.GEMMA_3_4B return TEModel.GEMMA_2_2B + # Must precede the Qwen2.5-7B k_proj.bias=512 check (GPT-OSS also has 8*64=512). + if "layers.0.self_attn.sinks" in sd and "layers.0.mlp.experts.gate_up_proj.weight" in sd: + return TEModel.GPT_OSS_20B if 'model.layers.0.self_attn.k_proj.bias' in sd: weight = sd['model.layers.0.self_attn.k_proj.bias'] if weight.shape[0] == 256: @@ -1558,6 +1564,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2) clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None) + elif te_model == TEModel.GPT_OSS_20B: + clip_target.clip = comfy.text_encoders.gpt_oss.lens_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.gpt_oss.LensTokenizer + tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None) elif te_model == TEModel.QWEN3_4B: if clip_type == CLIPType.FLUX or clip_type == CLIPType.FLUX2: clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_4b") diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 617db4f28..e451892e9 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -829,6 +829,48 @@ class Flux2(Flux): return None + +class Lens(supported_models_base.BASE): + """Microsoft Lens (3.8B dual-stream MMDiT, GPT-OSS-20B text features, Flux2 VAE).""" + + unet_config = { + "image_model": "lens", + } + + sampling_settings = { + "shift": 1.829, # Default mu for 1440x1440 (and any seq_len > 4300 + } + + unet_extra_config = {} + latent_format = latent_formats.Flux2 + + supported_inference_dtypes = [torch.bfloat16, torch.float32] # fp16 causes NaNs + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def __init__(self, unet_config): + super().__init__(unet_config) + + def get_model(self, state_dict, prefix="", device=None): + return model_base.Lens(self, model_type=model_base.ModelType.FLUX, device=device) + + def clip_target(self, state_dict={}): + pref = self.text_encoder_key_prefix[0] + for hint in ("gpt_oss.transformer.", ""): + full_prefix = "{}{}".format(pref, hint) + if "{}layers.0.self_attn.sinks".format(full_prefix) in state_dict: + detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, full_prefix) + return supported_models_base.ClipTarget( + comfy.text_encoders.gpt_oss.LensTokenizer, + comfy.text_encoders.gpt_oss.lens_te(**detect), + ) + return supported_models_base.ClipTarget( + comfy.text_encoders.gpt_oss.LensTokenizer, + comfy.text_encoders.gpt_oss.lens_te(), + ) + + class GenmoMochi(supported_models_base.BASE): unet_config = { "image_model": "mochi_preview", @@ -2096,6 +2138,7 @@ models = [ Omnigen2, QwenImage, Flux2, + Lens, Kandinsky5Image, Kandinsky5, Anima, diff --git a/comfy/text_encoders/gpt_oss.py b/comfy/text_encoders/gpt_oss.py new file mode 100644 index 000000000..d596ef9a0 --- /dev/null +++ b/comfy/text_encoders/gpt_oss.py @@ -0,0 +1,600 @@ +"""GPT-OSS text encoder for Lens.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Any, List, Optional, Sequence + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ops +from comfy import sd1_clip +from comfy.ldm.modules.attention import TORCH_HAS_GQA, optimized_attention_for_device +from comfy.text_encoders.llama import RMSNorm, apply_rope + + +@dataclass +class GptOss20BConfig: + vocab_size: int = 201088 + hidden_size: int = 2880 + intermediate_size: int = 2880 + num_hidden_layers: int = 24 + num_attention_heads: int = 64 + num_key_value_heads: int = 8 + head_dim: int = 64 + num_local_experts: int = 32 + num_experts_per_tok: int = 4 + sliding_window: int = 128 + original_max_position_embeddings: int = 4096 + rope_theta: float = 150000.0 + rope_factor: float = 32.0 + rope_beta_fast: float = 32.0 + rope_beta_slow: float = 1.0 + rope_truncate: bool = False + rms_norm_eps: float = 1e-5 + attention_bias: bool = True + layer_types: Optional[List[str]] = None + moe_alpha: float = 1.702 + moe_limit: float = 7.0 + + def __post_init__(self): + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if (i + 1) % 2 else "full_attention" + for i in range(self.num_hidden_layers) + ] + + +def _yarn_inv_freq(head_dim: int, base: float, factor: float, beta_fast: float, beta_slow: float, + original_max_position_embeddings: int, truncate: bool, device=None) -> tuple[torch.Tensor, float]: + """YARN inv_freq + attention scaling (matches transformers).""" + dim = head_dim + + def find_correction_dim(num_rotations: float) -> float: + return (dim * math.log(original_max_position_embeddings / (num_rotations * 2 * math.pi))) / ( + 2 * math.log(base) + ) + + def find_correction_range() -> tuple[float, float]: + low = find_correction_dim(beta_fast) + high = find_correction_dim(beta_slow) + if truncate: + low = math.floor(low) + high = math.ceil(high) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_factor(min_: float, max_: float, n: int) -> torch.Tensor: + if min_ == max_: + max_ += 0.001 + linear = (torch.arange(n, dtype=torch.float32, device=device) - min_) / (max_ - min_) + return torch.clamp(linear, 0, 1) + + def get_mscale(scale: float) -> float: + if scale <= 1: + return 1.0 + return 0.1 * math.log(scale) + 1.0 + + attention_scaling = get_mscale(factor) + + pos_freqs = base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range() + extrap_factor = 1 - linear_ramp_factor(low, high, dim // 2) + inv_freq = inv_freq_interpolation * (1 - extrap_factor) + inv_freq_extrapolation * extrap_factor + return inv_freq, attention_scaling + + +def _build_freqs_cis(inv_freq: torch.Tensor, attention_scaling: float, position_ids: torch.Tensor, dtype: torch.dtype, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + inv_freq_e = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + pos_e = position_ids[:, None, :].float() + freqs = (inv_freq_e @ pos_e).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = (emb.cos() * attention_scaling).to(dtype).unsqueeze(1) + sin = (emb.sin() * attention_scaling).to(dtype).unsqueeze(1) + sin_split = sin.shape[-1] // 2 + return cos, sin[..., :sin_split], -sin[..., sin_split:] + + +def _attention_with_sinks(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, sinks: torch.Tensor, + attention_mask: Optional[torch.Tensor], num_heads: int, num_kv_groups: int) -> torch.Tensor: + """Attention with per-head sinks. + + Sinks add a learned term to each row's softmax denominator but contribute + nothing to the output. We fake this by appending one zero k/v position and + putting the sink logit in the mask at that column. + """ + + if num_kv_groups > 1 and not TORCH_HAS_GQA: + k = k.repeat_interleave(num_kv_groups, dim=1) + v = v.repeat_interleave(num_kv_groups, dim=1) + + B, _, S_q, D = q.shape + H_kv = k.shape[1] + S_kv = k.shape[-2] + + k = torch.cat([k, k.new_zeros(B, H_kv, 1, D)], dim=-2) + v = torch.cat([v, v.new_zeros(B, H_kv, 1, D)], dim=-2) + + sinks_col = sinks.to(q.dtype).view(1, num_heads, 1, 1).expand(B, num_heads, S_q, 1) + if attention_mask is not None: + mask_left = attention_mask[..., :S_kv].expand(B, num_heads, S_q, S_kv) + else: + mask_left = q.new_zeros(B, num_heads, S_q, S_kv) + mask = torch.cat([mask_left, sinks_col], dim=-1) + + op = optimized_attention_for_device(q.device, mask=True, small_input=True) + return op(q, k, v, num_heads, mask=mask, skip_reshape=True, enable_gqa=True) + + +class GptOssAttention(nn.Module): + def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None): + super().__init__() + self.layer_idx = layer_idx + self.layer_type = config.layer_types[layer_idx] + self.num_heads = config.num_attention_heads + self.num_kv_heads = config.num_key_value_heads + self.num_kv_groups = self.num_heads // self.num_kv_heads + self.head_dim = config.head_dim + self.hidden_size = config.hidden_size + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + + bias = config.attention_bias + self.q_proj = ops.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype) + self.o_proj = ops.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=bias, device=device, dtype=dtype) + self.sinks = nn.Parameter(torch.empty(self.num_heads, device=device, dtype=dtype)) + + def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], freqs_cis) -> torch.Tensor: + B, S, _ = hidden_states.shape + + q = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2) + k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2) + + q, k = apply_rope(q, k, freqs_cis) + + out = _attention_with_sinks(q, k, v, self.sinks, attention_mask, self.num_heads, self.num_kv_groups) + return self.o_proj(out) + + +# Mixture of Experts + +class GptOssTopKRouter(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None): + super().__init__() + self.top_k = config.num_experts_per_tok + self.num_experts = config.num_local_experts + self.weight = nn.Parameter(torch.empty(config.num_local_experts, config.hidden_size, device=device, dtype=dtype)) + self.bias = nn.Parameter(torch.empty(config.num_local_experts, device=device, dtype=dtype)) + + def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + weight = comfy.ops.cast_to_input(self.weight, hidden_states, copy=False) + bias = comfy.ops.cast_to_input(self.bias, hidden_states, copy=False) + logits = F.linear(hidden_states, weight, bias) + top_vals, top_idx = torch.topk(logits, self.top_k, dim=-1) + # Softmax over top-k slice only + scores = F.softmax(top_vals, dim=-1, dtype=top_vals.dtype) + return scores, top_idx + + +class GptOssExperts(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.num_experts = config.num_local_experts + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.alpha = config.moe_alpha + self.limit = config.moe_limit + + E = self.num_experts + H = self.hidden_size + I = self.intermediate_size + + self.gate_up_proj = ops.MoEExperts(num_experts=E, in_features=H, out_features=2 * I, bias=True, device=device, dtype=dtype) + self.down_proj = ops.MoEExperts(num_experts=E, in_features=I, out_features=H, bias=True, device=device, dtype=dtype) + + def _apply_gate(self, gate_up: torch.Tensor) -> torch.Tensor: + gate = gate_up[..., ::2] + up = gate_up[..., 1::2] + gate = gate.clamp(max=self.limit) + up = up.clamp(min=-self.limit, max=self.limit) + glu = gate * torch.sigmoid(gate * self.alpha) + return torch.addcmul(glu, up, glu) + + def forward(self, hidden_states: torch.Tensor, router_indices: torch.Tensor, routing_weights: torch.Tensor) -> torch.Tensor: + N = hidden_states.shape[0] + top_k = router_indices.shape[-1] + H = hidden_states.shape[-1] + + per_pair = torch.zeros((N * top_k, H), dtype=hidden_states.dtype, device=hidden_states.device) + + expert_mask = F.one_hot(router_indices, num_classes=self.num_experts).permute(2, 1, 0) + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + + with self.gate_up_proj.bank_resident(hidden_states) as gate_up_bank, \ + self.down_proj.bank_resident(hidden_states) as down_bank: + for ei in expert_hit: + expert_idx = int(ei.item()) + top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) + current = hidden_states[token_idx] + + gate_up = gate_up_bank.expert_linear(current, expert_idx) + gated = self._apply_gate(gate_up) + expert_out = down_bank.expert_linear(gated, expert_idx) + + weighted = expert_out * routing_weights[token_idx, top_k_pos, None] + + flat_idx = token_idx * top_k + top_k_pos + per_pair[flat_idx] = weighted.to(per_pair.dtype) + + return per_pair.view(N, top_k, H).sum(dim=1) + + +class GptOssMLP(nn.Module): + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.router = GptOssTopKRouter(config, device=device, dtype=dtype) + self.experts = GptOssExperts(config, device=device, dtype=dtype, ops=ops) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + B, S, H = hidden_states.shape + flat = hidden_states.reshape(-1, H) + scores, idx = self.router(flat) + out = self.experts(flat, idx, scores) + return out.reshape(B, S, H) + + +# Decoder layer + model + +class GptOssDecoderLayer(nn.Module): + def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None): + super().__init__() + self.self_attn = GptOssAttention(config, layer_idx, device=device, dtype=dtype, ops=ops) + self.mlp = GptOssMLP(config, device=device, dtype=dtype, ops=ops) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + self.layer_type = config.layer_types[layer_idx] + + def forward(self, x: torch.Tensor, attention_masks: dict[str, Optional[torch.Tensor]], freqs_cis) -> torch.Tensor: + residual = x + x = self.input_layernorm(x) + x = self.self_attn(x, attention_masks[self.layer_type], freqs_cis) + x = residual + x + + residual = x + x = self.post_attention_layernorm(x) + x = self.mlp(x) + x = residual + x + return x + + +def _make_full_causal_mask(B: int, S: int, key_padding_mask: Optional[torch.Tensor], dtype, device): + neg = torch.finfo(dtype).min + mask = torch.full((S, S), neg, dtype=dtype, device=device).triu_(1) + mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous() + if key_padding_mask is not None: + kp = key_padding_mask.to(dtype=dtype) + kp = (1.0 - kp).reshape(B, 1, 1, S) * neg + mask = mask + kp + return mask + + +def _make_sliding_causal_mask(B: int, S: int, window: int, key_padding_mask: Optional[torch.Tensor], dtype, device): + neg = torch.finfo(dtype).min + i = torch.arange(S, device=device).view(-1, 1) + j = torch.arange(S, device=device).view(1, -1) + keep = (j <= i) & (j > i - window) + mask = torch.where(keep, torch.zeros((), dtype=dtype, device=device), torch.full((), neg, dtype=dtype, device=device)) + mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous() + if key_padding_mask is not None: + kp = key_padding_mask.to(dtype=dtype) + kp = (1.0 - kp).reshape(B, 1, 1, S) * neg + mask = mask + kp + return mask + + +class GptOssModel(nn.Module): + """GPT-OSS decoder with multi-layer hidden-state capture + early exit.""" + + def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None): + super().__init__() + self.config = config + self.dtype = dtype + self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype) + self.layers = nn.ModuleList( + [ + GptOssDecoderLayer(config, i, device=device, dtype=dtype, ops=ops) + for i in range(config.num_hidden_layers) + ] + ) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) + + # Always build on CPU so the buffer survives meta-device construction. + inv_freq, attn_scaling = _yarn_inv_freq( + head_dim=config.head_dim, + base=config.rope_theta, + factor=config.rope_factor, + beta_fast=config.rope_beta_fast, + beta_slow=config.rope_beta_slow, + original_max_position_embeddings=config.original_max_position_embeddings, + truncate=config.rope_truncate, + device=torch.device("cpu"), + ) + self.register_buffer("rope_inv_freq", inv_freq, persistent=False) + self.rope_attention_scaling = float(attn_scaling) + + @property + def num_layers(self) -> int: + return self.config.num_hidden_layers + + def get_input_embeddings(self): + return self.embed_tokens + + def _build_attention_masks(self, B: int, S: int, attention_mask: Optional[torch.Tensor], dtype: torch.dtype, device, + ) -> dict[str, torch.Tensor]: + full = _make_full_causal_mask(B, S, attention_mask, dtype, device) + masks = {"full_attention": full} + if any(t == "sliding_attention" for t in self.config.layer_types): + masks["sliding_attention"] = _make_sliding_causal_mask( + B, S, self.config.sliding_window, attention_mask, dtype, device + ) + return masks + + def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None, + capture_layers: Optional[Sequence[int]] = None) -> dict[str, Any]: + B, S = input_ids.shape + device = input_ids.device + dtype = self.dtype + + hidden_states = self.embed_tokens(input_ids, out_dtype=dtype) + + position_ids = torch.arange(S, device=device).unsqueeze(0).expand(B, -1) + freqs_cis = _build_freqs_cis(self.rope_inv_freq.to(device=device), self.rope_attention_scaling, position_ids, dtype) + + attn_masks = self._build_attention_masks(B, S, attention_mask, dtype, device) + + capture_layers = list(capture_layers) if capture_layers else None + if capture_layers: + max_layer = max(capture_layers) + wanted = {idx: pos for pos, idx in enumerate(capture_layers)} + captured: List[Optional[torch.Tensor]] = [None] * len(capture_layers) + else: + max_layer = self.config.num_hidden_layers - 1 + wanted = None + captured = None + + for i, layer in enumerate(self.layers): + hidden_states = layer(hidden_states, attn_masks, freqs_cis) + if wanted is not None and i in wanted: + captured[wanted[i]] = hidden_states + if i >= max_layer: + break + + if captured is not None: + return {"hidden_states": captured} + return {"last_hidden_state": self.norm(hidden_states)} + + +# Lens chat-template constants (verbatim from the reference pipeline). +_LENS_CHAT_SYSTEM = ( + "Describe the image by detailing the color, shape, size, texture, " + "quantity, text, spatial relationships of the objects and background." +) +_LENS_CHAT_ASSISTANT_THINKING = "Need to generate one image according to the description." +LENS_TXT_OFFSET = 97 +LENS_SELECTED_LAYERS = (5, 11, 17, 23) +LENS_MAX_TOKENS = 512 + + +# The reference GPT-OSS Harmony template injects today's date here +_LENS_CHAT_DATE = "2026-05-23" + + +def _lens_render_chat(prompt: str) -> str: + """Render the Lens prompt in GPT-OSS Harmony format.""" + return ( + f"<|start|>system<|message|>" + f"You are ChatGPT, a large language model trained by OpenAI.\n" + f"Knowledge cutoff: 2024-06\n" + f"Current date: {_LENS_CHAT_DATE}\n\n" + f"Reasoning: medium\n\n" + f"# Valid channels: analysis, commentary, final. " + f"Channel must be included for every message.<|end|>" + f"<|start|>developer<|message|># Instructions\n\n" + f"{_LENS_CHAT_SYSTEM}\n\n<|end|>" + f"<|start|>user<|message|>{prompt}<|end|>" + f"<|start|>assistant<|channel|>analysis<|message|>" + f"{_LENS_CHAT_ASSISTANT_THINKING}<|end|>" + f"<|start|>assistant<|channel|>final<|message|>" + ) + + +# GPT-OSS-20B fixed token IDs (from the tokenizer's added-tokens table). +_LENS_PAD_TOKEN_ID = 199999 # <|endoftext|> + + +class _GptOssRawTokenizer: + """Raw ``tokenizers.Tokenizer`` wrapper. + + The tokenizer JSON ships as a byte tensor inside the encoder checkpoint + (``tokenizer_json`` key) rather than as a committed file. Extracted + it in ``sd.py`` and passes it here via ``tokenizer_data``. + """ + + def __init__(self, tokenizer_json_bytes=None, **kwargs): + from tokenizers import Tokenizer + if isinstance(tokenizer_json_bytes, torch.Tensor): + tokenizer_json_bytes = bytes(tokenizer_json_bytes.tolist()) + if tokenizer_json_bytes is None: + raise ValueError( + "Lens tokenizer requires the ``tokenizer_json`` byte tensor in the " + "encoder state dict. Re-bundle the encoder via bundle_te.py so it " + "embeds the tokenizer." + ) + self.tokenizer = Tokenizer.from_str(tokenizer_json_bytes.decode("utf-8")) + + @classmethod + def from_pretrained(cls, tokenizer_data, **kwargs): + return cls(tokenizer_json_bytes=tokenizer_data, **kwargs) + + def __call__(self, text): + return {"input_ids": self.tokenizer.encode(text, add_special_tokens=False).ids} + + def get_vocab(self): + return self.tokenizer.get_vocab() + + def convert_tokens_to_ids(self, tokens): + return [self.tokenizer.token_to_id(t) for t in tokens] + + def decode(self, ids, **kwargs): + return self.tokenizer.decode(ids, skip_special_tokens=kwargs.get("skip_special_tokens", False)) + + +class LensGptOssTokenizer(sd1_clip.SDTokenizer): + tokenizer_json_data = None + + def __init__(self, embedding_directory=None, tokenizer_data={}): + tokenizer_json = tokenizer_data.get("tokenizer_json", None) + self.tokenizer_json_data = tokenizer_json + super().__init__( + tokenizer_json, + embedding_directory=embedding_directory, + pad_with_end=False, + embedding_size=2880, + embedding_key="gpt_oss", + tokenizer_class=_GptOssRawTokenizer, + has_start_token=False, + has_end_token=False, + pad_to_max_length=False, + max_length=99999999, + min_length=1, + pad_left=False, + disable_weights=True, + tokenizer_data=tokenizer_data, + ) + self.pad_token_id = _LENS_PAD_TOKEN_ID + + def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs): + # Empty prompt -> empty list; encode_token_weights returns zeros (uncond). + if not text or not text.strip(): + return [[]] + rendered = _lens_render_chat(text) + ids = self.tokenizer(rendered)["input_ids"] + if len(ids) > LENS_MAX_TOKENS: + ids = ids[:LENS_MAX_TOKENS] + return [[(int(t), 1.0) for t in ids]] + + def state_dict(self): + if self.tokenizer_json_data is not None: + return {"tokenizer_json": self.tokenizer_json_data} + return {} + + +class LensTokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data={}): + super().__init__( + embedding_directory=embedding_directory, + tokenizer_data=tokenizer_data, + name="gpt_oss", + tokenizer=LensGptOssTokenizer, + ) + + +class LensGptOssClipModel(nn.Module): + """SDClipModel-shaped Lens GPT-OSS encoder (multi-layer feature extractor).""" + + def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs): + super().__init__() + model_options = dict(model_options or {}) + + operations = model_options.get("custom_operations") + if operations is None: + quant_config = model_options.get("quantization_metadata") or {} + operations = comfy.ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True) + self.operations = operations + + cfg_overrides = model_options.get("gpt_oss_config", {}) + self.config = GptOss20BConfig(**cfg_overrides) + self.selected_layers = tuple(model_options.get("selected_layers", LENS_SELECTED_LAYERS)) + self.txt_offset = int(model_options.get("txt_offset", LENS_TXT_OFFSET)) + + self.transformer = GptOssModel(self.config, device=device, dtype=dtype, ops=operations) + self.num_layers = self.config.num_hidden_layers + self.dtype = dtype + self.execution_device = None + self._pad_token_id = _LENS_PAD_TOKEN_ID + + def set_clip_options(self, options): + self.execution_device = options.get("execution_device", self.execution_device) + + def reset_clip_options(self): + self.execution_device = None + + def _gather_tokens(self, token_weight_pairs): + ids_list = [[int(t[0]) for t in batch] for batch in token_weight_pairs] + pad_id = self._pad_token_id + max_len = max(len(x) for x in ids_list) + device = self.execution_device + ids = torch.full((len(ids_list), max_len), pad_id, dtype=torch.long, device=device) + mask = torch.zeros((len(ids_list), max_len), dtype=torch.long, device=device) + for i, x in enumerate(ids_list): + ids[i, : len(x)] = torch.tensor(x, dtype=torch.long, device=device) + mask[i, : len(x)] = 1 + return ids, mask + + def encode_token_weights(self, token_weight_pairs): + # Empty negative: emit zero-length features + zero mask + if all(len(batch) == 0 for batch in token_weight_pairs): + device = self.execution_device + B = len(token_weight_pairs) + L = len(self.selected_layers) + H = self.config.hidden_size + flat = torch.zeros(B, 0, L * H, dtype=self.dtype, device=device) + mask = torch.zeros(B, 0, dtype=torch.long, device=device) + return flat, None, {"attention_mask": mask, "num_layers_stacked": L} + + input_ids, attn_mask = self._gather_tokens(token_weight_pairs) + out = self.transformer(input_ids, attention_mask=attn_mask, capture_layers=self.selected_layers) + layers = out["hidden_states"] # list of L × [B, S, H] + stacked = torch.stack(layers, dim=2) # [B, S, L, H] + + offset = self.txt_offset + if stacked.shape[1] > offset: + stacked = stacked[:, offset:].contiguous() + mask_trim = attn_mask[:, offset:] + else: + stacked = stacked[:, :0] + mask_trim = attn_mask[:, :0] + + B, S, L, H = stacked.shape + flat = stacked.reshape(B, S, L * H) + extra = {"attention_mask": mask_trim, "num_layers_stacked": L} + return flat, None, extra + + def load_sd(self, sd): + return self.transformer.load_state_dict(sd, strict=False, assign=True) + + +class LensTEModel(sd1_clip.SD1ClipModel): + def __init__(self, device="cpu", dtype=None, model_options=None): + super().__init__(device=device, dtype=dtype, name="gpt_oss", clip_model=LensGptOssClipModel, model_options=model_options or {}) + + +def lens_te(dtype_llama=None, llama_quantization_metadata=None): + class LensTEModel_(LensTEModel): + def __init__(self, device="cpu", dtype=None, model_options=None): + mo = dict(model_options or {}) + if llama_quantization_metadata is not None: + mo["quantization_metadata"] = llama_quantization_metadata + if dtype is None and dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=mo) + + return LensTEModel_ diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py index 4ebb4b51e..b585c560f 100644 --- a/comfy_extras/nodes_cfg.py +++ b/comfy_extras/nodes_cfg.py @@ -57,24 +57,55 @@ class CFGNorm(io.ComfyNode): inputs=[ io.Model.Input("model"), io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01), + io.Boolean.Input( + "pre_cfg", + default=False, + optional=True, + tooltip=( + "If true, rescale the combined noise BEFORE the sampler's CFG combine, " + "without clamping (can amplify). Matches the norm-scaled CFG used by " + "models like Lens. Default false keeps the original post-CFG x0-space " + "attenuate-only behavior." + ), + ), ], outputs=[io.Model.Output(display_name="patched_model")], is_experimental=True, ) @classmethod - def execute(cls, model, strength) -> io.NodeOutput: + def execute(cls, model, strength, pre_cfg=False) -> io.NodeOutput: m = model.clone() - def cfg_norm(args): - cond_p = args['cond_denoised'] - pred_text_ = args["denoised"] + if pre_cfg: + def cfg_norm_pre(args): + cond = args["cond"] + uncond = args["uncond"] + cond_scale = args["cond_scale"] + comb = uncond + cond_scale * (cond - uncond) + cond_norm = torch.linalg.vector_norm(cond, dim=1, keepdim=True) + comb_norm = torch.linalg.vector_norm(comb, dim=1, keepdim=True) + rescale = torch.where( + comb_norm > 0, + cond_norm / comb_norm.clamp_min(1e-12), + torch.ones_like(comb_norm), + ) + rescaled = comb * rescale + # strength blends back toward standard linear CFG (1.0 = full rescale). + if strength != 1.0: + rescaled = strength * rescaled + (1.0 - strength) * comb + return rescaled + m.set_model_sampler_cfg_function(cfg_norm_pre) + else: + def cfg_norm(args): + cond_p = args['cond_denoised'] + pred_text_ = args["denoised"] - norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) - norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) - scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) - return pred_text_ * scale * strength + norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True) + norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True) + scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0) + return pred_text_ * scale * strength - m.set_model_sampler_post_cfg_function(cfg_norm) + m.set_model_sampler_post_cfg_function(cfg_norm) return io.NodeOutput(m) diff --git a/nodes.py b/nodes.py index 669a7057b..13d3864cd 100644 --- a/nodes.py +++ b/nodes.py @@ -969,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -979,7 +979,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) From f9f54cae428337ae9d9342b14141d77e1fb53ef0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Tue, 26 May 2026 10:32:53 +0300 Subject: [PATCH 08/13] Lens: some cleanup (#14112) * Lens: remove redundant memory optimization --- comfy/ldm/lens/model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/comfy/ldm/lens/model.py b/comfy/ldm/lens/model.py index 7bff7f6af..cd5015ddc 100644 --- a/comfy/ldm/lens/model.py +++ b/comfy/ldm/lens/model.py @@ -141,7 +141,6 @@ class LensJointAttention(nn.Module): img_q, img_k, img_v = img_qkv.unbind(dim=2) img_q = self.norm_q(img_q) img_k = self.norm_k(img_k) - img_v = img_v.contiguous() del img_qkv # text stream @@ -149,8 +148,6 @@ class LensJointAttention(nn.Module): txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2) txt_q = self.norm_added_q(txt_q) txt_k = self.norm_added_k(txt_k) - txt_v = txt_v.contiguous() - del txt_qkv # [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2) From 921775704cd986bd49e693e49f55422dc7a82b9b Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Tue, 26 May 2026 16:25:20 -0700 Subject: [PATCH 09/13] openapi: document QueueManageResponse body on POST /api/queue (#14117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * openapi: document QueueManageResponse body on POST /api/queue The Cloud runtime returns a JSON body from POST /api/queue describing which prompts were deleted and whether the queue was cleared. The spec previously declared a bare 200 with no schema, so generated clients had no type for the response. Adds a QueueManageResponse schema ({deleted, cleared}) and references it from the 200 response. Tagged x-runtime: [cloud] with a [cloud-only] description: local ComfyUI returns an empty 200 body, so both fields are nullable. * openapi: fix GET /api/hub/labels response to the label-catalog shape (#14118) * openapi: fix GET /api/hub/labels response to the label-catalog shape GET /api/hub/labels returns the catalog of available labels you can filter by, which the Cloud runtime serves as {labels: HubLabelInfo[]} (slug name, display_name, and a type category: tag/model/custom_node). The spec had this operation returning a bare array of HubLabel ({id, name, color}) — that schema models the label chips attached to a published workflow (HubWorkflow.labels), a different object. The catalog schema (HubLabelInfo) already existed but was unreferenced. Repoints the 200 response to a new HubLabelListResponse wrapper over the existing HubLabelInfo. HubLabel is unchanged and still used by HubWorkflow.labels. Endpoint remains x-runtime: [cloud]. * openapi: add Cloud-runtime fields (workflow_id, execution_error) to JobEntry (#14119) * openapi: add Cloud-runtime fields workflow_id, execution_error to JobEntry The Cloud runtime returns two additional fields on JobEntry that the spec didn't declare: - workflow_id: UUID of the Cloud workflow entity the job is associated with - execution_error: structured ComfyUI execution error for failed jobs (reuses the existing ExecutionError schema) Both tagged x-runtime: [cloud] with [cloud-only] descriptions; local ComfyUI does not populate them. * openapi: document Cloud-runtime request fields on POST /api/assets/export (#14120) The Cloud runtime accepts three request fields on /api/assets/export that the spec didn't declare: - job_ids: include all assets associated with the given jobs - naming_strategy: how to name files in the ZIP (enum, default group_by_job_time) - job_asset_name_filters: optional per-job asset-name allowlist Also drops asset_ids from required: the runtime supports exporting by job_ids alone, so neither field is individually required. /api/assets/export is already x-runtime: [cloud]; these are plain field additions under that endpoint-level tag. --- openapi.yaml | 74 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 8 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 502e518c7..f801a39d9 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -275,7 +275,10 @@ paths: responses: "200": description: Queue updated - + content: + application/json: + schema: + $ref: "#/components/schemas/QueueManageResponse" '400': description: Invalid request parameters content: @@ -3092,18 +3095,34 @@ paths: application/json: schema: type: object - required: - - asset_ids properties: + job_ids: + type: array + items: + type: string + description: Job IDs whose associated assets should all be included in the ZIP bundle. asset_ids: type: array items: type: string format: uuid - description: IDs of assets to export + description: Asset IDs to include in the ZIP bundle. Additive to assets associated with provided job IDs. export_name: type: string description: Name for the export archive + naming_strategy: + type: string + enum: [group_by_job_id, preserve, asset_id, group_by_job_time] + default: group_by_job_time + description: "Strategy for naming files in the ZIP: group by job ID, preserve original names, use the asset ID, or group by job creation time." + job_asset_name_filters: + type: object + additionalProperties: + type: array + minItems: 1 + items: + type: string + description: Optional per-job asset name filters. When provided for a job ID, only assets whose name matches one of the listed names are included. responses: "202": description: Export task accepted @@ -3575,10 +3594,7 @@ paths: content: application/json: schema: - type: array - items: - $ref: "#/components/schemas/HubLabel" - + $ref: "#/components/schemas/HubLabelListResponse" '400': description: Bad request (e.g. invalid type parameter) content: @@ -7466,6 +7482,25 @@ components: type: string description: Array of prompt IDs to delete from queue + QueueManageResponse: + type: object + x-runtime: [cloud] + description: >- + [cloud-only] Result of a queue mutation. The Cloud runtime returns which + items were deleted and whether the queue was cleared; local ComfyUI + returns an empty 200 body. + properties: + deleted: + type: array + nullable: true + items: + type: string + description: Prompt IDs that were deleted from the queue. + cleared: + type: boolean + nullable: true + description: Whether the queue was cleared. + # ------------------------------------------------------------------- # History # ------------------------------------------------------------------- @@ -7546,6 +7581,16 @@ components: outputs_count: type: integer description: Total number of output files + workflow_id: + type: string + nullable: true + x-runtime: [cloud] + description: "[cloud-only] UUID of the Cloud workflow entity this job is associated with. Local ComfyUI returns null." + execution_error: + x-runtime: [cloud] + description: "[cloud-only] Detailed execution error from ComfyUI for failed jobs. Absent on local ComfyUI." + allOf: + - $ref: "#/components/schemas/ExecutionError" JobDetailResponse: type: object @@ -10433,6 +10478,19 @@ components: - custom_node description: Label category. + HubLabelListResponse: + type: object + x-runtime: [cloud] + description: '[cloud-only] Response wrapper for the available Hub label catalog.' + required: + - labels + properties: + labels: + type: array + items: + $ref: '#/components/schemas/HubLabelInfo' + description: Available labels, optionally filtered by type. + HubProfileSummary: type: object x-runtime: [cloud] From 28f4ef277c17af8c9e2d849f3c34e1027b2abaf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jukka=20Sepp=C3=A4nen?= <40791699+kijai@users.noreply.github.com> Date: Wed, 27 May 2026 03:50:14 +0300 Subject: [PATCH 10/13] feat: Support NVIDIA PixelDiT and PiD (CORE-201) (#14103) --- comfy/latent_formats.py | 4 +- comfy/ldm/modules/diffusionmodules/mmdit.py | 5 +- comfy/ldm/pixeldit/model.py | 239 ++++++++++++++++++++ comfy/ldm/pixeldit/modules.py | 187 +++++++++++++++ comfy/ldm/pixeldit/pid.py | 226 ++++++++++++++++++ comfy/model_base.py | 32 +++ comfy/model_detection.py | 17 ++ comfy/sd.py | 10 +- comfy/supported_models.py | 69 ++++++ comfy/text_encoders/pixeldit.py | 104 +++++++++ comfy_extras/nodes_pid.py | 55 +++++ nodes.py | 5 +- 12 files changed, 946 insertions(+), 7 deletions(-) create mode 100644 comfy/ldm/pixeldit/model.py create mode 100644 comfy/ldm/pixeldit/modules.py create mode 100644 comfy/ldm/pixeldit/pid.py create mode 100644 comfy/text_encoders/pixeldit.py create mode 100644 comfy_extras/nodes_pid.py diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py index 75d459b59..12a934d71 100644 --- a/comfy/latent_formats.py +++ b/comfy/latent_formats.py @@ -799,13 +799,15 @@ class ZImagePixelSpace(ChromaRadiance): """ pass - class HiDreamO1Pixel(ChromaRadiance): """Pixel-space latent format for HiDream-O1. No VAE — model patches/unpatches raw RGB internally with patch_size=32. """ pass +class PixelDiTPixel(ChromaRadiance): + pass + class CogVideoX(LatentFormat): """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b). diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py index 0dc8fe789..9ab3c463c 100644 --- a/comfy/ldm/modules/diffusionmodules/mmdit.py +++ b/comfy/ldm/modules/diffusionmodules/mmdit.py @@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module): Embeds scalar timesteps into vector representations. """ - def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None): + def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000): super().__init__() if output_size is None: output_size = hidden_size @@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module): operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device), ) self.frequency_embedding_size = frequency_embedding_size + self.max_period = max_period def forward(self, t, dtype, **kwargs): - t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype) + t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype) t_emb = self.mlp(t_freq) return t_emb diff --git a/comfy/ldm/pixeldit/model.py b/comfy/ldm/pixeldit/model.py new file mode 100644 index 000000000..b044b9b29 --- /dev/null +++ b/comfy/ldm/pixeldit/model.py @@ -0,0 +1,239 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +import comfy.ldm.common_dit +import comfy.patcher_extension +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.hidream.model import FeedForwardSwiGLU +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder + +from .modules import ( + FinalLayer, + PatchTokenEmbedder, + PiTBlock, + PixelTokenEmbedder, + apply_adaln_, + precompute_freqs_cis_2d, +) + + +class MMDiTJointAttention(nn.Module): + """Joint MMDiT attention with separate Q/K/V/proj for image and text streams. + + RoPE is applied to each stream before concatenation so each stream uses its own + 2D/1D positional encoding. Concat order is [text, image] (text first). + """ + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + + self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + + self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device) + self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + B, Nx, _ = x.shape + _, Ny, _ = y.shape + H = self.num_heads + D = self.head_dim + + qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4) + qx, kx, vx = qkv_x.unbind(0) + qx = self.q_norm_x(qx) + kx = self.k_norm_x(kx) + + qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4) + qy, ky, vy = qkv_y.unbind(0) + qy = self.q_norm_y(qy) + ky = self.k_norm_y(ky) + + qx, kx = apply_rope(qx, kx, pos_img[None, None]) + if pos_txt is not None: + qy, ky = apply_rope(qy, ky, pos_txt[None, None]) + + q_joint = torch.cat([qy, qx], dim=2) + k_joint = torch.cat([ky, kx], dim=2) + v_joint = torch.cat([vy, vx], dim=2) + + out_joint = optimized_attention( + q_joint, k_joint, v_joint, H, + mask=attn_mask, skip_reshape=True, skip_output_reshape=True, + transformer_options=transformer_options, + ) + + out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D) + out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D) + + return self.proj_x(out_x), self.proj_y(out_y) + + +class MMDiTBlockT2I(nn.Module): + def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None): + super().__init__() + self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + mlp_hidden_dim = int(hidden_size * mlp_ratio) + self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations) + self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device)) + + def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}): + shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1) + shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1) + + x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x) + y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y) + attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options) + x = torch.addcmul(x, gate_msa_x, attn_x) + y = torch.addcmul(y, gate_msa_y, attn_y) + + x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x))) + y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y))) + return x, y + + +class PixDiT_T2I(nn.Module): + """PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint + (also runs at 512px when fed the appropriate latent size and flow_shift). + + Forward: + x: [B, 3, H, W] pixel-space input (no VAE) + timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention) + context: [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended) + Returns flow-matching velocity [B, 3, H, W]. + """ + def __init__( + self, + in_channels=3, + num_groups=24, + hidden_size=1536, + pixel_hidden_size=16, + pixel_attn_hidden_size=1152, + pixel_num_groups=16, + patch_depth=14, + pixel_depth=2, + patch_size=16, + txt_embed_dim=2304, + txt_max_length=300, + use_text_rope=True, + text_rope_theta=10000.0, + image_model=None, + dtype=None, + device=None, + operations=None, + pixel_mlp_chunks=2, + ): + super().__init__() + self.dtype = dtype + self.in_channels = in_channels + self.out_channels = in_channels + self.hidden_size = hidden_size + self.num_groups = num_groups + self.patch_depth = patch_depth + self.pixel_depth = pixel_depth + self.patch_size = patch_size + self.pixel_hidden_size = pixel_hidden_size + self.pixel_attn_hidden_size = pixel_attn_hidden_size + self.pixel_num_groups = pixel_num_groups + self.txt_embed_dim = txt_embed_dim + self.txt_max_length = txt_max_length + self.use_text_rope = use_text_rope + self.text_rope_theta = text_rope_theta + + self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations) + self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations) + self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10) + self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations) + self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device)) + + self.patch_blocks = nn.ModuleList([ + MMDiTBlockT2I(self.hidden_size, self.num_groups, + dtype=dtype, device=device, operations=operations) + for _ in range(self.patch_depth) + ]) + self.pixel_blocks = nn.ModuleList([ + PiTBlock( + self.pixel_hidden_size, + self.hidden_size, + patch_size=self.patch_size, + num_heads=self.num_groups, + attn_hidden_size=self.pixel_attn_hidden_size, + attn_num_heads=self.pixel_num_groups, + dtype=dtype, device=device, operations=operations, + mlp_chunks=pixel_mlp_chunks, + ) + for _ in range(self.pixel_depth) + ]) + + self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts) + + def _fetch_text_pos(self, length, device, dtype): + return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype) + + def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + return comfy.patcher_extension.WrapperExecutor.new_class_executor( + self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options), + ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs) + + def _pre_patch_block(self, s, i, **kwargs): + """Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate).""" + return s + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs): + H_orig, W_orig = x.shape[2], x.shape[3] + x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size)) + B, _, H, W = x.shape + Hs = H // self.patch_size + Ws = W // self.patch_size + L = Hs * Ws + + pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2) + + t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size) + + if context is None or context.dim() != 3: + raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]") + Ltxt = min(context.shape[1], self.txt_max_length) + y = context[:, :Ltxt, :] + y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size) + y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter + + condition = F.silu(t_emb) + pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None + + s = self.s_embedder(x_patches) + for i, blk in enumerate(self.patch_blocks): + s = self._pre_patch_block(s, i, **kwargs) + s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options) + s = F.silu(t_emb + s) + + s_cond = s.view(B * L, self.hidden_size) + x_pixels = self.pixel_embedder(x, patch_size=self.patch_size) + for blk in self.pixel_blocks: + x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options) + + x_pixels = self.final_layer(x_pixels) + C_out = self.out_channels + P2 = self.patch_size * self.patch_size + x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L) + out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size) + return out[:, :, :H_orig, :W_orig] diff --git a/comfy/ldm/pixeldit/modules.py b/comfy/ldm/pixeldit/modules.py new file mode 100644 index 000000000..4b1e538c7 --- /dev/null +++ b/comfy/ldm/pixeldit/modules.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn + +from comfy.ldm.flux.math import apply_rope, rope +from comfy.ldm.modules.attention import optimized_attention +from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch + + +def apply_adaln_(x, shift, scale): + return x.addcmul_(x, scale).add_(shift) + + +def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0, + ref_grid_h=None, ref_grid_w=None, + scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0, + device=None, dtype=torch.float32, **kwargs): + """2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim. + + rope_options: + scale_x / scale_y multiply the position range (RoPE extrapolation). + shift_x / shift_y offset the position origin (tiled / regional inference). + With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling + (rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)). + Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2]. + Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}]. + """ + dim_axis = dim // 2 + if ref_grid_h is not None and dim_axis > 2: + h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2)) + w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2)) + else: + h_ntk = w_ntk = 1.0 + + x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device) + y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device) + y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij") + x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0) + y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0) + out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2) + return out.to(dtype=dtype) + + +def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32): + """Standard 2D sin/cos absolute positional embedding (ViT-style). + + first half encodes W-coordinates, second half H. + """ + assert embed_dim % 4 == 0 + grid_h = torch.arange(height, dtype=torch.float32, device=device) + grid_w = torch.arange(width, dtype=torch.float32, device=device) + grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij") + emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device) + emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device) + return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype) + + +class RotaryAttention(nn.Module): + """Single-stream self-attention with rotary positional encoding (used inside PiTBlock).""" + def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None): + super().__init__() + assert dim % num_heads == 0 + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device) + self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device) + self.proj = operations.Linear(dim, dim, dtype=dtype, device=device) + + def forward(self, x, pos, mask=None, transformer_options={}): + B, N, C = x.shape + H = self.num_heads + D = self.head_dim + qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None]) + x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options) + return self.proj(x) + + +class FinalLayer(nn.Module): + def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None): + super().__init__() + self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device) + self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device) + + def forward(self, x): + return self.linear(self.norm(x)) + + +class PatchTokenEmbedder(nn.Module): + """Linear projection used both for patchified-image tokens and text-feature tokens.""" + def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None): + super().__init__() + self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device) + self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity() + + def forward(self, x): + return self.norm(self.proj(x)) + + +class PixelTokenEmbedder(nn.Module): + """Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences.""" + def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None): + super().__init__() + self.in_channels = in_channels + self.hidden_size_output = hidden_size_output + self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device) + + def forward(self, inputs, patch_size): + B, _, H, W = inputs.shape + Hs, Ws = H // patch_size, W // patch_size + P2 = patch_size * patch_size + x = inputs.permute(0, 2, 3, 1).contiguous() + x = self.proj(x) + pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output) + x = x + pos_full.unsqueeze(0) + x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output) + return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output) + + +class PiTBlock(nn.Module): + """Pixel-level transformer block. + + Compresses each patch's P^2 pixel tokens → 1 attention token via a linear, + runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens. + Conditioning is per-pixel adaLN from the patch-level features. + """ + def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0, + attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1): + super().__init__() + self.pixel_dim = pixel_hidden_size + self.context_dim = patch_hidden_size + self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size + self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads + assert self.attn_dim % self.num_heads == 0 + + p2 = patch_size * patch_size + self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device) + self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device) + + self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations) + self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device) + self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations) + + self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device) + + self._rope_fn = precompute_freqs_cis_2d + self.mlp_chunks = max(1, int(mlp_chunks)) + + def _fetch_pos(self, height, width, device, dtype, **rope_opts): + return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts) + + def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}): + BL, P2, _ = x.shape + Hs, Ws = image_height // patch_size, image_width // patch_size + L = Hs * Ws + B = BL // L + + # Attention path uses only msa params; compute, use, free before mlp params allocate. + msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1) + + x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa) + x_flat = x_norm.view(BL, P2 * self.pixel_dim) + + x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim) + pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {})) + attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options) + attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim)) + attn_exp = attn_flat.view(BL, P2, self.pixel_dim) + x = torch.addcmul(x, gate_msa, attn_exp) + del msa_params, shift_msa, scale_msa, gate_msa + + mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim) + shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1) + gate_mlp = gate_mlp.contiguous() # detach from mlp_params so the del below frees shift+scale storage before the MLP + mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp) + del mlp_params, shift_mlp, scale_mlp + + # MLP in chunks since the peak memory usage is huge here + chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks + for s in range(0, BL, chunk_size): + e = min(s + chunk_size, BL) + x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e])) + return x diff --git a/comfy/ldm/pixeldit/pid.py b/comfy/ldm/pixeldit/pid.py new file mode 100644 index 000000000..0ad4b7ce8 --- /dev/null +++ b/comfy/ldm/pixeldit/pid.py @@ -0,0 +1,226 @@ +"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent +directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I +body + LQ projection branch injected before each MMDiT patch block. +""" + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .model import PixDiT_T2I +from .modules import precompute_freqs_cis_2d + + +class SigmaAwareGatePerTokenPerDim(nn.Module): + """gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq. + + Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1. + """ + + def __init__(self, dim: int, dtype=None, device=None, operations=None): + super().__init__() + self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device) + self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device)) + + def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor: + content_logit = self.content_proj(torch.cat([x, lq], dim=-1)) + # log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM. + log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32) + sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1) + gate = torch.sigmoid(content_logit + sigma_offset) + return x + (gate * lq).to(x.dtype) + + +class ResBlock(nn.Module): + """Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip.""" + + def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None): + super().__init__() + self.block = nn.Sequential( + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + operations.GroupNorm(num_groups, channels, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.block(x) + + +class LQProjection2D(nn.Module): + """LQ latent -> per-block patch-aligned features for controlnet-style injection.""" + + def __init__( + self, + latent_channels: int, + hidden_dim: int = 512, + out_dim: int = 1536, + patch_size: int = 16, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + num_res_blocks: int = 4, + num_outputs: int = 7, + interval: int = 2, + dtype=None, device=None, operations=None, + ): + super().__init__() + self.latent_channels = latent_channels + self.hidden_dim = hidden_dim + self.out_dim = out_dim + self.patch_size = patch_size + self.sr_scale = sr_scale + self.latent_spatial_down_factor = latent_spatial_down_factor + self.num_outputs = num_outputs + self.interval = interval + + z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size + self.z_to_patch_ratio = z_to_patch_ratio + if z_to_patch_ratio >= 1: + self.latent_fold_factor = 0 + latent_proj_in_ch = latent_channels + else: + fold_factor = int(1 / z_to_patch_ratio) + assert fold_factor * z_to_patch_ratio == 1.0 + self.latent_fold_factor = fold_factor + latent_proj_in_ch = latent_channels * fold_factor * fold_factor + + layers = [ + operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + nn.SiLU(), + operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device), + ] + for _ in range(num_res_blocks): + layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations)) + self.latent_proj = nn.Sequential(*layers) + + self.output_heads = nn.ModuleList( + [operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)] + ) + self.gate_modules = nn.ModuleList( + [SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations) + for _ in range(num_outputs)] + ) + + def is_gate_active(self, block_idx: int) -> bool: + return block_idx % self.interval == 0 + + def output_index(self, block_idx: int) -> int: + return block_idx // self.interval + + def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor: + return self.gate_modules[out_idx](x, lq_feature, sigma) + + def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor: + B, z_dim = lq_latent.shape[:2] + if self.z_to_patch_ratio >= 1: + if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW: + z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest") + else: + z_aligned = lq_latent + else: + f = self.latent_fold_factor + zH_expected, zW_expected = pH * f, pW * f + if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected: + lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest") + z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4) + z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW) + return self.latent_proj(z_aligned) + + def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]: + feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW) + B, C, H, W = feat.shape + tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C) + return [head(tokens) for head in self.output_heads] + + +class PidNet(PixDiT_T2I): + """PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block).""" + + def __init__( + self, + lq_latent_channels: int = 16, + lq_hidden_dim: int = 512, + lq_num_res_blocks: int = 4, + lq_interval: int = 2, + sr_scale: int = 4, + latent_spatial_down_factor: int = 8, + rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64. + rope_ref_w: int = 1024, + image_model=None, + dtype=None, device=None, operations=None, + **pixdit_kwargs, + ): + super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs) + + self.rope_ref_grid_h = rope_ref_h // self.patch_size + self.rope_ref_grid_w = rope_ref_w // self.patch_size + + # Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware. + def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts): + return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts) + for blk in self.pixel_blocks: + blk._rope_fn = _pit_rope_fn + + num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval + self.lq_proj = LQProjection2D( + latent_channels=lq_latent_channels, + hidden_dim=lq_hidden_dim, + out_dim=self.hidden_size, + patch_size=self.patch_size, + sr_scale=sr_scale, + latent_spatial_down_factor=latent_spatial_down_factor, + num_res_blocks=lq_num_res_blocks, + num_outputs=num_lq_outputs, + interval=lq_interval, + dtype=dtype, + device=device, + operations=operations, + ) + + def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts): + return precompute_freqs_cis_2d( + self.hidden_size // self.num_groups, + height, width, + ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, + device=device, dtype=dtype, **rope_opts, + ) + + def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs): + if not self.lq_proj.is_gate_active(i): + return s + out_idx = self.lq_proj.output_index(i) + if out_idx >= len(pid_lq_features): + return s + return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx) + + def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs): + if lq_latent is None: + raise ValueError("PidNet requires lq_latent — attach via PiDConditioning") + expected_c = self.lq_proj.latent_channels + if lq_latent.shape[1] != expected_c: + raise ValueError( + f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. " + f"Flux1/SD3 = 16 channels, Flux2 = 128 channels." + ) + B = x.shape[0] + Hs = x.shape[2] // self.patch_size + Ws = x.shape[3] // self.patch_size + + degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1) + if degrade_sigma.numel() == 1 and B > 1: + degrade_sigma = degrade_sigma.expand(B).contiguous() + + lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws) + + return super()._forward( + x, timesteps, + context=context, attention_mask=attention_mask, + transformer_options=transformer_options, + pid_lq_features=lq_features, + pid_degrade_sigma=degrade_sigma, + **kwargs, + ) diff --git a/comfy/model_base.py b/comfy/model_base.py index d4ab1499e..e55808633 100644 --- a/comfy/model_base.py +++ b/comfy/model_base.py @@ -49,6 +49,8 @@ import comfy.ldm.hunyuan3d.model import comfy.ldm.hidream.model import comfy.ldm.chroma.model import comfy.ldm.chroma_radiance.model +import comfy.ldm.pixeldit.model +import comfy.ldm.pixeldit.pid import comfy.ldm.ace.model import comfy.ldm.omnigen.omnigen2 import comfy.ldm.qwen_image.model @@ -1397,6 +1399,36 @@ class ZImagePixelSpace(Lumina2): BaseModel.__init__(self, model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiTPixelSpace) self.memory_usage_factor_conds = ("ref_latents",) + +class PixelDiTT2I(BaseModel): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + super().__init__(model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.model.PixDiT_T2I) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + attention_mask = kwargs.get("attention_mask", None) + if attention_mask is not None: + out["attention_mask"] = comfy.conds.CONDRegular(attention_mask) + return out + + +class PiD(PixelDiTT2I): + def __init__(self, model_config, model_type=ModelType.FLOW, device=None): + BaseModel.__init__(self, model_config, model_type, device=device, + unet_model=comfy.ldm.pixeldit.pid.PidNet) + + def extra_conds(self, **kwargs): + out = super().extra_conds(**kwargs) + lq_latent = kwargs.get("lq_latent", None) + if lq_latent is not None: + out["lq_latent"] = comfy.conds.CONDRegular(lq_latent) + degrade_sigma = kwargs.get("degrade_sigma", None) + if degrade_sigma is not None: + out["degrade_sigma"] = comfy.conds.CONDRegular(degrade_sigma) + return out + + class WAN21(BaseModel): def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None): super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel) diff --git a/comfy/model_detection.py b/comfy/model_detection.py index 2b0b98cd8..f0db7d388 100644 --- a/comfy/model_detection.py +++ b/comfy/model_detection.py @@ -463,6 +463,23 @@ def detect_unet_config(state_dict, key_prefix, metadata=None): dit_config["extra_per_block_abs_pos_emb_type"] = "learnable" return dit_config + # PiD (Pixel Diffusion Decoder). Must check BEFORE plain PixelDiT_T2I. + _lq_w_key = '{}lq_proj.latent_proj.0.weight'.format(key_prefix) + if _lq_w_key in state_dict_keys: + in_ch = int(state_dict[_lq_w_key].shape[1]) + _gate_prefix = '{}lq_proj.gate_modules.'.format(key_prefix) + num_gates = len({k[len(_gate_prefix):].split('.')[0] + for k in state_dict_keys if k.startswith(_gate_prefix)}) + dit_config = {"image_model": "pid", + "lq_latent_channels": in_ch, + "latent_spatial_down_factor": 16 if in_ch >= 64 else 8} + if num_gates > 0: + dit_config["lq_interval"] = (14 + num_gates - 1) // num_gates + return dit_config + + if '{}core.pixel_embedder.proj.weight'.format(key_prefix) in state_dict_keys: # PixelDiT T2I + return {"image_model": "pixeldit_t2i"} + if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys and '{}noise_refiner.0.attention.k_norm.weight'.format(key_prefix) in state_dict_keys: # Lumina 2 dit_config = {} dit_config["image_model"] = "lumina2" diff --git a/comfy/sd.py b/comfy/sd.py index beb782310..30b877b85 100644 --- a/comfy/sd.py +++ b/comfy/sd.py @@ -49,6 +49,7 @@ import comfy.text_encoders.lt import comfy.text_encoders.hunyuan_video import comfy.text_encoders.cosmos import comfy.text_encoders.lumina2 +import comfy.text_encoders.pixeldit import comfy.text_encoders.wan import comfy.text_encoders.hidream import comfy.text_encoders.ace @@ -1285,6 +1286,7 @@ class CLIPType(Enum): LONGCAT_IMAGE = 26 COGVIDEOX = 27 LENS = 28 + PIXELDIT = 29 @@ -1528,8 +1530,12 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip clip_target.tokenizer = variant.tokenizer tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None) elif te_model == TEModel.GEMMA_2_2B: - clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) - clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer + if clip_type == CLIPType.PIXELDIT: + clip_target.clip = comfy.text_encoders.pixeldit.pixeldit_te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer + else: + clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data)) + clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None) elif te_model == TEModel.GEMMA_3_4B: clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b") diff --git a/comfy/supported_models.py b/comfy/supported_models.py index e451892e9..4723caff5 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -30,6 +30,7 @@ import comfy.text_encoders.longcat_image import comfy.text_encoders.ernie import comfy.text_encoders.cogvideo import comfy.text_encoders.hidream_o1 +import comfy.text_encoders.pixeldit from . import supported_models_base from . import latent_formats @@ -1201,6 +1202,72 @@ class ZImagePixelSpace(ZImage): def get_model(self, state_dict, prefix="", device=None): return model_base.ZImagePixelSpace(self, device=device) +class PixelDiTT2I(supported_models_base.BASE): + unet_config = { + "image_model": "pixeldit_t2i", + } + + unet_extra_config = {} + + sampling_settings = { + "shift": 4.0, # 1024px stage 3 default; 2.0 for 512px + } + + latent_format = latent_formats.PixelDiTPixel + memory_usage_factor = 0.18 + supported_inference_dtypes = [torch.bfloat16, torch.float32] + + vae_key_prefix = ["vae."] + text_encoder_key_prefix = ["text_encoders."] + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PixelDiTT2I(self, device=device) + + def process_unet_state_dict(self, state_dict): + # pixel_dim from pixel_embedder.proj.weight = (pixel_dim, in_channels); p2 derived per-weight from total // (6 * pixel_dim). + pixel_dim = next(v for k, v in state_dict.items() if k.endswith("pixel_embedder.proj.weight")).shape[0] + + out = {} + marker = ".adaLN_modulation.0." + for k, v in state_dict.items(): + if k.startswith("_repa_projector") or k.startswith("net_ema."): + continue + if k.startswith("core."): + k = k[len("core."):] + elif k.startswith("net."): + k = k[len("net."):] + if "pixel_blocks." in k and marker in k: + # Split into msa (chunks 0-2) and mlp (chunks 3-5) for the two-Linear PiTBlock to reduce peak VRAM + p2 = v.shape[0] // (6 * pixel_dim) + trail = v.shape[1:] # () for bias, (in_dim,) for weight + vv = v.view(p2, 6, pixel_dim, *trail) + base, suffix = k.split(marker) + out[f"{base}.adaLN_modulation_msa.{suffix}"] = vv[:, 0:3].reshape(3 * p2 * pixel_dim, *trail).contiguous() + out[f"{base}.adaLN_modulation_mlp.{suffix}"] = vv[:, 3:6].reshape(3 * p2 * pixel_dim, *trail).contiguous() + else: + out[k] = v + return out + + def clip_target(self, state_dict={}): + return supported_models_base.ClipTarget( + comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer, + comfy.text_encoders.pixeldit.PixelDiTGemma2TE, + ) + +class PiD(PixelDiTT2I): + unet_config = { + "image_model": "pid", + } + + sampling_settings = { + "shift": 1.5, # close approximation of the original distill 4 steps [0.999, 0.866, 0.634, 0.342, 0] + } + + memory_usage_factor = 0.07 + + def get_model(self, state_dict, prefix="", device=None): + return model_base.PiD(self, device=device) + class WAN21_T2V(supported_models_base.BASE): unet_config = { "image_model": "wan2.1", @@ -2111,6 +2178,8 @@ models = [ CosmosI2VPredict2, ZImagePixelSpace, ZImage, + PiD, + PixelDiTT2I, Lumina2, WAN22_T2V, WAN21_CausalAR_T2V, diff --git a/comfy/text_encoders/pixeldit.py b/comfy/text_encoders/pixeldit.py new file mode 100644 index 000000000..3539711e4 --- /dev/null +++ b/comfy/text_encoders/pixeldit.py @@ -0,0 +1,104 @@ +import torch + +from comfy import sd1_clip +from .lumina2 import Gemma2BTokenizer, LuminaModel +import comfy.text_encoders.llama + + +class PixelDiTGemma2_2BModel(sd1_clip.SDClipModel): + def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}): + llama_quantization_metadata = model_options.get("llama_quantization_metadata", None) + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["quantization_metadata"] = llama_quantization_metadata + + super().__init__( + device=device, layer=layer, layer_idx=layer_idx, + textmodel_json_config={}, dtype=dtype, + special_tokens={"start": 2, "pad": 0}, + layer_norm_hidden_state=False, + model_class=comfy.text_encoders.llama.Gemma2_2B, + enable_attention_masks=attention_mask, + return_attention_masks=attention_mask, + model_options=model_options, + ) + + +_PIXELDIT_CHI_PROMPT = ( + 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions ' + "suitable for image generation. Evaluate the level of detail in the user prompt:\n" + "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, " + "and spatial relationships to create vivid and concrete scenes.\n" + "- If the prompt is already detailed, refine and enhance the existing details slightly without " + "overcomplicating.\n" + "Here are examples of how to transform or refine prompts:\n" + "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, " + "sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n" + "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring " + "glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus " + "passing by towering glass skyscrapers.\n" + "Please generate only the enhanced description for the prompt below and avoid including any " + "additional commentary or evaluations:\n" + "User Prompt: " +) + +_PIXELDIT_MAX_LENGTH = 300 +_PIXELDIT_CHI_PROMPT_DETECT_PREFIX = 'Given a user prompt, generate an "Enhanced prompt"' + + +class PixelDiTGemma2Tokenizer(sd1_clip.SD1Tokenizer): + def __init__(self, embedding_directory=None, tokenizer_data=None): + if tokenizer_data is None: + tokenizer_data = {} + super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, + name="gemma2_2b", tokenizer=Gemma2BTokenizer) + + def tokenize_with_weights(self, text, return_word_ids=False, **kwargs): + if not text.strip(): + return super().tokenize_with_weights("", return_word_ids=return_word_ids, disable_weights=True, min_length=_PIXELDIT_MAX_LENGTH) + + chi_token_count = len(self.gemma2_2b.tokenizer(_PIXELDIT_CHI_PROMPT)["input_ids"]) + combined = text if text.startswith(_PIXELDIT_CHI_PROMPT_DETECT_PREFIX) else _PIXELDIT_CHI_PROMPT + text + max_length_all = chi_token_count + _PIXELDIT_MAX_LENGTH - 2 + out = super().tokenize_with_weights(combined, return_word_ids=return_word_ids, + disable_weights=True, min_length=max_length_all) + out["gemma2_2b"] = [out["gemma2_2b"][0][:max_length_all]] + return out + + def untokenize(self, token_weight_pair): + return self.gemma2_2b.untokenize(token_weight_pair) + + def state_dict(self): + return self.gemma2_2b.state_dict() + + +class PixelDiTGemma2TE(LuminaModel): + # PixelDiT's select_index: keep BOS + last 299 embeddings of the padded sequence. + def __init__(self, device="cpu", dtype=None, model_options={}): + super().__init__(device=device, dtype=dtype, name="gemma2_2b", + clip_model=PixelDiTGemma2_2BModel, model_options=model_options) + + def encode_token_weights(self, token_weight_pairs): + result = super().encode_token_weights(token_weight_pairs) + cond, pooled = result[0], result[1] + extra = result[2] if len(result) > 2 else None + if cond.shape[1] > _PIXELDIT_MAX_LENGTH: + cond = torch.cat([cond[:, :1], cond[:, -(_PIXELDIT_MAX_LENGTH - 1):]], dim=1) + if extra is not None and "attention_mask" in extra: + am = extra["attention_mask"] + extra["attention_mask"] = torch.cat([am[..., :1], am[..., -(_PIXELDIT_MAX_LENGTH - 1):]], dim=-1) + if extra is not None: + return cond, pooled, extra + return cond, pooled + + +def pixeldit_te(dtype_llama=None, llama_quantization_metadata=None): + class PixelDiTTE_(PixelDiTGemma2TE): + def __init__(self, device="cpu", dtype=None, model_options={}): + if llama_quantization_metadata is not None: + model_options = model_options.copy() + model_options["llama_quantization_metadata"] = llama_quantization_metadata + if dtype_llama is not None: + dtype = dtype_llama + super().__init__(device=device, dtype=dtype, model_options=model_options) + return PixelDiTTE_ diff --git a/comfy_extras/nodes_pid.py b/comfy_extras/nodes_pid.py new file mode 100644 index 000000000..811b9ae8e --- /dev/null +++ b/comfy_extras/nodes_pid.py @@ -0,0 +1,55 @@ +"""PiD (Pixel Diffusion Decoder) node""" + +import torch +from typing_extensions import override + +import node_helpers +import comfy.latent_formats +from comfy_api.latest import ComfyExtension, io + + +class PiDConditioning(io.ComfyNode): + @classmethod + def define_schema(cls) -> io.Schema: + return io.Schema( + node_id="PiDConditioning", + display_name="PiD Conditioning", + category="advanced/conditioning", + description=( + "Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling" + ), + inputs=[ + io.Conditioning.Input("positive"), + io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."), + io.Combo.Input("latent_format", options=["flux", "sd3"], default="flux", + tooltip="Flux1 and Flux2 latents auto-detected from channel dim, sd3 has to be selected manually."), + io.Float.Input( + "degrade_sigma", default=0.0, min=0.0, max=1.0, step=0.01, + tooltip="0 = clean latent. Increase to denoise corrupted latent outputs.", + ), + ], + outputs=[io.Conditioning.Output()], + ) + + @classmethod + def execute(cls, positive, latent, latent_format: str, degrade_sigma: float) -> io.NodeOutput: + samples = latent["samples"] + if latent_format == "flux": + fmt_cls = comfy.latent_formats.Flux2 if samples.shape[1] == 128 else comfy.latent_formats.Flux + else: + fmt_cls = comfy.latent_formats.SD3 + lq_latent = fmt_cls().process_in(samples) + sigma_t = torch.tensor([float(degrade_sigma)], dtype=torch.float32) + return io.NodeOutput(node_helpers.conditioning_set_values( + positive, {"lq_latent": lq_latent, "degrade_sigma": sigma_t}, + )) + + +class PiDExtension(ComfyExtension): + @override + async def get_node_list(self) -> list[type[io.ComfyNode]]: + return [PiDConditioning] + + +async def comfy_entrypoint() -> PiDExtension: + return PiDExtension() diff --git a/nodes.py b/nodes.py index 13d3864cd..87d81b5b7 100644 --- a/nodes.py +++ b/nodes.py @@ -969,7 +969,7 @@ class CLIPLoader: @classmethod def INPUT_TYPES(s): return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ), - "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens"], ), + "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ), }, "optional": { "device": (["default", "cpu"], {"advanced": True}), @@ -979,7 +979,7 @@ class CLIPLoader: CATEGORY = "advanced/loaders" - DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b" + DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm" def load_clip(self, clip_name, type="stable_diffusion", device="default"): clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION) @@ -2420,6 +2420,7 @@ async def init_builtin_extra_nodes(): "nodes_context_windows.py", "nodes_qwen.py", "nodes_chroma_radiance.py", + "nodes_pid.py", "nodes_model_patch.py", "nodes_easycache.py", "nodes_audio_encoder.py", From d8d860a5883d84d1dc5b46306429f76e786a5e96 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 May 2026 18:04:55 -0700 Subject: [PATCH 11/13] Closer memory usage factors for PID (#14123) --- comfy/supported_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 4723caff5..776091d42 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -1214,7 +1214,7 @@ class PixelDiTT2I(supported_models_base.BASE): } latent_format = latent_formats.PixelDiTPixel - memory_usage_factor = 0.18 + memory_usage_factor = 0.04 supported_inference_dtypes = [torch.bfloat16, torch.float32] vae_key_prefix = ["vae."] @@ -1263,7 +1263,7 @@ class PiD(PixelDiTT2I): "shift": 1.5, # close approximation of the original distill 4 steps [0.999, 0.866, 0.634, 0.342, 0] } - memory_usage_factor = 0.07 + memory_usage_factor = 0.04 def get_model(self, state_dict, prefix="", device=None): return model_base.PiD(self, device=device) From e75a92c1b620e47f4aa96912d441072fefe5caf4 Mon Sep 17 00:00:00 2001 From: comfyanonymous <121283862+comfyanonymous@users.noreply.github.com> Date: Tue, 26 May 2026 18:06:51 -0700 Subject: [PATCH 12/13] Add memory usage factor for lens model. (#14124) --- comfy/supported_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/comfy/supported_models.py b/comfy/supported_models.py index 776091d42..00941da53 100644 --- a/comfy/supported_models.py +++ b/comfy/supported_models.py @@ -845,6 +845,8 @@ class Lens(supported_models_base.BASE): unet_extra_config = {} latent_format = latent_formats.Flux2 + memory_usage_factor = 4.0 + supported_inference_dtypes = [torch.bfloat16, torch.float32] # fp16 causes NaNs vae_key_prefix = ["vae."] From 2072d3e46d1d1c6ed9533f51726118b4c82b7113 Mon Sep 17 00:00:00 2001 From: Barish Ozbay <17261091+drozbay@users.noreply.github.com> Date: Tue, 26 May 2026 20:59:32 -0600 Subject: [PATCH 13/13] fix: Stop LTXVCropGuides leaving stray latent frames when guides share a start position (#13882) --- comfy_extras/nodes_lt.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py index 51cf7951f..48d75c9e5 100644 --- a/comfy_extras/nodes_lt.py +++ b/comfy_extras/nodes_lt.py @@ -226,10 +226,20 @@ def get_noise_mask(latent): noise_mask = noise_mask.clone() return noise_mask -def get_keyframe_idxs(cond): +def get_keyframe_idxs(cond, latent_shape=None): keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None) if keyframe_idxs is None: return None, 0 + # Get number of keyframes from latent_shape or guide_attention_entries if available + if latent_shape is not None and len(latent_shape) == 5: + tokens_per_frame = latent_shape[-2] * latent_shape[-1] + num_keyframes = keyframe_idxs.shape[2] // tokens_per_frame + return keyframe_idxs, num_keyframes + entries = conditioning_get_any_value(cond, "guide_attention_entries", None) + if entries: + num_keyframes = sum(e["latent_shape"][0] for e in entries) + return keyframe_idxs, num_keyframes + # fallback, may under-count if keyframes share t-start # keyframe_idxs contains start/end positions (last dimension), checking for unqiue values only for start num_keyframes = torch.unique(keyframe_idxs[:, 0, :, 0]).shape[0] return keyframe_idxs, num_keyframes @@ -322,9 +332,9 @@ class LTXVAddGuide(io.ComfyNode): return factor @classmethod - def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors): + def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors, latent_shape=None): time_scale_factor, _, _ = scale_factors - _, num_keyframes = get_keyframe_idxs(cond) + _, num_keyframes = get_keyframe_idxs(cond, latent_shape) latent_count = latent_length - num_keyframes frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0) if guide_length > 1 and frame_idx != 0: @@ -436,7 +446,7 @@ class LTXVAddGuide(io.ComfyNode): num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1 resolved_frame_idx = frame_idx if frame_idx < 0: - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0) causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1 @@ -454,7 +464,7 @@ class LTXVAddGuide(io.ComfyNode): if latent_downscale_factor > 1: t, guide_mask = cls.dilate_latent(t, latent_downscale_factor) - frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors) + frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors, latent_shape=latent_image.shape) assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence." positive, negative, latent_image, noise_mask = cls.append_keyframe( @@ -506,7 +516,7 @@ class LTXVCropGuides(io.ComfyNode): latent_image = latent["samples"].clone() noise_mask = get_noise_mask(latent) - _, num_keyframes = get_keyframe_idxs(positive) + _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape) if num_keyframes == 0: return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)