mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-22 07:49:33 +08:00
Merge branch 'master' into feat/load-3d-advanced
Some checks failed
Python Linting / Run Ruff (push) Has been cancelled
Python Linting / Run Pylint (push) Has been cancelled
Build package / Build Test (3.10) (push) Has been cancelled
Build package / Build Test (3.11) (push) Has been cancelled
Build package / Build Test (3.12) (push) Has been cancelled
Build package / Build Test (3.13) (push) Has been cancelled
Build package / Build Test (3.14) (push) Has been cancelled
Some checks failed
Python Linting / Run Ruff (push) Has been cancelled
Python Linting / Run Pylint (push) Has been cancelled
Build package / Build Test (3.10) (push) Has been cancelled
Build package / Build Test (3.11) (push) Has been cancelled
Build package / Build Test (3.12) (push) Has been cancelled
Build package / Build Test (3.13) (push) Has been cancelled
Build package / Build Test (3.14) (push) Has been cancelled
This commit is contained in:
commit
f470e7b46c
@ -140,7 +140,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
|
||||
- Commits outside of the stable release tags may be very unstable and break many custom nodes.
|
||||
- Serves as the foundation for the desktop release
|
||||
|
||||
2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
|
||||
2. **[Comfy Desktop](https://github.com/Comfy-Org/Comfy-Desktop)**
|
||||
- Builds a new release using the latest stable core version
|
||||
|
||||
3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
|
||||
@ -309,7 +309,7 @@ After this you should have everything installed and can proceed to running Comfy
|
||||
|
||||
#### Apple Mac silicon
|
||||
|
||||
You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
|
||||
You can install ComfyUI in Apple Mac silicon (M1, M2, M3 or M4) with any recent macOS version.
|
||||
|
||||
1. Install pytorch nightly. For instructions, read the [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) Apple Developer guide (make sure to install the latest pytorch nightly).
|
||||
1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux.
|
||||
|
||||
4191
blueprints/Character Replacement (SCAIL-2 Base).json
Normal file
4191
blueprints/Character Replacement (SCAIL-2 Base).json
Normal file
File diff suppressed because it is too large
Load Diff
4461
blueprints/Character Replacement (SCAIL-2 Extend).json
Normal file
4461
blueprints/Character Replacement (SCAIL-2 Extend).json
Normal file
File diff suppressed because it is too large
Load Diff
569
blueprints/Image Depth Estimation (Depth Anything 3).json
Normal file
569
blueprints/Image Depth Estimation (Depth Anything 3).json
Normal file
@ -0,0 +1,569 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 89,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 89,
|
||||
"type": "85e595bd-af9e-40ee-85c5-b98bb15da47a",
|
||||
"pos": [
|
||||
320,
|
||||
520
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
360
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resolution"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_method"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "output_type",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "output"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "output_normalization",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "output.normalization"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "apply_sky_clip",
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "output.apply_sky_clip"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"87",
|
||||
"resolution"
|
||||
],
|
||||
[
|
||||
"87",
|
||||
"resize_method"
|
||||
],
|
||||
[
|
||||
"86",
|
||||
"output"
|
||||
],
|
||||
[
|
||||
"86",
|
||||
"output.normalization"
|
||||
],
|
||||
[
|
||||
"86",
|
||||
"output.apply_sky_clip"
|
||||
],
|
||||
[
|
||||
"88",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image Depth Estimation (Depth Anything 3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "85e595bd-af9e-40ee-85c5-b98bb15da47a",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 4,
|
||||
"lastNodeId": 89,
|
||||
"lastLinkId": 109,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 2,
|
||||
"config": {},
|
||||
"name": "Image Depth Estimation (Depth Anything 3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
400,
|
||||
90,
|
||||
166.998046875,
|
||||
188
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1250,
|
||||
146,
|
||||
128,
|
||||
68
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "43cf3118-495a-487d-8eb3-a17c7e92f64f",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
19
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
542.998046875,
|
||||
114
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1089a0a1-6db1-45a8-84b0-0bfdc2ed920a",
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
22
|
||||
],
|
||||
"pos": [
|
||||
542.998046875,
|
||||
134
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "25fb64ac-26d5-466d-995b-6d51b9afa2c4",
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
23
|
||||
],
|
||||
"pos": [
|
||||
542.998046875,
|
||||
154
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8acafb7c-6c8b-46b3-9d74-c563498a3af1",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"linkIds": [
|
||||
24
|
||||
],
|
||||
"label": "output_type",
|
||||
"pos": [
|
||||
542.998046875,
|
||||
174
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1da5009b-4648-43e8-a257-16426630cf22",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
25
|
||||
],
|
||||
"label": "output_normalization",
|
||||
"pos": [
|
||||
542.998046875,
|
||||
194
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "fd7edb33-5fb1-4538-a411-26e5039a9321",
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"label": "apply_sky_clip",
|
||||
"pos": [
|
||||
542.998046875,
|
||||
214
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b5be4c8a-b833-4f1e-8c94-3ed1dd722190",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
106
|
||||
],
|
||||
"pos": [
|
||||
542.998046875,
|
||||
234
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "478ab537-63bc-4d74-a9f0-c975f550880f",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
7
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
1274,
|
||||
170
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 86,
|
||||
"type": "DA3Render",
|
||||
"pos": [
|
||||
800,
|
||||
310
|
||||
],
|
||||
"size": [
|
||||
380,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "da3_geometry",
|
||||
"name": "da3_geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"link": 12
|
||||
},
|
||||
{
|
||||
"localized_name": "output",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "output"
|
||||
},
|
||||
"link": 24
|
||||
},
|
||||
{
|
||||
"localized_name": "output.normalization",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "output.normalization"
|
||||
},
|
||||
"link": 25
|
||||
},
|
||||
{
|
||||
"localized_name": "output.apply_sky_clip",
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "output.apply_sky_clip"
|
||||
},
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"name": "geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DA3Render",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"depth",
|
||||
"v2_style",
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 87,
|
||||
"type": "DA3Inference",
|
||||
"pos": [
|
||||
800,
|
||||
50
|
||||
],
|
||||
"size": [
|
||||
390,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "da3_model",
|
||||
"name": "da3_model",
|
||||
"type": "DA3_MODEL",
|
||||
"link": 107
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 19
|
||||
},
|
||||
{
|
||||
"localized_name": "resolution",
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resolution"
|
||||
},
|
||||
"link": 22
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_method",
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_method"
|
||||
},
|
||||
"link": 23
|
||||
},
|
||||
{
|
||||
"localized_name": "mode",
|
||||
"name": "mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "mode"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "da3_geometry",
|
||||
"name": "da3_geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DA3Inference",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
504,
|
||||
"upper_bound_resize",
|
||||
"mono"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 88,
|
||||
"type": "LoadDA3Model",
|
||||
"pos": [
|
||||
810,
|
||||
-160
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 106
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
"name": "weight_dtype",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "weight_dtype"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "DA3_MODEL",
|
||||
"name": "DA3_MODEL",
|
||||
"type": "DA3_MODEL",
|
||||
"links": [
|
||||
107
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadDA3Model",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0",
|
||||
"models": [
|
||||
{
|
||||
"name": "depth_anything_3_mono_large.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors",
|
||||
"directory": "geometry_estimation"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"depth_anything_3_mono_large.safetensors",
|
||||
"default"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 87,
|
||||
"origin_slot": 0,
|
||||
"target_id": 86,
|
||||
"target_slot": 0,
|
||||
"type": "DA3_GEOMETRY"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 87,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"origin_id": 86,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 87,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 87,
|
||||
"target_slot": 3,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 86,
|
||||
"target_slot": 1,
|
||||
"type": "COMFY_DYNAMICCOMBO_V3"
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 86,
|
||||
"target_slot": 2,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 86,
|
||||
"target_slot": 3,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 106,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 88,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 107,
|
||||
"origin_id": 88,
|
||||
"origin_slot": 0,
|
||||
"target_id": 87,
|
||||
"target_slot": 0,
|
||||
"type": "DA3_MODEL"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Conditioning & Preprocessors/Depth",
|
||||
"description": "This subgraph takes an input image and produces a depth map using the Depth Anything 3 model, which recovers spatially consistent geometry from any number of views. It is ideal for single or multi-view images, videos, and 3D scenes where accurate depth estimation is needed for tasks like SLAM, novel view synthesis, or spatial perception. The model uses a plain transformer backbone and supports both monocular and multi-view inputs without."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"BlueprintDescription": "This subgraph takes an input image and produces a depth map using the Depth Anything 3 model, which recovers spatially consistent geometry from any number of views. It is ideal for single or multi-view images, videos, and 3D scenes where accurate depth estimation is needed for tasks like SLAM, novel view synthesis, or spatial perception. The model uses a plain transformer backbone and supports both monocular and multi-view inputs without."
|
||||
}
|
||||
}
|
||||
3549
blueprints/Image Edit (Bernini-R).json
Normal file
3549
blueprints/Image Edit (Bernini-R).json
Normal file
File diff suppressed because it is too large
Load Diff
1983
blueprints/Image to Gaussian Splat (TripoSplat).json
Normal file
1983
blueprints/Image to Gaussian Splat (TripoSplat).json
Normal file
File diff suppressed because it is too large
Load Diff
1088
blueprints/Text to Image (Anima Base 1.0).json
Normal file
1088
blueprints/Text to Image (Anima Base 1.0).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1077,9 +1077,12 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "This subgraph converts text prompts into non-photorealistic illustrations using a 2-billion-parameter model optimized for anime and artistic styles. It is ideal for generating concept art, character designs, or stylized illustrations where photorealism is not required. The model excels with anime and artistic content but performs poorly on realistic subjects."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
"extra": {
|
||||
"BlueprintDescription": "This subgraph converts text prompts into non-photorealistic illustrations using a 2-billion-parameter model optimized for anime and artistic styles. It is ideal for generating concept art, character designs, or stylized illustrations where photorealism is not required. The model excels with anime and artistic content but performs poorly on realistic subjects."
|
||||
}
|
||||
}
|
||||
2473
blueprints/Text to Image (Ideogram v4).json
Normal file
2473
blueprints/Text to Image (Ideogram v4).json
Normal file
File diff suppressed because it is too large
Load Diff
825
blueprints/Video Depth Estimation (Depth Anything 3).json
Normal file
825
blueprints/Video Depth Estimation (Depth Anything 3).json
Normal file
@ -0,0 +1,825 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 97,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 97,
|
||||
"type": "253ec5ca-8333-4ddf-a036-9fc0923651b9",
|
||||
"pos": [
|
||||
410,
|
||||
500
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
400
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "start_time",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "start_time"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "duration",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "duration"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resolution"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_method"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "output_type",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "output"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "normalization",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "output.normalization"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "output.apply_sky_clip"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"96",
|
||||
"start_time"
|
||||
],
|
||||
[
|
||||
"96",
|
||||
"duration"
|
||||
],
|
||||
[
|
||||
"93",
|
||||
"resolution"
|
||||
],
|
||||
[
|
||||
"93",
|
||||
"resize_method"
|
||||
],
|
||||
[
|
||||
"92",
|
||||
"output"
|
||||
],
|
||||
[
|
||||
"92",
|
||||
"output.normalization"
|
||||
],
|
||||
[
|
||||
"92",
|
||||
"output.apply_sky_clip"
|
||||
],
|
||||
[
|
||||
"94",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Video Depth Estimation (Depth Anything 3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "253ec5ca-8333-4ddf-a036-9fc0923651b9",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 4,
|
||||
"lastNodeId": 97,
|
||||
"lastLinkId": 129,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 2,
|
||||
"config": {},
|
||||
"name": "Video Depth Estimation (Depth Anything 3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-230,
|
||||
130,
|
||||
167.912109375,
|
||||
228
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1520,
|
||||
140,
|
||||
128,
|
||||
108
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "698c28c6-cf92-4039-8b39-f3062868ea7c",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
119
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
154
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "97a1f63e-1585-4a40-9dec-e2700120d84a",
|
||||
"name": "start_time",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
121
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
174
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4dbbd3b3-c5ee-4a56-a0d3-3268d3b2fd64",
|
||||
"name": "duration",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
122
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
194
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "16f55101-f99d-4c0c-bebf-c3b31c54f13e",
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
124
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
214
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d9cd7693-4bb3-4ed7-9a75-276b997abcd9",
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
125
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
234
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a6e90532-323b-462e-ba9c-1672384d5b31",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"linkIds": [
|
||||
126
|
||||
],
|
||||
"label": "output_type",
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
254
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "69e6aeef-437d-4fde-b2fc-d5ab9369238d",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
127
|
||||
],
|
||||
"label": "normalization",
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
274
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "73206f72-f89a-4698-885e-5d9277df2998",
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
128
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
294
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "dddbc7fc-9431-448a-9ed3-9aa62404288b",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
129
|
||||
],
|
||||
"pos": [
|
||||
-86.087890625,
|
||||
314
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "478ab537-63bc-4d74-a9f0-c975f550880f",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
7
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
1544,
|
||||
164
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "cdaf037e-79bc-4a94-b06c-0fd32e76f615",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"linkIds": [
|
||||
112
|
||||
],
|
||||
"pos": [
|
||||
1544,
|
||||
184
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "4c0e5484-d193-49c7-b107-92619628880a",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
113
|
||||
],
|
||||
"pos": [
|
||||
1544,
|
||||
204
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 92,
|
||||
"type": "DA3Render",
|
||||
"pos": [
|
||||
740,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
380,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "da3_geometry",
|
||||
"name": "da3_geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"link": 12
|
||||
},
|
||||
{
|
||||
"localized_name": "output",
|
||||
"name": "output",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "output"
|
||||
},
|
||||
"link": 126
|
||||
},
|
||||
{
|
||||
"localized_name": "output.normalization",
|
||||
"name": "output.normalization",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "output.normalization"
|
||||
},
|
||||
"link": 127
|
||||
},
|
||||
{
|
||||
"localized_name": "output.apply_sky_clip",
|
||||
"name": "output.apply_sky_clip",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "output.apply_sky_clip"
|
||||
},
|
||||
"link": 128
|
||||
},
|
||||
{
|
||||
"name": "geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
7
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DA3Render",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"depth",
|
||||
"v2_style",
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 93,
|
||||
"type": "DA3Inference",
|
||||
"pos": [
|
||||
740,
|
||||
-30
|
||||
],
|
||||
"size": [
|
||||
390,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "da3_model",
|
||||
"name": "da3_model",
|
||||
"type": "DA3_MODEL",
|
||||
"link": 107
|
||||
},
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 111
|
||||
},
|
||||
{
|
||||
"localized_name": "resolution",
|
||||
"name": "resolution",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resolution"
|
||||
},
|
||||
"link": 124
|
||||
},
|
||||
{
|
||||
"localized_name": "resize_method",
|
||||
"name": "resize_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_method"
|
||||
},
|
||||
"link": 125
|
||||
},
|
||||
{
|
||||
"localized_name": "mode",
|
||||
"name": "mode",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "mode"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "da3_geometry",
|
||||
"name": "da3_geometry",
|
||||
"type": "DA3_GEOMETRY",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "DA3Inference",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
504,
|
||||
"lower_bound_resize",
|
||||
"mono"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 94,
|
||||
"type": "LoadDA3Model",
|
||||
"pos": [
|
||||
50,
|
||||
410
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 129
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
"name": "weight_dtype",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "weight_dtype"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "DA3_MODEL",
|
||||
"name": "DA3_MODEL",
|
||||
"type": "DA3_MODEL",
|
||||
"links": [
|
||||
107
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadDA3Model",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0",
|
||||
"models": [
|
||||
{
|
||||
"name": "depth_anything_3_mono_large.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors",
|
||||
"directory": "geometry_estimation"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"depth_anything_3_mono_large.safetensors",
|
||||
"default"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 95,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
70,
|
||||
-140
|
||||
],
|
||||
"size": [
|
||||
260,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 120
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
111
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
112
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
113
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bit_depth",
|
||||
"name": "bit_depth",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 96,
|
||||
"type": "Video Slice",
|
||||
"pos": [
|
||||
70,
|
||||
-360
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 119
|
||||
},
|
||||
{
|
||||
"localized_name": "start_time",
|
||||
"name": "start_time",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "start_time"
|
||||
},
|
||||
"link": 121
|
||||
},
|
||||
{
|
||||
"localized_name": "duration",
|
||||
"name": "duration",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "duration"
|
||||
},
|
||||
"link": 122
|
||||
},
|
||||
{
|
||||
"localized_name": "strict_duration",
|
||||
"name": "strict_duration",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "strict_duration"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
120
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "Video Slice",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.24.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
5,
|
||||
false
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 93,
|
||||
"origin_slot": 0,
|
||||
"target_id": 92,
|
||||
"target_slot": 0,
|
||||
"type": "DA3_GEOMETRY"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"origin_id": 92,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 107,
|
||||
"origin_id": 94,
|
||||
"origin_slot": 0,
|
||||
"target_id": 93,
|
||||
"target_slot": 0,
|
||||
"type": "DA3_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 111,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 0,
|
||||
"target_id": 93,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 112,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 113,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 119,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 96,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 120,
|
||||
"origin_id": 96,
|
||||
"origin_slot": 0,
|
||||
"target_id": 95,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 121,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 96,
|
||||
"target_slot": 1,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 122,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 96,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 124,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 93,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 125,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 93,
|
||||
"target_slot": 3,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 126,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 92,
|
||||
"target_slot": 1,
|
||||
"type": "COMFY_DYNAMICCOMBO_V3"
|
||||
},
|
||||
{
|
||||
"id": 127,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 92,
|
||||
"target_slot": 2,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 128,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 92,
|
||||
"target_slot": 3,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 129,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 94,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Conditioning & Preprocessors/Depth",
|
||||
"description": "This subgraph processes a video input through Depth Anything 3 to produce temporally consistent depth maps for each frame, outputting a depth video. It is ideal for video content requiring spatial geometry estimation, such as 3D reconstruction, SLAM, or novel view synthesis from moving cameras. The model uses a plain transformer backbone trained with a depth-ray representation, supporting any number of views without requiring known camera poses."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"BlueprintDescription": "This subgraph processes a video input through Depth Anything 3 to produce temporally consistent depth maps for each frame, outputting a depth video. It is ideal for video content requiring spatial geometry estimation, such as 3D reconstruction, SLAM, or novel view synthesis from moving cameras. The model uses a plain transformer backbone trained with a depth-ray representation, supporting any number of views without requiring known camera poses."
|
||||
}
|
||||
}
|
||||
3732
blueprints/Video Edit (Bernini-R).json
Normal file
3732
blueprints/Video Edit (Bernini-R).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -145,6 +145,7 @@ vram_group.add_argument("--novram", action="store_true", help="When lowvram isn'
|
||||
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
|
||||
|
||||
parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
|
||||
parser.add_argument("--vram-headroom", type=float, default=0, help="Set the amount of vram in GB for DynamicVRAM to maintain as extra headroom above default. ComfyUI will try and keep this much VRAM completely free and unused, even counting VRAM from other apps.")
|
||||
|
||||
parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
|
||||
parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
|
||||
|
||||
321
comfy/ldm/boogu/model.py
Normal file
321
comfy/ldm/boogu/model.py
Normal file
@ -0,0 +1,321 @@
|
||||
# Boogu-Image-0.1 transformer
|
||||
# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an
|
||||
# added dual-stream ("double_stream") stage before the single-stream layers, conditioned
|
||||
# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux
|
||||
# RoPE core, the only new component is the double-stream block + the hybrid forward order.
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
|
||||
import comfy.ldm.common_dit
|
||||
import comfy.ldm.omnigen.omnigen2
|
||||
from comfy.ldm.modules.attention import optimized_attention_masked
|
||||
from comfy.ldm.omnigen.omnigen2 import (
|
||||
OmniGen2RotaryPosEmbed,
|
||||
Lumina2CombinedTimestepCaptionEmbedding,
|
||||
LuminaRMSNormZero,
|
||||
LuminaLayerNormContinuous,
|
||||
LuminaFeedForward,
|
||||
Attention,
|
||||
OmniGen2TransformerBlock,
|
||||
apply_rotary_emb,
|
||||
)
|
||||
|
||||
class BooguDoubleStreamProcessor(nn.Module):
|
||||
# Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections.
|
||||
def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
query_dim = head_dim * heads
|
||||
kv_dim = head_dim * kv_heads
|
||||
|
||||
self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
|
||||
batch_size = img_hidden_states.shape[0]
|
||||
L_instruct = instruct_hidden_states.shape[1]
|
||||
|
||||
img_q = self.img_to_q(img_hidden_states)
|
||||
img_k = self.img_to_k(img_hidden_states)
|
||||
img_v = self.img_to_v(img_hidden_states)
|
||||
|
||||
instruct_q = self.instruct_to_q(instruct_hidden_states)
|
||||
instruct_k = self.instruct_to_k(instruct_hidden_states)
|
||||
instruct_v = self.instruct_to_v(instruct_hidden_states)
|
||||
|
||||
# Concatenate instruction first, then image (matches reference processor order).
|
||||
query = torch.cat([instruct_q, img_q], dim=1)
|
||||
key = torch.cat([instruct_k, img_k], dim=1)
|
||||
value = torch.cat([instruct_v, img_v], dim=1)
|
||||
|
||||
query = query.view(batch_size, -1, attn.heads, attn.dim_head)
|
||||
key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head)
|
||||
value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head)
|
||||
|
||||
query = attn.norm_q(query)
|
||||
key = attn.norm_k(key)
|
||||
|
||||
if rotary_emb is not None:
|
||||
query = apply_rotary_emb(query, rotary_emb)
|
||||
key = apply_rotary_emb(key, rotary_emb)
|
||||
|
||||
query = query.transpose(1, 2)
|
||||
key = key.transpose(1, 2)
|
||||
value = value.transpose(1, 2)
|
||||
|
||||
if attn.kv_heads < attn.heads:
|
||||
key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
|
||||
value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
|
||||
|
||||
hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
|
||||
|
||||
# Split back to instruction/image, apply per-stream output projections, recombine.
|
||||
instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct])
|
||||
img_hidden_states = self.img_out(hidden_states[:, L_instruct:])
|
||||
hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1)
|
||||
|
||||
hidden_states = attn.to_out[0](hidden_states)
|
||||
return hidden_states
|
||||
|
||||
|
||||
class BooguJointAttention(nn.Module):
|
||||
# Holds the shared q/k RMSNorm + final output projection
|
||||
def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
self.heads = heads
|
||||
self.kv_heads = kv_heads
|
||||
self.dim_head = head_dim
|
||||
self.scale = head_dim ** -0.5
|
||||
|
||||
self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
|
||||
self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
|
||||
self.to_out = nn.Sequential(
|
||||
operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device),
|
||||
nn.Dropout(0.0),
|
||||
)
|
||||
self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
|
||||
return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options)
|
||||
|
||||
|
||||
class BooguDoubleStreamBlock(nn.Module):
|
||||
# Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP.
|
||||
def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None):
|
||||
super().__init__()
|
||||
head_dim = dim // num_attention_heads
|
||||
|
||||
self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
|
||||
self.img_self_attn = Attention(
|
||||
query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads,
|
||||
eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations,
|
||||
)
|
||||
|
||||
self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
|
||||
self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
|
||||
self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
|
||||
|
||||
def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}):
|
||||
L_instruct = instruct_hidden_states.shape[1]
|
||||
|
||||
img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
|
||||
img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
|
||||
img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
|
||||
|
||||
instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb)
|
||||
instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb)
|
||||
|
||||
joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options)
|
||||
instruct_attn_out = joint_attn_out[:, :L_instruct]
|
||||
img_attn_out = joint_attn_out[:, L_instruct:]
|
||||
|
||||
img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options)
|
||||
|
||||
img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
|
||||
img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
|
||||
img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
|
||||
img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
|
||||
img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
|
||||
|
||||
instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out)
|
||||
instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1)
|
||||
instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input))
|
||||
instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out)
|
||||
|
||||
return img_hidden_states, instruct_hidden_states
|
||||
|
||||
|
||||
class BooguTransformer2DModel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
in_channels: int = 16,
|
||||
out_channels: Optional[int] = None,
|
||||
hidden_size: int = 3360,
|
||||
num_layers: int = 32,
|
||||
num_double_stream_layers: int = 8,
|
||||
num_refiner_layers: int = 2,
|
||||
num_attention_heads: int = 28,
|
||||
num_kv_heads: int = 7,
|
||||
multiple_of: int = 256,
|
||||
ffn_dim_multiplier: Optional[float] = None,
|
||||
norm_eps: float = 1e-5,
|
||||
axes_dim_rope: Tuple[int, int, int] = (40, 40, 40),
|
||||
axes_lens: Tuple[int, int, int] = (2048, 1664, 1664),
|
||||
instruction_feat_dim: int = 4096,
|
||||
timestep_scale: float = 1000.0,
|
||||
image_model=None,
|
||||
device=None, dtype=None, operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.patch_size = patch_size
|
||||
self.out_channels = out_channels or in_channels
|
||||
self.hidden_size = hidden_size
|
||||
self.dtype = dtype
|
||||
|
||||
self.rope_embedder = OmniGen2RotaryPosEmbed(
|
||||
theta=10000,
|
||||
axes_dim=axes_dim_rope,
|
||||
axes_lens=axes_lens,
|
||||
patch_size=patch_size,
|
||||
)
|
||||
|
||||
self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
|
||||
self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
|
||||
|
||||
self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
|
||||
hidden_size=hidden_size,
|
||||
text_feat_dim=instruction_feat_dim,
|
||||
norm_eps=norm_eps,
|
||||
timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.noise_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.ref_image_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.context_refiner = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_refiner_layers)
|
||||
])
|
||||
|
||||
self.double_stream_layers = nn.ModuleList([
|
||||
BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_double_stream_layers)
|
||||
])
|
||||
|
||||
self.single_stream_layers = nn.ModuleList([
|
||||
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_out = LuminaLayerNormContinuous(
|
||||
embedding_dim=hidden_size,
|
||||
conditioning_embedding_dim=min(hidden_size, 1024),
|
||||
elementwise_affine=False,
|
||||
eps=1e-6,
|
||||
out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
|
||||
|
||||
# Patchify/refine helpers are identical to OmniGen2; reuse via bound methods.
|
||||
flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq
|
||||
img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine
|
||||
|
||||
def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
|
||||
B, C, H, W = x.shape
|
||||
hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
|
||||
_, _, H_padded, W_padded = hidden_states.shape
|
||||
timestep = 1.0 - timesteps
|
||||
text_hidden_states = context
|
||||
text_attention_mask = attention_mask
|
||||
ref_image_hidden_states = ref_latents
|
||||
device = hidden_states.device
|
||||
|
||||
temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
|
||||
|
||||
(
|
||||
hidden_states, ref_image_hidden_states,
|
||||
img_mask, ref_img_mask,
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
ref_img_sizes, img_sizes,
|
||||
) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
|
||||
|
||||
(
|
||||
context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
|
||||
rotary_emb, encoder_seq_lengths, seq_lengths,
|
||||
) = self.rope_embedder(
|
||||
hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
ref_img_sizes, img_sizes, device,
|
||||
)
|
||||
|
||||
for layer in self.context_refiner:
|
||||
text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)
|
||||
|
||||
img_len = hidden_states.shape[1]
|
||||
combined_img_hidden_states = self.img_patch_embed_and_refine(
|
||||
hidden_states, ref_image_hidden_states,
|
||||
img_mask, ref_img_mask,
|
||||
noise_rotary_emb, ref_img_rotary_emb,
|
||||
l_effective_ref_img_len, l_effective_img_len,
|
||||
temb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
# Double-stream stage: the image self-attention only sees the [ref ; noise] tokens,
|
||||
# which sit after the instruction tokens in the joint rope.
|
||||
L_instruct = text_hidden_states.shape[1]
|
||||
combined_img_rotary_emb = rotary_emb[:, L_instruct:]
|
||||
for layer in self.double_stream_layers:
|
||||
combined_img_hidden_states, text_hidden_states = layer(
|
||||
combined_img_hidden_states, text_hidden_states,
|
||||
rotary_emb, combined_img_rotary_emb, temb,
|
||||
joint_attention_mask=None, img_attention_mask=None,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
|
||||
|
||||
for layer in self.single_stream_layers:
|
||||
hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options)
|
||||
|
||||
hidden_states = self.norm_out(hidden_states, temb)
|
||||
|
||||
p = self.patch_size
|
||||
output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W]
|
||||
|
||||
return -output
|
||||
@ -22,7 +22,7 @@ def apply_rotary_emb(x, freqs_cis):
|
||||
|
||||
|
||||
def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
|
||||
return F.silu(x) * y
|
||||
return F.silu(x, inplace=True).mul_(y)
|
||||
|
||||
|
||||
class TimestepEmbedding(nn.Module):
|
||||
|
||||
@ -1665,7 +1665,7 @@ class SCAILWanModel(WanModel):
|
||||
|
||||
# embeddings
|
||||
x = self.patch_embedding(x.float()).to(x.dtype)
|
||||
if ref_mask_latents is not None: # SCAIL-2 additive mask stream
|
||||
if ref_mask_latents is not None: # SCAIL-2 additive mask stream (one identity mask frame per reference, then video)
|
||||
x = x + self.patch_embedding_mask(ref_mask_latents.float()).to(x.dtype)
|
||||
grid_sizes = x.shape[2:]
|
||||
transformer_options["grid_sizes"] = grid_sizes
|
||||
@ -1728,22 +1728,25 @@ class SCAILWanModel(WanModel):
|
||||
|
||||
# ref_mask_flag is a scalar bool (CONDConstant, SCAIL-2 only). False => replacement mode,
|
||||
# which places ref/pose via H/W rope shifts instead of the animation-mode temporal offset.
|
||||
# reference_latent may stack several frames: the last is the primary reference adjacent to the video, the earlier frames are additional references.
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, ref_mask_flag=None, transformer_options={}):
|
||||
ref_t_patches = 0
|
||||
if reference_latent is not None:
|
||||
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
|
||||
|
||||
if ref_mask_flag is not None and not bool(ref_mask_flag):
|
||||
REF_ROPE_H = 120.0
|
||||
POSE_ROPE_W = 120.0
|
||||
|
||||
ref_t_patches = 0
|
||||
if reference_latent is not None:
|
||||
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
|
||||
main_t_patches = t - ref_t_patches
|
||||
video_t_start = max(ref_t_patches - 1, 0)
|
||||
|
||||
parts = []
|
||||
if ref_t_patches > 0:
|
||||
ref_tf = {"rope_options": {"shift_y": REF_ROPE_H, "shift_x": 0.0, "scale_y": 1.0, "scale_x": 1.0}}
|
||||
parts.append(super().rope_encode(ref_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=ref_tf))
|
||||
if main_t_patches > 0:
|
||||
parts.append(super().rope_encode(main_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=transformer_options))
|
||||
parts.append(super().rope_encode(main_t_patches, h, w, t_start=video_t_start, device=device, dtype=dtype, transformer_options=transformer_options))
|
||||
|
||||
if pose_latents is not None:
|
||||
F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
|
||||
@ -1752,7 +1755,7 @@ class SCAILWanModel(WanModel):
|
||||
h_shift = (h_scale - 1) / 2
|
||||
w_shift = (w_scale - 1) / 2
|
||||
pose_tf = {"rope_options": {"shift_y": h_shift, "shift_x": POSE_ROPE_W + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
|
||||
parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=0, device=device, dtype=dtype, transformer_options=pose_tf))
|
||||
parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=video_t_start, device=device, dtype=dtype, transformer_options=pose_tf))
|
||||
|
||||
return torch.cat(parts, dim=1)
|
||||
|
||||
@ -1761,10 +1764,6 @@ class SCAILWanModel(WanModel):
|
||||
if pose_latents is None:
|
||||
return main_freqs
|
||||
|
||||
ref_t_patches = 0
|
||||
if reference_latent is not None:
|
||||
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
|
||||
|
||||
F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
|
||||
|
||||
# if pose is at half resolution, scale_y/scale_x=2 stretches the position range to cover the same RoPE extent as the main frames
|
||||
|
||||
@ -54,6 +54,7 @@ import comfy.ldm.pixeldit.model
|
||||
import comfy.ldm.pixeldit.pid
|
||||
import comfy.ldm.ace.model
|
||||
import comfy.ldm.omnigen.omnigen2
|
||||
import comfy.ldm.boogu.model
|
||||
import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.ideogram4.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
@ -1747,10 +1748,14 @@ class WAN21_SCAIL(WAN21):
|
||||
|
||||
reference_latents = kwargs.get("reference_latents", None)
|
||||
if reference_latents is not None:
|
||||
ref_latent = self.process_latent_in(reference_latents[-1])
|
||||
ref_mask = torch.ones_like(ref_latent[:, :4])
|
||||
ref_latent = torch.cat([ref_latent, ref_mask], dim=1)
|
||||
out['reference_latent'] = comfy.conds.CONDRegular(ref_latent)
|
||||
# SCAIL-2 multi-reference: reference_latents[0] is the primary ref, [1:] are additional
|
||||
# references. Stack as [additional..., primary] so the primary stays adjacent to the video.
|
||||
ordered = list(reference_latents[1:]) + list(reference_latents[:1])
|
||||
stacked = []
|
||||
for lat in ordered:
|
||||
lat = self.process_latent_in(lat)
|
||||
stacked.append(torch.cat([lat, torch.ones_like(lat[:, :4])], dim=1))
|
||||
out['reference_latent'] = comfy.conds.CONDRegular(torch.cat(stacked, dim=2))
|
||||
|
||||
pose_latents = kwargs.get("pose_video_latent", None)
|
||||
if pose_latents is not None:
|
||||
@ -1792,6 +1797,7 @@ class WAN21_SCAIL2(WAN21_SCAIL):
|
||||
if driving_mask_28ch is not None:
|
||||
out['sam_latents'] = comfy.conds.CONDRegular(driving_mask_28ch.movedim(1, 2).contiguous())
|
||||
|
||||
# ref_mask_28ch holds one identity mask per stacked reference frame (additional refs first, then the primary ref), followed by zeros over the video frames.
|
||||
ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
|
||||
if ref_mask_28ch is not None:
|
||||
out['ref_mask_latents'] = comfy.conds.CONDRegular(ref_mask_28ch.movedim(1, 2).contiguous())
|
||||
@ -1819,10 +1825,11 @@ class WAN21_SCAIL2(WAN21_SCAIL):
|
||||
# Return sliced view omitting retain_index_list
|
||||
return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=0)
|
||||
if cond_key == "ref_mask_latents" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
|
||||
# The ref mask is just a single frame padded with frames of zeros, so just grab the first frames for all windows
|
||||
# The ref mask is N leading ref frames padded with frames of zeros, so just grab the first frames for all windows
|
||||
full_ref_mask = cond_value.cond
|
||||
video_frame_count = x_in.shape[2]
|
||||
if full_ref_mask.shape[2] != video_frame_count + 1:
|
||||
ref_frame_count = full_ref_mask.shape[2] - video_frame_count
|
||||
if ref_frame_count < 1:
|
||||
return None
|
||||
window_length = len(window.index_list)
|
||||
|
||||
@ -1831,7 +1838,7 @@ class WAN21_SCAIL2(WAN21_SCAIL):
|
||||
if anchor_index is not None and anchor_index >= 0:
|
||||
window_length += 1
|
||||
|
||||
window_ref_mask = full_ref_mask[:, :, :window_length + 1].to(device)
|
||||
window_ref_mask = full_ref_mask[:, :, :window_length + ref_frame_count].to(device)
|
||||
return cond_value._copy_with(window_ref_mask)
|
||||
|
||||
return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
|
||||
@ -2097,6 +2104,11 @@ class Omnigen2(BaseModel):
|
||||
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
|
||||
return out
|
||||
|
||||
class Boogu(Omnigen2):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel)
|
||||
self.memory_usage_factor_conds = ("ref_latents",)
|
||||
|
||||
class QwenImage(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)
|
||||
|
||||
@ -761,6 +761,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys: # Boogu-Image (OmniGen2 derivative + dual-stream stage)
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "boogu"
|
||||
dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0]
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.')
|
||||
dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.')
|
||||
dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0]
|
||||
return dit_config
|
||||
|
||||
if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: # Omnigen2
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "omnigen2"
|
||||
|
||||
25
comfy/sd.py
25
comfy/sd.py
@ -67,6 +67,8 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.qwen35
|
||||
import comfy.text_encoders.qwen3vl
|
||||
import comfy.text_encoders.boogu
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.gemma4
|
||||
import comfy.text_encoders.cogvideo
|
||||
@ -1300,6 +1302,7 @@ class CLIPType(Enum):
|
||||
LENS = 28
|
||||
PIXELDIT = 29
|
||||
IDEOGRAM4 = 30
|
||||
BOOGU = 31
|
||||
|
||||
|
||||
|
||||
@ -1353,6 +1356,8 @@ class TEModel(Enum):
|
||||
GEMMA_4_31B = 31
|
||||
T5_GEMMA = 32
|
||||
GPT_OSS_20B = 33
|
||||
QWEN3VL_4B = 34
|
||||
QWEN3VL_8B = 35
|
||||
|
||||
|
||||
def detect_te_model(sd):
|
||||
@ -1414,6 +1419,8 @@ def detect_te_model(sd):
|
||||
if weight.shape[0] == 5120:
|
||||
return TEModel.QWEN35_27B
|
||||
return TEModel.QWEN35_2B
|
||||
if "model.visual.deepstack_merger_list.0.norm.weight" in sd: # DeepStack is unique to Qwen3-VL
|
||||
return TEModel.QWEN3VL_4B if sd["model.visual.merger.linear_fc2.weight"].shape[0] == 2560 else TEModel.QWEN3VL_8B
|
||||
if "model.layers.0.post_attention_layernorm.weight" in sd:
|
||||
weight = sd['model.layers.0.post_attention_layernorm.weight']
|
||||
if 'model.layers.0.self_attn.q_norm.weight' in sd:
|
||||
@ -1612,6 +1619,24 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
|
||||
qwen35_type = {TEModel.QWEN35_08B: "qwen35_08b", TEModel.QWEN35_2B: "qwen35_2b", TEModel.QWEN35_4B: "qwen35_4b", TEModel.QWEN35_9B: "qwen35_9b", TEModel.QWEN35_27B: "qwen35_27b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
|
||||
elif te_model in (TEModel.QWEN3VL_4B, TEModel.QWEN3VL_8B):
|
||||
if clip_type == CLIPType.IDEOGRAM4 and te_model == TEModel.QWEN3VL_8B: # Ideogram4 reuses the full Qwen3-VL-8B (13-layer tap for conditioning + multimodal generate).
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
|
||||
elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B: # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template.
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
|
||||
elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2): # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
|
||||
klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
|
||||
clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B if te_model == TEModel.QWEN3VL_8B else comfy.text_encoders.flux.KleinTokenizer
|
||||
else:
|
||||
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
|
||||
qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
|
||||
clip_target.clip = comfy.text_encoders.qwen3vl.te(**llama_detect(clip_data), model_type=qwen3vl_type)
|
||||
clip_target.tokenizer = comfy.text_encoders.qwen3vl.tokenizer(model_type=qwen3vl_type)
|
||||
elif te_model == TEModel.QWEN3_06B:
|
||||
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
|
||||
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer
|
||||
|
||||
@ -25,6 +25,7 @@ import comfy.text_encoders.hunyuan_image
|
||||
import comfy.text_encoders.kandinsky5
|
||||
import comfy.text_encoders.z_image
|
||||
import comfy.text_encoders.ideogram4
|
||||
import comfy.text_encoders.boogu
|
||||
import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
@ -1758,6 +1759,27 @@ class Omnigen2(supported_models_base.BASE):
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
|
||||
|
||||
class Boogu(Omnigen2):
|
||||
unet_config = {
|
||||
"image_model": "boogu",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"multiplier": 1.0,
|
||||
"shift": 3.16,
|
||||
}
|
||||
|
||||
memory_usage_factor = 2.15
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.Boogu(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
pref = self.text_encoder_key_prefix[0]
|
||||
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect))
|
||||
|
||||
class Ideogram4(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "ideogram4",
|
||||
@ -2300,6 +2322,7 @@ models = [
|
||||
ACEStep,
|
||||
ACEStep15,
|
||||
Omnigen2,
|
||||
Boogu,
|
||||
QwenImage,
|
||||
Ideogram4,
|
||||
Flux2,
|
||||
|
||||
58
comfy/text_encoders/boogu.py
Normal file
58
comfy/text_encoders/boogu.py
Normal file
@ -0,0 +1,58 @@
|
||||
"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim).
|
||||
|
||||
Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature
|
||||
(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer).
|
||||
The model itself is the standard Qwen3-VL TE, only the chat template differs
|
||||
(a fixed system prompt and no <think> block).
|
||||
"""
|
||||
|
||||
import comfy.text_encoders.qwen3vl
|
||||
from comfy import sd1_clip
|
||||
|
||||
|
||||
# System prompts from the reference pipeline (pipeline_boogu.py).
|
||||
# T2I (non-empty instruction, no image) uses the helpful-assistant prompt
|
||||
# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt.
|
||||
BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."
|
||||
BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
|
||||
|
||||
|
||||
class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
|
||||
# apply_chat_template without add_generation_prompt
|
||||
self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
|
||||
self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n"
|
||||
# Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction.
|
||||
self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
|
||||
if llama_template is None and len(images) == 0 and text.strip() == "":
|
||||
llama_template = self.llama_template_drop
|
||||
# Boogu conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds by default.
|
||||
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
|
||||
|
||||
|
||||
class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
|
||||
super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type)
|
||||
# apply the final RMSNorm to the tapped last layer
|
||||
self.layer_norm_hidden_state = True
|
||||
|
||||
|
||||
class BooguTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b")
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class BooguTEModel_(BooguTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return BooguTEModel_
|
||||
@ -9,6 +9,7 @@ import os
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
import comfy.text_encoders.llama
|
||||
import comfy.text_encoders.qwen3vl
|
||||
from comfy import sd1_clip
|
||||
|
||||
# Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1.
|
||||
@ -77,3 +78,43 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return Ideogram4TEModel_
|
||||
|
||||
|
||||
# Full Qwen3-VL-8B variant with vision
|
||||
|
||||
class Ideogram4Qwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}):
|
||||
super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None, dtype=dtype,
|
||||
attention_mask=attention_mask, model_options=model_options, model_type="qwen3vl_8b")
|
||||
|
||||
|
||||
class Ideogram4Qwen3VLTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Ideogram4Qwen3VLClipModel, model_options=model_options)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
|
||||
b, n, seq, h = out.shape # (B, n_taps=13, seq, 4096), ascending layer order.
|
||||
out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n) # (B, seq, 4096*13 = 53248).
|
||||
return out, pooled, extra
|
||||
|
||||
|
||||
class Ideogram4Qwen3VLTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
|
||||
# Ideogram 4 conditions on the no-think template; default thinking=True drops the empty think block qwen3vl adds.
|
||||
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
|
||||
|
||||
|
||||
def te_qwen3vl(dtype_llama=None, llama_quantization_metadata=None):
|
||||
class Ideogram4Qwen3VLTEModel_(Ideogram4Qwen3VLTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options)
|
||||
return Ideogram4Qwen3VLTEModel_
|
||||
|
||||
@ -251,6 +251,19 @@ class Qwen3_8BConfig:
|
||||
lm_head: bool = True
|
||||
stop_tokens = [151643, 151645]
|
||||
|
||||
@dataclass
|
||||
class Qwen3VL_8BConfig(Qwen3_8BConfig):
|
||||
max_position_embeddings: int = 262144
|
||||
rope_theta: float = 5000000.0
|
||||
rope_dims = [24, 20, 20]
|
||||
interleaved_mrope = True
|
||||
|
||||
@dataclass
|
||||
class Qwen3VL_4BConfig(Qwen3VL_8BConfig):
|
||||
hidden_size: int = 2560
|
||||
intermediate_size: int = 9728
|
||||
lm_head: bool = False # 4B ties word embeddings
|
||||
|
||||
@dataclass
|
||||
class Ovis25_2BConfig:
|
||||
vocab_size: int = 151936
|
||||
@ -703,7 +716,8 @@ class Llama2_(nn.Module):
|
||||
interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True,
|
||||
dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None,deepstack_embeds=None, visual_pos_masks=None):
|
||||
if embeds is not None:
|
||||
x = embeds
|
||||
else:
|
||||
@ -767,6 +781,10 @@ class Llama2_(nn.Module):
|
||||
if current_kv is not None:
|
||||
next_key_values.append(current_kv)
|
||||
|
||||
# DeepStack: add per-layer visual features into the first len() decoder layers at image positions (Qwen3-VL)
|
||||
if deepstack_embeds is not None and i < len(deepstack_embeds):
|
||||
x[visual_pos_masks] = x[visual_pos_masks] + deepstack_embeds[i].to(x)
|
||||
|
||||
if i == intermediate_output:
|
||||
intermediate = x.clone()
|
||||
|
||||
@ -860,7 +878,7 @@ class BaseGenerate:
|
||||
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
|
||||
return past_key_values
|
||||
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None):
|
||||
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None, position_ids=None, deepstack_embeds=None, visual_pos_masks=None):
|
||||
device = embeds.device
|
||||
|
||||
if stop_tokens is None:
|
||||
@ -884,10 +902,18 @@ class BaseGenerate:
|
||||
generated_token_ids = []
|
||||
pbar = comfy.utils.ProgressBar(max_length)
|
||||
|
||||
# MRoPE: prefill uses explicit 3D position_ids, decode continues from the last position
|
||||
next_pos = int(position_ids[:, -1].max()) + 1 if position_ids is not None else None
|
||||
|
||||
# Generation loop
|
||||
current_input_ids = initial_input_ids
|
||||
for step in tqdm(range(max_length), desc="Generating tokens"):
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids)
|
||||
# DeepStack visual features are injected on the prefill only; gemma4's forward lacks these kwargs.
|
||||
extra = {}
|
||||
if step == 0 and deepstack_embeds is not None:
|
||||
extra["deepstack_embeds"] = deepstack_embeds
|
||||
extra["visual_pos_masks"] = visual_pos_masks
|
||||
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids, position_ids=position_ids, **extra)
|
||||
logits = self.logits(x)[:, -1]
|
||||
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
|
||||
token_id = next_token[0].item()
|
||||
@ -895,6 +921,9 @@ class BaseGenerate:
|
||||
|
||||
embeds = self.model.embed_tokens(next_token).to(execution_dtype)
|
||||
current_input_ids = next_token if initial_input_ids is not None else None
|
||||
if next_pos is not None: # advance MRoPE position for the next (decode) step
|
||||
position_ids = torch.tensor([[next_pos]], device=device)
|
||||
next_pos += 1
|
||||
pbar.update(1)
|
||||
|
||||
if token_id in stop_tokens:
|
||||
|
||||
@ -3,7 +3,6 @@ import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from dataclasses import dataclass, field
|
||||
import os
|
||||
import math
|
||||
|
||||
import comfy.model_management
|
||||
from comfy.ldm.modules.attention import optimized_attention_for_device
|
||||
@ -563,6 +562,8 @@ class Qwen35VisionModel(nn.Module):
|
||||
for _ in range(config["depth"])
|
||||
])
|
||||
self.merger = Qwen35VisionPatchMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
self.deepstack_visual_indexes = [] # DeepStack, per-layer visual features (Qwen3-VL)
|
||||
self.deepstack_merger_list = None
|
||||
|
||||
def rot_pos_emb(self, grid_thw):
|
||||
merge_size = self.spatial_merge_size
|
||||
@ -664,9 +665,14 @@ class Qwen35VisionModel(nn.Module):
|
||||
).cumsum(dim=0, dtype=torch.int32)
|
||||
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
|
||||
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
|
||||
for blk in self.blocks:
|
||||
deepstack_features = []
|
||||
for layer_num, blk in enumerate(self.blocks):
|
||||
x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
|
||||
if self.deepstack_merger_list is not None and layer_num in self.deepstack_visual_indexes:
|
||||
deepstack_features.append(self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](x))
|
||||
merged = self.merger(x)
|
||||
if self.deepstack_merger_list is not None:
|
||||
return merged, deepstack_features
|
||||
return merged
|
||||
|
||||
# Model Wrapper
|
||||
@ -690,30 +696,7 @@ class Qwen35(BaseLlama, BaseGenerate, torch.nn.Module):
|
||||
return None, None
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[], past_key_values=None):
|
||||
grid = None
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
grid = e.get("extra", None)
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
|
||||
if grid is None:
|
||||
position_ids = None
|
||||
|
||||
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, embeds.shape[1], embeds.device)
|
||||
return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids, past_key_values=past_key_values)
|
||||
|
||||
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
|
||||
|
||||
193
comfy/text_encoders/qwen3vl.py
Normal file
193
comfy/text_encoders/qwen3vl.py
Normal file
@ -0,0 +1,193 @@
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
from comfy import sd1_clip
|
||||
import comfy.text_encoders.qwen_vl
|
||||
from .qwen35 import Qwen35VisionModel
|
||||
from .llama import BaseLlama, BaseQwen3, BaseGenerate, Llama2_, Qwen3VL_4BConfig, Qwen3VL_8BConfig
|
||||
|
||||
|
||||
QWEN3VL_VISION = {
|
||||
"qwen3vl_4b": dict(hidden_size=1024, intermediate_size=4096, depth=24, deepstack_visual_indexes=[5, 11, 17]),
|
||||
"qwen3vl_8b": dict(hidden_size=1152, intermediate_size=4304, depth=27, deepstack_visual_indexes=[8, 16, 24]),
|
||||
}
|
||||
QWEN3VL_VISION_COMMON = dict(num_heads=16, patch_size=16, temporal_patch_size=2, in_channels=3,
|
||||
spatial_merge_size=2, num_position_embeddings=2304)
|
||||
|
||||
QWEN3VL_CONFIGS = {"qwen3vl_4b": Qwen3VL_4BConfig, "qwen3vl_8b": Qwen3VL_8BConfig}
|
||||
|
||||
|
||||
class Qwen3VLDeepstackMerger(nn.Module):
|
||||
# DeepStack merger: postshuffle LayerNorm (applied after spatial merge), unlike the main merger.
|
||||
def __init__(self, hidden_size, spatial_merge_size, out_hidden_size, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.merge_dim = hidden_size * (spatial_merge_size ** 2)
|
||||
self.norm = ops.LayerNorm(self.merge_dim, eps=1e-6, device=device, dtype=dtype)
|
||||
self.linear_fc1 = ops.Linear(self.merge_dim, self.merge_dim, device=device, dtype=dtype)
|
||||
self.linear_fc2 = ops.Linear(self.merge_dim, out_hidden_size, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.norm(x.view(-1, self.merge_dim))
|
||||
return self.linear_fc2(F.gelu(self.linear_fc1(x)))
|
||||
|
||||
|
||||
class Qwen3VLVisionModel(Qwen35VisionModel):
|
||||
# Qwen3.5 vision + DeepStack
|
||||
def __init__(self, config, device=None, dtype=None, ops=None):
|
||||
super().__init__(config, device=device, dtype=dtype, ops=ops)
|
||||
self.deepstack_visual_indexes = config["deepstack_visual_indexes"]
|
||||
self.deepstack_merger_list = nn.ModuleList([
|
||||
Qwen3VLDeepstackMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
|
||||
for _ in self.deepstack_visual_indexes
|
||||
])
|
||||
|
||||
|
||||
class Qwen3VL(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
|
||||
model_type = "qwen3vl_8b"
|
||||
|
||||
def __init__(self, config_dict, dtype, device, operations):
|
||||
super().__init__()
|
||||
config = QWEN3VL_CONFIGS[self.model_type](**config_dict)
|
||||
self.num_layers = config.num_hidden_layers
|
||||
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
|
||||
vision_config = {**QWEN3VL_VISION_COMMON, **QWEN3VL_VISION[self.model_type], "out_hidden_size": config.hidden_size}
|
||||
self.visual = Qwen3VLVisionModel(vision_config, device=device, dtype=dtype, ops=operations)
|
||||
self.dtype = dtype
|
||||
|
||||
def preprocess_embed(self, embed, device):
|
||||
if embed["type"] == "image":
|
||||
# Qwen3-VL normalizes to [-1, 1] (mean/std 0.5), unlike Qwen2.5-VL's CLIP normalization.
|
||||
image, grid = comfy.text_encoders.qwen_vl.process_qwen2vl_images(embed["data"], patch_size=16, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
|
||||
merged, deepstack = self.visual(image.to(device, dtype=torch.float32), grid)
|
||||
return merged, {"grid": grid, "deepstack": deepstack}
|
||||
return None, None
|
||||
|
||||
def build_image_inputs(self, embeds, embeds_info):
|
||||
# Returns (position_ids, visual_pos_masks, deepstack) for the prompt
|
||||
images = sorted([e for e in embeds_info if e.get("type") == "image"], key=lambda e: e["index"])
|
||||
if len(images) == 0:
|
||||
return None, None, None
|
||||
|
||||
device = embeds.device
|
||||
seq = embeds.shape[1]
|
||||
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, seq, device)
|
||||
|
||||
# DeepStack: mask of image positions + per-vision-layer features to inject there.
|
||||
visual_pos_masks = torch.zeros((1, seq), dtype=torch.bool, device=device)
|
||||
deepstack = None
|
||||
for e in images:
|
||||
start = e["index"]
|
||||
end = e["size"] + start
|
||||
visual_pos_masks[0, start:end] = True
|
||||
ds = e["extra"]["deepstack"]
|
||||
if deepstack is None:
|
||||
deepstack = [d for d in ds]
|
||||
else:
|
||||
deepstack = [torch.cat([deepstack[i], ds[i]], dim=0) for i in range(len(ds))]
|
||||
return position_ids, visual_pos_masks, deepstack
|
||||
|
||||
|
||||
def _make_qwen3vl_model(model_type):
|
||||
class Qwen3VL_(Qwen3VL):
|
||||
pass
|
||||
Qwen3VL_.model_type = model_type
|
||||
return Qwen3VL_
|
||||
|
||||
|
||||
class Qwen3VLClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
|
||||
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={},
|
||||
dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False,
|
||||
model_class=_make_qwen3vl_model(model_type), enable_attention_masks=attention_mask,
|
||||
return_attention_masks=attention_mask, model_options=model_options)
|
||||
|
||||
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
|
||||
if isinstance(tokens, dict):
|
||||
tokens = next(iter(tokens.values()))
|
||||
tokens_only = [[t[0] for t in b] for b in tokens]
|
||||
embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
|
||||
position_ids, visual_pos_masks, deepstack = self.transformer.build_image_inputs(embeds, embeds_info)
|
||||
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed,
|
||||
presence_penalty=presence_penalty, position_ids=position_ids,
|
||||
visual_pos_masks=visual_pos_masks, deepstack_embeds=deepstack)
|
||||
|
||||
|
||||
class Qwen3VLTEModel(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, model_type="qwen3vl_8b"):
|
||||
clip_model = lambda **kw: Qwen3VLClipModel(**kw, model_type=model_type)
|
||||
super().__init__(device=device, dtype=dtype, name=model_type, clip_model=clip_model, model_options=model_options)
|
||||
|
||||
|
||||
class Qwen3VLSDTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, embedding_size=4096, embedding_key="qwen3vl_8b"):
|
||||
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
|
||||
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
|
||||
|
||||
|
||||
class Qwen3VLTokenizer(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}, model_type="qwen3vl_8b"):
|
||||
embedding_size = 2560 if model_type == "qwen3vl_4b" else 4096
|
||||
tokenizer = lambda *a, **kw: Qwen3VLSDTokenizer(*a, **kw, embedding_size=embedding_size, embedding_key=model_type)
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=model_type, tokenizer=tokenizer)
|
||||
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
self.llama_template_images = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
|
||||
image = kwargs.get("image", None)
|
||||
if image is not None and len(images) == 0:
|
||||
images = [image[i:i + 1] for i in range(image.shape[0])]
|
||||
|
||||
skip_template = text.startswith('<|im_start|>')
|
||||
if prevent_empty_text and text == '':
|
||||
text = ' '
|
||||
|
||||
if skip_template:
|
||||
llama_text = text
|
||||
else:
|
||||
if llama_template is not None:
|
||||
template = llama_template
|
||||
elif len(images) == 0:
|
||||
template = self.llama_template
|
||||
else:
|
||||
template = self.llama_template_images
|
||||
if len(images) > 1:
|
||||
vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
template = template.replace(vision_block, vision_block * len(images), 1)
|
||||
llama_text = template.format(text)
|
||||
if not thinking: # Qwen3 convention: empty think block suppresses reasoning
|
||||
llama_text += "<think>\n\n</think>\n\n"
|
||||
|
||||
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
|
||||
key_name = next(iter(tokens))
|
||||
embed_count = 0
|
||||
for r in tokens[key_name]:
|
||||
for i in range(len(r)):
|
||||
if r[i][0] == 151655: # <|image_pad|>
|
||||
if len(images) > embed_count:
|
||||
r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
|
||||
embed_count += 1
|
||||
return tokens
|
||||
|
||||
|
||||
def tokenizer(model_type="qwen3vl_8b"):
|
||||
class Qwen3VLTokenizer_(Qwen3VLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type=model_type)
|
||||
return Qwen3VLTokenizer_
|
||||
|
||||
|
||||
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen3vl_8b"):
|
||||
class Qwen3VLTEModel_(Qwen3VLTEModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
if dtype_llama is not None:
|
||||
dtype = dtype_llama
|
||||
if llama_quantization_metadata is not None:
|
||||
model_options = model_options.copy()
|
||||
model_options["quantization_metadata"] = llama_quantization_metadata
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, model_type=model_type)
|
||||
return Qwen3VLTEModel_
|
||||
@ -88,6 +88,32 @@ def process_qwen2vl_images(
|
||||
return flatten_patches, image_grid_thw
|
||||
|
||||
|
||||
def qwen2vl_mrope_position_ids(embeds_info, seq_len, device):
|
||||
# (3, seq_len) T/H/W MRoPE position ids: text runs sequentially, each image span gets its grid positions.
|
||||
# Returns None when there are no image embeds. `extra` is the image grid_thw, or a dict carrying it under "grid".
|
||||
position_ids = None
|
||||
offset = 0
|
||||
for e in embeds_info:
|
||||
if e.get("type") == "image":
|
||||
extra = e.get("extra", None)
|
||||
grid = extra["grid"] if isinstance(extra, dict) else extra
|
||||
start = e.get("index")
|
||||
if position_ids is None:
|
||||
position_ids = torch.zeros((3, seq_len), device=device)
|
||||
position_ids[:, :start] = torch.arange(0, start, device=device)
|
||||
end = e.get("size") + start
|
||||
len_max = int(grid.max()) // 2
|
||||
start_next = len_max + start
|
||||
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (seq_len - end) + offset, device=device)
|
||||
position_ids[0, start:end] = start + offset
|
||||
max_d = int(grid[0][1]) // 2
|
||||
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
|
||||
max_d = int(grid[0][2]) // 2
|
||||
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
|
||||
offset += len_max - (end - start)
|
||||
return position_ids
|
||||
|
||||
|
||||
class VisionPatchEmbed(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
|
||||
@ -25,6 +25,11 @@ CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = {
|
||||
"default": False,
|
||||
"description": "Show the sign-in button in the frontend even when not signed in",
|
||||
},
|
||||
"enable_telemetry": {
|
||||
"type": "bool",
|
||||
"default": False,
|
||||
"description": "Signal the frontend that telemetry collection is enabled",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -325,21 +325,25 @@ class VideoFromFile(VideoInput):
|
||||
checked_alpha = True
|
||||
|
||||
# Fix non-deterministic video decode when the video width is not a multiple of 32
|
||||
# For non-yuvj pixel formats (all H.264/H.265 video)
|
||||
# For non-yuvj pixel formats: most H.264/H.265 video and static images (e.g. lossy WebP via LoadImage)
|
||||
# Pad both axes to a multiple of 32 and smear the border so the alignment padding never bleeds into the cropped edges
|
||||
if image_format in ('gbrpf32le', 'gbrapf32le') and frame.width % 32 != 0:
|
||||
if align_graph is None:
|
||||
pad_w = ((frame.width + 31) // 32) * 32
|
||||
pad_h = ((frame.height + 31) // 32) * 32
|
||||
g = av.filter.Graph()
|
||||
g_src = g.add_buffer(width=frame.width, height=frame.height,
|
||||
format=frame.format.name, time_base=video_stream.time_base)
|
||||
g_pad = g.add('pad', f'{pad_w}:{frame.height}:0:0')
|
||||
g_pad = g.add('pad', f'{pad_w}:{pad_h}:0:0')
|
||||
g_fill = g.add('fillborders', f'left=0:right={pad_w - frame.width}:top=0:bottom={pad_h - frame.height}:mode=smear')
|
||||
g_sink = g.add('buffersink')
|
||||
g_src.link_to(g_pad)
|
||||
g_pad.link_to(g_sink)
|
||||
g_pad.link_to(g_fill)
|
||||
g_fill.link_to(g_sink)
|
||||
g.configure()
|
||||
align_graph = (g, g_src, g_sink)
|
||||
align_graph[1].push(frame)
|
||||
img = np.ascontiguousarray(align_graph[2].pull().to_ndarray(format=image_format)[:, :frame.width])
|
||||
img = np.ascontiguousarray(align_graph[2].pull().to_ndarray(format=image_format)[:frame.height, :frame.width])
|
||||
else:
|
||||
img = frame.to_ndarray(format=image_format)
|
||||
if frame.rotation != 0:
|
||||
|
||||
@ -149,3 +149,59 @@ class MotionControlRequest(BaseModel):
|
||||
character_orientation: str = Field(...)
|
||||
mode: str = Field(..., description="'pro' or 'std'")
|
||||
model_name: str = Field(...)
|
||||
|
||||
|
||||
class Kling3TurboSettings(BaseModel):
|
||||
resolution: str = Field("720p", description="'720p' or '1080p'")
|
||||
aspect_ratio: str | None = Field(None, description="'16:9'/'9:16'/'1:1'; text-to-video only")
|
||||
duration: int = Field(5, description="3-15 second")
|
||||
|
||||
|
||||
class Kling3TurboText2VideoRequest(BaseModel):
|
||||
prompt: str = Field(..., description="<=3072 chars; may use multi-shot 'shot n, m, words; ...'")
|
||||
settings: Kling3TurboSettings | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboContent(BaseModel):
|
||||
type: str = Field(..., description="'prompt' or 'first_frame'")
|
||||
text: str | None = Field(None, description="for type=prompt; <=2500 chars")
|
||||
url: str | None = Field(None, description="for type=first_frame")
|
||||
|
||||
|
||||
class Kling3TurboImage2VideoRequest(BaseModel):
|
||||
contents: list[Kling3TurboContent] = Field(..., description="prompt + first_frame materials")
|
||||
settings: Kling3TurboSettings | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboCreateData(BaseModel):
|
||||
id: str | None = Field(None, description="Task ID")
|
||||
status: str | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboCreateResponse(BaseModel):
|
||||
code: int | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
request_id: str | None = Field(None)
|
||||
data: Kling3TurboCreateData | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboOutput(BaseModel):
|
||||
type: str | None = Field(None, description="'video', 'image', 'audio', ...")
|
||||
id: str | None = Field(None)
|
||||
url: str | None = Field(None)
|
||||
duration: str | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboTaskData(BaseModel):
|
||||
id: str | None = Field(None)
|
||||
status: str | None = Field(None, description="submitted | processing | succeeded | failed")
|
||||
message: str | None = Field(None)
|
||||
outputs: list[Kling3TurboOutput] | None = Field(None)
|
||||
|
||||
|
||||
class Kling3TurboQueryResponse(BaseModel):
|
||||
code: int | None = Field(None)
|
||||
message: str | None = Field(None)
|
||||
request_id: str | None = Field(None)
|
||||
data: list[Kling3TurboTaskData] | None = Field(None)
|
||||
|
||||
@ -10,6 +10,7 @@ from pydantic import BaseModel, Field, confloat
|
||||
class LumaIO:
|
||||
LUMA_REF = "LUMA_REF"
|
||||
LUMA_CONCEPTS = "LUMA_CONCEPTS"
|
||||
LUMA_RAY32_KEYFRAME = "LUMA_RAY32_KEYFRAME"
|
||||
|
||||
|
||||
class LumaReference:
|
||||
@ -20,13 +21,14 @@ class LumaReference:
|
||||
def create_api_model(self, download_url: str):
|
||||
return LumaImageRef(url=download_url, weight=self.weight)
|
||||
|
||||
|
||||
class LumaReferenceChain:
|
||||
def __init__(self, first_ref: LumaReference=None):
|
||||
def __init__(self, first_ref: LumaReference = None):
|
||||
self.refs: list[LumaReference] = []
|
||||
if first_ref:
|
||||
self.refs.append(first_ref)
|
||||
|
||||
def add(self, luma_ref: LumaReference=None):
|
||||
def add(self, luma_ref: LumaReference = None):
|
||||
self.refs.append(luma_ref)
|
||||
|
||||
def create_api_model(self, download_urls: list[str], max_refs=4):
|
||||
@ -124,7 +126,7 @@ def get_luma_concepts(include_none=False):
|
||||
"pull_out",
|
||||
"aerial",
|
||||
"crane_up",
|
||||
"eye_level"
|
||||
"eye_level",
|
||||
]
|
||||
|
||||
|
||||
@ -162,8 +164,8 @@ class LumaVideoModelOutputDuration(str, Enum):
|
||||
|
||||
|
||||
class LumaGenerationType(str, Enum):
|
||||
video = 'video'
|
||||
image = 'image'
|
||||
video = "video"
|
||||
image = "image"
|
||||
|
||||
|
||||
class LumaState(str, Enum):
|
||||
@ -174,86 +176,109 @@ class LumaState(str, Enum):
|
||||
|
||||
|
||||
class LumaAssets(BaseModel):
|
||||
video: Optional[str] = Field(None, description='The URL of the video')
|
||||
image: Optional[str] = Field(None, description='The URL of the image')
|
||||
progress_video: Optional[str] = Field(None, description='The URL of the progress video')
|
||||
video: Optional[str] = Field(None, description="The URL of the video")
|
||||
image: Optional[str] = Field(None, description="The URL of the image")
|
||||
progress_video: Optional[str] = Field(None, description="The URL of the progress video")
|
||||
|
||||
|
||||
class LumaImageRef(BaseModel):
|
||||
"""Used for image gen"""
|
||||
url: str = Field(..., description='The URL of the image reference')
|
||||
weight: confloat(ge=0.0, le=1.0) = Field(..., description='The weight of the image reference')
|
||||
|
||||
url: str = Field(..., description="The URL of the image reference")
|
||||
weight: confloat(ge=0.0, le=1.0) = Field(..., description="The weight of the image reference")
|
||||
|
||||
|
||||
class LumaImageReference(BaseModel):
|
||||
"""Used for video gen"""
|
||||
type: Optional[str] = Field('image', description='Input type, defaults to image')
|
||||
url: str = Field(..., description='The URL of the image')
|
||||
|
||||
type: Optional[str] = Field("image", description="Input type, defaults to image")
|
||||
url: str = Field(..., description="The URL of the image")
|
||||
|
||||
|
||||
class LumaModifyImageRef(BaseModel):
|
||||
url: str = Field(..., description='The URL of the image reference')
|
||||
weight: confloat(ge=0.0, le=1.0) = Field(..., description='The weight of the image reference')
|
||||
url: str = Field(..., description="The URL of the image reference")
|
||||
weight: confloat(ge=0.0, le=1.0) = Field(..., description="The weight of the image reference")
|
||||
|
||||
|
||||
class LumaCharacterRef(BaseModel):
|
||||
identity0: LumaImageIdentity = Field(..., description='The image identity object')
|
||||
identity0: LumaImageIdentity = Field(..., description="The image identity object")
|
||||
|
||||
|
||||
class LumaImageIdentity(BaseModel):
|
||||
images: list[str] = Field(..., description='The URLs of the image identity')
|
||||
images: list[str] = Field(..., description="The URLs of the image identity")
|
||||
|
||||
|
||||
class LumaGenerationReference(BaseModel):
|
||||
type: str = Field('generation', description='Input type, defaults to generation')
|
||||
id: str = Field(..., description='The ID of the generation')
|
||||
type: str = Field("generation", description="Input type, defaults to generation")
|
||||
id: str = Field(..., description="The ID of the generation")
|
||||
|
||||
|
||||
class LumaKeyframes(BaseModel):
|
||||
frame0: Optional[Union[LumaImageReference, LumaGenerationReference]] = Field(None, description='')
|
||||
frame1: Optional[Union[LumaImageReference, LumaGenerationReference]] = Field(None, description='')
|
||||
frame0: Optional[Union[LumaImageReference, LumaGenerationReference]] = Field(None, description="")
|
||||
frame1: Optional[Union[LumaImageReference, LumaGenerationReference]] = Field(None, description="")
|
||||
|
||||
|
||||
class LumaConceptObject(BaseModel):
|
||||
key: str = Field(..., description='Camera Concept name')
|
||||
key: str = Field(..., description="Camera Concept name")
|
||||
|
||||
|
||||
class LumaImageGenerationRequest(BaseModel):
|
||||
prompt: str = Field(..., description='The prompt of the generation')
|
||||
model: LumaImageModel = Field(LumaImageModel.photon_1, description='The image model used for the generation')
|
||||
aspect_ratio: Optional[LumaAspectRatio] = Field(LumaAspectRatio.ratio_16_9, description='The aspect ratio of the generation')
|
||||
image_ref: Optional[list[LumaImageRef]] = Field(None, description='List of image reference objects')
|
||||
style_ref: Optional[list[LumaImageRef]] = Field(None, description='List of style reference objects')
|
||||
character_ref: Optional[LumaCharacterRef] = Field(None, description='The image identity object')
|
||||
modify_image_ref: Optional[LumaModifyImageRef] = Field(None, description='The modify image reference object')
|
||||
prompt: str = Field(..., description="The prompt of the generation")
|
||||
model: LumaImageModel = Field(LumaImageModel.photon_1, description="The image model used for the generation")
|
||||
aspect_ratio: Optional[LumaAspectRatio] = Field(LumaAspectRatio.ratio_16_9)
|
||||
image_ref: Optional[list[LumaImageRef]] = Field(None, description="List of image reference objects")
|
||||
style_ref: Optional[list[LumaImageRef]] = Field(None, description="List of style reference objects")
|
||||
character_ref: Optional[LumaCharacterRef] = Field(None, description="The image identity object")
|
||||
modify_image_ref: Optional[LumaModifyImageRef] = Field(None, description="The modify image reference object")
|
||||
|
||||
|
||||
class LumaGenerationRequest(BaseModel):
|
||||
prompt: str = Field(..., description='The prompt of the generation')
|
||||
model: LumaVideoModel = Field(LumaVideoModel.ray_2, description='The video model used for the generation')
|
||||
duration: Optional[LumaVideoModelOutputDuration] = Field(None, description='The duration of the generation')
|
||||
aspect_ratio: Optional[LumaAspectRatio] = Field(None, description='The aspect ratio of the generation')
|
||||
resolution: Optional[LumaVideoOutputResolution] = Field(None, description='The resolution of the generation')
|
||||
loop: Optional[bool] = Field(None, description='Whether to loop the video')
|
||||
keyframes: Optional[LumaKeyframes] = Field(None, description='The keyframes of the generation')
|
||||
concepts: Optional[list[LumaConceptObject]] = Field(None, description='Camera Concepts to apply to generation')
|
||||
prompt: str = Field(..., description="The prompt of the generation")
|
||||
model: LumaVideoModel = Field(LumaVideoModel.ray_2, description="The video model used for the generation")
|
||||
duration: Optional[LumaVideoModelOutputDuration] = Field(None, description="The duration of the generation")
|
||||
aspect_ratio: Optional[LumaAspectRatio] = Field(None, description="The aspect ratio of the generation")
|
||||
resolution: Optional[LumaVideoOutputResolution] = Field(None, description="The resolution of the generation")
|
||||
loop: Optional[bool] = Field(None, description="Whether to loop the video")
|
||||
keyframes: Optional[LumaKeyframes] = Field(None, description="The keyframes of the generation")
|
||||
concepts: Optional[list[LumaConceptObject]] = Field(None, description="Camera Concepts to apply to generation")
|
||||
|
||||
|
||||
class LumaGeneration(BaseModel):
|
||||
id: str = Field(..., description='The ID of the generation')
|
||||
generation_type: LumaGenerationType = Field(..., description='Generation type, image or video')
|
||||
state: LumaState = Field(..., description='The state of the generation')
|
||||
failure_reason: Optional[str] = Field(None, description='The reason for the state of the generation')
|
||||
created_at: str = Field(..., description='The date and time when the generation was created')
|
||||
assets: Optional[LumaAssets] = Field(None, description='The assets of the generation')
|
||||
model: str = Field(..., description='The model used for the generation')
|
||||
request: Union[LumaGenerationRequest, LumaImageGenerationRequest] = Field(..., description="The request used for the generation")
|
||||
id: str = Field(..., description="The ID of the generation")
|
||||
generation_type: LumaGenerationType = Field(..., description="Generation type, image or video")
|
||||
state: LumaState = Field(..., description="The state of the generation")
|
||||
failure_reason: Optional[str] = Field(None, description="The reason for the state of the generation")
|
||||
created_at: str = Field(..., description="The date and time when the generation was created")
|
||||
assets: Optional[LumaAssets] = Field(None, description="The assets of the generation")
|
||||
model: str = Field(..., description="The model used for the generation")
|
||||
request: Union[LumaGenerationRequest, LumaImageGenerationRequest] = Field(...)
|
||||
|
||||
|
||||
class Luma2ImageRef(BaseModel):
|
||||
url: str | None = None
|
||||
data: str | None = None
|
||||
media_type: str | None = None
|
||||
generation_id: str | None = Field(None, description="reference a prior generation (extend / source reuse)")
|
||||
|
||||
|
||||
class Luma2VideoEdit(BaseModel):
|
||||
"""Edit controls for Ray 3.2 ``video_edit`` generations."""
|
||||
|
||||
auto_controls: bool | None = Field(None, description="derive a conditioning schedule from the source (recommended)")
|
||||
strength: str | None = Field(None, description="'adhere_1' .. 'reimagine_3'; constrained by IO.Combo")
|
||||
|
||||
|
||||
class Luma2VideoOptions(BaseModel):
|
||||
"""Ray 3.2 ``video`` output settings (text / image / keyframe / edit / extend)."""
|
||||
|
||||
resolution: str | None = Field(None, description="360p | 540p | 720p | 1080p")
|
||||
duration: str | None = Field(None, description="5s | 10s")
|
||||
loop: bool | None = Field(None)
|
||||
start_frame: Luma2ImageRef | None = Field(None)
|
||||
end_frame: Luma2ImageRef | None = Field(None)
|
||||
keyframes: list[Luma2ImageRef] | None = Field(None)
|
||||
keyframe_indexes: list[int] | None = Field(None)
|
||||
edit: Luma2VideoEdit | None = Field(None)
|
||||
|
||||
|
||||
class Luma2GenerationRequest(BaseModel):
|
||||
@ -266,6 +291,7 @@ class Luma2GenerationRequest(BaseModel):
|
||||
web_search: bool | None = None
|
||||
image_ref: list[Luma2ImageRef] | None = None
|
||||
source: Luma2ImageRef | None = None
|
||||
video: Luma2VideoOptions | None = Field(None)
|
||||
|
||||
|
||||
class Luma2Generation(BaseModel):
|
||||
@ -277,3 +303,31 @@ class Luma2Generation(BaseModel):
|
||||
output: list[LumaImageReference] | None = None
|
||||
failure_reason: str | None = None
|
||||
failure_code: str | None = None
|
||||
|
||||
|
||||
# --- Ray 3.2 multi-keyframe chain ---
|
||||
|
||||
LUMA_KEYFRAME_MODE_FRACTION = "fraction" # value in [0.0, 1.0] of the output video duration
|
||||
LUMA_KEYFRAME_MODE_SECONDS = "seconds" # absolute time, in seconds, from the start of the output
|
||||
|
||||
|
||||
class LumaRay32KeyframeItem:
|
||||
"""One guide image anchored at a position on the Ray 3.2 output timeline."""
|
||||
|
||||
def __init__(self, image: torch.Tensor, mode: str, value: float):
|
||||
self.image = image
|
||||
self.mode = mode # LUMA_KEYFRAME_MODE_FRACTION | LUMA_KEYFRAME_MODE_SECONDS
|
||||
self.value = value
|
||||
|
||||
|
||||
class LumaRay32KeyframeChain:
|
||||
def __init__(self):
|
||||
self.items: list[LumaRay32KeyframeItem] = []
|
||||
|
||||
def add(self, item: LumaRay32KeyframeItem) -> None:
|
||||
self.items.append(item)
|
||||
|
||||
def clone(self) -> "LumaRay32KeyframeChain":
|
||||
c = LumaRay32KeyframeChain()
|
||||
c.items = list(self.items)
|
||||
return c
|
||||
|
||||
@ -60,6 +60,12 @@ from comfy_api_nodes.apis.kling import (
|
||||
OmniProImageRequest,
|
||||
OmniProReferences2VideoRequest,
|
||||
OmniProText2VideoRequest,
|
||||
Kling3TurboSettings,
|
||||
Kling3TurboText2VideoRequest,
|
||||
Kling3TurboContent,
|
||||
Kling3TurboImage2VideoRequest,
|
||||
Kling3TurboCreateResponse,
|
||||
Kling3TurboQueryResponse,
|
||||
TaskStatusResponse,
|
||||
TextToVideoWithAudioRequest,
|
||||
)
|
||||
@ -2847,6 +2853,67 @@ class MotionControl(IO.ComfyNode):
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
|
||||
|
||||
def build_turbo_shot_prompt(multi_prompt: list[MultiPromptEntry]) -> str:
|
||||
"""Render storyboard entries into the Turbo multi-shot prompt 'shot n, m, words; ...'."""
|
||||
return "; ".join(f"shot {i}, {int(e.duration)}, {e.prompt}" for i, e in enumerate(multi_prompt, 1)) + ";"
|
||||
|
||||
|
||||
def _turbo_video_url(response: Kling3TurboQueryResponse) -> str:
|
||||
"""Extract the result video URL from a /tasks response (data[].outputs[] where type == 'video')."""
|
||||
task = response.data[0] if response.data else None
|
||||
if task and task.outputs:
|
||||
for output in task.outputs:
|
||||
if output.type == "video" and output.url:
|
||||
return output.url
|
||||
raise RuntimeError(f"Kling 3.0 Turbo task finished without a video output: {response.model_dump()}")
|
||||
|
||||
|
||||
async def execute_kling_turbo(
|
||||
cls: type[IO.ComfyNode],
|
||||
*,
|
||||
prompt: str,
|
||||
resolution: str,
|
||||
aspect_ratio: str,
|
||||
duration: int,
|
||||
start_frame: torch.Tensor | None,
|
||||
) -> IO.NodeOutput:
|
||||
"""Create + poll a Kling 3.0 Turbo task. Image-to-video when start_frame is given, else text-to-video."""
|
||||
if start_frame is not None:
|
||||
validate_image_dimensions(start_frame, min_width=300, min_height=300)
|
||||
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
|
||||
contents = [Kling3TurboContent(type="first_frame", url=tensor_to_base64_string(start_frame))]
|
||||
if prompt:
|
||||
contents.insert(0, Kling3TurboContent(type="prompt", text=prompt))
|
||||
create = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/image-to-video/kling-3.0-turbo", method="POST"),
|
||||
response_model=Kling3TurboCreateResponse,
|
||||
data=Kling3TurboImage2VideoRequest(
|
||||
contents=contents,
|
||||
settings=Kling3TurboSettings(resolution=resolution, duration=duration), # i2v: no aspect_ratio
|
||||
),
|
||||
)
|
||||
else:
|
||||
create = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/text-to-video/kling-3.0-turbo", method="POST"),
|
||||
response_model=Kling3TurboCreateResponse,
|
||||
data=Kling3TurboText2VideoRequest(
|
||||
prompt=prompt,
|
||||
settings=Kling3TurboSettings(resolution=resolution, aspect_ratio=aspect_ratio, duration=duration),
|
||||
),
|
||||
)
|
||||
if not (create.data and create.data.id):
|
||||
raise RuntimeError(f"Kling 3.0 Turbo create failed. Code: {create.code}, Message: {create.message}")
|
||||
final_response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/tasks", query_params={"task_ids": create.data.id}),
|
||||
response_model=Kling3TurboQueryResponse,
|
||||
status_extractor=lambda r: (r.data[0].status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(_turbo_video_url(final_response)))
|
||||
|
||||
|
||||
class KlingVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -2884,7 +2951,11 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
],
|
||||
tooltip="Generate a series of video segments with individual prompts and durations.",
|
||||
),
|
||||
IO.Boolean.Input("generate_audio", default=True),
|
||||
IO.Boolean.Input(
|
||||
"generate_audio",
|
||||
default=True,
|
||||
tooltip="'kling-3.0-turbo' always generates native audio, so the audio toggle is ignored.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
@ -2899,6 +2970,17 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-3.0-turbo",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], default="720p"),
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=["16:9", "9:16", "1:1"],
|
||||
tooltip="Ignored in image-to-video mode.",
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
tooltip="Model and generation settings.",
|
||||
),
|
||||
@ -2930,6 +3012,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=[
|
||||
"model",
|
||||
"model.resolution",
|
||||
"generate_audio",
|
||||
"multi_shot",
|
||||
@ -2944,14 +3027,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
$ms := widgets.multi_shot;
|
||||
$isSb := $ms != "disabled";
|
||||
$n := $isSb ? $number($substring($ms, 0, 1)) : 0;
|
||||
@ -2962,7 +3038,18 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
$d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0;
|
||||
$d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0;
|
||||
$dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration");
|
||||
{"type":"usd","usd": $rate * $dur}
|
||||
widgets.model = "kling-3.0-turbo"
|
||||
? {"type":"usd","usd": ($res = "1080p" ? 0.14 : 0.112) * $dur}
|
||||
: (
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
{"type":"usd","usd": $rate * $dur}
|
||||
)
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -3015,6 +3102,17 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
duration = multi_shot["duration"]
|
||||
validate_string(multi_shot["prompt"], min_length=1, max_length=2500)
|
||||
|
||||
if model["model"] == "kling-3.0-turbo":
|
||||
turbo_prompt = build_turbo_shot_prompt(multi_prompt_list) if custom_multi_shot else multi_shot["prompt"]
|
||||
return await execute_kling_turbo(
|
||||
cls,
|
||||
prompt=turbo_prompt,
|
||||
resolution=model["resolution"],
|
||||
aspect_ratio=model["aspect_ratio"],
|
||||
duration=duration,
|
||||
start_frame=start_frame,
|
||||
)
|
||||
|
||||
if start_frame is not None:
|
||||
validate_image_dimensions(start_frame, min_width=300, min_height=300)
|
||||
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
|
||||
|
||||
@ -3,9 +3,13 @@ from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.luma import (
|
||||
LUMA_KEYFRAME_MODE_FRACTION,
|
||||
LUMA_KEYFRAME_MODE_SECONDS,
|
||||
Luma2Generation,
|
||||
Luma2GenerationRequest,
|
||||
Luma2ImageRef,
|
||||
Luma2VideoEdit,
|
||||
Luma2VideoOptions,
|
||||
LumaAspectRatio,
|
||||
LumaCharacterRef,
|
||||
LumaConceptChain,
|
||||
@ -18,6 +22,8 @@ from comfy_api_nodes.apis.luma import (
|
||||
LumaIO,
|
||||
LumaKeyframes,
|
||||
LumaModifyImageRef,
|
||||
LumaRay32KeyframeChain,
|
||||
LumaRay32KeyframeItem,
|
||||
LumaReference,
|
||||
LumaReferenceChain,
|
||||
LumaVideoModel,
|
||||
@ -33,6 +39,7 @@ from comfy_api_nodes.util import (
|
||||
sync_op,
|
||||
upload_image_to_comfyapi,
|
||||
upload_images_to_comfyapi,
|
||||
upload_video_to_comfyapi,
|
||||
validate_string,
|
||||
)
|
||||
|
||||
@ -692,7 +699,10 @@ async def _luma2_upload_image_refs(
|
||||
async def _luma2_submit_and_poll(
|
||||
cls: type[IO.ComfyNode],
|
||||
request: Luma2GenerationRequest,
|
||||
) -> Input.Image:
|
||||
*,
|
||||
estimated_duration: int | None = None,
|
||||
) -> Luma2Generation:
|
||||
"""Submit a Luma Agents generation and poll until done; returns the completed generation."""
|
||||
initial = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/luma_2/generations", method="POST"),
|
||||
@ -700,21 +710,21 @@ async def _luma2_submit_and_poll(
|
||||
data=request,
|
||||
)
|
||||
if not initial.id:
|
||||
raise RuntimeError("Luma 2 API did not return a generation id.")
|
||||
raise RuntimeError("Luma API did not return a generation id.")
|
||||
final = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/luma_2/generations/{initial.id}", method="GET"),
|
||||
response_model=Luma2Generation,
|
||||
status_extractor=lambda r: r.state,
|
||||
progress_extractor=lambda r: None,
|
||||
estimated_duration=estimated_duration,
|
||||
)
|
||||
if not final.output:
|
||||
if not final.output or not final.output[0].url:
|
||||
msg = final.failure_reason or "no output returned"
|
||||
raise RuntimeError(f"Luma 2 generation failed: {msg}")
|
||||
url = final.output[0].url
|
||||
if not url:
|
||||
raise RuntimeError("Luma 2 generation completed without an output URL.")
|
||||
return await download_url_to_image_tensor(url)
|
||||
if final.failure_code:
|
||||
msg = f"{msg} [{final.failure_code}]"
|
||||
raise RuntimeError(f"Luma generation failed: {msg}")
|
||||
return final
|
||||
|
||||
|
||||
class LumaImageNode(IO.ComfyNode):
|
||||
@ -843,7 +853,8 @@ class LumaImageNode(IO.ComfyNode):
|
||||
web_search=model["web_search"],
|
||||
image_ref=await _luma2_upload_image_refs(cls, model.get("image_ref"), max_count=9),
|
||||
)
|
||||
return IO.NodeOutput(await _luma2_submit_and_poll(cls, request))
|
||||
final = await _luma2_submit_and_poll(cls, request)
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(final.output[0].url))
|
||||
|
||||
|
||||
class LumaImageEditNode(IO.ComfyNode):
|
||||
@ -929,7 +940,533 @@ class LumaImageEditNode(IO.ComfyNode):
|
||||
web_search=model["web_search"],
|
||||
image_ref=await _luma2_upload_image_refs(cls, model.get("image_ref"), max_count=8),
|
||||
)
|
||||
return IO.NodeOutput(await _luma2_submit_and_poll(cls, request))
|
||||
final = await _luma2_submit_and_poll(cls, request)
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(final.output[0].url))
|
||||
|
||||
|
||||
_BADGE_RAY32_VIDEO = IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["resolution", "duration"]),
|
||||
expr="""
|
||||
(
|
||||
$p := {
|
||||
"360p": {"5s": 0.06, "10s": 0.18},
|
||||
"540p": {"5s": 0.15, "10s": 0.45},
|
||||
"720p": {"5s": 0.3, "10s": 0.9},
|
||||
"1080p": {"5s": 1.2, "10s": 3.6}
|
||||
};
|
||||
{"type": "usd", "usd": $lookup($lookup($p, widgets.resolution), widgets.duration)}
|
||||
)
|
||||
""",
|
||||
)
|
||||
|
||||
_BADGE_RAY32_VIDEO_5S = IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["resolution"]),
|
||||
expr="""
|
||||
(
|
||||
$p := {"360p": 0.06, "540p": 0.15, "720p": 0.3, "1080p": 1.2};
|
||||
{"type": "usd", "usd": $lookup($p, widgets.resolution)}
|
||||
)
|
||||
""",
|
||||
)
|
||||
|
||||
_BADGE_RAY32_EDIT = IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["resolution"]),
|
||||
expr="""
|
||||
(
|
||||
$p := {
|
||||
"360p": {"min": 0.54, "max": 1.08},
|
||||
"540p": {"min": 0.72, "max": 1.44},
|
||||
"720p": {"min": 1.08, "max": 2.16},
|
||||
"1080p": {"min": 2.16, "max": 4.32}
|
||||
};
|
||||
$r := $lookup($p, widgets.resolution);
|
||||
{"type": "range_usd", "min_usd": $r.min, "max_usd": $r.max, "format": {"note": "(by source length)"}}
|
||||
)
|
||||
""",
|
||||
)
|
||||
|
||||
_BADGE_RAY32_REFRAME = IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["resolution"]),
|
||||
expr="""
|
||||
(
|
||||
$p := {"360p": 0.03, "540p": 0.06, "720p": 0.12, "1080p": 0.36};
|
||||
{"type": "usd", "usd": $lookup($p, widgets.resolution), "format": {"suffix": "/second"}}
|
||||
)
|
||||
""",
|
||||
)
|
||||
|
||||
|
||||
def _ray32_seed_input() -> IO.Input:
|
||||
return IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=0xFFFFFFFFFFFFFFFF,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to determine if node should re-run; results are nondeterministic regardless of seed.",
|
||||
)
|
||||
|
||||
|
||||
async def _ray32_generate(cls: type[IO.ComfyNode], request: Luma2GenerationRequest) -> IO.NodeOutput:
|
||||
"""Run a ray-3.2 generation and return (video, generation_id)."""
|
||||
final = await _luma2_submit_and_poll(cls, request, estimated_duration=120)
|
||||
video = await download_url_to_video_output(final.output[0].url)
|
||||
return IO.NodeOutput(video, final.id or "")
|
||||
|
||||
|
||||
class LumaRay32TextToVideoNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32TextToVideoNode",
|
||||
display_name="Luma Ray 3.2 Text to Video",
|
||||
category="partner/video/Luma",
|
||||
description="Generate a video from a text prompt using Luma's Ray 3.2 model.",
|
||||
inputs=[
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Text prompt for the video generation."),
|
||||
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"]),
|
||||
IO.Combo.Input("resolution", options=["360p", "540p", "720p", "1080p"], default="720p"),
|
||||
IO.Combo.Input("duration", options=["5s", "10s"]),
|
||||
IO.Boolean.Input(
|
||||
"loop",
|
||||
default=False,
|
||||
tooltip="Make the video loop seamlessly. Only available with 5s duration.",
|
||||
),
|
||||
_ray32_seed_input(),
|
||||
],
|
||||
outputs=[IO.Video.Output(), IO.String.Output(display_name="generation_id")],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_VIDEO,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls, prompt: str, aspect_ratio: str, resolution: str, duration: str, loop: bool, seed: int
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=6000)
|
||||
if loop and duration == "10s":
|
||||
raise ValueError("Looping is only available with 5s duration on Ray 3.2.")
|
||||
request = Luma2GenerationRequest(
|
||||
prompt=prompt,
|
||||
model="ray-3.2",
|
||||
type="video",
|
||||
aspect_ratio=aspect_ratio,
|
||||
video=Luma2VideoOptions(resolution=resolution, duration=duration, loop=loop or None),
|
||||
)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaRay32ImageToVideoNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32ImageToVideoNode",
|
||||
display_name="Luma Ray 3.2 Image to Video",
|
||||
category="partner/video/Luma",
|
||||
description="Generate a video from a start and/or end frame using Luma's Ray 3.2 model. "
|
||||
"Image-anchored generations are always 5 seconds.",
|
||||
inputs=[
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Text prompt for the video generation."),
|
||||
IO.Combo.Input("resolution", options=["360p", "540p", "720p", "1080p"], default="720p"),
|
||||
IO.Boolean.Input(
|
||||
"loop",
|
||||
default=False,
|
||||
tooltip="Make the video loop seamlessly. Not available when an end_frame is set.",
|
||||
),
|
||||
_ray32_seed_input(),
|
||||
IO.Image.Input("start_frame", optional=True, tooltip="First frame of the generated video."),
|
||||
IO.Image.Input("end_frame", optional=True, tooltip="Last frame of the generated video."),
|
||||
],
|
||||
outputs=[IO.Video.Output(), IO.String.Output(display_name="generation_id")],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_VIDEO_5S,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
resolution: str,
|
||||
loop: bool,
|
||||
seed: int,
|
||||
start_frame: torch.Tensor | None = None,
|
||||
end_frame: torch.Tensor | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=6000)
|
||||
if start_frame is None and end_frame is None:
|
||||
raise ValueError("Provide at least one of start_frame / end_frame.")
|
||||
if loop and end_frame is not None:
|
||||
raise ValueError("Looping is not available when an end_frame is set.")
|
||||
video = Luma2VideoOptions(resolution=resolution, duration="5s", loop=loop or None)
|
||||
if start_frame is not None:
|
||||
url = await upload_image_to_comfyapi(cls, start_frame, mime_type="image/png")
|
||||
video.start_frame = Luma2ImageRef(url=url)
|
||||
if end_frame is not None:
|
||||
url = await upload_image_to_comfyapi(cls, end_frame, mime_type="image/png")
|
||||
video.end_frame = Luma2ImageRef(url=url)
|
||||
request = Luma2GenerationRequest(prompt=prompt, model="ray-3.2", type="video", video=video)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaRay32KeyframeNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32KeyframeNode",
|
||||
display_name="Luma Ray 3.2 Keyframe",
|
||||
category="partner/video/Luma",
|
||||
description="Anchor a guide image to a position on the Ray 3.2 output video timeline. Connect this to "
|
||||
"the 'keyframes' input of the Luma Ray 3.2 Keyframes to Video node; chain several together via the "
|
||||
"optional 'keyframes' input below.",
|
||||
inputs=[
|
||||
IO.Image.Input("image", tooltip="Guide image to place at the chosen moment of the output video."),
|
||||
IO.DynamicCombo.Input(
|
||||
"position",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"Fraction of duration (0.0-1.0)",
|
||||
[
|
||||
IO.Float.Input(
|
||||
"fraction",
|
||||
default=0.0,
|
||||
min=0.0,
|
||||
max=1.0,
|
||||
step=0.01,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Where in the output video this image applies " "(0.0 = start, 1.0 = end).",
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"Absolute time (seconds)",
|
||||
[
|
||||
IO.Float.Input(
|
||||
"seconds",
|
||||
default=0.0,
|
||||
min=0.0,
|
||||
max=10.0,
|
||||
step=0.1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Time in seconds from the start of the output video where this "
|
||||
"image applies.",
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
tooltip="How to place this image on the output video's timeline.",
|
||||
),
|
||||
IO.Custom(LumaIO.LUMA_RAY32_KEYFRAME).Input(
|
||||
"keyframes",
|
||||
optional=True,
|
||||
tooltip="Optional earlier keyframes to chain with this one.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Custom(LumaIO.LUMA_RAY32_KEYFRAME).Output(display_name="keyframes")],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(
|
||||
cls,
|
||||
image: torch.Tensor,
|
||||
position: dict,
|
||||
keyframes: LumaRay32KeyframeChain | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
chain = keyframes.clone() if keyframes is not None else LumaRay32KeyframeChain()
|
||||
if position["position"] == "Absolute time (seconds)":
|
||||
mode, value = LUMA_KEYFRAME_MODE_SECONDS, float(position["seconds"])
|
||||
else:
|
||||
mode, value = LUMA_KEYFRAME_MODE_FRACTION, float(position["fraction"])
|
||||
chain.add(LumaRay32KeyframeItem(image=image, mode=mode, value=value))
|
||||
return IO.NodeOutput(chain)
|
||||
|
||||
|
||||
class LumaRay32KeyframesToVideoNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32KeyframesToVideoNode",
|
||||
display_name="Luma Ray 3.2 Keyframes to Video",
|
||||
category="partner/video/Luma",
|
||||
description="Generate a video that interpolates through a sequence of guide images, each anchored to a "
|
||||
"position on the timeline, using Luma Ray 3.2. Build the sequence with Luma Ray 3.2 Keyframe nodes "
|
||||
"(at least 2).",
|
||||
inputs=[
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Text prompt for the video generation."),
|
||||
IO.Combo.Input("resolution", options=["360p", "540p", "720p", "1080p"], default="720p"),
|
||||
IO.Combo.Input("duration", options=["5s", "10s"]),
|
||||
_ray32_seed_input(),
|
||||
IO.Custom(LumaIO.LUMA_RAY32_KEYFRAME).Input(
|
||||
"keyframes",
|
||||
tooltip="Keyframe sequence from Luma Ray 3.2 Keyframe nodes (at least 2).",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Video.Output(), IO.String.Output(display_name="generation_id")],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_VIDEO,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
resolution: str,
|
||||
duration: str,
|
||||
seed: int,
|
||||
keyframes: LumaRay32KeyframeChain | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=6000)
|
||||
items = keyframes.items if keyframes is not None else []
|
||||
if len(items) < 2:
|
||||
raise ValueError(
|
||||
"Connect at least 2 Luma Ray 3.2 Keyframe nodes "
|
||||
"(use Luma Ray 3.2 Image to Video for a single start/end frame)."
|
||||
)
|
||||
if len(items) > 64:
|
||||
raise ValueError(f"Ray 3.2 supports at most 64 keyframes; got {len(items)}.")
|
||||
maxframe = 120 if duration == "5s" else 240
|
||||
duration_seconds = maxframe / 24 # 5.0 or 10.0
|
||||
# Resolve each keyframe to an output-frame index, then order by position
|
||||
# (so the user can chain keyframes in any order — the position is what places them)
|
||||
placed: list[tuple[int, torch.Tensor]] = []
|
||||
for item in items:
|
||||
if item.mode == LUMA_KEYFRAME_MODE_SECONDS:
|
||||
if item.value > duration_seconds:
|
||||
raise ValueError(
|
||||
f"Keyframe position {item.value:g}s is past the end of the {duration} video; "
|
||||
f"use 0-{duration_seconds:g}s (or switch the keyframe to fraction mode)."
|
||||
)
|
||||
idx = round(item.value * 24)
|
||||
else:
|
||||
idx = round(item.value * maxframe)
|
||||
placed.append((max(0, min(maxframe, idx)), item.image))
|
||||
placed.sort(key=lambda p: p[0])
|
||||
indexes = [idx for idx, _ in placed]
|
||||
for a, b in zip(indexes, indexes[1:]):
|
||||
if a == b:
|
||||
raise ValueError(
|
||||
f"Two keyframes resolve to the same output frame ({a}) for a {duration} video "
|
||||
f"(valid range 0-{maxframe}); give each keyframe a distinct position."
|
||||
)
|
||||
refs: list[Luma2ImageRef] = []
|
||||
for _, image in placed:
|
||||
url = await upload_image_to_comfyapi(cls, image, mime_type="image/png")
|
||||
refs.append(Luma2ImageRef(url=url))
|
||||
request = Luma2GenerationRequest(
|
||||
prompt=prompt,
|
||||
model="ray-3.2",
|
||||
type="video",
|
||||
video=Luma2VideoOptions(resolution=resolution, duration=duration, keyframes=refs, keyframe_indexes=indexes),
|
||||
)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaRay32VideoEditNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32VideoEditNode",
|
||||
display_name="Luma Ray 3.2 Video Edit",
|
||||
category="partner/video/Luma",
|
||||
description="Re-render an existing video under a new prompt using Luma Ray 3.2 (restyle, relight, add "
|
||||
"or remove elements) while keeping the original motion. Source video up to 18 seconds; the edited "
|
||||
"video keeps the source's length.",
|
||||
inputs=[
|
||||
IO.Video.Input("video", tooltip="Source video to edit. Up to 18 seconds."),
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Describes the desired edit."),
|
||||
IO.Combo.Input("resolution", options=["360p", "540p", "720p", "1080p"], default="720p"),
|
||||
IO.Combo.Input(
|
||||
"strength",
|
||||
options=[
|
||||
"auto",
|
||||
"adhere_1",
|
||||
"adhere_2",
|
||||
"adhere_3",
|
||||
"flex_1",
|
||||
"flex_2",
|
||||
"flex_3",
|
||||
"reimagine_1",
|
||||
"reimagine_2",
|
||||
"reimagine_3",
|
||||
],
|
||||
default="auto",
|
||||
tooltip="How strongly to preserve vs. reimagine the source. 'auto' lets Ray 3.2 choose; "
|
||||
"adhere_* preserves the most, flex_* is balanced, reimagine_* changes the most.",
|
||||
),
|
||||
_ray32_seed_input(),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
IO.String.Output(display_name="generation_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_EDIT,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls, video: Input.Video, prompt: str, resolution: str, strength: str, seed: int
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=6000)
|
||||
try:
|
||||
duration = "5s" if video.get_duration() <= 5.0 else "10s"
|
||||
except Exception:
|
||||
duration = "10s"
|
||||
source_url = await upload_video_to_comfyapi(cls, video, max_duration=18)
|
||||
edit = Luma2VideoEdit(auto_controls=True) if strength == "auto" else Luma2VideoEdit(strength=strength)
|
||||
request = Luma2GenerationRequest(
|
||||
prompt=prompt,
|
||||
model="ray-3.2",
|
||||
type="video_edit",
|
||||
source=Luma2ImageRef(url=source_url, media_type="video/mp4"),
|
||||
video=Luma2VideoOptions(resolution=resolution, duration=duration, edit=edit),
|
||||
)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaRay32VideoReframeNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32VideoReframeNode",
|
||||
display_name="Luma Ray 3.2 Video Reframe",
|
||||
category="partner/video/Luma",
|
||||
description="Change the aspect ratio of an existing video, using Luma Ray 3.2 to fill the newly "
|
||||
"exposed canvas areas. Source video up to 30 seconds. Billed per second of output.",
|
||||
inputs=[
|
||||
IO.Video.Input("video", tooltip="Source video to reframe. Up to 30 seconds."),
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Describes how the newly exposed canvas areas should be filled.",
|
||||
),
|
||||
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"]),
|
||||
IO.Combo.Input("resolution", options=["360p", "540p", "720p", "1080p"], default="720p"),
|
||||
_ray32_seed_input(),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
IO.String.Output(display_name="generation_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_REFRAME,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls, video: Input.Video, prompt: str, aspect_ratio: str, resolution: str, seed: int
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False, min_length=1, max_length=6000)
|
||||
if resolution == "1080p" and aspect_ratio in {"9:16", "3:4"}:
|
||||
raise ValueError("1080p is not available for vertical aspect ratios (9:16, 3:4) when reframing.")
|
||||
source_url = await upload_video_to_comfyapi(cls, video, max_duration=30)
|
||||
request = Luma2GenerationRequest(
|
||||
prompt=prompt,
|
||||
model="ray-3.2",
|
||||
type="video_reframe",
|
||||
aspect_ratio=aspect_ratio,
|
||||
source=Luma2ImageRef(url=source_url, media_type="video/mp4"),
|
||||
video=Luma2VideoOptions(resolution=resolution),
|
||||
)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaRay32ExtendVideoNode(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="LumaRay32ExtendVideoNode",
|
||||
display_name="Luma Ray 3.2 Extend Video",
|
||||
category="partner/video/Luma",
|
||||
description="Extend a previous Ray 3.2 generation forward (continue after it) or backward (lead-in "
|
||||
"before it). Connect the generation_id output of a prior Luma Ray 3.2 node."
|
||||
" Extensions are always 5 seconds.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"source_generation_id",
|
||||
default="",
|
||||
tooltip="generation_id of the prior Ray 3.2 video to extend."
|
||||
" Connect the generation_id output of another Luma Ray 3.2 node.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"direction",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"Forward (continue after)",
|
||||
[
|
||||
IO.Boolean.Input(
|
||||
"loop",
|
||||
default=False,
|
||||
tooltip="Loop the extended video seamlessly (forward extend only).",
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option("Backward (lead-in before)", []),
|
||||
],
|
||||
tooltip="Forward continues after the prior clip; backward is prepended before it.",
|
||||
),
|
||||
IO.String.Input("prompt", multiline=True, default="", tooltip="Text prompt for the new content."),
|
||||
IO.Combo.Input("resolution", options=["540p", "720p", "1080p"], default="720p"),
|
||||
_ray32_seed_input(),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
IO.String.Output(display_name="generation_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=_BADGE_RAY32_VIDEO_5S,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls, source_generation_id: str, direction: dict, prompt: str, resolution: str, seed: int
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False, min_length=1, max_length=6000)
|
||||
gen_id = (source_generation_id or "").strip()
|
||||
if not gen_id:
|
||||
raise ValueError(
|
||||
"source_generation_id is required (connect the generation_id output of a prior Luma Ray 3.2 node)."
|
||||
)
|
||||
video = Luma2VideoOptions(resolution=resolution, duration="5s")
|
||||
ref = Luma2ImageRef(generation_id=gen_id)
|
||||
if direction["direction"] == "Forward (continue after)":
|
||||
video.start_frame = ref
|
||||
if direction.get("loop"):
|
||||
video.loop = True
|
||||
else:
|
||||
video.end_frame = ref
|
||||
request = Luma2GenerationRequest(prompt=prompt, model="ray-3.2", type="video", video=video)
|
||||
return await _ray32_generate(cls, request)
|
||||
|
||||
|
||||
class LumaExtension(ComfyExtension):
|
||||
@ -944,6 +1481,13 @@ class LumaExtension(ComfyExtension):
|
||||
LumaConceptsNode,
|
||||
LumaImageNode,
|
||||
LumaImageEditNode,
|
||||
LumaRay32TextToVideoNode,
|
||||
LumaRay32ImageToVideoNode,
|
||||
LumaRay32KeyframeNode,
|
||||
LumaRay32KeyframesToVideoNode,
|
||||
LumaRay32VideoEditNode,
|
||||
LumaRay32VideoReframeNode,
|
||||
LumaRay32ExtendVideoNode,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -100,8 +100,7 @@ class SoniloTextToMusic(IO.ComfyNode):
|
||||
node_id="SoniloTextToMusic",
|
||||
display_name="Sonilo Text to Music",
|
||||
category="partner/audio/Sonilo",
|
||||
description="Generate music from a text prompt using Sonilo's AI model. "
|
||||
"Leave duration at 0 to let the model infer it from the prompt.",
|
||||
description="Generate music from a text prompt using Sonilo's AI model.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
@ -111,11 +110,10 @@ class SoniloTextToMusic(IO.ComfyNode):
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=0,
|
||||
min=0,
|
||||
default=30,
|
||||
min=1,
|
||||
max=360,
|
||||
tooltip="Target duration in seconds. Set to 0 to let the model "
|
||||
"infer the duration from the prompt. Maximum: 6 minutes.",
|
||||
tooltip="Target duration in seconds. Maximum: 6 minutes.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
@ -136,13 +134,7 @@ class SoniloTextToMusic(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration"]),
|
||||
expr="""
|
||||
(
|
||||
widgets.duration > 0
|
||||
? {"type":"usd","usd": 0.005 * widgets.duration}
|
||||
: {"type":"usd","usd": 0.005, "format":{"suffix":"/second"}}
|
||||
)
|
||||
""",
|
||||
expr='{"type":"usd","usd": 0.0025 * widgets.duration}',
|
||||
),
|
||||
)
|
||||
|
||||
@ -150,14 +142,13 @@ class SoniloTextToMusic(IO.ComfyNode):
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
duration: int = 0,
|
||||
duration: int = 1,
|
||||
seed: int = 0,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=1000)
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("prompt", prompt)
|
||||
if duration > 0:
|
||||
form.add_field("duration", str(duration))
|
||||
form.add_field("duration", str(duration))
|
||||
audio_bytes = await _stream_sonilo_music(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/sonilo/t2m/generate", method="POST"),
|
||||
|
||||
@ -4,6 +4,8 @@ import os
|
||||
import re
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from io import BytesIO
|
||||
|
||||
from yarl import URL
|
||||
@ -91,6 +93,32 @@ async def sleep_with_interrupt(
|
||||
await asyncio.sleep(min(1.0, end - now))
|
||||
|
||||
|
||||
def _retry_after_wait(value: str | None, fallback: float, max_wait: float) -> float:
|
||||
"""Delay before the next retry, honoring a server ``Retry-After`` header."""
|
||||
|
||||
seconds: float | None = None
|
||||
if value is not None:
|
||||
value = value.strip()
|
||||
if value.isascii() and value.isdigit():
|
||||
# delay-seconds form. The ASCII-digit guard keeps exotic Unicode "digit" characters away from float()
|
||||
# an all-digit string always converts (huge values become inf, never raising).
|
||||
seconds = float(value)
|
||||
elif value:
|
||||
# HTTP-date form. parsedate_to_datetime raises OverflowError (not a ValueError) on absurd years/offsets
|
||||
try:
|
||||
parsed = parsedate_to_datetime(value)
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
parsed = None
|
||||
if parsed is not None:
|
||||
if parsed.tzinfo is None: # naive datetime: HTTP-date is UTC
|
||||
parsed = parsed.replace(tzinfo=timezone.utc)
|
||||
delta = (parsed - datetime.now(timezone.utc)).total_seconds()
|
||||
seconds = delta if delta > 0 else 0.0
|
||||
if seconds is None:
|
||||
return fallback
|
||||
return min(seconds, max_wait)
|
||||
|
||||
|
||||
def mimetype_to_extension(mime_type: str) -> str:
|
||||
"""Converts a MIME type to a file extension."""
|
||||
return mime_type.split("/")[-1].lower()
|
||||
|
||||
@ -21,6 +21,7 @@ from server import PromptServer
|
||||
|
||||
from . import request_logger
|
||||
from ._helpers import (
|
||||
_retry_after_wait,
|
||||
default_base_url,
|
||||
get_comfy_api_headers,
|
||||
get_node_id,
|
||||
@ -82,6 +83,7 @@ class _PollUIState:
|
||||
|
||||
|
||||
_RETRY_STATUS = {408, 500, 502, 503, 504} # status 429 is handled separately
|
||||
_MAX_RETRY_AFTER_WAIT = 150.0 # Cap a server Retry-After at this many seconds so a large hint can't block execution
|
||||
COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"]
|
||||
FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"]
|
||||
QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait", "in_queue"]
|
||||
@ -747,6 +749,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
|
||||
should_retry = True
|
||||
|
||||
if should_retry:
|
||||
wait_time = _retry_after_wait(resp.headers.get("Retry-After"), wait_time, _MAX_RETRY_AFTER_WAIT)
|
||||
logging.warning(
|
||||
"HTTP %s %s -> %s. Waiting %.2fs (%s).",
|
||||
method,
|
||||
|
||||
@ -11,7 +11,7 @@ class TextEncodeAceStepAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/ace",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
@ -33,7 +33,7 @@ class TextEncodeAceStepAudio15(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="TextEncodeAceStepAudio1.5",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/ace",
|
||||
inputs=[
|
||||
IO.Clip.Input("clip"),
|
||||
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
|
||||
@ -67,7 +67,7 @@ class EmptyAceStepLatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStepLatentAudio",
|
||||
display_name="Empty Ace Step 1.0 Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ace",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
|
||||
IO.Int.Input(
|
||||
@ -90,7 +90,7 @@ class EmptyAceStep15LatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyAceStep1.5LatentAudio",
|
||||
display_name="Empty Ace Step 1.5 Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ace",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
|
||||
IO.Int.Input(
|
||||
@ -111,8 +111,8 @@ class ReferenceAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ReferenceTimbreAudio",
|
||||
display_name="Reference Audio",
|
||||
category="advanced/conditioning/audio",
|
||||
display_name="Set Reference Audio",
|
||||
category="model/conditioning",
|
||||
is_experimental=True,
|
||||
description="This node sets the reference audio for ace step 1.5",
|
||||
inputs=[
|
||||
|
||||
@ -16,7 +16,7 @@ class APG(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="APG",
|
||||
display_name="Adaptive Projected Guidance",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input(
|
||||
|
||||
@ -19,7 +19,7 @@ class EmptyARVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyARVideoLatent",
|
||||
category="model/latent/video",
|
||||
category="model/latent/autoregressive",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=832, min=16, max=8192, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=8192, step=16),
|
||||
@ -85,7 +85,7 @@ class ARVideoI2V(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ARVideoI2V",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/autoregressive",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Vae.Input("vae"),
|
||||
|
||||
@ -16,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="EmptyLatentAudio",
|
||||
display_name="Empty Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
essentials_category="Audio",
|
||||
inputs=[
|
||||
IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1),
|
||||
@ -41,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ConditioningStableAudio",
|
||||
category="model/conditioning",
|
||||
category="model/conditioning/stable audio",
|
||||
inputs=[
|
||||
IO.Conditioning.Input("positive"),
|
||||
IO.Conditioning.Input("negative"),
|
||||
@ -70,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode):
|
||||
node_id="VAEEncodeAudio",
|
||||
search_aliases=["audio to latent"],
|
||||
display_name="VAE Encode Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Audio.Input("audio"),
|
||||
IO.Vae.Input("vae"),
|
||||
@ -115,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode):
|
||||
node_id="VAEDecodeAudio",
|
||||
search_aliases=["latent to audio"],
|
||||
display_name="VAE Decode Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
@ -137,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode):
|
||||
node_id="VAEDecodeAudioTiled",
|
||||
search_aliases=["latent to audio"],
|
||||
display_name="VAE Decode Audio (Tiled)",
|
||||
category="model/latent/audio",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
|
||||
@ -39,9 +39,9 @@ class BerniniConditioning(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="BerniniConditioning",
|
||||
display_name="Bernini Conditioning",
|
||||
category="conditioning/video_models",
|
||||
category="model/conditioning/bernini",
|
||||
description="Conditioning node for Bernini in-context video/image conditioning. It can be used for the following tasks: t2v (text-to-video), v2v (video-to-video), rv2v (reference-guided video editing), r2v (reference-to-video), ads2v (insert image/video into video)."
|
||||
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
|
||||
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -50,14 +50,11 @@ class BerniniConditioning(io.ComfyNode):
|
||||
io.Int.Input("height", default=480, min=16, max=8192, step=16),
|
||||
io.Int.Input("length", default=81, min=1, max=8192, step=4),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
io.Image.Input("source_video", optional=True, tooltip=(
|
||||
"Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
|
||||
io.Image.Input("reference_video", optional=True, tooltip=(
|
||||
"Video to insert into the source video (ads2v).")),
|
||||
io.Image.Input("source_video", optional=True, tooltip=("Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
|
||||
io.Image.Input("reference_video", optional=True, tooltip=("Video to insert into the source video (ads2v).")),
|
||||
io.Autogrow.Input("reference_images", optional=True,
|
||||
template=io.Autogrow.TemplatePrefix(
|
||||
input=io.Image.Input("reference_image", tooltip=(
|
||||
"Reference image injected as an in-context token (r2v, rv2v).")),
|
||||
input=io.Image.Input("reference_image", tooltip=("Reference image injected as an in-context token (r2v, rv2v).")),
|
||||
prefix="reference_image_", min=0, max=8)),
|
||||
io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True, tooltip=(
|
||||
"Max size for the long edge of reference_video and reference_images. Resized with preserved aspect ratio and snapped to 16px.")),
|
||||
@ -70,10 +67,8 @@ class BerniniConditioning(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size,
|
||||
source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
|
||||
device=comfy.model_management.intermediate_device())
|
||||
def execute(cls, positive, negative, vae, width, height, length, batch_size, source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
|
||||
# source_video (1), reference_video (2), reference_images (3, 4, ...).
|
||||
context = []
|
||||
@ -106,9 +101,7 @@ class BerniniConditioning(io.ComfyNode):
|
||||
class BerniniExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
BerniniConditioning,
|
||||
]
|
||||
return [BerniniConditioning,]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> BerniniExtension:
|
||||
|
||||
97
comfy_extras/nodes_boogu.py
Normal file
97
comfy_extras/nodes_boogu.py
Normal file
@ -0,0 +1,97 @@
|
||||
import math
|
||||
|
||||
import node_helpers
|
||||
import comfy.utils
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class TextEncodeBooguEdit(io.ComfyNode):
|
||||
"""Boogu-Image Edit conditioning.
|
||||
|
||||
The edit image is used twice, matching the reference pipeline:
|
||||
- Qwen3-VL vision tokens (instruction understanding) -> positive only
|
||||
- VAE reference latent (image identity) -> positive and negative
|
||||
The ref latent is in both conds so it cancels under CFG (identity preserved);
|
||||
the vision tokens are only in the positive so CFG amplifies the instruction.
|
||||
The tokenizer selects the right system prompt automatically (image -> TI2I,
|
||||
empty negative -> DROP), so no template plumbing is needed here.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeBooguEdit",
|
||||
category="model/conditioning/boogu",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True),
|
||||
io.Vae.Input("vae"),
|
||||
io.Autogrow.Input(
|
||||
"images",
|
||||
template=io.Autogrow.TemplateNames(
|
||||
io.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 17)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
|
||||
ref_latents = []
|
||||
images_vl = []
|
||||
|
||||
images = images or {}
|
||||
for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
|
||||
image = images[name]
|
||||
if image is None:
|
||||
continue
|
||||
samples = image.movedim(-1, 1)
|
||||
|
||||
# Vision tower input: the reference caps the VLM image at 384x384
|
||||
# (max_vlm_input_pil_pixels in pipeline_boogu.py).
|
||||
total = int(384 * 384)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
width = round(samples.shape[3] * scale_by)
|
||||
height = round(samples.shape[2] * scale_by)
|
||||
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
||||
images_vl.append(s.movedim(1, -1)[:, :, :, :3])
|
||||
|
||||
# Reference latent: align to 16 px (VAE /8 * patch_size 2).
|
||||
if vae is not None:
|
||||
total = int(1024 * 1024)
|
||||
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
|
||||
width = round(samples.shape[3] * scale_by / 16.0) * 16
|
||||
height = round(samples.shape[2] * scale_by / 16.0) * 16
|
||||
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
|
||||
ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
|
||||
|
||||
# positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
|
||||
positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
|
||||
negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt))
|
||||
|
||||
if len(ref_latents) > 0:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
|
||||
|
||||
return io.NodeOutput(positive, negative)
|
||||
|
||||
|
||||
class BooguExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
TextEncodeBooguEdit,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> BooguExtension:
|
||||
return BooguExtension()
|
||||
@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanCameraEmbedding",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/camera",
|
||||
inputs=[
|
||||
io.Combo.Input(
|
||||
"camera_pose",
|
||||
|
||||
@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyChromaRadianceLatentImage",
|
||||
category="model/latent/chroma_radiance",
|
||||
category="model/latent/chroma radiance",
|
||||
inputs=[
|
||||
io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="ChromaRadianceOptions",
|
||||
category="model/patch/chroma_radiance",
|
||||
category="model/patch/chroma radiance",
|
||||
description="Allows setting advanced options for the Chroma Radiance model.",
|
||||
inputs=[
|
||||
io.Model.Input(id="model"),
|
||||
|
||||
@ -9,7 +9,8 @@ class CLIPTextEncodeSDXLRefiner(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSDXLRefiner",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SDXL Refiner)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Float.Input("ascore", default=6.0, min=0.0, max=1000.0, step=0.01),
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
@ -30,7 +31,8 @@ class CLIPTextEncodeSDXL(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSDXL",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SDXL)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
|
||||
@ -66,6 +66,7 @@ class WanContextWindowsManualNode(ContextWindowsManualNode):
|
||||
schema.node_id = "WanContextWindowsManual"
|
||||
schema.display_name = "WAN Context Windows (Manual)"
|
||||
schema.description = "Manually set context windows for WAN-like models (dim=2)."
|
||||
schema.category="model/patch/wan"
|
||||
schema.inputs = [
|
||||
io.Model.Input("model", tooltip="The model to apply context windows to during sampling."),
|
||||
io.Int.Input("context_length", min=1, max=nodes.MAX_RESOLUTION, step=4, default=81, tooltip="The length of the context window.", advanced=True),
|
||||
|
||||
@ -9,6 +9,8 @@ class SetUnionControlNetType(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SetUnionControlNetType",
|
||||
search_aliases=["set controlnet type", "union controlnet type"],
|
||||
display_name="Set Union ControlNet Type",
|
||||
category="model/conditioning/controlnet",
|
||||
inputs=[
|
||||
io.ControlNet.Input("control_net"),
|
||||
@ -39,6 +41,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="ControlNetInpaintingAliMamaApply",
|
||||
search_aliases=["masked controlnet"],
|
||||
display_name="Apply ControlNet Inpainting (AliMama)",
|
||||
category="model/conditioning/controlnet",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
|
||||
@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyCosmosLatentVideo",
|
||||
category="model/latent/video",
|
||||
category="model/latent/cosmos",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/cosmos",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="CosmosPredict2ImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/cosmos",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
|
||||
@ -729,7 +729,7 @@ class SamplerCustom(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SamplerCustom",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Boolean.Input("add_noise", default=True, advanced=True),
|
||||
@ -1015,7 +1015,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SamplerCustomAdvanced",
|
||||
category="model/sampling/custom_sampling",
|
||||
category="model/sampling/custom",
|
||||
inputs=[
|
||||
io.Noise.Input("noise"),
|
||||
io.Guider.Input("guider"),
|
||||
@ -1143,7 +1143,7 @@ class CFGOverride(io.ComfyNode):
|
||||
display_name="CFG Override",
|
||||
description="Override cfg to a fixed value over a [start, end] percent (sigma) range. "
|
||||
"With multiple overrides, the one nearest the sampler wins on overlap.",
|
||||
category="sampling/custom_sampling",
|
||||
category="model/sampling/guiders",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("cfg", default=1.0, min=0.0, max=100.0, step=0.1, round=0.01),
|
||||
|
||||
@ -1583,7 +1583,7 @@ class LoadTrainingDataset(io.ComfyNode):
|
||||
shard_path = os.path.join(dataset_dir, shard_file)
|
||||
|
||||
with open(shard_path, "rb") as f:
|
||||
shard_data = torch.load(f)
|
||||
shard_data = torch.load(f, weights_only=True)
|
||||
|
||||
all_latents.extend(shard_data["latents"])
|
||||
all_conditioning.extend(shard_data["conditioning"])
|
||||
|
||||
@ -363,7 +363,7 @@ class EasyCacheNode(io.ComfyNode):
|
||||
node_id="EasyCache",
|
||||
display_name="EasyCache",
|
||||
description="Native EasyCache implementation.",
|
||||
category="advanced/debug/model",
|
||||
category="advanced/debug",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model", tooltip="The model to add EasyCache to."),
|
||||
@ -496,7 +496,7 @@ class LazyCacheNode(io.ComfyNode):
|
||||
node_id="LazyCache",
|
||||
display_name="LazyCache",
|
||||
description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
|
||||
category="advanced/debug/model",
|
||||
category="advanced/debug",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Model.Input("model", tooltip="The model to add LazyCache to."),
|
||||
|
||||
@ -8,7 +8,8 @@ class ReferenceLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ReferenceLatent",
|
||||
category="advanced/conditioning/edit_models",
|
||||
display_name="Set Reference Latent",
|
||||
category="model/conditioning",
|
||||
description="This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images.",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
|
||||
@ -13,7 +13,7 @@ class CLIPTextEncodeFlux(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeFlux",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyFlux2LatentImage",
|
||||
display_name="Empty Flux 2 Latent",
|
||||
category="model/latent",
|
||||
category="model/latent/flux",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -61,7 +61,7 @@ class FluxGuidance(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxGuidance",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1),
|
||||
@ -84,7 +84,7 @@ class FluxDisableGuidance(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxDisableGuidance",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
description="This node completely disables the guidance embed on Flux and Flux like models",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
@ -128,7 +128,7 @@ class FluxKontextImageScale(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="FluxKontextImageScale",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
description="This node resizes the image to one that is more optimal for flux kontext.",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
@ -156,7 +156,7 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="FluxKontextMultiReferenceLatentMethod",
|
||||
display_name="Edit Model Reference Method",
|
||||
category="advanced/conditioning/flux",
|
||||
category="model/conditioning/flux",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Combo.Input(
|
||||
|
||||
@ -11,8 +11,9 @@ class QuadrupleCLIPLoader(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="QuadrupleCLIPLoader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
|
||||
display_name="Load CLIP (Quadruple)",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
|
||||
inputs=[
|
||||
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
|
||||
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
|
||||
@ -38,8 +39,9 @@ class CLIPTextEncodeHiDream(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeHiDream",
|
||||
display_name="CLIP Text Encode (HiDream)",
|
||||
search_aliases=["hidream prompt"],
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/hidream",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -14,7 +14,7 @@ class EmptyHiDreamO1LatentImage(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyHiDreamO1LatentImage",
|
||||
display_name="Empty HiDream-O1 Latent Image",
|
||||
category="model/latent/image",
|
||||
category="model/latent/hidream",
|
||||
description=(
|
||||
"Empty pixel-space latent for HiDream-O1-Image. The model was "
|
||||
"trained at ~4 megapixels; lower resolutions go off-distribution "
|
||||
@ -47,7 +47,7 @@ class HiDreamO1ReferenceImages(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1ReferenceImages",
|
||||
display_name="HiDream-O1 Reference Images",
|
||||
category="model/conditioning/image",
|
||||
category="model/conditioning/hidream",
|
||||
description=(
|
||||
"Attach 1-10 reference images to conditioning, one for edit instruction"
|
||||
"or multiple for subject-driven personalization."
|
||||
@ -117,7 +117,7 @@ class HiDreamO1PatchSeamSmoothing(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1PatchSeamSmoothing",
|
||||
display_name="HiDream-O1 Patch Seam Smoothing",
|
||||
category="advanced/model",
|
||||
category="model/patch/hidream",
|
||||
is_experimental=True,
|
||||
description=(
|
||||
"Average the model output across multiple shifted patch-grid "
|
||||
|
||||
@ -14,7 +14,8 @@ class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeHunyuanDiT",
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (Hunyuan Image)",
|
||||
category="model/conditioning/hunyuan image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("bert", multiline=True, dynamic_prompts=True),
|
||||
@ -41,7 +42,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="EmptyHunyuanLatentVideo",
|
||||
display_name="Empty HunyuanVideo 1.0 Latent",
|
||||
category="model/latent/video",
|
||||
category="model/latent/hunyuan video",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -67,6 +68,7 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo):
|
||||
schema = super().define_schema()
|
||||
schema.node_id = "EmptyHunyuanVideo15Latent"
|
||||
schema.display_name = "Empty HunyuanVideo 1.5 Latent"
|
||||
schema.category = "model/latent/hunyuan video"
|
||||
return schema
|
||||
|
||||
@classmethod
|
||||
@ -81,7 +83,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15ImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -132,7 +134,7 @@ class HunyuanVideo15SuperResolution(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15SuperResolution",
|
||||
display_name="Hunyuan Video 1.5 Super Resolution",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -227,7 +229,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanVideo15LatentUpscaleWithModel",
|
||||
display_name="Hunyuan Video 15 Latent Upscale With Model",
|
||||
category="model/latent",
|
||||
category="model/latent/hunyhuan video",
|
||||
inputs=[
|
||||
io.LatentUpscaleModel.Input("model"),
|
||||
io.Latent.Input("samples"),
|
||||
@ -276,7 +278,7 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeHunyuanVideo_ImageToVideo",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.ClipVisionOutput.Input("clip_vision_output"),
|
||||
@ -308,7 +310,7 @@ class HunyuanImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="HunyuanImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Vae.Input("vae"),
|
||||
@ -359,7 +361,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyHunyuanImageLatent",
|
||||
category="model/latent",
|
||||
category="model/latent/hunyuan image",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -384,7 +386,7 @@ class HunyuanRefinerLatent(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="HunyuanRefinerLatent",
|
||||
display_name="Hunyuan Latent Refiner",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/hunyuan video",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -12,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="EmptyLatentHunyuan3Dv2",
|
||||
category="model/latent/3d",
|
||||
category="model/latent/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.Int.Input("resolution", default=3072, min=1, max=8192),
|
||||
IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
|
||||
@ -35,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="Hunyuan3Dv2Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.ClipVisionOutput.Input("clip_vision_output"),
|
||||
],
|
||||
@ -60,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="Hunyuan3Dv2ConditioningMultiView",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.ClipVisionOutput.Input("front", optional=True),
|
||||
IO.ClipVisionOutput.Input("left", optional=True),
|
||||
@ -97,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="VAEDecodeHunyuan3D",
|
||||
category="model/latent/3d",
|
||||
category="model/latent/hunyuan 3d",
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
IO.Vae.Input("vae"),
|
||||
|
||||
@ -38,7 +38,7 @@ class Ideogram4Scheduler(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="Ideogram4Scheduler",
|
||||
display_name="Ideogram 4 Scheduler",
|
||||
category="sampling/custom_sampling/schedulers",
|
||||
category="model/sampling/schedulers",
|
||||
inputs=[
|
||||
io.Int.Input("steps", default=20, min=1, max=200),
|
||||
io.Int.Input("width", default=1024, min=256, max=8192, step=16),
|
||||
|
||||
@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Kandinsky5ImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/kandinsky",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="NormalizeVideoLatentStart",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning",
|
||||
description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.",
|
||||
inputs=[
|
||||
io.Latent.Input("latent"),
|
||||
@ -104,8 +104,9 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeKandinsky5",
|
||||
display_name="CLIP Text Encode (Kandinsky 5)",
|
||||
search_aliases=["kandinsky prompt"],
|
||||
category="advanced/conditioning/kandinsky5",
|
||||
category="model/conditioning/kandinsky",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -262,6 +262,7 @@ class LatentBatch(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LatentBatch",
|
||||
search_aliases=["combine latents", "merge latents", "join latents"],
|
||||
display_name="Batch Latents (DEPRECATED)",
|
||||
category="model/latent/batch",
|
||||
is_deprecated=True,
|
||||
inputs=[
|
||||
@ -447,6 +448,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ReplaceVideoLatentFrames",
|
||||
display_name="Replace Video Latent Frames",
|
||||
category="model/latent/batch",
|
||||
inputs=[
|
||||
io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."),
|
||||
|
||||
@ -25,7 +25,7 @@ class GetICLoRAParameters(io.ComfyNode):
|
||||
display_name="Get IC-LoRA Parameters",
|
||||
description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded "
|
||||
"model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"],
|
||||
inputs=[
|
||||
io.Model.Input(
|
||||
@ -62,7 +62,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyLTXVLatentVideo",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -86,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVImgToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -131,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVImgToVideoInplace",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Image.Input("image"),
|
||||
@ -251,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVAddGuide",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -498,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVCropGuides",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -542,7 +542,7 @@ class LTXVConditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVConditioning",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/ltxv",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -566,7 +566,7 @@ class ModelSamplingLTXV(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="ModelSamplingLTXV",
|
||||
category="advanced/model",
|
||||
category="model/patch/ltxv",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
|
||||
@ -746,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVConcatAVLatent",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Latent.Input("video_latent"),
|
||||
io.Latent.Input("audio_latent"),
|
||||
@ -781,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="LTXVSeparateAVLatent",
|
||||
category="model/latent/video/ltxv",
|
||||
category="model/latent/ltxv",
|
||||
description="LTXV Separate AV Latent",
|
||||
inputs=[
|
||||
io.Latent.Input("av_latent"),
|
||||
@ -814,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVReferenceAudio",
|
||||
display_name="LTXV Reference Audio (ID-LoRA)",
|
||||
category="model/conditioning/audio",
|
||||
category="model/conditioning/ltxv",
|
||||
description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
|
||||
@ -40,7 +40,7 @@ class LTXVAudioVAEEncode(VAEEncodeAudio):
|
||||
return io.Schema(
|
||||
node_id="LTXVAudioVAEEncode",
|
||||
display_name="LTXV Audio VAE Encode",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Audio.Input("audio", tooltip="The audio to be encoded."),
|
||||
io.Vae.Input(
|
||||
@ -63,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVAudioVAEDecode",
|
||||
display_name="LTXV Audio VAE Decode",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Latent.Input("samples", tooltip="The latent to be decoded."),
|
||||
io.Vae.Input(
|
||||
@ -96,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="LTXVEmptyLatentAudio",
|
||||
display_name="LTXV Empty Latent Audio",
|
||||
category="model/latent/audio",
|
||||
category="model/latent/ltxv",
|
||||
inputs=[
|
||||
io.Int.Input(
|
||||
"frames_number",
|
||||
@ -168,9 +168,9 @@ class LTXAVTextEncoderLoader(io.ComfyNode):
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="LTXAVTextEncoderLoader",
|
||||
display_name="LTXV Audio Text Encoder Loader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nltxav: gemma 3 12B",
|
||||
display_name="Load LTXV Audio Text Encoder",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nltxav: gemma 3 12B",
|
||||
inputs=[
|
||||
io.Combo.Input(
|
||||
"text_encoder",
|
||||
|
||||
@ -13,7 +13,7 @@ class LTXVLatentUpsampler(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="LTXVLatentUpsampler",
|
||||
category="model/latent/video",
|
||||
category="model/latent/ltxv",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
IO.Latent.Input("samples"),
|
||||
|
||||
@ -9,7 +9,7 @@ class RenormCFG(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RenormCFG",
|
||||
category="advanced/model",
|
||||
category="model/patch",
|
||||
inputs=[
|
||||
io.Model.Input("model"),
|
||||
io.Float.Input("cfg_trunc", default=100, min=0.0, max=100.0, step=0.01, advanced=True),
|
||||
@ -80,8 +80,8 @@ class CLIPTextEncodeLumina2(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeLumina2",
|
||||
search_aliases=["lumina prompt"],
|
||||
display_name="CLIP Text Encode for Lumina2",
|
||||
category="model/conditioning",
|
||||
display_name="CLIP Text Encode (Lumina 2)",
|
||||
category="model/conditioning/lumina",
|
||||
description="Encodes a system prompt and a user prompt using a CLIP model into an embedding "
|
||||
"that can be used to guide the diffusion model towards generating specific images.",
|
||||
inputs=[
|
||||
|
||||
@ -53,6 +53,7 @@ class LatentCompositeMasked(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="LatentCompositeMasked",
|
||||
search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"],
|
||||
display_name="Latent Composite Masked",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
IO.Latent.Input("destination"),
|
||||
|
||||
@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptyMochiLatentVideo",
|
||||
category="model/latent/video",
|
||||
category="model/latent/mochi",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
|
||||
@ -59,7 +59,7 @@ class ModelSamplingDiscrete:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, zsnr):
|
||||
m = model.clone()
|
||||
@ -97,7 +97,7 @@ class ModelSamplingStableCascade:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/stable cascade"
|
||||
|
||||
def patch(self, model, shift):
|
||||
m = model.clone()
|
||||
@ -123,7 +123,7 @@ class ModelSamplingSD3:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/stable diffusion"
|
||||
|
||||
def patch(self, model, shift, multiplier=1000):
|
||||
m = model.clone()
|
||||
@ -150,6 +150,7 @@ class ModelSamplingAuraFlow(ModelSamplingSD3):
|
||||
}}
|
||||
|
||||
FUNCTION = "patch_aura"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch_aura(self, model, shift):
|
||||
return self.patch(model, shift, multiplier=1.0)
|
||||
@ -167,7 +168,7 @@ class ModelSamplingFlux:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch/flux"
|
||||
|
||||
def patch(self, model, max_shift, base_shift, width, height):
|
||||
m = model.clone()
|
||||
@ -202,7 +203,7 @@ class ModelSamplingContinuousEDM:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, sigma_max, sigma_min):
|
||||
m = model.clone()
|
||||
@ -247,7 +248,7 @@ class ModelSamplingContinuousV:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, sampling, sigma_max, sigma_min):
|
||||
m = model.clone()
|
||||
@ -273,7 +274,7 @@ class RescaleCFG:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, multiplier):
|
||||
def rescale_cfg(args):
|
||||
@ -314,7 +315,7 @@ class ModelNoiseScale:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
CATEGORY = "model/patch"
|
||||
|
||||
def patch(self, model, noise_scale):
|
||||
m = model.clone()
|
||||
@ -337,7 +338,7 @@ class ModelComputeDtype:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/debug/model"
|
||||
CATEGORY = "advanced/debug"
|
||||
|
||||
def patch(self, model, dtype):
|
||||
m = model.clone()
|
||||
|
||||
@ -21,7 +21,7 @@ class ModelMergeSimple:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, ratio):
|
||||
m = model1.clone()
|
||||
@ -40,7 +40,7 @@ class ModelSubtract:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, multiplier):
|
||||
m = model1.clone()
|
||||
@ -58,7 +58,7 @@ class ModelAdd:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2):
|
||||
m = model1.clone()
|
||||
@ -78,7 +78,7 @@ class CLIPMergeSimple:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2, ratio):
|
||||
m = clip1.clone()
|
||||
@ -101,7 +101,7 @@ class CLIPSubtract:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2, multiplier):
|
||||
m = clip1.clone()
|
||||
@ -123,7 +123,7 @@ class CLIPAdd:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, clip1, clip2):
|
||||
m = clip1.clone()
|
||||
@ -147,7 +147,7 @@ class ModelMergeBlocks:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "merge"
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def merge(self, model1, model2, **kwargs):
|
||||
m = model1.clone()
|
||||
@ -242,7 +242,7 @@ class CheckpointSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, model, clip, vae, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
save_checkpoint(model, clip=clip, vae=vae, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
|
||||
@ -261,7 +261,7 @@ class CLIPSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, clip, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
prompt_info = ""
|
||||
@ -318,7 +318,7 @@ class VAESave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, vae, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
|
||||
@ -353,7 +353,7 @@ class ModelSave:
|
||||
FUNCTION = "save"
|
||||
OUTPUT_NODE = True
|
||||
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
def save(self, model, filename_prefix, prompt=None, extra_pnginfo=None):
|
||||
save_checkpoint(model, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
import comfy_extras.nodes_model_merging
|
||||
|
||||
class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
arg_dict = { "model1": ("MODEL",),
|
||||
@ -27,7 +27,7 @@ class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
|
||||
|
||||
class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -53,7 +53,7 @@ class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -77,7 +77,7 @@ class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
|
||||
|
||||
class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -104,7 +104,7 @@ class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -130,7 +130,7 @@ class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -153,7 +153,7 @@ class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -175,7 +175,7 @@ class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -197,7 +197,7 @@ class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -221,7 +221,7 @@ class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -245,7 +245,7 @@ class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
DESCRIPTION = "1.3B model has 30 blocks, 14B model has 40 blocks. Image to video model has the extra img_emb."
|
||||
|
||||
@classmethod
|
||||
@ -269,7 +269,7 @@ class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -292,7 +292,7 @@ class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlo
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -315,7 +315,7 @@ class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBl
|
||||
return {"required": arg_dict}
|
||||
|
||||
class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks):
|
||||
CATEGORY = "advanced/model_merging/model_specific"
|
||||
CATEGORY = "model/merging/model specific"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
|
||||
@ -232,7 +232,7 @@ class ModelPatchLoader:
|
||||
FUNCTION = "load_model_patch"
|
||||
EXPERIMENTAL = True
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_model_patch(self, name):
|
||||
model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name)
|
||||
@ -479,7 +479,7 @@ class QwenImageDiffsynthControlnet:
|
||||
FUNCTION = "diffsynth_controlnet"
|
||||
EXPERIMENTAL = True
|
||||
|
||||
CATEGORY = "advanced/loaders/qwen"
|
||||
CATEGORY = "model/patch/qwen"
|
||||
|
||||
def diffsynth_controlnet(self, model, model_patch, vae, image=None, strength=1.0, inpaint_image=None, mask=None):
|
||||
model_patched = model.clone()
|
||||
@ -512,7 +512,7 @@ class ZImageFunControlnet(QwenImageDiffsynthControlnet):
|
||||
},
|
||||
"optional": {"image": ("IMAGE",), "inpaint_image": ("IMAGE",), "mask": ("MASK",)}}
|
||||
|
||||
CATEGORY = "advanced/loaders/zimage"
|
||||
CATEGORY = "model/patch/z-image"
|
||||
|
||||
class UsoStyleProjectorPatch:
|
||||
def __init__(self, model_patch, encoded_image):
|
||||
@ -675,3 +675,11 @@ NODE_CLASS_MAPPINGS = {
|
||||
"USOStyleReference": USOStyleReference,
|
||||
"SUPIRApply": SUPIRApply,
|
||||
}
|
||||
|
||||
NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"ModelPatchLoader": "Load Model Patch",
|
||||
"QwenImageDiffsynthControlnet": "Apply Qwen Image DiffSynth ControlNet",
|
||||
"ZImageFunControlnet": "Apply Z-Image Fun ControlNet",
|
||||
"USOStyleReference": "Apply USO Style Reference",
|
||||
"SUPIRApply": "Apply SUPIR Patch",
|
||||
}
|
||||
|
||||
@ -14,10 +14,8 @@ class PiDConditioning(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="PiDConditioning",
|
||||
display_name="PiD Conditioning",
|
||||
category="advanced/conditioning",
|
||||
description=(
|
||||
"Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"
|
||||
),
|
||||
category="model/conditioning",
|
||||
description=("Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"),
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."),
|
||||
|
||||
@ -7,8 +7,9 @@ class CLIPTextEncodePixArtAlpha(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodePixArtAlpha",
|
||||
display_name="CLIP Text Encode (PixArt Alpha)",
|
||||
search_aliases=["pixart prompt"],
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/pixart",
|
||||
description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
|
||||
|
||||
@ -616,7 +616,7 @@ class BatchLatentsNode(io.ComfyNode):
|
||||
node_id="BatchLatentsNode",
|
||||
search_aliases=["combine latents", "stack latents", "merge latents"],
|
||||
display_name="Batch Latents",
|
||||
category="model/latent",
|
||||
category="model/latent/batch",
|
||||
inputs=[
|
||||
io.Autogrow.Input("latents", template=autogrow_template)
|
||||
],
|
||||
|
||||
@ -12,7 +12,7 @@ class TextEncodeQwenImageEdit(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeQwenImageEdit",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/qwen image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
@ -55,7 +55,7 @@ class TextEncodeQwenImageEditPlus(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeQwenImageEditPlus",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/qwen image",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -14,7 +14,7 @@ class RTDETR_detect(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="RTDETR_detect",
|
||||
display_name="RT-DETR Detect",
|
||||
display_name="Run Real-Time Detection (RT-DETR)",
|
||||
category="image/detection",
|
||||
search_aliases=["bbox", "bounding box", "object detection", "coco"],
|
||||
inputs=[
|
||||
|
||||
@ -264,7 +264,7 @@ class SAM3_VideoTrack(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SAM3_VideoTrack",
|
||||
display_name="SAM3 Video Track",
|
||||
display_name="Run SAM3 Video Track",
|
||||
category="image/detection",
|
||||
search_aliases=["sam3", "video", "track", "propagate"],
|
||||
inputs=[
|
||||
|
||||
@ -34,14 +34,20 @@ def _unpack(track_data):
|
||||
return unpack_masks(packed)
|
||||
|
||||
|
||||
def _first_frame_cx_area(masks_bool):
|
||||
first = masks_bool[0].float()
|
||||
H, W = first.shape[-2], first.shape[-1]
|
||||
n_pixels = H * W
|
||||
grid_x = torch.arange(W, device=first.device, dtype=first.dtype).view(1, W)
|
||||
area = first.sum(dim=(-1, -2)).clamp_(min=1)
|
||||
cx = (first * grid_x).sum(dim=(-1, -2)) / area
|
||||
return (cx / W).tolist(), (area / n_pixels).tolist()
|
||||
def _first_appearance_cx_area(masks_bool):
|
||||
"""Per object: first frame it appears in, plus centroid-x and area in that frame."""
|
||||
m = masks_bool.float()
|
||||
T, H, W = m.shape[0], m.shape[-2], m.shape[-1]
|
||||
grid_x = torch.arange(W, device=m.device, dtype=m.dtype).view(1, 1, 1, W)
|
||||
area_t = m.sum(dim=(-1, -2))
|
||||
cx_t = (m * grid_x).sum(dim=(-1, -2)) / area_t.clamp(min=1)
|
||||
present = area_t > 0
|
||||
frame_idx = torch.arange(T, device=m.device).unsqueeze(1)
|
||||
first_t = torch.where(present, frame_idx, T).amin(dim=0)
|
||||
sel = first_t.clamp(max=T - 1).unsqueeze(0)
|
||||
cx = cx_t.gather(0, sel).squeeze(0)
|
||||
area = area_t.gather(0, sel).squeeze(0)
|
||||
return first_t.tolist(), (cx / W).tolist(), (area / (H * W)).tolist()
|
||||
|
||||
|
||||
def _subset_track_data(track_data, obj_indices):
|
||||
@ -81,12 +87,26 @@ def _render_colored_masks(track_data, background="black"):
|
||||
masks_full.view(T * N_obj, 1, Hm, Wm), size=(H, W), mode="nearest"
|
||||
).view(T, N_obj, H, W) > 0.5
|
||||
any_mask = masks_full.any(dim=1)
|
||||
obj_idx_map = masks_full.to(torch.uint8).argmax(dim=1)
|
||||
color_overlay = colors[obj_idx_map]
|
||||
color_overlay = colors[masks_full.to(torch.uint8).argmax(dim=1)]
|
||||
bg_tensor = torch.tensor(bg_rgb, device=device, dtype=color_overlay.dtype).view(1, 1, 1, 3)
|
||||
return torch.where(any_mask.unsqueeze(-1), color_overlay, bg_tensor.expand_as(color_overlay))
|
||||
|
||||
|
||||
def _render_mask_as_identity(mask, background="black"):
|
||||
"""Plain comfy MASK (B,H,W) or (H,W) -> (B,H,W,3) rendered as a single identity (palette[0])
|
||||
on the given background. A batch is treated as multiple views of that one subject."""
|
||||
device = comfy.model_management.intermediate_device()
|
||||
dtype = comfy.model_management.intermediate_dtype()
|
||||
if mask.ndim == 2:
|
||||
mask = mask.unsqueeze(0)
|
||||
mask = mask.to(device=device, dtype=dtype)
|
||||
B, H, W = mask.shape
|
||||
bg_rgb = (1.0, 1.0, 1.0) if background.startswith("white") else (0.0, 0.0, 0.0)
|
||||
color = torch.tensor(DEFAULT_PALETTE[0], device=device, dtype=dtype).view(1, 1, 1, 3)
|
||||
bg = torch.tensor(bg_rgb, device=device, dtype=dtype).view(1, 1, 1, 3)
|
||||
return torch.where((mask > 0.5).unsqueeze(-1), color.expand(B, H, W, 3), bg.expand(B, H, W, 3))
|
||||
|
||||
|
||||
def _extract_mask_to_28ch(rgb_video):
|
||||
"""Colored RGB mask (T, H, W, 3) in [0, 1] -> SCAIL-2 28-channel binary latent
|
||||
(1, T_lat, 28, H_lat, W_lat). 7 per-color binary channels (white/r/g/b/y/m/c)
|
||||
@ -123,7 +143,7 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSCAILToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/scail",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -138,8 +158,8 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
|
||||
io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step of the pose conditioning."),
|
||||
io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step of the pose conditioning."),
|
||||
io.Image.Input("reference_image", optional=True, tooltip="Reference image, for multiple references composite all on single image."),
|
||||
io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask at the same resolution as reference_image."),
|
||||
io.Image.Input("reference_image", optional=True, tooltip="Reference image. The first image is the primary reference (composite all identities onto it). SCAIL-2: extra batch images are used as additional views (back view, close-up, occluded background), each needing a matching reference_image_mask in that identity's color."),
|
||||
io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask, batch matching reference_image (first = primary reference mask, rest = identity masks for the additional reference_image)."),
|
||||
io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="CLIP vision features for conditioning. Model is trained with stretch resize to aspect ratio."),
|
||||
io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="Cumulative output frame this chunk begins at. Wire from the previous chunk's video_frame_offset output."),
|
||||
io.Int.Input("previous_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="Tail frames of previous_frames to anchor. SCAIL-2 trained at 5 (81-frame chunks, 76-frame step)."),
|
||||
@ -171,19 +191,21 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
video_frame_offset -= prev_trimmed.shape[0]
|
||||
video_frame_offset = max(0, video_frame_offset)
|
||||
|
||||
ref_latent = None
|
||||
if reference_image is not None:
|
||||
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
# Replacement Mode: composite ref on black bg using reference_image_mask as alpha matte
|
||||
if replacement_mode and reference_image_mask is not None:
|
||||
rm = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
|
||||
is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(reference_image.dtype)
|
||||
reference_image = reference_image * is_char
|
||||
ref_latent = vae.encode(reference_image[:, :, :, :3])
|
||||
ref_imgs = comfy.utils.common_upscale(reference_image.movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
n_ref = ref_imgs.shape[0]
|
||||
# SCAIL-2 multi-reference: the first image is the primary ref, the rest are additional references.
|
||||
|
||||
if ref_latent is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
|
||||
# Replacement Mode: composite each ref on black bg using its mask as alpha matte
|
||||
if replacement_mode and reference_image_mask is not None:
|
||||
rm = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
|
||||
rm = rm[[min(i, rm.shape[0] - 1) for i in range(n_ref)]]
|
||||
is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(ref_imgs.dtype)
|
||||
ref_imgs = ref_imgs * is_char
|
||||
# encode each ref individually so each stays a single latent frame (a batched encode would be treated as a video)
|
||||
ref_latents = [vae.encode(ref_imgs[i:i + 1, :, :, :3]) for i in range(n_ref)]
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
|
||||
|
||||
if clip_vision_output is not None:
|
||||
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
|
||||
@ -221,11 +243,16 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
positive = node_helpers.conditioning_set_values(positive, {"driving_mask_28ch": driving_mask_28ch})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"driving_mask_28ch": driving_mask_28ch})
|
||||
|
||||
if reference_image_mask is not None:
|
||||
ref_mask_hw = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
|
||||
ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw)
|
||||
# The ref mask binds reference frames to identities, so it only applies when there's a reference image.
|
||||
if reference_image_mask is not None and reference_image is not None:
|
||||
ref_mask_hw = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
|
||||
n_masks = ref_mask_hw.shape[0]
|
||||
n_ref = reference_image.shape[0]
|
||||
|
||||
add_masks = [_extract_mask_to_28ch(ref_mask_hw[min(i, n_masks - 1)][None]) for i in range(1, n_ref)]
|
||||
ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw[:1])
|
||||
zeros = torch.zeros((1, latent.shape[2], 28, ref_mask_1f.shape[-2], ref_mask_1f.shape[-1]), device=ref_mask_1f.device, dtype=ref_mask_1f.dtype)
|
||||
ref_mask_28ch = torch.cat([ref_mask_1f, zeros], dim=1)
|
||||
ref_mask_28ch = torch.cat(add_masks + [ref_mask_1f, zeros], dim=1)
|
||||
positive = node_helpers.conditioning_set_values(positive, {"ref_mask_28ch": ref_mask_28ch})
|
||||
negative = node_helpers.conditioning_set_values(negative, {"ref_mask_28ch": ref_mask_28ch})
|
||||
|
||||
@ -244,12 +271,9 @@ class WanSCAILToVideo(io.ComfyNode):
|
||||
|
||||
|
||||
class SCAIL2ColoredMask(io.ComfyNode):
|
||||
"""Render SAM3 tracks for the driving pose video and (optionally) the reference
|
||||
image into the two colored masks WanSCAILToVideo consumes. Shared `sort_by`
|
||||
across both outputs guarantees identity K maps to the same color on both
|
||||
sides, for multi-person workflow consistency.
|
||||
reference_image_mask is always rendered black-bg (model convention)
|
||||
pose_video_mask bg follows replacement_mode: black = Animation Mode, white = Replacement Mode
|
||||
"""Render SAM3 tracks for the driving pose video and reference image(s) into the
|
||||
colored masks WanSCAILToVideo consumes. Shared `sort_by` keeps each identity on the
|
||||
same color across both outputs.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
@ -257,18 +281,18 @@ class SCAIL2ColoredMask(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="SCAIL2ColoredMask",
|
||||
display_name="Create SCAIL-2 Colored Mask",
|
||||
category="conditioning/video_models/scail",
|
||||
category="model/conditioning/wan/scail",
|
||||
inputs=[
|
||||
SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
|
||||
SAM3TrackData.Input("ref_track_data", optional=True,
|
||||
tooltip="SAM3 track of the reference image."),
|
||||
io.MultiType.Input("ref_track_data", [SAM3TrackData, io.Mask], optional=True, display_name="reference_masks",
|
||||
tooltip="SAM3 track of the reference image(s) (one identity per object, colored in batch order), or a plain MASK of the reference subject (rendered as a single identity)."),
|
||||
io.String.Input("object_indices", default="",
|
||||
tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
|
||||
io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
|
||||
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
|
||||
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). Objects that appear in earlier frames always come first; within a frame, left_to_right = leftmost object (by centroid at first appearance) gets the first color, area = biggest object (by mask area at first appearance) gets the first color; none = keep SAM3's order."),
|
||||
io.Boolean.Input("replacement_mode", default=False,
|
||||
tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
|
||||
"True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
|
||||
tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
|
||||
"True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output("pose_video_mask"),
|
||||
@ -282,11 +306,11 @@ class SCAIL2ColoredMask(io.ComfyNode):
|
||||
def _prep(td):
|
||||
masks_bool = _unpack(td)
|
||||
if sort_by != "none" and masks_bool is not None:
|
||||
cx, area = _first_frame_cx_area(masks_bool)
|
||||
first_t, cx, area = _first_appearance_cx_area(masks_bool)
|
||||
if sort_by == "left_to_right":
|
||||
order = sorted(range(len(cx)), key=lambda i: cx[i])
|
||||
order = sorted(range(len(cx)), key=lambda i: (first_t[i], cx[i]))
|
||||
else: # "area"
|
||||
order = sorted(range(len(area)), key=lambda i: -area[i])
|
||||
order = sorted(range(len(area)), key=lambda i: (first_t[i], -area[i]))
|
||||
td = _subset_track_data(td, order)
|
||||
if object_indices.strip():
|
||||
indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
|
||||
@ -302,8 +326,10 @@ class SCAIL2ColoredMask(io.ComfyNode):
|
||||
ref_bg = "black" if replacement_mode else "white"
|
||||
|
||||
if ref_track_data is not None:
|
||||
ref = _prep(ref_track_data)
|
||||
reference_image_mask = _render_colored_masks(ref, ref_bg)
|
||||
if isinstance(ref_track_data, torch.Tensor): # plain comfy MASK
|
||||
reference_image_mask = _render_mask_as_identity(ref_track_data, ref_bg)
|
||||
else:
|
||||
reference_image_mask = _render_colored_masks(_prep(ref_track_data), ref_bg)
|
||||
else:
|
||||
H, W = drv["orig_size"]
|
||||
fill_value = 1.0 if ref_bg == "white" else 0.0
|
||||
|
||||
@ -13,8 +13,9 @@ class TripleCLIPLoader(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TripleCLIPLoader",
|
||||
category="advanced/loaders",
|
||||
description="[Recipes]\n\nsd3: clip-l, clip-g, t5",
|
||||
display_name="Load CLIP (Triple)",
|
||||
category="model/loaders",
|
||||
description="Recipes:\nsd3: clip-l, clip-g, t5",
|
||||
inputs=[
|
||||
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
|
||||
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
|
||||
@ -41,7 +42,7 @@ class EmptySD3LatentImage(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="EmptySD3LatentImage",
|
||||
category="model/latent/sd3",
|
||||
category="model/latent/stable diffusion",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
@ -66,7 +67,8 @@ class CLIPTextEncodeSD3(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="CLIPTextEncodeSD3",
|
||||
search_aliases=["sd3 prompt"],
|
||||
category="advanced/conditioning",
|
||||
display_name="CLIP Text Encode (SD3)",
|
||||
category="model/conditioning/stable diffusion",
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
|
||||
|
||||
@ -96,8 +96,12 @@ class KeypointDraw:
|
||||
# Body connections - matching DWPose limbSeq (1-indexed, converted to 0-indexed)
|
||||
self.body_limbSeq = [
|
||||
[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10],
|
||||
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17],
|
||||
[1, 16], [16, 18]
|
||||
[10, 11], [2, 12], [12, 13], [13, 14]
|
||||
]
|
||||
|
||||
# Head connections (1-indexed, converted to 0-indexed)
|
||||
self.head_edges = [
|
||||
[2, 1], [1, 15], [15, 17], [1, 16], [16, 18]
|
||||
]
|
||||
|
||||
# Colors matching DWPose
|
||||
@ -215,7 +219,7 @@ class KeypointDraw:
|
||||
return unique_pts if len(unique_pts) > 1 else [[center[0], center[1]], [center[0], center[1]]]
|
||||
|
||||
def draw_wholebody_keypoints(self, canvas, keypoints, scores=None, threshold=0.3,
|
||||
draw_body=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
|
||||
draw_body=True, draw_head=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
|
||||
"""
|
||||
Draw wholebody keypoints (134 keypoints after processing) in DWPose style.
|
||||
|
||||
@ -237,9 +241,17 @@ class KeypointDraw:
|
||||
"""
|
||||
H, W, C = canvas.shape
|
||||
|
||||
# Draw body limbs
|
||||
if draw_body and len(keypoints) >= 18:
|
||||
for i, limb in enumerate(self.body_limbSeq):
|
||||
# Draw body limbs & head connections
|
||||
if (draw_body or draw_head) and len(keypoints) >= 18:
|
||||
colorIndexOffset = 0
|
||||
edges = []
|
||||
if draw_body:
|
||||
edges += self.body_limbSeq
|
||||
else:
|
||||
colorIndexOffset += len(self.body_limbSeq)
|
||||
if draw_head:
|
||||
edges += self.head_edges
|
||||
for i, limb in enumerate(edges):
|
||||
# Convert from 1-indexed to 0-indexed
|
||||
idx1, idx2 = limb[0] - 1, limb[1] - 1
|
||||
|
||||
@ -262,11 +274,17 @@ class KeypointDraw:
|
||||
|
||||
polygon = self.draw.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stick_width), int(angle), 0, 360, 1)
|
||||
|
||||
self.draw.fillConvexPoly(canvas, polygon, self.colors[i % len(self.colors)])
|
||||
self.draw.fillConvexPoly(canvas, polygon, self.colors[(i + colorIndexOffset) % len(self.colors)])
|
||||
|
||||
# Draw body keypoints
|
||||
if draw_body and len(keypoints) >= 18:
|
||||
# Draw body & head keypoints
|
||||
if (draw_body or draw_head) and len(keypoints) >= 18:
|
||||
head_keypoints = {0, 14, 15, 16, 17} # nose, eyes, ears
|
||||
neck_point = 1
|
||||
for i in range(18):
|
||||
if not draw_head and i in head_keypoints:
|
||||
continue
|
||||
if not draw_body and i not in head_keypoints and i != neck_point:
|
||||
continue
|
||||
if scores is not None and scores[i] < threshold:
|
||||
continue
|
||||
x, y = int(keypoints[i][0]), int(keypoints[i][1])
|
||||
@ -365,6 +383,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
io.Int.Input("stick_width", default=4, min=1, max=10, step=1),
|
||||
io.Int.Input("face_point_size", default=3, min=1, max=10, step=1),
|
||||
io.Float.Input("score_threshold", default=0.3, min=0.0, max=1.0, step=0.01),
|
||||
io.Boolean.Input("draw_head", default=True),
|
||||
],
|
||||
outputs=[
|
||||
io.Image.Output(),
|
||||
@ -372,7 +391,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold) -> io.NodeOutput:
|
||||
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold, draw_head) -> io.NodeOutput:
|
||||
if not keypoints:
|
||||
return io.NodeOutput(torch.zeros((1, 64, 64, 3), dtype=torch.float32))
|
||||
height = keypoints[0]["canvas_height"]
|
||||
@ -405,7 +424,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
|
||||
canvas = drawer.draw_wholebody_keypoints(
|
||||
canvas, kp, sc,
|
||||
threshold=score_threshold,
|
||||
draw_body=draw_body, draw_feet=draw_feet,
|
||||
draw_body=draw_body, draw_head=draw_head, draw_feet=draw_feet,
|
||||
draw_face=draw_face, draw_hands=draw_hands,
|
||||
stick_width=stick_width, face_point_size=face_point_size,
|
||||
)
|
||||
|
||||
@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SD_4XUpscale_Conditioning",
|
||||
category="model/conditioning/upscale_diffusion",
|
||||
category="model/conditioning/stable diffusion upscaler",
|
||||
inputs=[
|
||||
io.Image.Input("images"),
|
||||
io.Conditioning.Input("positive"),
|
||||
|
||||
@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableZero123_Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable zero123",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableZero123_Conditioning_Batched",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable zero123",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SV3D_Conditioning",
|
||||
category="model/conditioning/3d_models",
|
||||
category="model/conditioning/stable video 3d",
|
||||
inputs=[
|
||||
io.ClipVision.Input("clip_vision"),
|
||||
io.Image.Input("init_image"),
|
||||
|
||||
@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_EmptyLatentImage",
|
||||
category="model/latent/stable_cascade",
|
||||
category="model/latent/stable cascade",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
|
||||
io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
|
||||
@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_StageC_VAEEncode",
|
||||
category="model/latent/stable_cascade",
|
||||
category="model/latent/stable cascade",
|
||||
inputs=[
|
||||
io.Image.Input("image"),
|
||||
io.Vae.Input("vae"),
|
||||
@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="StableCascade_StageB_Conditioning",
|
||||
category="model/conditioning/stable_cascade",
|
||||
category="model/conditioning/stable cascade",
|
||||
inputs=[
|
||||
io.Conditioning.Input("conditioning"),
|
||||
io.Latent.Input("stage_c"),
|
||||
|
||||
@ -35,7 +35,7 @@ class TextGenerate(io.ComfyNode):
|
||||
io.Image.Input("image", optional=True),
|
||||
io.Image.Input("video", optional=True, tooltip="Video frames as image batch. Assumed to be 24 FPS; subsampled to 1 FPS internally."),
|
||||
io.Audio.Input("audio", optional=True),
|
||||
io.Int.Input("max_length", default=256, min=1, max=2048),
|
||||
io.Int.Input("max_length", default=512, min=1, max=32768),
|
||||
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
|
||||
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
|
||||
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
|
||||
|
||||
@ -1367,7 +1367,7 @@ class SaveLoRA(io.ComfyNode):
|
||||
node_id="SaveLoRA",
|
||||
search_aliases=["export lora"],
|
||||
display_name="Save LoRA Weights",
|
||||
category="advanced/model_merging",
|
||||
category="model/merging",
|
||||
is_experimental=True,
|
||||
is_output_node=True,
|
||||
inputs=[
|
||||
|
||||
@ -65,7 +65,7 @@ class TripoSplatPreprocessImage(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="TripoSplatPreprocessImage",
|
||||
display_name="TripoSplat Preprocess Image",
|
||||
category="3d/conditioning",
|
||||
category="model/conditioning/triposplat",
|
||||
description="Crop center each image to a square canvas on a black background and add padding.",
|
||||
inputs=[
|
||||
IO.Image.Input("image"),
|
||||
@ -95,7 +95,7 @@ class TripoSplatConditioning(IO.ComfyNode):
|
||||
return IO.Schema(
|
||||
node_id="TripoSplatConditioning",
|
||||
display_name="TripoSplat Conditioning",
|
||||
category="3d/conditioning",
|
||||
category="model/conditioning/triposplat",
|
||||
description="Encode the image with DINOv3 and the Flux2 VAE into TripoSplat positive/negative "
|
||||
"conditioning, and create the fixed size noise target (latent + camera) for the KSampler",
|
||||
inputs=[
|
||||
|
||||
@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning:
|
||||
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/conditioning/video_models"
|
||||
CATEGORY = "model/conditioning/stable video"
|
||||
|
||||
def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level):
|
||||
output = clip_vision.encode_image(init_image)
|
||||
@ -108,7 +108,7 @@ class VideoTriangleCFGGuidance:
|
||||
return (m, )
|
||||
|
||||
class ImageOnlyCheckpointSave(comfy_extras.nodes_model_merging.CheckpointSave):
|
||||
CATEGORY = "advanced/model_merging"
|
||||
CATEGORY = "model/merging"
|
||||
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, temporal, x, y, z, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
|
||||
@ -160,4 +160,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"ImageOnlyCheckpointLoader": "Load Checkpoint Image Only (img2vid model)",
|
||||
"VideoLinearCFGGuidance": "Video Linear CFG Guidance",
|
||||
"VideoTriangleCFGGuidance": "Video Triangle CFG Guidance",
|
||||
"ConditioningSetAreaPercentageVideo": "Conditioning (Set Area with Percentage for Video)",
|
||||
}
|
||||
|
||||
@ -175,7 +175,7 @@ class VOIDInpaintConditioning(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDInpaintConditioning",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/void",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -288,7 +288,7 @@ class VOIDWarpedNoise(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoise",
|
||||
category="model/latent/video",
|
||||
category="model/latent/void",
|
||||
inputs=[
|
||||
OpticalFlow.Input(
|
||||
"optical_flow",
|
||||
@ -393,7 +393,7 @@ class VOIDWarpedNoiseSource(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="VOIDWarpedNoiseSource",
|
||||
category="model/sampling/noise",
|
||||
category="model/latent/void",
|
||||
inputs=[
|
||||
io.Latent.Input("warped_noise",
|
||||
tooltip="Warped noise latent from VOIDWarpedNoise"),
|
||||
|
||||
@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFunControlToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun control",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Wan22FunControlToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun control",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFirstLastFrameToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanFunInpaintToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/fun inpaint",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="WanVaceToVideo",
|
||||
search_aliases=["video conditioning", "video control"],
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/vace",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -375,7 +375,8 @@ class TrimVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TrimVideoLatent",
|
||||
category="model/latent/video",
|
||||
display_name="Trim Video Latent",
|
||||
category="model/latent",
|
||||
inputs=[
|
||||
io.Latent.Input("samples"),
|
||||
io.Int.Input("trim_amount", default=0, min=0, max=99999),
|
||||
@ -398,7 +399,7 @@ class WanCameraImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanCameraImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/camera",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -452,7 +453,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanPhantomSubjectToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/phantom subject",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -707,7 +708,7 @@ class WanTrackToVideo(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="WanTrackToVideo",
|
||||
search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"],
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -951,7 +952,7 @@ class WanSoundImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSoundImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/sound",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -984,7 +985,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanSoundImageToVideoExtend",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/sound",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1046,7 +1047,7 @@ class WanHuMoImageToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanHuMoImageToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/humo",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1112,7 +1113,7 @@ class WanAnimateToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanAnimateToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/animate",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
@ -1252,7 +1253,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="Wan22ImageToVideoLatent",
|
||||
category="model/conditioning/inpaint",
|
||||
category="model/conditioning/wan",
|
||||
inputs=[
|
||||
io.Vae.Input("vae"),
|
||||
io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32),
|
||||
@ -1302,7 +1303,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanInfiniteTalkToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/infinite talk",
|
||||
inputs=[
|
||||
io.DynamicCombo.Input("mode", options=[
|
||||
io.DynamicCombo.Option("single_speaker", []),
|
||||
|
||||
@ -713,7 +713,7 @@ class WanDancerEncodeAudio(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanDancerEncodeAudio",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/dancer",
|
||||
inputs=[
|
||||
io.Audio.Input("audio"),
|
||||
io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4),
|
||||
@ -787,7 +787,7 @@ class WanDancerVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanDancerVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/dancer",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -247,7 +247,7 @@ class WanMoveVisualizeTracks(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveVisualizeTracks",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Image.Input("images"),
|
||||
io.Tracks.Input("tracks", optional=True),
|
||||
@ -283,7 +283,7 @@ class WanMoveTracksFromCoords(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveTracksFromCoords",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.String.Input("track_coords", force_input=True, default="[]", optional=True),
|
||||
io.Mask.Input("track_mask", optional=True),
|
||||
@ -325,7 +325,8 @@ class GenerateTracks(io.ComfyNode):
|
||||
return io.Schema(
|
||||
node_id="GenerateTracks",
|
||||
search_aliases=["motion paths", "camera movement", "trajectory"],
|
||||
category="model/conditioning/video_models",
|
||||
display_name="Generate Video Tracks",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Int.Input("width", default=832, min=16, max=4096, step=16),
|
||||
io.Int.Input("height", default=480, min=16, max=4096, step=16),
|
||||
@ -434,7 +435,7 @@ class WanMoveConcatTrack(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveConcatTrack",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Tracks.Input("tracks_1"),
|
||||
io.Tracks.Input("tracks_2", optional=True),
|
||||
@ -463,7 +464,7 @@ class WanMoveTrackToVideo(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="WanMoveTrackToVideo",
|
||||
category="model/conditioning/video_models",
|
||||
category="model/conditioning/wan/move",
|
||||
inputs=[
|
||||
io.Conditioning.Input("positive"),
|
||||
io.Conditioning.Input("negative"),
|
||||
|
||||
@ -10,7 +10,7 @@ class TextEncodeZImageOmni(io.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="TextEncodeZImageOmni",
|
||||
category="advanced/conditioning",
|
||||
category="model/conditioning/z-image",
|
||||
is_experimental=True,
|
||||
inputs=[
|
||||
io.Clip.Input("clip"),
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.24.0"
|
||||
__version__ = "0.25.0"
|
||||
|
||||
49
main.py
49
main.py
@ -55,7 +55,11 @@ if __name__ == "__main__" and args.debug_hang:
|
||||
import comfy_aimdo.control
|
||||
|
||||
if enables_dynamic_vram():
|
||||
comfy_aimdo.control.init()
|
||||
try:
|
||||
comfy_aimdo.control.init(simple_vram_headroom=None if args.reserve_vram is None else int(args.reserve_vram * 1024 ** 3))
|
||||
except TypeError:
|
||||
# comfy-aimdo 0.4.9 protocol.
|
||||
comfy_aimdo.control.init()
|
||||
|
||||
if os.name == "nt":
|
||||
os.environ['MIMALLOC_PURGE_DELAY'] = '0'
|
||||
@ -123,6 +127,10 @@ def apply_custom_paths():
|
||||
for config_path in itertools.chain(*args.extra_model_paths_config):
|
||||
utils.extra_config.load_extra_path_config(config_path)
|
||||
|
||||
# --base-directory
|
||||
if args.base_directory:
|
||||
logging.info(f"Setting base directory to: {folder_paths.base_path}")
|
||||
|
||||
# --output-directory, --input-directory, --user-directory
|
||||
if args.output_directory:
|
||||
output_dir = os.path.abspath(args.output_directory)
|
||||
@ -231,23 +239,30 @@ import comfy.model_patcher
|
||||
if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()):
|
||||
if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)):
|
||||
logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
|
||||
elif comfy_aimdo.control.init_devices(d.index for d in comfy.model_management.get_all_torch_devices()):
|
||||
if args.verbose == 'DEBUG':
|
||||
comfy_aimdo.control.set_log_debug()
|
||||
elif args.verbose == 'CRITICAL':
|
||||
comfy_aimdo.control.set_log_critical()
|
||||
elif args.verbose == 'ERROR':
|
||||
comfy_aimdo.control.set_log_error()
|
||||
elif args.verbose == 'WARNING':
|
||||
comfy_aimdo.control.set_log_warning()
|
||||
else: #INFO
|
||||
comfy_aimdo.control.set_log_info()
|
||||
|
||||
comfy.model_patcher.CoreModelPatcher = comfy.model_patcher.ModelPatcherDynamic
|
||||
comfy.memory_management.aimdo_enabled = True
|
||||
logging.info("DynamicVRAM support detected and enabled")
|
||||
else:
|
||||
logging.warning("No working comfy-aimdo install detected. DynamicVRAM support disabled. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
|
||||
try:
|
||||
aimdo_initialized = comfy_aimdo.control.init_devices((d.index, int(args.vram_headroom * 1024 ** 3)) for d in comfy.model_management.get_all_torch_devices())
|
||||
except TypeError:
|
||||
# comfy-aimdo 0.4.9 protocol.
|
||||
aimdo_initialized = comfy_aimdo.control.init_devices(d.index for d in comfy.model_management.get_all_torch_devices())
|
||||
|
||||
if aimdo_initialized:
|
||||
if args.verbose == 'DEBUG':
|
||||
comfy_aimdo.control.set_log_debug()
|
||||
elif args.verbose == 'CRITICAL':
|
||||
comfy_aimdo.control.set_log_critical()
|
||||
elif args.verbose == 'ERROR':
|
||||
comfy_aimdo.control.set_log_error()
|
||||
elif args.verbose == 'WARNING':
|
||||
comfy_aimdo.control.set_log_warning()
|
||||
else: #INFO
|
||||
comfy_aimdo.control.set_log_info()
|
||||
|
||||
comfy.model_patcher.CoreModelPatcher = comfy.model_patcher.ModelPatcherDynamic
|
||||
comfy.memory_management.aimdo_enabled = True
|
||||
logging.info("DynamicVRAM support detected and enabled")
|
||||
else:
|
||||
logging.warning("No working comfy-aimdo install detected. DynamicVRAM support disabled. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
|
||||
|
||||
|
||||
def cuda_malloc_warning():
|
||||
|
||||
66
nodes.py
66
nodes.py
@ -20,8 +20,6 @@ from PIL.PngImagePlugin import PngInfo
|
||||
import numpy as np
|
||||
import safetensors.torch
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy"))
|
||||
|
||||
import comfy.diffusers_load
|
||||
import comfy.samplers
|
||||
import comfy.sample
|
||||
@ -87,7 +85,7 @@ class ConditioningCombine:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "combine"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"]
|
||||
|
||||
def combine(self, conditioning_1, conditioning_2):
|
||||
@ -104,7 +102,7 @@ class ConditioningAverage :
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "addWeighted"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def addWeighted(self, conditioning_to, conditioning_from, conditioning_to_strength):
|
||||
out = []
|
||||
@ -143,7 +141,7 @@ class ConditioningConcat:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "concat"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def concat(self, conditioning_to, conditioning_from):
|
||||
out = []
|
||||
@ -176,7 +174,7 @@ class ConditioningSetArea:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, x, y, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": (height // 8, width // 8, y // 8, x // 8),
|
||||
@ -197,7 +195,7 @@ class ConditioningSetAreaPercentage:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, width, height, x, y, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", height, width, y, x),
|
||||
@ -214,7 +212,7 @@ class ConditioningSetAreaStrength:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, strength):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"strength": strength})
|
||||
@ -234,7 +232,7 @@ class ConditioningSetMask:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "append"
|
||||
|
||||
CATEGORY = "model/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def append(self, conditioning, mask, set_cond_area, strength):
|
||||
set_area_to_bounds = False
|
||||
@ -257,7 +255,7 @@ class ConditioningZeroOut:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "zero_out"
|
||||
|
||||
CATEGORY = "advanced/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def zero_out(self, conditioning):
|
||||
c = []
|
||||
@ -283,11 +281,10 @@ class ConditioningSetTimestepRange:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "set_range"
|
||||
|
||||
CATEGORY = "advanced/conditioning"
|
||||
CATEGORY = "model/conditioning/transform"
|
||||
|
||||
def set_range(self, conditioning, start, end):
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"start_percent": start,
|
||||
"end_percent": end})
|
||||
c = node_helpers.conditioning_set_values(conditioning, {"start_percent": start, "end_percent": end})
|
||||
return (c, )
|
||||
|
||||
class VAEDecode:
|
||||
@ -389,7 +386,7 @@ class VAEEncodeForInpaint:
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/latent/inpaint"
|
||||
CATEGORY = "model/latent"
|
||||
|
||||
def encode(self, vae, pixels, mask, grow_mask_by=6):
|
||||
downscale_ratio = vae.spacial_compression_encode()
|
||||
@ -438,7 +435,7 @@ class InpaintModelConditioning:
|
||||
RETURN_NAMES = ("positive", "negative", "latent")
|
||||
FUNCTION = "encode"
|
||||
|
||||
CATEGORY = "model/conditioning/inpaint"
|
||||
CATEGORY = "model/conditioning"
|
||||
|
||||
def encode(self, positive, negative, pixels, vae, mask, noise_mask=True):
|
||||
x = (pixels.shape[1] // 8) * 8
|
||||
@ -576,7 +573,7 @@ class CheckpointLoader:
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE")
|
||||
FUNCTION = "load_checkpoint"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
DEPRECATED = True
|
||||
|
||||
def load_checkpoint(self, config_name, ckpt_name):
|
||||
@ -622,8 +619,9 @@ class DiffusersLoader:
|
||||
return {"required": {"model_path": (paths,), }}
|
||||
RETURN_TYPES = ("MODEL", "CLIP", "VAE")
|
||||
FUNCTION = "load_checkpoint"
|
||||
DEPRECATED = True
|
||||
|
||||
CATEGORY = "advanced/loaders/deprecated"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_checkpoint(self, model_path, output_vae=True, output_clip=True):
|
||||
for search_path in folder_paths.get_folder_paths("diffusers"):
|
||||
@ -949,7 +947,7 @@ class UNETLoader:
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "load_unet"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
def load_unet(self, unet_name, weight_dtype):
|
||||
model_options = {}
|
||||
@ -969,7 +967,7 @@ class CLIPLoader:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4"], ),
|
||||
"type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit", "ideogram4", "boogu"], ),
|
||||
},
|
||||
"optional": {
|
||||
"device": (["default", "cpu"], {"advanced": True}),
|
||||
@ -977,9 +975,9 @@ class CLIPLoader:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm"
|
||||
DESCRIPTION = "Recipes:\nsd: clip-l\nstable cascade: clip-g\nsd3: t5 xxl / clip-g / clip-l\nstable audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\nhidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\npixeldit: gemma 2 2B elm"
|
||||
|
||||
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@ -1005,9 +1003,9 @@ class DualCLIPLoader:
|
||||
RETURN_TYPES = ("CLIP",)
|
||||
FUNCTION = "load_clip"
|
||||
|
||||
CATEGORY = "advanced/loaders"
|
||||
CATEGORY = "model/loaders"
|
||||
|
||||
DESCRIPTION = "[Recipes]\n\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small\nnewbie: gemma-3-4b-it, jina clip v2"
|
||||
DESCRIPTION = "Recipes:\nsdxl: clip-l, clip-g\nsd3: clip-l, clip-g / clip-l, t5 / clip-g, t5\nflux: clip-l, t5\nhidream: at least one of t5 or llama, recommended t5 and llama\nhunyuan_image: qwen2.5vl 7b and byt5 small\nnewbie: gemma-3-4b-it, jina clip v2"
|
||||
|
||||
def load_clip(self, clip_name1, clip_name2, type, device="default"):
|
||||
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
|
||||
@ -1088,7 +1086,7 @@ class StyleModelApply:
|
||||
RETURN_TYPES = ("CONDITIONING",)
|
||||
FUNCTION = "apply_stylemodel"
|
||||
|
||||
CATEGORY = "model/conditioning/style_model"
|
||||
CATEGORY = "model/conditioning"
|
||||
|
||||
def apply_stylemodel(self, conditioning, style_model, clip_vision_output, strength, strength_type):
|
||||
cond = style_model.get_cond(clip_vision_output).flatten(start_dim=0, end_dim=1).unsqueeze(dim=0)
|
||||
@ -1518,13 +1516,11 @@ class LatentCrop:
|
||||
class SetLatentNoiseMask:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "samples": ("LATENT",),
|
||||
"mask": ("MASK",),
|
||||
}}
|
||||
return {"required": { "samples": ("LATENT",), "mask": ("MASK",), }}
|
||||
RETURN_TYPES = ("LATENT",)
|
||||
FUNCTION = "set_mask"
|
||||
|
||||
CATEGORY = "model/latent/inpaint"
|
||||
CATEGORY = "model/latent"
|
||||
|
||||
def set_mask(self, samples, mask):
|
||||
s = samples.copy()
|
||||
@ -2045,7 +2041,7 @@ NODE_CLASS_MAPPINGS = {
|
||||
"ImageBatch": ImageBatch,
|
||||
"ImagePadForOutpaint": ImagePadForOutpaint,
|
||||
"EmptyImage": EmptyImage,
|
||||
"ConditioningAverage": ConditioningAverage ,
|
||||
"ConditioningAverage": ConditioningAverage,
|
||||
"ConditioningCombine": ConditioningCombine,
|
||||
"ConditioningConcat": ConditioningConcat,
|
||||
"ConditioningSetArea": ConditioningSetArea,
|
||||
@ -2101,6 +2097,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"LoraLoader": "Load LoRA (Model and CLIP)",
|
||||
"LoraLoaderModelOnly": "Load LoRA",
|
||||
"CLIPLoader": "Load CLIP",
|
||||
"DualCLIPLoader": "Load CLIP (Dual)",
|
||||
"ControlNetLoader": "Load ControlNet Model",
|
||||
"DiffControlNetLoader": "Load ControlNet Model (diff)",
|
||||
"StyleModelLoader": "Load Style Model",
|
||||
@ -2108,6 +2105,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"UNETLoader": "Load Diffusion Model",
|
||||
"unCLIPCheckpointLoader": "Load unCLIP Checkpoint",
|
||||
"GLIGENLoader": "Load GLIGEN Model",
|
||||
"DiffusersLoader": "Load Diffusers Model (DEPRECATED)",
|
||||
# Conditioning
|
||||
"CLIPVisionEncode": "CLIP Vision Encode",
|
||||
"StyleModelApply": "Apply Style Model",
|
||||
@ -2115,12 +2113,16 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"CLIPSetLastLayer": "CLIP Set Last Layer",
|
||||
"ConditioningCombine": "Conditioning (Combine)",
|
||||
"ConditioningAverage ": "Conditioning (Average)",
|
||||
"ConditioningAverage": "Conditioning (Average)",
|
||||
"ConditioningConcat": "Conditioning (Concat)",
|
||||
"ConditioningSetArea": "Conditioning (Set Area)",
|
||||
"ConditioningSetAreaPercentage": "Conditioning (Set Area with Percentage)",
|
||||
"ConditioningSetAreaStrength": "Conditioning (Set Area Strength)",
|
||||
"ConditioningSetMask": "Conditioning (Set Mask)",
|
||||
"ControlNetApply": "Apply ControlNet (DEPRECATED)",
|
||||
"ControlNetApplyAdvanced": "Apply ControlNet",
|
||||
"GLIGENTextBoxApply": "Apply GLIGEN Text Box",
|
||||
"ConditioningZeroOut": "Conditioning Zero Out",
|
||||
# Latent
|
||||
"VAEEncodeForInpaint": "VAE Encode (for Inpainting)",
|
||||
"SetLatentNoiseMask": "Set Latent Noise Mask",
|
||||
@ -2134,7 +2136,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
|
||||
"LatentUpscaleBy": "Upscale Latent By",
|
||||
"LatentComposite": "Latent Composite",
|
||||
"LatentBlend": "Latent Blend",
|
||||
"LatentFromBatch" : "Latent From Batch",
|
||||
"LatentFromBatch" : "Get Latent From Batch",
|
||||
"RepeatLatentBatch": "Repeat Latent Batch",
|
||||
# Image
|
||||
"EmptyImage": "Empty Image",
|
||||
@ -2295,6 +2297,9 @@ async def init_external_custom_nodes():
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
# TODO: remove at some point when custom nodes don't break.
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), "comfy"))
|
||||
|
||||
base_node_names = set(NODE_CLASS_MAPPINGS.keys())
|
||||
node_paths = folder_paths.get_folder_paths("custom_nodes")
|
||||
node_import_times = []
|
||||
@ -2421,6 +2426,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_tcfg.py",
|
||||
"nodes_context_windows.py",
|
||||
"nodes_qwen.py",
|
||||
"nodes_boogu.py",
|
||||
"nodes_chroma_radiance.py",
|
||||
"nodes_pid.py",
|
||||
"nodes_model_patch.py",
|
||||
|
||||
215
openapi.yaml
215
openapi.yaml
@ -673,6 +673,35 @@ components:
|
||||
- created_at
|
||||
- updated_at
|
||||
type: object
|
||||
JobsCancelRequest:
|
||||
additionalProperties: false
|
||||
description: Request to cancel multiple jobs by ID.
|
||||
properties:
|
||||
job_ids:
|
||||
description: Job identifiers (UUIDs) to cancel.
|
||||
items:
|
||||
format: uuid
|
||||
type: string
|
||||
maxItems: 100
|
||||
minItems: 1
|
||||
type: array
|
||||
required:
|
||||
- job_ids
|
||||
type: object
|
||||
JobsCancelResponse:
|
||||
description: Response for POST /api/jobs/cancel.
|
||||
properties:
|
||||
cancelled:
|
||||
description: |
|
||||
Job IDs for which a cancel event was successfully dispatched by this
|
||||
call. Jobs already in a terminal or cancelling state are idempotently
|
||||
skipped and will not appear here.
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
required:
|
||||
- cancelled
|
||||
type: object
|
||||
JobsListResponse:
|
||||
description: Paginated list of jobs for the authenticated user.
|
||||
properties:
|
||||
@ -896,11 +925,6 @@ components:
|
||||
additionalProperties: true
|
||||
description: The workflow graph to execute
|
||||
type: object
|
||||
prompt_id:
|
||||
description: Optional client-supplied job id. Must be a UUID in canonical lowercase hyphenated form; it is echoed back in the response. Omitted or null means the server generates one.
|
||||
format: uuid
|
||||
nullable: true
|
||||
type: string
|
||||
workflow_id:
|
||||
description: UUID identifying the cloud workflow entity to associate with this job
|
||||
type: string
|
||||
@ -1011,7 +1035,7 @@ components:
|
||||
description: If true, clear all pending jobs from the queue
|
||||
type: boolean
|
||||
delete:
|
||||
description: Array of PENDING job IDs to cancel
|
||||
description: Array of job IDs to cancel; pending and running jobs transition to cancelled
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
@ -1800,13 +1824,91 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Invalid request (no fields provided)
|
||||
description: |
|
||||
Invalid request — no fields provided, or `preview_id` is the zero UUID
|
||||
(`INVALID_PREVIEW_ID`).
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Unauthorized
|
||||
"404":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: |
|
||||
Asset not found — returned both when the asset being updated does
|
||||
not exist and when `preview_id` does not reference an asset
|
||||
accessible to the caller.
|
||||
"500":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error
|
||||
summary: Update asset metadata
|
||||
tags:
|
||||
- file
|
||||
/api/assets/{id}/content:
|
||||
get:
|
||||
description: |
|
||||
Returns the binary content of an asset by ID.
|
||||
|
||||
The contract is the same across runtimes — "GET this path and you
|
||||
receive the asset's bytes" — but the mechanism differs:
|
||||
- **Local ComfyUI** streams the bytes directly (`200`,
|
||||
`application/octet-stream`).
|
||||
- **Cloud** does not proxy large files; it responds `302` with a
|
||||
`Location` redirect to a short-lived signed storage URL. Clients that
|
||||
follow redirects (browsers, `fetch`/XHR, `<img>`/`<video>`) receive
|
||||
the bytes transparently.
|
||||
|
||||
Prefer this over the filename-addressed `/api/view` when you have an
|
||||
asset ID.
|
||||
operationId: getAssetContent
|
||||
parameters:
|
||||
- description: Asset ID
|
||||
in: path
|
||||
name: id
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- description: |
|
||||
Content-Disposition for the response: `attachment` (download) or
|
||||
`inline` (render in browser). Defaults to `attachment`.
|
||||
in: query
|
||||
name: disposition
|
||||
schema:
|
||||
default: attachment
|
||||
enum:
|
||||
- inline
|
||||
- attachment
|
||||
type: string
|
||||
responses:
|
||||
"200":
|
||||
content:
|
||||
application/octet-stream:
|
||||
schema:
|
||||
format: binary
|
||||
type: string
|
||||
description: Asset content stream (local runtime streams the bytes directly)
|
||||
"302":
|
||||
description: Redirect to a signed storage URL (cloud runtime)
|
||||
headers:
|
||||
Cache-Control:
|
||||
description: Private caching directive scoped to the signed URL lifetime
|
||||
schema:
|
||||
type: string
|
||||
Location:
|
||||
description: Short-lived signed URL to the asset content in storage
|
||||
schema:
|
||||
type: string
|
||||
Vary:
|
||||
description: Partitions any cached redirect by auth credentials so a private redirect is not reused across users
|
||||
schema:
|
||||
type: string
|
||||
"404":
|
||||
content:
|
||||
application/json:
|
||||
@ -1819,7 +1921,11 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error
|
||||
summary: Update asset metadata
|
||||
security:
|
||||
- ApiKeyAuth: []
|
||||
- BearerAuth: []
|
||||
- CookieAuth: []
|
||||
summary: Get asset content
|
||||
tags:
|
||||
- file
|
||||
/api/assets/{id}/tags:
|
||||
@ -2675,14 +2781,20 @@ paths:
|
||||
summary: Get internationalisation translation strings
|
||||
/api/interrupt:
|
||||
post:
|
||||
deprecated: true
|
||||
description: |
|
||||
Cancel all currently RUNNING jobs for the authenticated user.
|
||||
This will interrupt any job that is currently in 'in_progress' status.
|
||||
Note: This endpoint only affects running jobs. To cancel pending jobs, use /api/queue.
|
||||
Deprecated. Prefer the jobs-namespace cancel endpoints:
|
||||
POST /api/jobs/{job_id}/cancel for a single job, or
|
||||
POST /api/jobs/cancel to cancel jobs by ID.
|
||||
|
||||
Cancels the first active job for the authenticated user (the currently
|
||||
running job if there is one, otherwise the next pending job). Takes no
|
||||
body and cannot target a specific job — use the jobs-namespace endpoints
|
||||
for that.
|
||||
operationId: interruptJob
|
||||
responses:
|
||||
"200":
|
||||
description: Success - Job interrupted or no running job found
|
||||
description: Success - first active job cancelled, or no active job found
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
@ -2695,7 +2807,7 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error
|
||||
summary: Interrupt currently running jobs
|
||||
summary: Interrupt the first active job
|
||||
tags:
|
||||
- queue
|
||||
/api/job/{job_id}/status:
|
||||
@ -2954,6 +3066,64 @@ paths:
|
||||
summary: Cancel a job
|
||||
tags:
|
||||
- workflow
|
||||
/api/jobs/cancel:
|
||||
post:
|
||||
description: |
|
||||
Cancel one or more jobs for the authenticated user in a single request.
|
||||
|
||||
State-agnostic: cancels both pending and running jobs (both transition to
|
||||
the cancelled state via the same mechanism as the single-job endpoint).
|
||||
|
||||
Idempotent per job: a job already in a terminal or cancelling state is a
|
||||
no-op and simply will not appear in the returned `cancelled` list.
|
||||
|
||||
Fail-fast on unknown IDs: if any provided job ID does not exist for this
|
||||
user, the request returns 404 and no jobs are cancelled. This surfaces
|
||||
bad IDs to the caller rather than silently dropping them.
|
||||
|
||||
This is the canonical batch-cancel endpoint. The delete operation on
|
||||
POST /api/queue is deprecated in favour of this.
|
||||
operationId: cancelJobs
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/JobsCancelRequest'
|
||||
required: true
|
||||
responses:
|
||||
"200":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/JobsCancelResponse'
|
||||
description: Success - cancel requests dispatched (or jobs were already terminal)
|
||||
"400":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Bad Request - job_ids is missing, empty, exceeds the maximum count, or contains an invalid UUID
|
||||
"401":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Unauthorized - Authentication required
|
||||
"404":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: One or more job IDs not found for this user (no jobs cancelled)
|
||||
"500":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/ErrorResponse'
|
||||
description: Internal server error - cancellation failed
|
||||
summary: Cancel multiple jobs
|
||||
tags:
|
||||
- workflow
|
||||
/api/node_replacements:
|
||||
get:
|
||||
description: |
|
||||
@ -3050,6 +3220,12 @@ paths:
|
||||
schema:
|
||||
$ref: '#/components/schemas/PromptErrorResponse'
|
||||
description: Payment required - Insufficient credits
|
||||
"413":
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: '#/components/schemas/PromptErrorResponse'
|
||||
description: Workflow JSON too large
|
||||
"429":
|
||||
content:
|
||||
application/json:
|
||||
@ -3098,9 +3274,18 @@ paths:
|
||||
tags:
|
||||
- queue
|
||||
post:
|
||||
deprecated: true
|
||||
description: |
|
||||
Cancel specific PENDING jobs by ID or clear all pending jobs in the queue.
|
||||
Note: This endpoint only affects pending jobs. To cancel running jobs, use /api/interrupt.
|
||||
Deprecated. Prefer the jobs-namespace cancel endpoints:
|
||||
POST /api/jobs/cancel for cancelling jobs by ID, and
|
||||
POST /api/jobs/{job_id}/cancel for a single job.
|
||||
|
||||
Cancel specific jobs by ID (the `delete` field) or clear all pending
|
||||
jobs in the queue (the `clear` field). Despite the `delete` naming, this
|
||||
does not delete anything — listed jobs transition to the cancelled state,
|
||||
and `delete` cancels both pending and running jobs (not pending-only as
|
||||
previously documented). Job-by-ID cancellation is superseded by
|
||||
POST /api/jobs/cancel; `clear` has no jobs-namespace replacement yet.
|
||||
operationId: manageQueue
|
||||
requestBody:
|
||||
content:
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.24.0"
|
||||
version = "0.25.0"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
comfyui-frontend-package==1.45.15
|
||||
comfyui-workflow-templates==0.9.98
|
||||
comfyui-frontend-package==1.45.19
|
||||
comfyui-workflow-templates==0.10.0
|
||||
comfyui-embedded-docs==0.5.4
|
||||
torch
|
||||
torchsde
|
||||
@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
|
||||
filelock
|
||||
av>=16.0.0
|
||||
comfy-kitchen==0.2.10
|
||||
comfy-aimdo==0.4.9
|
||||
comfy-aimdo==0.4.10
|
||||
requests
|
||||
simpleeval>=1.0.0
|
||||
blake3
|
||||
|
||||
Loading…
Reference in New Issue
Block a user