Merge branch 'master' into seedvr2-native-support-v5

This commit is contained in:
John Pollock 2026-06-18 09:58:04 -05:00
commit ad04a6199e
115 changed files with 25844 additions and 483 deletions

View File

@ -140,7 +140,7 @@ ComfyUI follows a weekly release cycle targeting Monday but this regularly chang
- Commits outside of the stable release tags may be very unstable and break many custom nodes.
- Serves as the foundation for the desktop release
2. **[ComfyUI Desktop](https://github.com/Comfy-Org/desktop)**
2. **[Comfy Desktop](https://github.com/Comfy-Org/Comfy-Desktop)**
- Builds a new release using the latest stable core version
3. **[ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend)**
@ -309,7 +309,7 @@ After this you should have everything installed and can proceed to running Comfy
#### Apple Mac silicon
You can install ComfyUI in Apple Mac silicon (M1 or M2) with any recent macOS version.
You can install ComfyUI in Apple Mac silicon (M1, M2, M3 or M4) with any recent macOS version.
1. Install pytorch nightly. For instructions, read the [Accelerated PyTorch training on Mac](https://developer.apple.com/metal/pytorch/) Apple Developer guide (make sure to install the latest pytorch nightly).
1. Follow the [ComfyUI manual installation](#manual-install-windows-linux) instructions for Windows and Linux.
@ -364,7 +364,7 @@ For models compatible with Iluvatar Extension for PyTorch. Here's a step-by-step
| Flag | Description |
|------|-------------|
| `--enable-manager` | Enable ComfyUI-Manager |
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (requires `--enable-manager`) |
| `--enable-manager-legacy-ui` | Use the legacy manager UI instead of the new UI (implies `--enable-manager`) |
| `--disable-manager-ui` | Disable the manager UI and endpoints while keeping background features like security checks and scheduled installation completion (requires `--enable-manager`) |
@ -382,11 +382,7 @@ For AMD 7600 and maybe other RDNA3 cards: ```HSA_OVERRIDE_GFX_VERSION=11.0.0 pyt
### AMD ROCm Tips
You can enable experimental memory efficient attention on recent pytorch in ComfyUI on some AMD GPUs using this command, it should already be enabled by default on RDNA3. If this improves speed for you on latest pytorch on your GPU please report it so that I can enable it by default.
```TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 python main.py --use-pytorch-cross-attention```
You can also try setting this env variable `PYTORCH_TUNABLEOP_ENABLED=1` which might speed things up at the cost of a very slow initial run.
You can try setting this env variable `PYTORCH_TUNABLEOP_ENABLED=1` which might speed things up at the cost of a very slow initial run.
# Notes
@ -462,16 +458,6 @@ To use the most up-to-date frontend version:
This approach allows you to easily switch between the stable fortnightly release and the cutting-edge daily updates, or even specific versions for testing purposes.
### Accessing the Legacy Frontend
If you need to use the legacy frontend for any reason, you can access it using the following command line argument:
```
--front-end-version Comfy-Org/ComfyUI_legacy_frontend@latest
```
This will use a snapshot of the legacy frontend preserved in the [ComfyUI Legacy Frontend repository](https://github.com/Comfy-Org/ComfyUI_legacy_frontend).
# QA
### Which GPU should I buy for this?

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,569 @@
{
"revision": 0,
"last_node_id": 89,
"last_link_id": 0,
"nodes": [
{
"id": 89,
"type": "85e595bd-af9e-40ee-85c5-b98bb15da47a",
"pos": [
320,
520
],
"size": [
400,
360
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": null
},
{
"name": "resolution",
"type": "INT",
"widget": {
"name": "resolution"
},
"link": null
},
{
"name": "resize_method",
"type": "COMBO",
"widget": {
"name": "resize_method"
},
"link": null
},
{
"label": "output_type",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "output"
},
"link": null
},
{
"label": "output_normalization",
"name": "output.normalization",
"type": "COMBO",
"widget": {
"name": "output.normalization"
},
"link": null
},
{
"label": "apply_sky_clip",
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"widget": {
"name": "output.apply_sky_clip"
},
"link": null
},
{
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": []
}
],
"properties": {
"proxyWidgets": [
[
"87",
"resolution"
],
[
"87",
"resize_method"
],
[
"86",
"output"
],
[
"86",
"output.normalization"
],
[
"86",
"output.apply_sky_clip"
],
[
"88",
"model_name"
]
],
"cnr_id": "comfy-core",
"ver": "0.24.0"
},
"widgets_values": [],
"title": "Image Depth Estimation (Depth Anything 3)"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "85e595bd-af9e-40ee-85c5-b98bb15da47a",
"version": 1,
"state": {
"lastGroupId": 4,
"lastNodeId": 89,
"lastLinkId": 109,
"lastRerouteId": 0
},
"revision": 2,
"config": {},
"name": "Image Depth Estimation (Depth Anything 3)",
"inputNode": {
"id": -10,
"bounding": [
400,
90,
166.998046875,
188
]
},
"outputNode": {
"id": -20,
"bounding": [
1250,
146,
128,
68
]
},
"inputs": [
{
"id": "43cf3118-495a-487d-8eb3-a17c7e92f64f",
"name": "image",
"type": "IMAGE",
"linkIds": [
19
],
"localized_name": "image",
"pos": [
542.998046875,
114
]
},
{
"id": "1089a0a1-6db1-45a8-84b0-0bfdc2ed920a",
"name": "resolution",
"type": "INT",
"linkIds": [
22
],
"pos": [
542.998046875,
134
]
},
{
"id": "25fb64ac-26d5-466d-995b-6d51b9afa2c4",
"name": "resize_method",
"type": "COMBO",
"linkIds": [
23
],
"pos": [
542.998046875,
154
]
},
{
"id": "8acafb7c-6c8b-46b3-9d74-c563498a3af1",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"linkIds": [
24
],
"label": "output_type",
"pos": [
542.998046875,
174
]
},
{
"id": "1da5009b-4648-43e8-a257-16426630cf22",
"name": "output.normalization",
"type": "COMBO",
"linkIds": [
25
],
"label": "output_normalization",
"pos": [
542.998046875,
194
]
},
{
"id": "fd7edb33-5fb1-4538-a411-26e5039a9321",
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"linkIds": [
26
],
"label": "apply_sky_clip",
"pos": [
542.998046875,
214
]
},
{
"id": "b5be4c8a-b833-4f1e-8c94-3ed1dd722190",
"name": "model_name",
"type": "COMBO",
"linkIds": [
106
],
"pos": [
542.998046875,
234
]
}
],
"outputs": [
{
"id": "478ab537-63bc-4d74-a9f0-c975f550880f",
"name": "IMAGE",
"type": "IMAGE",
"linkIds": [
7
],
"localized_name": "IMAGE",
"pos": [
1274,
170
]
}
],
"widgets": [],
"nodes": [
{
"id": 86,
"type": "DA3Render",
"pos": [
800,
310
],
"size": [
380,
130
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"localized_name": "da3_geometry",
"name": "da3_geometry",
"type": "DA3_GEOMETRY",
"link": 12
},
{
"localized_name": "output",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "output"
},
"link": 24
},
{
"localized_name": "output.normalization",
"name": "output.normalization",
"type": "COMBO",
"widget": {
"name": "output.normalization"
},
"link": 25
},
{
"localized_name": "output.apply_sky_clip",
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"widget": {
"name": "output.apply_sky_clip"
},
"link": 26
},
{
"name": "geometry",
"type": "DA3_GEOMETRY",
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"slot_index": 0,
"links": [
7
]
}
],
"properties": {
"Node name for S&R": "DA3Render",
"cnr_id": "comfy-core",
"ver": "0.19.0"
},
"widgets_values": [
"depth",
"v2_style",
false
]
},
{
"id": 87,
"type": "DA3Inference",
"pos": [
800,
50
],
"size": [
390,
130
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "da3_model",
"name": "da3_model",
"type": "DA3_MODEL",
"link": 107
},
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 19
},
{
"localized_name": "resolution",
"name": "resolution",
"type": "INT",
"widget": {
"name": "resolution"
},
"link": 22
},
{
"localized_name": "resize_method",
"name": "resize_method",
"type": "COMBO",
"widget": {
"name": "resize_method"
},
"link": 23
},
{
"localized_name": "mode",
"name": "mode",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "mode"
},
"link": null
}
],
"outputs": [
{
"localized_name": "da3_geometry",
"name": "da3_geometry",
"type": "DA3_GEOMETRY",
"slot_index": 0,
"links": [
12
]
}
],
"properties": {
"Node name for S&R": "DA3Inference",
"cnr_id": "comfy-core",
"ver": "0.19.0"
},
"widgets_values": [
504,
"upper_bound_resize",
"mono"
]
},
{
"id": 88,
"type": "LoadDA3Model",
"pos": [
810,
-160
],
"size": [
400,
140
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "model_name",
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": 106
},
{
"localized_name": "weight_dtype",
"name": "weight_dtype",
"type": "COMBO",
"widget": {
"name": "weight_dtype"
},
"link": null
}
],
"outputs": [
{
"localized_name": "DA3_MODEL",
"name": "DA3_MODEL",
"type": "DA3_MODEL",
"links": [
107
]
}
],
"properties": {
"Node name for S&R": "LoadDA3Model",
"cnr_id": "comfy-core",
"ver": "0.24.0",
"models": [
{
"name": "depth_anything_3_mono_large.safetensors",
"url": "https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors",
"directory": "geometry_estimation"
}
]
},
"widgets_values": [
"depth_anything_3_mono_large.safetensors",
"default"
]
}
],
"groups": [],
"links": [
{
"id": 12,
"origin_id": 87,
"origin_slot": 0,
"target_id": 86,
"target_slot": 0,
"type": "DA3_GEOMETRY"
},
{
"id": 19,
"origin_id": -10,
"origin_slot": 0,
"target_id": 87,
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 7,
"origin_id": 86,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 22,
"origin_id": -10,
"origin_slot": 1,
"target_id": 87,
"target_slot": 2,
"type": "INT"
},
{
"id": 23,
"origin_id": -10,
"origin_slot": 2,
"target_id": 87,
"target_slot": 3,
"type": "COMBO"
},
{
"id": 24,
"origin_id": -10,
"origin_slot": 3,
"target_id": 86,
"target_slot": 1,
"type": "COMFY_DYNAMICCOMBO_V3"
},
{
"id": 25,
"origin_id": -10,
"origin_slot": 4,
"target_id": 86,
"target_slot": 2,
"type": "COMBO"
},
{
"id": 26,
"origin_id": -10,
"origin_slot": 5,
"target_id": 86,
"target_slot": 3,
"type": "BOOLEAN"
},
{
"id": 106,
"origin_id": -10,
"origin_slot": 6,
"target_id": 88,
"target_slot": 0,
"type": "COMBO"
},
{
"id": 107,
"origin_id": 88,
"origin_slot": 0,
"target_id": 87,
"target_slot": 0,
"type": "DA3_MODEL"
}
],
"extra": {},
"category": "Conditioning & Preprocessors/Depth",
"description": "This subgraph takes an input image and produces a depth map using the Depth Anything 3 model, which recovers spatially consistent geometry from any number of views. It is ideal for single or multi-view images, videos, and 3D scenes where accurate depth estimation is needed for tasks like SLAM, novel view synthesis, or spatial perception. The model uses a plain transformer backbone and supports both monocular and multi-view inputs without."
}
]
},
"extra": {
"BlueprintDescription": "This subgraph takes an input image and produces a depth map using the Depth Anything 3 model, which recovers spatially consistent geometry from any number of views. It is ideal for single or multi-view images, videos, and 3D scenes where accurate depth estimation is needed for tasks like SLAM, novel view synthesis, or spatial perception. The model uses a plain transformer backbone and supports both monocular and multi-view inputs without."
}
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1077,9 +1077,12 @@
}
],
"extra": {},
"category": "Image generation and editing/Text to image"
"category": "Image generation and editing/Text to image",
"description": "This subgraph converts text prompts into non-photorealistic illustrations using a 2-billion-parameter model optimized for anime and artistic styles. It is ideal for generating concept art, character designs, or stylized illustrations where photorealism is not required. The model excels with anime and artistic content but performs poorly on realistic subjects."
}
]
},
"extra": {}
"extra": {
"BlueprintDescription": "This subgraph converts text prompts into non-photorealistic illustrations using a 2-billion-parameter model optimized for anime and artistic styles. It is ideal for generating concept art, character designs, or stylized illustrations where photorealism is not required. The model excels with anime and artistic content but performs poorly on realistic subjects."
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,825 @@
{
"revision": 0,
"last_node_id": 97,
"last_link_id": 0,
"nodes": [
{
"id": 97,
"type": "253ec5ca-8333-4ddf-a036-9fc0923651b9",
"pos": [
410,
500
],
"size": [
400,
400
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "video",
"type": "VIDEO",
"link": null
},
{
"name": "start_time",
"type": "FLOAT",
"widget": {
"name": "start_time"
},
"link": null
},
{
"name": "duration",
"type": "FLOAT",
"widget": {
"name": "duration"
},
"link": null
},
{
"name": "resolution",
"type": "INT",
"widget": {
"name": "resolution"
},
"link": null
},
{
"name": "resize_method",
"type": "COMBO",
"widget": {
"name": "resize_method"
},
"link": null
},
{
"label": "output_type",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "output"
},
"link": null
},
{
"label": "normalization",
"name": "output.normalization",
"type": "COMBO",
"widget": {
"name": "output.normalization"
},
"link": null
},
{
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"widget": {
"name": "output.apply_sky_clip"
},
"link": null
},
{
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"links": []
},
{
"name": "audio",
"type": "AUDIO",
"links": []
},
{
"name": "fps",
"type": "FLOAT",
"links": []
}
],
"properties": {
"proxyWidgets": [
[
"96",
"start_time"
],
[
"96",
"duration"
],
[
"93",
"resolution"
],
[
"93",
"resize_method"
],
[
"92",
"output"
],
[
"92",
"output.normalization"
],
[
"92",
"output.apply_sky_clip"
],
[
"94",
"model_name"
]
],
"cnr_id": "comfy-core",
"ver": "0.24.0"
},
"widgets_values": [],
"title": "Video Depth Estimation (Depth Anything 3)"
}
],
"links": [],
"version": 0.4,
"definitions": {
"subgraphs": [
{
"id": "253ec5ca-8333-4ddf-a036-9fc0923651b9",
"version": 1,
"state": {
"lastGroupId": 4,
"lastNodeId": 97,
"lastLinkId": 129,
"lastRerouteId": 0
},
"revision": 2,
"config": {},
"name": "Video Depth Estimation (Depth Anything 3)",
"inputNode": {
"id": -10,
"bounding": [
-230,
130,
167.912109375,
228
]
},
"outputNode": {
"id": -20,
"bounding": [
1520,
140,
128,
108
]
},
"inputs": [
{
"id": "698c28c6-cf92-4039-8b39-f3062868ea7c",
"name": "video",
"type": "VIDEO",
"linkIds": [
119
],
"pos": [
-86.087890625,
154
]
},
{
"id": "97a1f63e-1585-4a40-9dec-e2700120d84a",
"name": "start_time",
"type": "FLOAT",
"linkIds": [
121
],
"pos": [
-86.087890625,
174
]
},
{
"id": "4dbbd3b3-c5ee-4a56-a0d3-3268d3b2fd64",
"name": "duration",
"type": "FLOAT",
"linkIds": [
122
],
"pos": [
-86.087890625,
194
]
},
{
"id": "16f55101-f99d-4c0c-bebf-c3b31c54f13e",
"name": "resolution",
"type": "INT",
"linkIds": [
124
],
"pos": [
-86.087890625,
214
]
},
{
"id": "d9cd7693-4bb3-4ed7-9a75-276b997abcd9",
"name": "resize_method",
"type": "COMBO",
"linkIds": [
125
],
"pos": [
-86.087890625,
234
]
},
{
"id": "a6e90532-323b-462e-ba9c-1672384d5b31",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"linkIds": [
126
],
"label": "output_type",
"pos": [
-86.087890625,
254
]
},
{
"id": "69e6aeef-437d-4fde-b2fc-d5ab9369238d",
"name": "output.normalization",
"type": "COMBO",
"linkIds": [
127
],
"label": "normalization",
"pos": [
-86.087890625,
274
]
},
{
"id": "73206f72-f89a-4698-885e-5d9277df2998",
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"linkIds": [
128
],
"pos": [
-86.087890625,
294
]
},
{
"id": "dddbc7fc-9431-448a-9ed3-9aa62404288b",
"name": "model_name",
"type": "COMBO",
"linkIds": [
129
],
"pos": [
-86.087890625,
314
]
}
],
"outputs": [
{
"id": "478ab537-63bc-4d74-a9f0-c975f550880f",
"name": "IMAGE",
"type": "IMAGE",
"linkIds": [
7
],
"localized_name": "IMAGE",
"pos": [
1544,
164
]
},
{
"id": "cdaf037e-79bc-4a94-b06c-0fd32e76f615",
"name": "audio",
"type": "AUDIO",
"linkIds": [
112
],
"pos": [
1544,
184
]
},
{
"id": "4c0e5484-d193-49c7-b107-92619628880a",
"name": "fps",
"type": "FLOAT",
"linkIds": [
113
],
"pos": [
1544,
204
]
}
],
"widgets": [],
"nodes": [
{
"id": 92,
"type": "DA3Render",
"pos": [
740,
230
],
"size": [
380,
130
],
"flags": {},
"order": 0,
"mode": 0,
"inputs": [
{
"localized_name": "da3_geometry",
"name": "da3_geometry",
"type": "DA3_GEOMETRY",
"link": 12
},
{
"localized_name": "output",
"name": "output",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "output"
},
"link": 126
},
{
"localized_name": "output.normalization",
"name": "output.normalization",
"type": "COMBO",
"widget": {
"name": "output.normalization"
},
"link": 127
},
{
"localized_name": "output.apply_sky_clip",
"name": "output.apply_sky_clip",
"type": "BOOLEAN",
"widget": {
"name": "output.apply_sky_clip"
},
"link": 128
},
{
"name": "geometry",
"type": "DA3_GEOMETRY",
"link": null
}
],
"outputs": [
{
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
"slot_index": 0,
"links": [
7
]
}
],
"properties": {
"Node name for S&R": "DA3Render",
"cnr_id": "comfy-core",
"ver": "0.19.0"
},
"widgets_values": [
"depth",
"v2_style",
false
]
},
{
"id": 93,
"type": "DA3Inference",
"pos": [
740,
-30
],
"size": [
390,
130
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"localized_name": "da3_model",
"name": "da3_model",
"type": "DA3_MODEL",
"link": 107
},
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
"link": 111
},
{
"localized_name": "resolution",
"name": "resolution",
"type": "INT",
"widget": {
"name": "resolution"
},
"link": 124
},
{
"localized_name": "resize_method",
"name": "resize_method",
"type": "COMBO",
"widget": {
"name": "resize_method"
},
"link": 125
},
{
"localized_name": "mode",
"name": "mode",
"type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
"name": "mode"
},
"link": null
}
],
"outputs": [
{
"localized_name": "da3_geometry",
"name": "da3_geometry",
"type": "DA3_GEOMETRY",
"slot_index": 0,
"links": [
12
]
}
],
"properties": {
"Node name for S&R": "DA3Inference",
"cnr_id": "comfy-core",
"ver": "0.19.0"
},
"widgets_values": [
504,
"lower_bound_resize",
"mono"
]
},
{
"id": 94,
"type": "LoadDA3Model",
"pos": [
50,
410
],
"size": [
400,
140
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"localized_name": "model_name",
"name": "model_name",
"type": "COMBO",
"widget": {
"name": "model_name"
},
"link": 129
},
{
"localized_name": "weight_dtype",
"name": "weight_dtype",
"type": "COMBO",
"widget": {
"name": "weight_dtype"
},
"link": null
}
],
"outputs": [
{
"localized_name": "DA3_MODEL",
"name": "DA3_MODEL",
"type": "DA3_MODEL",
"links": [
107
]
}
],
"properties": {
"Node name for S&R": "LoadDA3Model",
"cnr_id": "comfy-core",
"ver": "0.24.0",
"models": [
{
"name": "depth_anything_3_mono_large.safetensors",
"url": "https://huggingface.co/Comfy-Org/Depth-Anything-3/resolve/main/geometry_estimation/depth_anything_3_mono_large.safetensors",
"directory": "geometry_estimation"
}
]
},
"widgets_values": [
"depth_anything_3_mono_large.safetensors",
"default"
]
},
{
"id": 95,
"type": "GetVideoComponents",
"pos": [
70,
-140
],
"size": [
260,
120
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": 120
}
],
"outputs": [
{
"localized_name": "images",
"name": "images",
"type": "IMAGE",
"links": [
111
]
},
{
"localized_name": "audio",
"name": "audio",
"type": "AUDIO",
"links": [
112
]
},
{
"localized_name": "fps",
"name": "fps",
"type": "FLOAT",
"links": [
113
]
},
{
"localized_name": "bit_depth",
"name": "bit_depth",
"type": "INT",
"links": null
}
],
"properties": {
"Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
"ver": "0.24.0"
}
},
{
"id": 96,
"type": "Video Slice",
"pos": [
70,
-360
],
"size": [
270,
170
],
"flags": {},
"order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "video",
"name": "video",
"type": "VIDEO",
"link": 119
},
{
"localized_name": "start_time",
"name": "start_time",
"type": "FLOAT",
"widget": {
"name": "start_time"
},
"link": 121
},
{
"localized_name": "duration",
"name": "duration",
"type": "FLOAT",
"widget": {
"name": "duration"
},
"link": 122
},
{
"localized_name": "strict_duration",
"name": "strict_duration",
"type": "BOOLEAN",
"widget": {
"name": "strict_duration"
},
"link": null
}
],
"outputs": [
{
"localized_name": "VIDEO",
"name": "VIDEO",
"type": "VIDEO",
"links": [
120
]
}
],
"properties": {
"Node name for S&R": "Video Slice",
"cnr_id": "comfy-core",
"ver": "0.24.0"
},
"widgets_values": [
0,
5,
false
]
}
],
"groups": [],
"links": [
{
"id": 12,
"origin_id": 93,
"origin_slot": 0,
"target_id": 92,
"target_slot": 0,
"type": "DA3_GEOMETRY"
},
{
"id": 7,
"origin_id": 92,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
"type": "IMAGE"
},
{
"id": 107,
"origin_id": 94,
"origin_slot": 0,
"target_id": 93,
"target_slot": 0,
"type": "DA3_MODEL"
},
{
"id": 111,
"origin_id": 95,
"origin_slot": 0,
"target_id": 93,
"target_slot": 1,
"type": "IMAGE"
},
{
"id": 112,
"origin_id": 95,
"origin_slot": 1,
"target_id": -20,
"target_slot": 1,
"type": "AUDIO"
},
{
"id": 113,
"origin_id": 95,
"origin_slot": 2,
"target_id": -20,
"target_slot": 2,
"type": "FLOAT"
},
{
"id": 119,
"origin_id": -10,
"origin_slot": 0,
"target_id": 96,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 120,
"origin_id": 96,
"origin_slot": 0,
"target_id": 95,
"target_slot": 0,
"type": "VIDEO"
},
{
"id": 121,
"origin_id": -10,
"origin_slot": 1,
"target_id": 96,
"target_slot": 1,
"type": "FLOAT"
},
{
"id": 122,
"origin_id": -10,
"origin_slot": 2,
"target_id": 96,
"target_slot": 2,
"type": "FLOAT"
},
{
"id": 124,
"origin_id": -10,
"origin_slot": 3,
"target_id": 93,
"target_slot": 2,
"type": "INT"
},
{
"id": 125,
"origin_id": -10,
"origin_slot": 4,
"target_id": 93,
"target_slot": 3,
"type": "COMBO"
},
{
"id": 126,
"origin_id": -10,
"origin_slot": 5,
"target_id": 92,
"target_slot": 1,
"type": "COMFY_DYNAMICCOMBO_V3"
},
{
"id": 127,
"origin_id": -10,
"origin_slot": 6,
"target_id": 92,
"target_slot": 2,
"type": "COMBO"
},
{
"id": 128,
"origin_id": -10,
"origin_slot": 7,
"target_id": 92,
"target_slot": 3,
"type": "BOOLEAN"
},
{
"id": 129,
"origin_id": -10,
"origin_slot": 8,
"target_id": 94,
"target_slot": 0,
"type": "COMBO"
}
],
"extra": {},
"category": "Conditioning & Preprocessors/Depth",
"description": "This subgraph processes a video input through Depth Anything 3 to produce temporally consistent depth maps for each frame, outputting a depth video. It is ideal for video content requiring spatial geometry estimation, such as 3D reconstruction, SLAM, or novel view synthesis from moving cameras. The model uses a plain transformer backbone trained with a depth-ray representation, supporting any number of views without requiring known camera poses."
}
]
},
"extra": {
"BlueprintDescription": "This subgraph processes a video input through Depth Anything 3 to produce temporally consistent depth maps for each frame, outputting a depth video. It is ideal for video content requiring spatial geometry estimation, such as 3D reconstruction, SLAM, or novel view synthesis from moving cameras. The model uses a plain transformer backbone trained with a depth-ray representation, supporting any number of views without requiring known camera poses."
}
}

File diff suppressed because it is too large Load Diff

View File

@ -115,6 +115,7 @@ cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metav
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
cache_group.add_argument("--high-ram", action="store_true", help="Can improve performance slightly on high RAM or on systems where pagefile use is preferred over model loading.")
attn_group = parser.add_mutually_exclusive_group()
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@ -133,7 +134,7 @@ upcast.add_argument("--dont-upcast-attention", action="store_true", help="Disabl
parser.add_argument("--enable-manager", action="store_true", help="Enable the ComfyUI-Manager feature.")
manager_group = parser.add_mutually_exclusive_group()
manager_group.add_argument("--disable-manager-ui", action="store_true", help="Disables only the ComfyUI-Manager UI and endpoints. Scheduled installations and similar background tasks will still operate.")
manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager")
manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", help="Enables the legacy UI of ComfyUI-Manager. Implies --enable-manager.")
vram_group = parser.add_mutually_exclusive_group()
@ -144,6 +145,7 @@ vram_group.add_argument("--novram", action="store_true", help="When lowvram isn'
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
parser.add_argument("--reserve-vram", type=float, default=None, help="Set the amount of vram in GB you want to reserve for use by your OS/other software. By default some amount is reserved depending on your OS.")
parser.add_argument("--vram-headroom", type=float, default=0, help="Set the amount of vram in GB for DynamicVRAM to maintain as extra headroom above default. ComfyUI will try and keep this much VRAM completely free and unused, even counting VRAM from other apps.")
parser.add_argument("--async-offload", nargs='?', const=2, type=int, default=None, metavar="NUM_STREAMS", help="Use async weight offloading. An optional argument controls the amount of offload streams. Default is 2. Enabled by default on Nvidia.")
parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
@ -249,6 +251,9 @@ else:
if args.cache_ram is not None and len(args.cache_ram) > 2:
parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
if args.high_ram:
args.cache_classic = True
if args.windows_standalone_build:
args.auto_launch = True
@ -258,6 +263,10 @@ if args.disable_auto_launch:
if args.force_fp16:
args.fp16_unet = True
# '--enable-manager-legacy-ui' is meaningless unless the manager is enabled, so imply '--enable-manager'.
if args.enable_manager_legacy_ui:
args.enable_manager = True
# '--fast' is not provided, use an empty set
if args.fast is None:

321
comfy/ldm/boogu/model.py Normal file
View File

@ -0,0 +1,321 @@
# Boogu-Image-0.1 transformer
# Architecture is an OmniGen2 derivative (see comfy/ldm/omnigen/omnigen2.py) with an
# added dual-stream ("double_stream") stage before the single-stream layers, conditioned
# by a Qwen3-VL multimodal LLM. Reuses the OmniGen2/Lumina building blocks and the Flux
# RoPE core, the only new component is the double-stream block + the hybrid forward order.
from typing import Optional, Tuple
import torch
import torch.nn as nn
from einops import rearrange
import comfy.ldm.common_dit
import comfy.ldm.omnigen.omnigen2
from comfy.ldm.modules.attention import optimized_attention_masked
from comfy.ldm.omnigen.omnigen2 import (
OmniGen2RotaryPosEmbed,
Lumina2CombinedTimestepCaptionEmbedding,
LuminaRMSNormZero,
LuminaLayerNormContinuous,
LuminaFeedForward,
Attention,
OmniGen2TransformerBlock,
apply_rotary_emb,
)
class BooguDoubleStreamProcessor(nn.Module):
# Joint attention over [instruct ; img] with separate per-stream q/k/v and output projections.
def __init__(self, dim, head_dim, heads, kv_heads, dtype=None, device=None, operations=None):
super().__init__()
query_dim = head_dim * heads
kv_dim = head_dim * kv_heads
self.img_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
self.img_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
self.img_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
self.instruct_to_q = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
self.instruct_to_k = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
self.instruct_to_v = operations.Linear(query_dim, kv_dim, bias=False, dtype=dtype, device=device)
self.instruct_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
self.img_out = operations.Linear(query_dim, query_dim, bias=False, dtype=dtype, device=device)
def forward(self, attn, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
batch_size = img_hidden_states.shape[0]
L_instruct = instruct_hidden_states.shape[1]
img_q = self.img_to_q(img_hidden_states)
img_k = self.img_to_k(img_hidden_states)
img_v = self.img_to_v(img_hidden_states)
instruct_q = self.instruct_to_q(instruct_hidden_states)
instruct_k = self.instruct_to_k(instruct_hidden_states)
instruct_v = self.instruct_to_v(instruct_hidden_states)
# Concatenate instruction first, then image (matches reference processor order).
query = torch.cat([instruct_q, img_q], dim=1)
key = torch.cat([instruct_k, img_k], dim=1)
value = torch.cat([instruct_v, img_v], dim=1)
query = query.view(batch_size, -1, attn.heads, attn.dim_head)
key = key.view(batch_size, -1, attn.kv_heads, attn.dim_head)
value = value.view(batch_size, -1, attn.kv_heads, attn.dim_head)
query = attn.norm_q(query)
key = attn.norm_k(key)
if rotary_emb is not None:
query = apply_rotary_emb(query, rotary_emb)
key = apply_rotary_emb(key, rotary_emb)
query = query.transpose(1, 2)
key = key.transpose(1, 2)
value = value.transpose(1, 2)
if attn.kv_heads < attn.heads:
key = key.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
value = value.repeat_interleave(attn.heads // attn.kv_heads, dim=1)
hidden_states = optimized_attention_masked(query, key, value, attn.heads, attention_mask, skip_reshape=True, transformer_options=transformer_options)
# Split back to instruction/image, apply per-stream output projections, recombine.
instruct_hidden_states = self.instruct_out(hidden_states[:, :L_instruct])
img_hidden_states = self.img_out(hidden_states[:, L_instruct:])
hidden_states = torch.cat([instruct_hidden_states, img_hidden_states], dim=1)
hidden_states = attn.to_out[0](hidden_states)
return hidden_states
class BooguJointAttention(nn.Module):
# Holds the shared q/k RMSNorm + final output projection
def __init__(self, dim, head_dim, heads, kv_heads, eps=1e-5, dtype=None, device=None, operations=None):
super().__init__()
self.heads = heads
self.kv_heads = kv_heads
self.dim_head = head_dim
self.scale = head_dim ** -0.5
self.norm_q = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
self.norm_k = operations.RMSNorm(head_dim, eps=eps, dtype=dtype, device=device)
self.to_out = nn.Sequential(
operations.Linear(heads * head_dim, dim, bias=False, dtype=dtype, device=device),
nn.Dropout(0.0),
)
self.processor = BooguDoubleStreamProcessor(dim, head_dim, heads, kv_heads, dtype=dtype, device=device, operations=operations)
def forward(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask=None, transformer_options={}):
return self.processor(self, img_hidden_states, instruct_hidden_states, rotary_emb, attention_mask, transformer_options=transformer_options)
class BooguDoubleStreamBlock(nn.Module):
# Dual-stream block: joint attention over [instruct ; img] + image self-attention, each stream with its own modulation/MLP.
def __init__(self, dim, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=None, device=None, operations=None):
super().__init__()
head_dim = dim // num_attention_heads
self.img_instruct_attn = BooguJointAttention(dim, head_dim, num_attention_heads, num_kv_heads, eps=1e-5, dtype=dtype, device=device, operations=operations)
self.img_self_attn = Attention(
query_dim=dim, dim_head=head_dim, heads=num_attention_heads, kv_heads=num_kv_heads,
eps=1e-5, bias=False, dtype=dtype, device=device, operations=operations,
)
self.img_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
self.instruct_feed_forward = LuminaFeedForward(dim=dim, inner_dim=4 * dim, multiple_of=multiple_of, dtype=dtype, device=device, operations=operations)
self.img_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
self.img_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
self.img_norm3 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
self.instruct_norm1 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
self.instruct_norm2 = LuminaRMSNormZero(embedding_dim=dim, norm_eps=norm_eps, dtype=dtype, device=device, operations=operations)
self.img_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.img_self_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.img_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.img_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.instruct_attn_norm = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.instruct_ffn_norm1 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
self.instruct_ffn_norm2 = operations.RMSNorm(dim, eps=norm_eps, dtype=dtype, device=device)
def forward(self, img_hidden_states, instruct_hidden_states, joint_rotary_emb, img_rotary_emb, temb, joint_attention_mask=None, img_attention_mask=None, transformer_options={}):
L_instruct = instruct_hidden_states.shape[1]
img_norm1_out, img_gate_msa, img_scale_mlp, img_gate_mlp = self.img_norm1(img_hidden_states, temb)
img_norm2_out, img_shift_mlp, _, _ = self.img_norm2(img_hidden_states, temb)
img_norm3_out, img_gate_self, _, _ = self.img_norm3(img_hidden_states, temb)
instruct_norm1_out, instruct_gate_msa, instruct_scale_mlp, instruct_gate_mlp = self.instruct_norm1(instruct_hidden_states, temb)
instruct_norm2_out, instruct_shift_mlp, _, _ = self.instruct_norm2(instruct_hidden_states, temb)
joint_attn_out = self.img_instruct_attn(img_norm1_out, instruct_norm1_out, joint_rotary_emb, joint_attention_mask, transformer_options=transformer_options)
instruct_attn_out = joint_attn_out[:, :L_instruct]
img_attn_out = joint_attn_out[:, L_instruct:]
img_self_attn_out = self.img_self_attn(img_norm3_out, img_norm3_out, img_attention_mask, img_rotary_emb, transformer_options=transformer_options)
img_hidden_states = img_hidden_states + img_gate_msa.unsqueeze(1).tanh() * self.img_attn_norm(img_attn_out)
img_hidden_states = img_hidden_states + img_gate_self.unsqueeze(1).tanh() * self.img_self_attn_norm(img_self_attn_out)
img_mlp_input = (1 + img_scale_mlp.unsqueeze(1)) * img_norm2_out + img_shift_mlp.unsqueeze(1)
img_mlp_out = self.img_feed_forward(self.img_ffn_norm1(img_mlp_input))
img_hidden_states = img_hidden_states + img_gate_mlp.unsqueeze(1).tanh() * self.img_ffn_norm2(img_mlp_out)
instruct_hidden_states = instruct_hidden_states + instruct_gate_msa.unsqueeze(1).tanh() * self.instruct_attn_norm(instruct_attn_out)
instruct_mlp_input = (1 + instruct_scale_mlp.unsqueeze(1)) * instruct_norm2_out + instruct_shift_mlp.unsqueeze(1)
instruct_mlp_out = self.instruct_feed_forward(self.instruct_ffn_norm1(instruct_mlp_input))
instruct_hidden_states = instruct_hidden_states + instruct_gate_mlp.unsqueeze(1).tanh() * self.instruct_ffn_norm2(instruct_mlp_out)
return img_hidden_states, instruct_hidden_states
class BooguTransformer2DModel(nn.Module):
def __init__(
self,
patch_size: int = 2,
in_channels: int = 16,
out_channels: Optional[int] = None,
hidden_size: int = 3360,
num_layers: int = 32,
num_double_stream_layers: int = 8,
num_refiner_layers: int = 2,
num_attention_heads: int = 28,
num_kv_heads: int = 7,
multiple_of: int = 256,
ffn_dim_multiplier: Optional[float] = None,
norm_eps: float = 1e-5,
axes_dim_rope: Tuple[int, int, int] = (40, 40, 40),
axes_lens: Tuple[int, int, int] = (2048, 1664, 1664),
instruction_feat_dim: int = 4096,
timestep_scale: float = 1000.0,
image_model=None,
device=None, dtype=None, operations=None,
):
super().__init__()
self.patch_size = patch_size
self.out_channels = out_channels or in_channels
self.hidden_size = hidden_size
self.dtype = dtype
self.rope_embedder = OmniGen2RotaryPosEmbed(
theta=10000,
axes_dim=axes_dim_rope,
axes_lens=axes_lens,
patch_size=patch_size,
)
self.x_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
self.ref_image_patch_embedder = operations.Linear(patch_size * patch_size * in_channels, hidden_size, dtype=dtype, device=device)
self.time_caption_embed = Lumina2CombinedTimestepCaptionEmbedding(
hidden_size=hidden_size,
text_feat_dim=instruction_feat_dim,
norm_eps=norm_eps,
timestep_scale=timestep_scale, dtype=dtype, device=device, operations=operations
)
self.noise_refiner = nn.ModuleList([
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
for _ in range(num_refiner_layers)
])
self.ref_image_refiner = nn.ModuleList([
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
for _ in range(num_refiner_layers)
])
self.context_refiner = nn.ModuleList([
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=False, dtype=dtype, device=device, operations=operations)
for _ in range(num_refiner_layers)
])
self.double_stream_layers = nn.ModuleList([
BooguDoubleStreamBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, dtype=dtype, device=device, operations=operations)
for _ in range(num_double_stream_layers)
])
self.single_stream_layers = nn.ModuleList([
OmniGen2TransformerBlock(hidden_size, num_attention_heads, num_kv_heads, multiple_of, ffn_dim_multiplier, norm_eps, modulation=True, dtype=dtype, device=device, operations=operations)
for _ in range(num_layers)
])
self.norm_out = LuminaLayerNormContinuous(
embedding_dim=hidden_size,
conditioning_embedding_dim=min(hidden_size, 1024),
elementwise_affine=False,
eps=1e-6,
out_dim=patch_size * patch_size * self.out_channels, dtype=dtype, device=device, operations=operations
)
self.image_index_embedding = nn.Parameter(torch.empty(5, hidden_size, device=device, dtype=dtype))
# Patchify/refine helpers are identical to OmniGen2; reuse via bound methods.
flat_and_pad_to_seq = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.flat_and_pad_to_seq
img_patch_embed_and_refine = comfy.ldm.omnigen.omnigen2.OmniGen2Transformer2DModel.img_patch_embed_and_refine
def forward(self, x, timesteps, context, num_tokens, ref_latents=None, attention_mask=None, transformer_options={}, **kwargs):
B, C, H, W = x.shape
hidden_states = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
_, _, H_padded, W_padded = hidden_states.shape
timestep = 1.0 - timesteps
text_hidden_states = context
text_attention_mask = attention_mask
ref_image_hidden_states = ref_latents
device = hidden_states.device
temb, text_hidden_states = self.time_caption_embed(timestep, text_hidden_states, hidden_states[0].dtype)
(
hidden_states, ref_image_hidden_states,
img_mask, ref_img_mask,
l_effective_ref_img_len, l_effective_img_len,
ref_img_sizes, img_sizes,
) = self.flat_and_pad_to_seq(hidden_states, ref_image_hidden_states)
(
context_rotary_emb, ref_img_rotary_emb, noise_rotary_emb,
rotary_emb, encoder_seq_lengths, seq_lengths,
) = self.rope_embedder(
hidden_states.shape[0], text_hidden_states.shape[1], [num_tokens] * text_hidden_states.shape[0],
l_effective_ref_img_len, l_effective_img_len,
ref_img_sizes, img_sizes, device,
)
for layer in self.context_refiner:
text_hidden_states = layer(text_hidden_states, text_attention_mask, context_rotary_emb, transformer_options=transformer_options)
img_len = hidden_states.shape[1]
combined_img_hidden_states = self.img_patch_embed_and_refine(
hidden_states, ref_image_hidden_states,
img_mask, ref_img_mask,
noise_rotary_emb, ref_img_rotary_emb,
l_effective_ref_img_len, l_effective_img_len,
temb,
transformer_options=transformer_options,
)
# Double-stream stage: the image self-attention only sees the [ref ; noise] tokens,
# which sit after the instruction tokens in the joint rope.
L_instruct = text_hidden_states.shape[1]
combined_img_rotary_emb = rotary_emb[:, L_instruct:]
for layer in self.double_stream_layers:
combined_img_hidden_states, text_hidden_states = layer(
combined_img_hidden_states, text_hidden_states,
rotary_emb, combined_img_rotary_emb, temb,
joint_attention_mask=None, img_attention_mask=None,
transformer_options=transformer_options,
)
hidden_states = torch.cat([text_hidden_states, combined_img_hidden_states], dim=1)
for layer in self.single_stream_layers:
hidden_states = layer(hidden_states, None, rotary_emb, temb, transformer_options=transformer_options)
hidden_states = self.norm_out(hidden_states, temb)
p = self.patch_size
output = rearrange(hidden_states[:, -img_len:], 'b (h w) (p1 p2 c) -> b c (h p1) (w p2)', h=H_padded // p, w=W_padded // p, p1=p, p2=p)[:, :, :H, :W]
return -output

View File

@ -106,11 +106,11 @@ class Ideogram4EmbedScalar(nn.Module):
self.mlp_in = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
self.mlp_out = operations.Linear(dim, dim, bias=True, dtype=dtype, device=device)
def forward(self, x):
def forward(self, x, dtype):
x = x.to(torch.float32)
scaled = 1e4 * (x - self.range_min) / (self.range_max - self.range_min)
emb = _sinusoidal_embedding(scaled, self.dim)
emb = emb.to(self.mlp_in.weight.dtype)
emb = emb.to(dtype)
emb = F.silu(self.mlp_in(emb))
return self.mlp_out(emb)
@ -161,7 +161,7 @@ class Ideogram4Transformer(nn.Module):
x = x * output_image_mask
h = self.input_proj(x) * output_image_mask
t_cond = self.t_embedding(t)
t_cond = self.t_embedding(t, dtype=x.dtype)
if t.dim() == 1:
t_cond = t_cond.unsqueeze(1)
adaln_input = F.silu(self.adaln_proj(t_cond))

View File

@ -8,6 +8,7 @@ import torch.nn.functional as F
from einops import rearrange, repeat
from comfy.ldm.lightricks.model import Timesteps
from comfy.ldm.flux.layers import EmbedND
from comfy.ldm.flux.math import apply_rope1
from comfy.ldm.modules.attention import optimized_attention_masked
import comfy.model_management
import comfy.ldm.common_dit
@ -17,13 +18,11 @@ def apply_rotary_emb(x, freqs_cis):
if x.shape[1] == 0:
return x
t_ = x.reshape(*x.shape[:-1], -1, 1, 2)
t_out = freqs_cis[..., 0] * t_[..., 0] + freqs_cis[..., 1] * t_[..., 1]
return t_out.reshape(*x.shape).to(dtype=x.dtype)
return apply_rope1(x, freqs_cis)
def swiglu(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
return F.silu(x) * y
return F.silu(x, inplace=True).mul_(y)
class TimestepEmbedding(nn.Module):

View File

@ -1665,7 +1665,7 @@ class SCAILWanModel(WanModel):
# embeddings
x = self.patch_embedding(x.float()).to(x.dtype)
if ref_mask_latents is not None: # SCAIL-2 additive mask stream
if ref_mask_latents is not None: # SCAIL-2 additive mask stream (one identity mask frame per reference, then video)
x = x + self.patch_embedding_mask(ref_mask_latents.float()).to(x.dtype)
grid_sizes = x.shape[2:]
transformer_options["grid_sizes"] = grid_sizes
@ -1728,22 +1728,25 @@ class SCAILWanModel(WanModel):
# ref_mask_flag is a scalar bool (CONDConstant, SCAIL-2 only). False => replacement mode,
# which places ref/pose via H/W rope shifts instead of the animation-mode temporal offset.
# reference_latent may stack several frames: the last is the primary reference adjacent to the video, the earlier frames are additional references.
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, device=None, dtype=None, pose_latents=None, reference_latent=None, ref_mask_flag=None, transformer_options={}):
ref_t_patches = 0
if reference_latent is not None:
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
if ref_mask_flag is not None and not bool(ref_mask_flag):
REF_ROPE_H = 120.0
POSE_ROPE_W = 120.0
ref_t_patches = 0
if reference_latent is not None:
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
main_t_patches = t - ref_t_patches
video_t_start = max(ref_t_patches - 1, 0)
parts = []
if ref_t_patches > 0:
ref_tf = {"rope_options": {"shift_y": REF_ROPE_H, "shift_x": 0.0, "scale_y": 1.0, "scale_x": 1.0}}
parts.append(super().rope_encode(ref_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=ref_tf))
if main_t_patches > 0:
parts.append(super().rope_encode(main_t_patches, h, w, t_start=0, device=device, dtype=dtype, transformer_options=transformer_options))
parts.append(super().rope_encode(main_t_patches, h, w, t_start=video_t_start, device=device, dtype=dtype, transformer_options=transformer_options))
if pose_latents is not None:
F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
@ -1752,7 +1755,7 @@ class SCAILWanModel(WanModel):
h_shift = (h_scale - 1) / 2
w_shift = (w_scale - 1) / 2
pose_tf = {"rope_options": {"shift_y": h_shift, "shift_x": POSE_ROPE_W + w_shift, "scale_y": h_scale, "scale_x": w_scale}}
parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=0, device=device, dtype=dtype, transformer_options=pose_tf))
parts.append(super().rope_encode(F_pose, H_pose, W_pose, t_start=video_t_start, device=device, dtype=dtype, transformer_options=pose_tf))
return torch.cat(parts, dim=1)
@ -1761,10 +1764,6 @@ class SCAILWanModel(WanModel):
if pose_latents is None:
return main_freqs
ref_t_patches = 0
if reference_latent is not None:
ref_t_patches = (reference_latent.shape[2] + (self.patch_size[0] // 2)) // self.patch_size[0]
F_pose, H_pose, W_pose = pose_latents.shape[-3], pose_latents.shape[-2], pose_latents.shape[-1]
# if pose is at half resolution, scale_y/scale_x=2 stretches the position range to cover the same RoPE extent as the main frames

View File

@ -55,6 +55,7 @@ import comfy.ldm.pixeldit.pid
import comfy.ldm.ace.model
import comfy.ldm.omnigen.omnigen2
import comfy.ldm.seedvr.model
import comfy.ldm.boogu.model
import comfy.ldm.qwen_image.model
import comfy.ldm.ideogram4.model
import comfy.ldm.kandinsky5.model
@ -1758,10 +1759,14 @@ class WAN21_SCAIL(WAN21):
reference_latents = kwargs.get("reference_latents", None)
if reference_latents is not None:
ref_latent = self.process_latent_in(reference_latents[-1])
ref_mask = torch.ones_like(ref_latent[:, :4])
ref_latent = torch.cat([ref_latent, ref_mask], dim=1)
out['reference_latent'] = comfy.conds.CONDRegular(ref_latent)
# SCAIL-2 multi-reference: reference_latents[0] is the primary ref, [1:] are additional
# references. Stack as [additional..., primary] so the primary stays adjacent to the video.
ordered = list(reference_latents[1:]) + list(reference_latents[:1])
stacked = []
for lat in ordered:
lat = self.process_latent_in(lat)
stacked.append(torch.cat([lat, torch.ones_like(lat[:, :4])], dim=1))
out['reference_latent'] = comfy.conds.CONDRegular(torch.cat(stacked, dim=2))
pose_latents = kwargs.get("pose_video_latent", None)
if pose_latents is not None:
@ -1803,6 +1808,7 @@ class WAN21_SCAIL2(WAN21_SCAIL):
if driving_mask_28ch is not None:
out['sam_latents'] = comfy.conds.CONDRegular(driving_mask_28ch.movedim(1, 2).contiguous())
# ref_mask_28ch holds one identity mask per stacked reference frame (additional refs first, then the primary ref), followed by zeros over the video frames.
ref_mask_28ch = kwargs.get("ref_mask_28ch", None)
if ref_mask_28ch is not None:
out['ref_mask_latents'] = comfy.conds.CONDRegular(ref_mask_28ch.movedim(1, 2).contiguous())
@ -1827,7 +1833,25 @@ class WAN21_SCAIL2(WAN21_SCAIL):
def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]):
if cond_key in ("sam_latents", "pose_latents"):
return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=1)
# Return sliced view omitting retain_index_list
return comfy.context_windows.slice_cond(cond_value, window, x_in, device, temporal_dim=2, temporal_offset=0)
if cond_key == "ref_mask_latents" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
# The ref mask is N leading ref frames padded with frames of zeros, so just grab the first frames for all windows
full_ref_mask = cond_value.cond
video_frame_count = x_in.shape[2]
ref_frame_count = full_ref_mask.shape[2] - video_frame_count
if ref_frame_count < 1:
return None
window_length = len(window.index_list)
# Account for the causal anchor frame if it exists
anchor_index = getattr(window, "causal_anchor_index", None)
if anchor_index is not None and anchor_index >= 0:
window_length += 1
window_ref_mask = full_ref_mask[:, :, :window_length + ref_frame_count].to(device)
return cond_value._copy_with(window_ref_mask)
return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
def concat_cond(self, **kwargs):
@ -2091,6 +2115,11 @@ class Omnigen2(BaseModel):
out['ref_latents'] = list([1, 16, sum(map(lambda a: math.prod(a.size()), ref_latents)) // 16])
return out
class Boogu(Omnigen2):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super(Omnigen2, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.boogu.model.BooguTransformer2DModel)
self.memory_usage_factor_conds = ("ref_latents",)
class QwenImage(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.qwen_image.model.QwenImageTransformer2DModel)

View File

@ -808,6 +808,16 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
return dit_config
if '{}double_stream_layers.0.img_instruct_attn.processor.img_to_q.weight'.format(key_prefix) in state_dict_keys: # Boogu-Image (OmniGen2 derivative + dual-stream stage)
dit_config = {}
dit_config["image_model"] = "boogu"
dit_config["hidden_size"] = state_dict['{}x_embedder.weight'.format(key_prefix)].shape[0]
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}single_stream_layers.'.format(key_prefix) + '{}.')
dit_config["num_double_stream_layers"] = count_blocks(state_dict_keys, '{}double_stream_layers.'.format(key_prefix) + '{}.')
dit_config["num_refiner_layers"] = count_blocks(state_dict_keys, '{}noise_refiner.'.format(key_prefix) + '{}.')
dit_config["instruction_feat_dim"] = state_dict['{}time_caption_embed.caption_embedder.0.weight'.format(key_prefix)].shape[0]
return dit_config
if '{}time_caption_embed.timestep_embedder.linear_1.bias'.format(key_prefix) in state_dict_keys: # Omnigen2
dit_config = {}
dit_config["image_model"] = "omnigen2"

View File

@ -643,6 +643,8 @@ def free_pins(size, evict_active=False):
return freed_total
def ensure_pin_budget(size, evict_active=False):
if args.high_ram:
return True
if args.fast_disk:
shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
else:
@ -1496,6 +1498,8 @@ if not args.disable_pinned_memory:
PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
def pinned_hostbuf_size(size):
if args.high_ram:
return max(0, int(size * 2))
return max(0, int(min(size, MAX_PINNED_MEMORY) * 2))
def discard_cuda_async_error():

View File

@ -180,7 +180,7 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
if pin is not None:
cast_maybe_lowvram_patch([pin], dest, offload_stream)
return
if signature is None:
if signature is None or args.high_ram:
comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
pin = comfy.pinned_memory.get_pin(m, subset=subset)
cast_maybe_lowvram_patch(source, pin, offload_stream, xfer_dest2=dest)
@ -299,21 +299,21 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
non_blocking = comfy.model_management.device_supports_non_blocking(device)
if hasattr(s, "_v"):
if hasattr(s, "_v") and comfy.model_management.is_device_cpu(device):
#vbar doesn't support CPU weights, but some custom nodes have weird paths
#that might switch the layer to the CPU and expect it to work. We have to take
#a clone conservatively as we are mmapped and some SFT files are packed misaligned
#If you are a custom node author reading this, please move your layer to the GPU
#or declare your ModelPatcher as CPU in the first place.
if comfy.model_management.is_device_cpu(device):
materialize_meta_param(s, ["weight", "bias"])
weight = s.weight.to(dtype=dtype, copy=True)
if isinstance(weight, QuantizedTensor):
weight = weight.dequantize()
bias = s.bias.to(dtype=bias_dtype, copy=True) if s.bias is not None else None
return format_return((weight, bias, (None, None, None)), offloadable)
materialize_meta_param(s, ["weight", "bias"])
weight = s.weight.to(dtype=dtype, copy=True)
if isinstance(weight, QuantizedTensor):
weight = weight.dequantize()
bias = s.bias.to(dtype=bias_dtype, copy=True) if s.bias is not None else None
return format_return((weight, bias, (None, None, None)), offloadable)
elif hasattr(s, "_v") and s.weight.device != device:
prefetched = hasattr(s, "_prefetch")
offload_stream = None
offload_device = None

View File

@ -69,6 +69,8 @@ import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.text_encoders.qwen35
import comfy.text_encoders.qwen3vl
import comfy.text_encoders.boogu
import comfy.text_encoders.ernie
import comfy.text_encoders.gemma4
import comfy.text_encoders.cogvideo
@ -1374,6 +1376,7 @@ class CLIPType(Enum):
LENS = 28
PIXELDIT = 29
IDEOGRAM4 = 30
BOOGU = 31
@ -1427,6 +1430,8 @@ class TEModel(Enum):
GEMMA_4_31B = 31
T5_GEMMA = 32
GPT_OSS_20B = 33
QWEN3VL_4B = 34
QWEN3VL_8B = 35
def detect_te_model(sd):
@ -1488,6 +1493,8 @@ def detect_te_model(sd):
if weight.shape[0] == 5120:
return TEModel.QWEN35_27B
return TEModel.QWEN35_2B
if "model.visual.deepstack_merger_list.0.norm.weight" in sd: # DeepStack is unique to Qwen3-VL
return TEModel.QWEN3VL_4B if sd["model.visual.merger.linear_fc2.weight"].shape[0] == 2560 else TEModel.QWEN3VL_8B
if "model.layers.0.post_attention_layernorm.weight" in sd:
weight = sd['model.layers.0.post_attention_layernorm.weight']
if 'model.layers.0.self_attn.q_norm.weight' in sd:
@ -1686,6 +1693,24 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
qwen35_type = {TEModel.QWEN35_08B: "qwen35_08b", TEModel.QWEN35_2B: "qwen35_2b", TEModel.QWEN35_4B: "qwen35_4b", TEModel.QWEN35_9B: "qwen35_9b", TEModel.QWEN35_27B: "qwen35_27b"}[te_model]
clip_target.clip = comfy.text_encoders.qwen35.te(**llama_detect(clip_data), model_type=qwen35_type)
clip_target.tokenizer = comfy.text_encoders.qwen35.tokenizer(model_type=qwen35_type)
elif te_model in (TEModel.QWEN3VL_4B, TEModel.QWEN3VL_8B):
if clip_type == CLIPType.IDEOGRAM4 and te_model == TEModel.QWEN3VL_8B: # Ideogram4 reuses the full Qwen3-VL-8B (13-layer tap for conditioning + multimodal generate).
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
clip_target.clip = comfy.text_encoders.ideogram4.te_qwen3vl(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.ideogram4.Ideogram4Qwen3VLTokenizer
elif clip_type == CLIPType.BOOGU and te_model == TEModel.QWEN3VL_8B: # Boogu-Image: full Qwen3-VL-8B, last hidden state, no-think template.
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
clip_target.clip = comfy.text_encoders.boogu.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.boogu.BooguTokenizer
elif clip_type in (CLIPType.FLUX, CLIPType.FLUX2): # Flux2 Klein reuses the Qwen3-VL LM (3-layer tap -> 12288); visual unused.
klein_model_type = "qwen3_8b" if te_model == TEModel.QWEN3VL_8B else "qwen3_4b"
clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type=klein_model_type)
clip_target.tokenizer = comfy.text_encoders.flux.KleinTokenizer8B if te_model == TEModel.QWEN3VL_8B else comfy.text_encoders.flux.KleinTokenizer
else:
clip_data[0] = comfy.utils.state_dict_prefix_replace(clip_data[0], {"model.language_model.": "model.", "model.visual.": "visual.", "lm_head.": "model.lm_head."})
qwen3vl_type = {TEModel.QWEN3VL_4B: "qwen3vl_4b", TEModel.QWEN3VL_8B: "qwen3vl_8b"}[te_model]
clip_target.clip = comfy.text_encoders.qwen3vl.te(**llama_detect(clip_data), model_type=qwen3vl_type)
clip_target.tokenizer = comfy.text_encoders.qwen3vl.tokenizer(model_type=qwen3vl_type)
elif te_model == TEModel.QWEN3_06B:
clip_target.clip = comfy.text_encoders.anima.te(**llama_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.anima.AnimaTokenizer

View File

@ -25,6 +25,7 @@ import comfy.text_encoders.hunyuan_image
import comfy.text_encoders.kandinsky5
import comfy.text_encoders.z_image
import comfy.text_encoders.ideogram4
import comfy.text_encoders.boogu
import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
@ -1787,6 +1788,27 @@ class Omnigen2(supported_models_base.BASE):
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen25_3b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.omnigen2.Omnigen2Tokenizer, comfy.text_encoders.omnigen2.te(**hunyuan_detect))
class Boogu(Omnigen2):
unet_config = {
"image_model": "boogu",
}
sampling_settings = {
"multiplier": 1.0,
"shift": 3.16,
}
memory_usage_factor = 2.15
def get_model(self, state_dict, prefix="", device=None):
out = model_base.Boogu(self, device=device)
return out
def clip_target(self, state_dict={}):
pref = self.text_encoder_key_prefix[0]
hunyuan_detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, "{}qwen3vl_8b.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.boogu.BooguTokenizer, comfy.text_encoders.boogu.te(**hunyuan_detect))
class Ideogram4(supported_models_base.BASE):
unet_config = {
"image_model": "ideogram4",
@ -2330,6 +2352,7 @@ models = [
ACEStep,
ACEStep15,
Omnigen2,
Boogu,
QwenImage,
Ideogram4,
Flux2,

View File

@ -0,0 +1,58 @@
"""Boogu-Image text encoder: full Qwen3-VL-8B, last hidden state (4096-dim).
Boogu uses the final hidden state of Qwen3-VL as the per-token instruction feature
(num_instruction_feature_layers=1, reduce_type=mean -> just the last layer).
The model itself is the standard Qwen3-VL TE, only the chat template differs
(a fixed system prompt and no <think> block).
"""
import comfy.text_encoders.qwen3vl
from comfy import sd1_clip
# System prompts from the reference pipeline (pipeline_boogu.py).
# T2I (non-empty instruction, no image) uses the helpful-assistant prompt
# everything else (the CFG negative / "drop" condition, and any image case) uses the TI2I "describe" prompt.
BOOGU_T2I_SYSTEM = "You are a helpful assistant that generates high-quality images based on user instructions. The instructions are as follows."
BOOGU_DROP_SYSTEM = "Describe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
class BooguTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
# apply_chat_template without add_generation_prompt
self.llama_template = "<|im_start|>system\n" + BOOGU_T2I_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
self.llama_template_images = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n"
# Reference SYSTEM_PROMPT_DROP: used for the empty negative/uncond instruction.
self.llama_template_drop = "<|im_start|>system\n" + BOOGU_DROP_SYSTEM + "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n"
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
if llama_template is None and len(images) == 0 and text.strip() == "":
llama_template = self.llama_template_drop
# Boogu conditions on the no-think template; thinking=True drops the empty <think> block qwen3vl adds by default.
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
class BooguQwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
super().__init__(device=device, dtype=dtype, attention_mask=attention_mask, model_options=model_options, model_type=model_type)
# apply the final RMSNorm to the tapped last layer
self.layer_norm_hidden_state = True
class BooguTEModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
clip_model = lambda **kw: BooguQwen3VLClipModel(**kw, model_type="qwen3vl_8b")
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=clip_model, model_options=model_options)
def te(dtype_llama=None, llama_quantization_metadata=None):
class BooguTEModel_(BooguTEModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
if dtype_llama is not None:
dtype = dtype_llama
if llama_quantization_metadata is not None:
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
return BooguTEModel_

View File

@ -9,6 +9,7 @@ import os
from transformers import Qwen2Tokenizer
import comfy.text_encoders.llama
import comfy.text_encoders.qwen3vl
from comfy import sd1_clip
# Reference taps outputs of layers (0,3,...,35); comfy captures layer inputs, offset by +1.
@ -77,3 +78,43 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
return Ideogram4TEModel_
# Full Qwen3-VL-8B variant with vision
class Ideogram4Qwen3VLClipModel(comfy.text_encoders.qwen3vl.Qwen3VLClipModel):
def __init__(self, device="cpu", dtype=None, attention_mask=True, model_options={}):
super().__init__(device=device, layer=IDEOGRAM4_TAP_LAYERS, layer_idx=None, dtype=dtype,
attention_mask=attention_mask, model_options=model_options, model_type="qwen3vl_8b")
class Ideogram4Qwen3VLTEModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
super().__init__(device=device, dtype=dtype, name="qwen3vl_8b", clip_model=Ideogram4Qwen3VLClipModel, model_options=model_options)
def encode_token_weights(self, token_weight_pairs):
out, pooled, extra = super().encode_token_weights(token_weight_pairs)
b, n, seq, h = out.shape # (B, n_taps=13, seq, 4096), ascending layer order.
out = out.permute(0, 2, 3, 1).reshape(b, seq, h * n) # (B, seq, 4096*13 = 53248).
return out, pooled, extra
class Ideogram4Qwen3VLTokenizer(comfy.text_encoders.qwen3vl.Qwen3VLTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type="qwen3vl_8b")
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=True, **kwargs):
# Ideogram 4 conditions on the no-think template; default thinking=True drops the empty think block qwen3vl adds.
return super().tokenize_with_weights(text, return_word_ids=return_word_ids, llama_template=llama_template, images=images, prevent_empty_text=prevent_empty_text, thinking=thinking, **kwargs)
def te_qwen3vl(dtype_llama=None, llama_quantization_metadata=None):
class Ideogram4Qwen3VLTEModel_(Ideogram4Qwen3VLTEModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
if dtype_llama is not None:
dtype = dtype_llama
if llama_quantization_metadata is not None:
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
return Ideogram4Qwen3VLTEModel_

View File

@ -251,6 +251,19 @@ class Qwen3_8BConfig:
lm_head: bool = True
stop_tokens = [151643, 151645]
@dataclass
class Qwen3VL_8BConfig(Qwen3_8BConfig):
max_position_embeddings: int = 262144
rope_theta: float = 5000000.0
rope_dims = [24, 20, 20]
interleaved_mrope = True
@dataclass
class Qwen3VL_4BConfig(Qwen3VL_8BConfig):
hidden_size: int = 2560
intermediate_size: int = 9728
lm_head: bool = False # 4B ties word embeddings
@dataclass
class Ovis25_2BConfig:
vocab_size: int = 151936
@ -703,7 +716,8 @@ class Llama2_(nn.Module):
interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
device=device)
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True,
dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None,deepstack_embeds=None, visual_pos_masks=None):
if embeds is not None:
x = embeds
else:
@ -767,6 +781,10 @@ class Llama2_(nn.Module):
if current_kv is not None:
next_key_values.append(current_kv)
# DeepStack: add per-layer visual features into the first len() decoder layers at image positions (Qwen3-VL)
if deepstack_embeds is not None and i < len(deepstack_embeds):
x[visual_pos_masks] = x[visual_pos_masks] + deepstack_embeds[i].to(x)
if i == intermediate_output:
intermediate = x.clone()
@ -860,7 +878,7 @@ class BaseGenerate:
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
return past_key_values
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None):
def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None, position_ids=None, deepstack_embeds=None, visual_pos_masks=None):
device = embeds.device
if stop_tokens is None:
@ -884,10 +902,18 @@ class BaseGenerate:
generated_token_ids = []
pbar = comfy.utils.ProgressBar(max_length)
# MRoPE: prefill uses explicit 3D position_ids, decode continues from the last position
next_pos = int(position_ids[:, -1].max()) + 1 if position_ids is not None else None
# Generation loop
current_input_ids = initial_input_ids
for step in tqdm(range(max_length), desc="Generating tokens"):
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids)
# DeepStack visual features are injected on the prefill only; gemma4's forward lacks these kwargs.
extra = {}
if step == 0 and deepstack_embeds is not None:
extra["deepstack_embeds"] = deepstack_embeds
extra["visual_pos_masks"] = visual_pos_masks
x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids, position_ids=position_ids, **extra)
logits = self.logits(x)[:, -1]
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
token_id = next_token[0].item()
@ -895,6 +921,9 @@ class BaseGenerate:
embeds = self.model.embed_tokens(next_token).to(execution_dtype)
current_input_ids = next_token if initial_input_ids is not None else None
if next_pos is not None: # advance MRoPE position for the next (decode) step
position_ids = torch.tensor([[next_pos]], device=device)
next_pos += 1
pbar.update(1)
if token_id in stop_tokens:

View File

@ -3,7 +3,6 @@ import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass, field
import os
import math
import comfy.model_management
from comfy.ldm.modules.attention import optimized_attention_for_device
@ -563,6 +562,8 @@ class Qwen35VisionModel(nn.Module):
for _ in range(config["depth"])
])
self.merger = Qwen35VisionPatchMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
self.deepstack_visual_indexes = [] # DeepStack, per-layer visual features (Qwen3-VL)
self.deepstack_merger_list = None
def rot_pos_emb(self, grid_thw):
merge_size = self.spatial_merge_size
@ -664,9 +665,14 @@ class Qwen35VisionModel(nn.Module):
).cumsum(dim=0, dtype=torch.int32)
cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
for blk in self.blocks:
deepstack_features = []
for layer_num, blk in enumerate(self.blocks):
x = blk(x, cu_seqlens=cu_seqlens, position_embeddings=position_embeddings, optimized_attention=optimized_attention)
if self.deepstack_merger_list is not None and layer_num in self.deepstack_visual_indexes:
deepstack_features.append(self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](x))
merged = self.merger(x)
if self.deepstack_merger_list is not None:
return merged, deepstack_features
return merged
# Model Wrapper
@ -690,30 +696,7 @@ class Qwen35(BaseLlama, BaseGenerate, torch.nn.Module):
return None, None
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, embeds_info=[], past_key_values=None):
grid = None
position_ids = None
offset = 0
for e in embeds_info:
if e.get("type") == "image":
grid = e.get("extra", None)
start = e.get("index")
if position_ids is None:
position_ids = torch.zeros((3, embeds.shape[1]), device=embeds.device)
position_ids[:, :start] = torch.arange(0, start, device=embeds.device)
end = e.get("size") + start
len_max = int(grid.max()) // 2
start_next = len_max + start
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (embeds.shape[1] - end) + offset, device=embeds.device)
position_ids[0, start:end] = start + offset
max_d = int(grid[0][1]) // 2
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
max_d = int(grid[0][2]) // 2
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=embeds.device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
offset += len_max - (end - start)
if grid is None:
position_ids = None
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, embeds.shape[1], embeds.device)
return super().forward(x, attention_mask=attention_mask, embeds=embeds, num_tokens=num_tokens, intermediate_output=intermediate_output, final_layer_norm_intermediate=final_layer_norm_intermediate, dtype=dtype, position_ids=position_ids, past_key_values=past_key_values)
def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):

View File

@ -0,0 +1,193 @@
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import Qwen2Tokenizer
from comfy import sd1_clip
import comfy.text_encoders.qwen_vl
from .qwen35 import Qwen35VisionModel
from .llama import BaseLlama, BaseQwen3, BaseGenerate, Llama2_, Qwen3VL_4BConfig, Qwen3VL_8BConfig
QWEN3VL_VISION = {
"qwen3vl_4b": dict(hidden_size=1024, intermediate_size=4096, depth=24, deepstack_visual_indexes=[5, 11, 17]),
"qwen3vl_8b": dict(hidden_size=1152, intermediate_size=4304, depth=27, deepstack_visual_indexes=[8, 16, 24]),
}
QWEN3VL_VISION_COMMON = dict(num_heads=16, patch_size=16, temporal_patch_size=2, in_channels=3,
spatial_merge_size=2, num_position_embeddings=2304)
QWEN3VL_CONFIGS = {"qwen3vl_4b": Qwen3VL_4BConfig, "qwen3vl_8b": Qwen3VL_8BConfig}
class Qwen3VLDeepstackMerger(nn.Module):
# DeepStack merger: postshuffle LayerNorm (applied after spatial merge), unlike the main merger.
def __init__(self, hidden_size, spatial_merge_size, out_hidden_size, device=None, dtype=None, ops=None):
super().__init__()
self.merge_dim = hidden_size * (spatial_merge_size ** 2)
self.norm = ops.LayerNorm(self.merge_dim, eps=1e-6, device=device, dtype=dtype)
self.linear_fc1 = ops.Linear(self.merge_dim, self.merge_dim, device=device, dtype=dtype)
self.linear_fc2 = ops.Linear(self.merge_dim, out_hidden_size, device=device, dtype=dtype)
def forward(self, x):
x = self.norm(x.view(-1, self.merge_dim))
return self.linear_fc2(F.gelu(self.linear_fc1(x)))
class Qwen3VLVisionModel(Qwen35VisionModel):
# Qwen3.5 vision + DeepStack
def __init__(self, config, device=None, dtype=None, ops=None):
super().__init__(config, device=device, dtype=dtype, ops=ops)
self.deepstack_visual_indexes = config["deepstack_visual_indexes"]
self.deepstack_merger_list = nn.ModuleList([
Qwen3VLDeepstackMerger(self.hidden_size, self.spatial_merge_size, config["out_hidden_size"], device=device, dtype=dtype, ops=ops)
for _ in self.deepstack_visual_indexes
])
class Qwen3VL(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
model_type = "qwen3vl_8b"
def __init__(self, config_dict, dtype, device, operations):
super().__init__()
config = QWEN3VL_CONFIGS[self.model_type](**config_dict)
self.num_layers = config.num_hidden_layers
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
vision_config = {**QWEN3VL_VISION_COMMON, **QWEN3VL_VISION[self.model_type], "out_hidden_size": config.hidden_size}
self.visual = Qwen3VLVisionModel(vision_config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype
def preprocess_embed(self, embed, device):
if embed["type"] == "image":
# Qwen3-VL normalizes to [-1, 1] (mean/std 0.5), unlike Qwen2.5-VL's CLIP normalization.
image, grid = comfy.text_encoders.qwen_vl.process_qwen2vl_images(embed["data"], patch_size=16, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
merged, deepstack = self.visual(image.to(device, dtype=torch.float32), grid)
return merged, {"grid": grid, "deepstack": deepstack}
return None, None
def build_image_inputs(self, embeds, embeds_info):
# Returns (position_ids, visual_pos_masks, deepstack) for the prompt
images = sorted([e for e in embeds_info if e.get("type") == "image"], key=lambda e: e["index"])
if len(images) == 0:
return None, None, None
device = embeds.device
seq = embeds.shape[1]
position_ids = comfy.text_encoders.qwen_vl.qwen2vl_mrope_position_ids(embeds_info, seq, device)
# DeepStack: mask of image positions + per-vision-layer features to inject there.
visual_pos_masks = torch.zeros((1, seq), dtype=torch.bool, device=device)
deepstack = None
for e in images:
start = e["index"]
end = e["size"] + start
visual_pos_masks[0, start:end] = True
ds = e["extra"]["deepstack"]
if deepstack is None:
deepstack = [d for d in ds]
else:
deepstack = [torch.cat([deepstack[i], ds[i]], dim=0) for i in range(len(ds))]
return position_ids, visual_pos_masks, deepstack
def _make_qwen3vl_model(model_type):
class Qwen3VL_(Qwen3VL):
pass
Qwen3VL_.model_type = model_type
return Qwen3VL_
class Qwen3VLClipModel(sd1_clip.SDClipModel):
def __init__(self, device="cpu", layer="hidden", layer_idx=-1, dtype=None, attention_mask=True, model_options={}, model_type="qwen3vl_8b"):
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={},
dtype=dtype, special_tokens={"pad": 151643}, layer_norm_hidden_state=False,
model_class=_make_qwen3vl_model(model_type), enable_attention_masks=attention_mask,
return_attention_masks=attention_mask, model_options=model_options)
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
if isinstance(tokens, dict):
tokens = next(iter(tokens.values()))
tokens_only = [[t[0] for t in b] for b in tokens]
embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
position_ids, visual_pos_masks, deepstack = self.transformer.build_image_inputs(embeds, embeds_info)
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed,
presence_penalty=presence_penalty, position_ids=position_ids,
visual_pos_masks=visual_pos_masks, deepstack_embeds=deepstack)
class Qwen3VLTEModel(sd1_clip.SD1ClipModel):
def __init__(self, device="cpu", dtype=None, model_options={}, model_type="qwen3vl_8b"):
clip_model = lambda **kw: Qwen3VLClipModel(**kw, model_type=model_type)
super().__init__(device=device, dtype=dtype, name=model_type, clip_model=clip_model, model_options=model_options)
class Qwen3VLSDTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}, embedding_size=4096, embedding_key="qwen3vl_8b"):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer")
super().__init__(tokenizer_path, pad_with_end=False, embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_class=Qwen2Tokenizer,
has_start_token=False, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_token=151643, tokenizer_data=tokenizer_data)
class Qwen3VLTokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}, model_type="qwen3vl_8b"):
embedding_size = 2560 if model_type == "qwen3vl_4b" else 4096
tokenizer = lambda *a, **kw: Qwen3VLSDTokenizer(*a, **kw, embedding_size=embedding_size, embedding_key=model_type)
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name=model_type, tokenizer=tokenizer)
self.llama_template = "<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
self.llama_template_images = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
image = kwargs.get("image", None)
if image is not None and len(images) == 0:
images = [image[i:i + 1] for i in range(image.shape[0])]
skip_template = text.startswith('<|im_start|>')
if prevent_empty_text and text == '':
text = ' '
if skip_template:
llama_text = text
else:
if llama_template is not None:
template = llama_template
elif len(images) == 0:
template = self.llama_template
else:
template = self.llama_template_images
if len(images) > 1:
vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
template = template.replace(vision_block, vision_block * len(images), 1)
llama_text = template.format(text)
if not thinking: # Qwen3 convention: empty think block suppresses reasoning
llama_text += "<think>\n\n</think>\n\n"
tokens = super().tokenize_with_weights(llama_text, return_word_ids=return_word_ids, disable_weights=True, **kwargs)
key_name = next(iter(tokens))
embed_count = 0
for r in tokens[key_name]:
for i in range(len(r)):
if r[i][0] == 151655: # <|image_pad|>
if len(images) > embed_count:
r[i] = ({"type": "image", "data": images[embed_count], "original_type": "image"},) + r[i][1:]
embed_count += 1
return tokens
def tokenizer(model_type="qwen3vl_8b"):
class Qwen3VLTokenizer_(Qwen3VLTokenizer):
def __init__(self, embedding_directory=None, tokenizer_data={}):
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, model_type=model_type)
return Qwen3VLTokenizer_
def te(dtype_llama=None, llama_quantization_metadata=None, model_type="qwen3vl_8b"):
class Qwen3VLTEModel_(Qwen3VLTEModel):
def __init__(self, device="cpu", dtype=None, model_options={}):
if dtype_llama is not None:
dtype = dtype_llama
if llama_quantization_metadata is not None:
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options, model_type=model_type)
return Qwen3VLTEModel_

View File

@ -88,6 +88,32 @@ def process_qwen2vl_images(
return flatten_patches, image_grid_thw
def qwen2vl_mrope_position_ids(embeds_info, seq_len, device):
# (3, seq_len) T/H/W MRoPE position ids: text runs sequentially, each image span gets its grid positions.
# Returns None when there are no image embeds. `extra` is the image grid_thw, or a dict carrying it under "grid".
position_ids = None
offset = 0
for e in embeds_info:
if e.get("type") == "image":
extra = e.get("extra", None)
grid = extra["grid"] if isinstance(extra, dict) else extra
start = e.get("index")
if position_ids is None:
position_ids = torch.zeros((3, seq_len), device=device)
position_ids[:, :start] = torch.arange(0, start, device=device)
end = e.get("size") + start
len_max = int(grid.max()) // 2
start_next = len_max + start
position_ids[:, end:] = torch.arange(start_next + offset, start_next + (seq_len - end) + offset, device=device)
position_ids[0, start:end] = start + offset
max_d = int(grid[0][1]) // 2
position_ids[1, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(1).repeat(1, math.ceil((end - start) / max_d)).flatten(0)[:end - start]
max_d = int(grid[0][2]) // 2
position_ids[2, start:end] = torch.arange(start + offset, start + max_d + offset, device=device).unsqueeze(0).repeat(math.ceil((end - start) / max_d), 1).flatten(0)[:end - start]
offset += len_max - (end - start)
return position_ids
class VisionPatchEmbed(nn.Module):
def __init__(
self,

View File

@ -25,6 +25,11 @@ CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = {
"default": False,
"description": "Show the sign-in button in the frontend even when not signed in",
},
"enable_telemetry": {
"type": "bool",
"default": False,
"description": "Signal the frontend that telemetry collection is enabled",
},
}

View File

@ -27,10 +27,13 @@ class VideoInput(ABC):
path: Union[str, IO[bytes]],
format: VideoContainer = VideoContainer.AUTO,
codec: VideoCodec = VideoCodec.AUTO,
metadata: Optional[dict] = None
metadata: Optional[dict] = None,
bit_depth: int | None = None,
):
"""
Abstract method to save the video input to a file.
bit_depth selects the encoded bit depth; None keeps the video's native depth.
"""
pass
@ -83,6 +86,14 @@ class VideoInput(ABC):
components = self.get_components()
return components.images.shape[2], components.images.shape[1]
def get_bit_depth(self) -> int:
"""
Returns the bit depth of the video (e.g. 8 or 10).
Default implementation returns 8; subclasses report their real depth.
"""
return 8
def get_duration(self) -> float:
"""
Returns the duration of the video in seconds.

View File

@ -52,6 +52,12 @@ def get_open_write_kwargs(
return open_kwargs
def video_stream_bit_depth(stream) -> int:
if stream is None or stream.format is None or not stream.format.components:
return 8
return max(component.bits for component in stream.format.components)
class VideoFromFile(VideoInput):
"""
Class representing video input from a file.
@ -97,6 +103,13 @@ class VideoFromFile(VideoInput):
return stream.width, stream.height
raise ValueError(f"No video stream found in file '{self.__file}'")
def get_bit_depth(self) -> int:
if isinstance(self.__file, io.BytesIO):
self.__file.seek(0) # Reset the BytesIO object to the beginning
with av.open(self.__file, mode="r") as container:
video_stream = container.streams.video[0] if len(container.streams.video) > 0 else None
return video_stream_bit_depth(video_stream)
def get_duration(self) -> float:
"""
Returns the duration of the video in seconds.
@ -257,6 +270,7 @@ class VideoFromFile(VideoInput):
image_format = 'gbrpf32le'
process_image_format = lambda a: a
align_graph = None
audio = None
streams = [video_stream]
@ -310,7 +324,28 @@ class VideoFromFile(VideoInput):
checked_alpha = True
img = frame.to_ndarray(format=image_format) # shape: (H, W, 4)
# Fix non-deterministic video decode when the video width is not a multiple of 32
# For non-yuvj pixel formats: most H.264/H.265 video and static images (e.g. lossy WebP via LoadImage)
# Pad both axes to a multiple of 32 and smear the border so the alignment padding never bleeds into the cropped edges
if image_format in ('gbrpf32le', 'gbrapf32le') and frame.width % 32 != 0:
if align_graph is None:
pad_w = ((frame.width + 31) // 32) * 32
pad_h = ((frame.height + 31) // 32) * 32
g = av.filter.Graph()
g_src = g.add_buffer(width=frame.width, height=frame.height,
format=frame.format.name, time_base=video_stream.time_base)
g_pad = g.add('pad', f'{pad_w}:{pad_h}:0:0')
g_fill = g.add('fillborders', f'left=0:right={pad_w - frame.width}:top=0:bottom={pad_h - frame.height}:mode=smear')
g_sink = g.add('buffersink')
g_src.link_to(g_pad)
g_pad.link_to(g_fill)
g_fill.link_to(g_sink)
g.configure()
align_graph = (g, g_src, g_sink)
align_graph[1].push(frame)
img = np.ascontiguousarray(align_graph[2].pull().to_ndarray(format=image_format)[:frame.height, :frame.width])
else:
img = frame.to_ndarray(format=image_format)
if frame.rotation != 0:
k = int(round(frame.rotation // 90))
img = np.rot90(img, k=k, axes=(0, 1)).copy()
@ -377,25 +412,32 @@ class VideoFromFile(VideoInput):
format: VideoContainer = VideoContainer.AUTO,
codec: VideoCodec = VideoCodec.AUTO,
metadata: Optional[dict] = None,
bit_depth: int | None = None,
):
if isinstance(self.__file, io.BytesIO):
self.__file.seek(0) # Reset the BytesIO object to the beginning
with av.open(self.__file, mode='r') as container:
container_format = container.format.name
video_encoding = container.streams.video[0].codec.name if len(container.streams.video) > 0 else None
video_stream = container.streams.video[0] if len(container.streams.video) > 0 else None
video_encoding = video_stream.codec.name if video_stream is not None else None
source_bit_depth = video_stream_bit_depth(video_stream)
reuse_streams = True
if format != VideoContainer.AUTO and format not in container_format.split(","):
reuse_streams = False
if codec != VideoCodec.AUTO and codec != video_encoding and video_encoding is not None:
reuse_streams = False
if bit_depth is not None and video_encoding is not None and bit_depth != source_bit_depth:
reuse_streams = False
if self.__start_time or self.__duration:
reuse_streams = False
if not reuse_streams:
if bit_depth is None:
bit_depth = source_bit_depth
components = self.get_components_internal(container)
video = VideoFromComponents(components)
return video.save_to(
path, format=format, codec=codec, metadata=metadata
path, format=format, codec=codec, metadata=metadata, bit_depth=bit_depth,
)
streams = container.streams
@ -451,8 +493,10 @@ class VideoFromComponents(VideoInput):
Class representing video input from tensors.
"""
def __init__(self, components: VideoComponents):
def __init__(self, components: VideoComponents, bit_depth: int = 8):
self.__components = components
# Tensor components have no inherent bit depth; this is the depth used when encoding.
self.__bit_depth = bit_depth
def get_components(self) -> VideoComponents:
return VideoComponents(
@ -461,18 +505,26 @@ class VideoFromComponents(VideoInput):
frame_rate=self.__components.frame_rate,
)
def get_bit_depth(self) -> int:
return self.__bit_depth
def save_to(
self,
path: str,
format: VideoContainer = VideoContainer.AUTO,
codec: VideoCodec = VideoCodec.AUTO,
metadata: Optional[dict] = None,
bit_depth: int | None = None,
):
"""Save the video to a file path or BytesIO buffer."""
if format != VideoContainer.AUTO and format != VideoContainer.MP4:
raise ValueError("Only MP4 format is supported for now")
if codec != VideoCodec.AUTO and codec != VideoCodec.H264:
raise ValueError("Only H264 codec is supported for now")
# None means "use the depth this video was created with" (CreateVideo's choice).
if bit_depth is None:
bit_depth = self.__bit_depth
is_10bit = bit_depth >= 10
extra_kwargs = {}
if isinstance(format, VideoContainer) and format != VideoContainer.AUTO:
extra_kwargs["format"] = format.value
@ -488,10 +540,11 @@ class VideoFromComponents(VideoInput):
frame_rate = Fraction(round(self.__components.frame_rate * 1000), 1000)
# Create a video stream
pix_fmt = "yuv420p10le" if is_10bit else "yuv420p"
video_stream = output.add_stream('h264', rate=frame_rate)
video_stream.width = self.__components.images.shape[2]
video_stream.height = self.__components.images.shape[1]
video_stream.pix_fmt = 'yuv420p'
video_stream.pix_fmt = pix_fmt
# Create an audio stream
audio_sample_rate = 1
@ -505,9 +558,14 @@ class VideoFromComponents(VideoInput):
# Encode video
for i, frame in enumerate(self.__components.images):
img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
frame = av.VideoFrame.from_ndarray(img, format='rgb24')
frame = frame.reformat(format='yuv420p') # Convert to YUV420P as required by h264
if is_10bit:
# 16-bit RGB keeps float precision through the conversion to 10-bit YUV.
img = (frame.float() * 65535).clamp(0, 65535).cpu().numpy().astype(np.uint16) # shape: (H, W, 3)
frame = av.VideoFrame.from_ndarray(img, format="rgb48le")
else:
img = (frame * 255).clamp(0, 255).byte().cpu().numpy() # shape: (H, W, 3)
frame = av.VideoFrame.from_ndarray(img, format='rgb24')
frame = frame.reformat(format=pix_fmt)
packet = video_stream.encode(frame)
output.mux(packet)

View File

@ -1400,7 +1400,8 @@ class V3Data(TypedDict):
class HiddenHolder:
def __init__(self, unique_id: str, prompt: Any,
extra_pnginfo: Any, dynprompt: Any,
auth_token_comfy_org: str, api_key_comfy_org: str, **kwargs):
auth_token_comfy_org: str, api_key_comfy_org: str,
comfy_usage_source: str = None, **kwargs):
self.unique_id = unique_id
"""UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
self.prompt = prompt
@ -1413,6 +1414,8 @@ class HiddenHolder:
"""AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend."""
self.api_key_comfy_org = api_key_comfy_org
"""API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend."""
self.comfy_usage_source = comfy_usage_source
"""COMFY_USAGE_SOURCE identifies the client that submitted the prompt (e.g. comfyui-frontend, comfy-cli, comfyui-mcp); forwarded to API nodes' upstream requests via the Comfy-Usage-Source header."""
def __getattr__(self, key: str):
'''If hidden variable not found, return None.'''
@ -1429,6 +1432,7 @@ class HiddenHolder:
dynprompt=d.get(Hidden.dynprompt, None),
auth_token_comfy_org=d.get(Hidden.auth_token_comfy_org, None),
api_key_comfy_org=d.get(Hidden.api_key_comfy_org, None),
comfy_usage_source=d.get(Hidden.comfy_usage_source, None),
)
@classmethod
@ -1451,6 +1455,8 @@ class Hidden(str, Enum):
"""AUTH_TOKEN_COMFY_ORG is a token acquired from signing into a ComfyOrg account on frontend."""
api_key_comfy_org = "API_KEY_COMFY_ORG"
"""API_KEY_COMFY_ORG is an API Key generated by ComfyOrg that allows skipping signing into a ComfyOrg account on frontend."""
comfy_usage_source = "COMFY_USAGE_SOURCE"
"""COMFY_USAGE_SOURCE identifies the client that submitted the prompt (e.g. comfyui-frontend, comfy-cli, comfyui-mcp); forwarded to API nodes' upstream requests via the Comfy-Usage-Source header."""
@dataclass
@ -1654,6 +1660,8 @@ class Schema:
self.hidden.append(Hidden.auth_token_comfy_org)
if Hidden.api_key_comfy_org not in self.hidden:
self.hidden.append(Hidden.api_key_comfy_org)
if Hidden.comfy_usage_source not in self.hidden:
self.hidden.append(Hidden.comfy_usage_source)
# if is an output_node, will need prompt and extra_pnginfo
if self.is_output_node:
if Hidden.prompt not in self.hidden:

View File

@ -1310,13 +1310,6 @@ class KlingTaskStatus(str, Enum):
failed = 'failed'
class KlingTextToVideoModelName(str, Enum):
kling_v1 = 'kling-v1'
kling_v1_6 = 'kling-v1-6'
kling_v2_1_master = 'kling-v2-1-master'
kling_v2_5_turbo = 'kling-v2-5-turbo'
class KlingVideoGenAspectRatio(str, Enum):
field_16_9 = '16:9'
field_9_16 = '9:16'
@ -5179,7 +5172,7 @@ class KlingText2VideoRequest(BaseModel):
duration: Optional[KlingVideoGenDuration] = '5'
external_task_id: Optional[str] = Field(None, description='Customized Task ID')
mode: Optional[KlingVideoGenMode] = 'std'
model_name: Optional[KlingTextToVideoModelName] = 'kling-v1'
model_name: Optional[str] = 'kling-v1'
negative_prompt: Optional[str] = Field(
None, description='Negative text prompt', max_length=2500
)

View File

@ -149,3 +149,59 @@ class MotionControlRequest(BaseModel):
character_orientation: str = Field(...)
mode: str = Field(..., description="'pro' or 'std'")
model_name: str = Field(...)
class Kling3TurboSettings(BaseModel):
resolution: str = Field("720p", description="'720p' or '1080p'")
aspect_ratio: str | None = Field(None, description="'16:9'/'9:16'/'1:1'; text-to-video only")
duration: int = Field(5, description="3-15 second")
class Kling3TurboText2VideoRequest(BaseModel):
prompt: str = Field(..., description="<=3072 chars; may use multi-shot 'shot n, m, words; ...'")
settings: Kling3TurboSettings | None = Field(None)
class Kling3TurboContent(BaseModel):
type: str = Field(..., description="'prompt' or 'first_frame'")
text: str | None = Field(None, description="for type=prompt; <=2500 chars")
url: str | None = Field(None, description="for type=first_frame")
class Kling3TurboImage2VideoRequest(BaseModel):
contents: list[Kling3TurboContent] = Field(..., description="prompt + first_frame materials")
settings: Kling3TurboSettings | None = Field(None)
class Kling3TurboCreateData(BaseModel):
id: str | None = Field(None, description="Task ID")
status: str | None = Field(None)
message: str | None = Field(None)
class Kling3TurboCreateResponse(BaseModel):
code: int | None = Field(None)
message: str | None = Field(None)
request_id: str | None = Field(None)
data: Kling3TurboCreateData | None = Field(None)
class Kling3TurboOutput(BaseModel):
type: str | None = Field(None, description="'video', 'image', 'audio', ...")
id: str | None = Field(None)
url: str | None = Field(None)
duration: str | None = Field(None)
class Kling3TurboTaskData(BaseModel):
id: str | None = Field(None)
status: str | None = Field(None, description="submitted | processing | succeeded | failed")
message: str | None = Field(None)
outputs: list[Kling3TurboOutput] | None = Field(None)
class Kling3TurboQueryResponse(BaseModel):
code: int | None = Field(None)
message: str | None = Field(None)
request_id: str | None = Field(None)
data: list[Kling3TurboTaskData] | None = Field(None)

View File

@ -67,15 +67,6 @@ class RunwayImageToVideoResponse(BaseModel):
id: Optional[str] = Field(None, description='Task ID')
class RunwayTaskStatusEnum(str, Enum):
SUCCEEDED = 'SUCCEEDED'
RUNNING = 'RUNNING'
FAILED = 'FAILED'
PENDING = 'PENDING'
CANCELLED = 'CANCELLED'
THROTTLED = 'THROTTLED'
class RunwayTaskStatusResponse(BaseModel):
createdAt: datetime = Field(..., description='Task creation timestamp')
id: str = Field(..., description='Task ID')
@ -86,7 +77,7 @@ class RunwayTaskStatusResponse(BaseModel):
ge=0.0,
le=1.0,
)
status: RunwayTaskStatusEnum
status: str = Field(..., description="SUCCEEDED, RUNNING, FAILED, PENDING, CANCELLED or THROTTLED")
class Model4(str, Enum):
@ -125,3 +116,144 @@ class RunwayTextToImageRequest(BaseModel):
class RunwayTextToImageResponse(BaseModel):
id: Optional[str] = Field(None, description='Task ID')
class RunwayAleph2IO:
"""Custom socket types for chaining Aleph2 guidance images."""
KEYFRAME = "RUNWAY_ALEPH2_KEYFRAME"
PROMPT_IMAGE = "RUNWAY_ALEPH2_PROMPT_IMAGE"
# Keyframe timing modes (anchored to the INPUT video). Stored on the chain item and used to
# choose the request model below. The values match the Aleph2 keyframe union field names.
KEYFRAME_MODE_SECONDS = "seconds" # absolute time, in seconds, from the start of the input video
KEYFRAME_MODE_AT = "at" # fraction [0.0, 1.0] of the input video duration
# Prompt-image position modes (anchored to the OUTPUT video). Values match the Aleph2 position `type`.
PROMPT_IMAGE_MODE_TIMESTAMP = "timestamp" # absolute time, in seconds, from the start of the output video
PROMPT_IMAGE_MODE_POSITION = "position" # fraction [0.0, 1.0] of the output video duration
class RunwayAleph2KeyframeItem:
"""A guidance image anchored to a point of the INPUT video (one Aleph2 ``keyframe``)."""
def __init__(self, image, mode: str, value: float):
self.image = image
self.mode = mode # KEYFRAME_MODE_SECONDS | KEYFRAME_MODE_AT
self.value = value
class RunwayAleph2KeyframeChain:
"""An ordered collection of keyframes, built by chaining Runway Aleph2 Keyframe nodes."""
def __init__(self):
self.items: list[RunwayAleph2KeyframeItem] = []
def add(self, item: RunwayAleph2KeyframeItem) -> None:
self.items.append(item)
def clone(self) -> "RunwayAleph2KeyframeChain":
c = RunwayAleph2KeyframeChain()
c.items = list(self.items)
return c
class RunwayAleph2PromptImageItem:
"""A guidance image anchored to a point of the OUTPUT video (one Aleph2 ``promptImage``)."""
def __init__(self, image, mode: str, value: float):
self.image = image
self.mode = mode # PROMPT_IMAGE_MODE_TIMESTAMP | PROMPT_IMAGE_MODE_POSITION
self.value = value
class RunwayAleph2PromptImageChain:
"""An ordered collection of prompt images, built by chaining Runway Aleph2 Prompt Image nodes."""
def __init__(self):
self.items: list[RunwayAleph2PromptImageItem] = []
def add(self, item: RunwayAleph2PromptImageItem) -> None:
self.items.append(item)
def clone(self) -> "RunwayAleph2PromptImageChain":
c = RunwayAleph2PromptImageChain()
c.items = list(self.items)
return c
class RunwayAleph2KeyframeSeconds(BaseModel):
seconds: float = Field(
...,
description="Absolute timestamp in seconds from the start of the input video when this guidance image should apply.",
ge=0.0,
)
uri: str = Field(...)
class RunwayAleph2KeyframeAt(BaseModel):
at: float = Field(
...,
description="Position as a fraction [0.0, 1.0] of the input video duration.",
ge=0.0,
le=1.0,
)
uri: str = Field(...)
class RunwayAleph2TimestampPosition(BaseModel):
type: str = Field(default="timestamp")
timestampSeconds: float = Field(
...,
description="Absolute timestamp in seconds from the start of the output video.",
ge=0.0,
)
class RunwayAleph2RelativePosition(BaseModel):
type: str = Field(default="position")
positionPercentage: float = Field(
...,
description="Position as a fraction [0.0, 1.0] of the total output video duration.",
ge=0.0,
le=1.0,
)
class RunwayAleph2PromptImage(BaseModel):
position: RunwayAleph2TimestampPosition | RunwayAleph2RelativePosition
uri: str = Field(...)
class RunwayAleph2ContentModeration(BaseModel):
publicFigureThreshold: str = Field(
...,
description='When set to "low", the content moderation system is less strict about '
'recognizable public figures. One of "auto" or "low".',
)
class RunwayAleph2Request(BaseModel):
model: str = Field(default="aleph2")
promptText: str = Field(
...,
description="A non-empty string describing what should appear in the output.",
min_length=1,
max_length=1000,
)
videoUri: str = Field(...)
seed: int = Field(..., description="Random seed for generation", ge=0, le=4294967295)
contentModeration: RunwayAleph2ContentModeration = Field(...)
keyframes: list[RunwayAleph2KeyframeSeconds | RunwayAleph2KeyframeAt] | None = Field(
None,
description="Timed guidance images placed at specific points in the input video. Up to 5.",
)
promptImage: list[RunwayAleph2PromptImage] | None = Field(
None,
description="Up to 5 image keyframes for guiding the edit at specific points in the output video.",
)
class RunwayAleph2Response(BaseModel):
id: str | None = Field(None, description="Task ID")

View File

@ -208,6 +208,10 @@ class TripoMultiviewToModelRequest(BaseModel):
quad: bool | None = Field(False, description="Whether to apply quad to the generated model")
class TripoTexturePrompt(BaseModel):
text: str | None = Field(None, description="Text guidance for texture generation")
class TripoTextureModelRequest(BaseModel):
type: TripoTaskType = Field(TripoTaskType.TEXTURE_MODEL, description="Type of task")
original_model_task_id: str = Field(..., description="The task ID of the original model")
@ -219,6 +223,11 @@ class TripoTextureModelRequest(BaseModel):
texture_alignment: TripoTextureAlignment | None = Field(
TripoTextureAlignment.ORIGINAL_IMAGE, description="The texture alignment method"
)
texture_prompt: TripoTexturePrompt | None = Field(
None,
description="Optional guidance for texturing. Required in practice for imported models, "
"which carry no source image to infer texture from.",
)
class TripoRefineModelRequest(BaseModel):
@ -307,6 +316,17 @@ class TripoP1MultiviewToModelRequest(TripoP1CommonRequest):
orientation: str | None = None
class TripoImportModelRequest(BaseModel):
"""Request for the comfy-api composite import endpoint (/proxy/tripo/v2/openapi/import).
The model file is uploaded to ComfyUI API storage first; the backend downloads it from
`url`, re-uploads it to Tripo's storage and creates the import_model task server-side.
"""
url: str = Field(..., description="ComfyUI API storage download URL of the model file")
format: str = Field(..., description='File format: "glb", "fbx", "obj" or "stl"')
class TripoTaskOutput(BaseModel):
model: str | None = Field(None, description="URL to the model")
base_model: str | None = Field(None, description="URL to the base model")

View File

@ -289,7 +289,7 @@ class BriaRemoveVideoBackground(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
expr="""{"type":"usd","usd":0.0042,"format":{"suffix":"/second"}}""",
),
)
@ -357,7 +357,7 @@ class BriaVideoGreenScreen(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
expr="""{"type":"usd","usd":0.0042,"format":{"suffix":"/second"}}""",
),
)
@ -433,7 +433,7 @@ class BriaVideoReplaceBackground(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
expr="""{"type":"usd","usd":0.0042,"format":{"suffix":"/second"}}""",
),
)
@ -452,7 +452,10 @@ class BriaVideoReplaceBackground(IO.ComfyNode):
validate_video_duration(background_video, max_duration=60.0)
background_url = await upload_video_to_comfyapi(cls, background_video, wait_label="Uploading background")
else:
background_url = await upload_image_to_comfyapi(cls, background_image, wait_label="Uploading background")
# Bria's replace_background 500s on RGBA, so drop the alpha channel before upload.
background_url = await upload_image_to_comfyapi(
cls, background_image[:, :, :, :3], wait_label="Uploading background"
)
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/bria/v2/video/edit/replace_background", method="POST"),
@ -530,7 +533,7 @@ class BriaTransparentVideoBackground(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd":0.14,"format":{"suffix":"/second"}}""",
expr="""{"type":"usd","usd":0.0042,"format":{"suffix":"/second"}}""",
),
)
@ -571,7 +574,7 @@ class BriaExtension(ComfyExtension):
BriaRemoveImageBackground,
BriaRemoveVideoBackground,
BriaVideoGreenScreen,
# BriaVideoReplaceBackground, # server returns Status 500 when we pass background video
BriaVideoReplaceBackground,
BriaTransparentVideoBackground,
]

View File

@ -60,6 +60,12 @@ from comfy_api_nodes.apis.kling import (
OmniProImageRequest,
OmniProReferences2VideoRequest,
OmniProText2VideoRequest,
Kling3TurboSettings,
Kling3TurboText2VideoRequest,
Kling3TurboContent,
Kling3TurboImage2VideoRequest,
Kling3TurboCreateResponse,
Kling3TurboQueryResponse,
TaskStatusResponse,
TextToVideoWithAudioRequest,
)
@ -436,7 +442,7 @@ async def execute_text2video(
negative_prompt=negative_prompt if negative_prompt else None,
duration=KlingVideoGenDuration(duration),
mode=KlingVideoGenMode(model_mode),
model_name=KlingVideoGenModelName(model_name),
model_name=model_name,
cfg_scale=cfg_scale,
aspect_ratio=KlingVideoGenAspectRatio(aspect_ratio),
camera_control=camera_control,
@ -2847,6 +2853,67 @@ class MotionControl(IO.ComfyNode):
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
def build_turbo_shot_prompt(multi_prompt: list[MultiPromptEntry]) -> str:
"""Render storyboard entries into the Turbo multi-shot prompt 'shot n, m, words; ...'."""
return "; ".join(f"shot {i}, {int(e.duration)}, {e.prompt}" for i, e in enumerate(multi_prompt, 1)) + ";"
def _turbo_video_url(response: Kling3TurboQueryResponse) -> str:
"""Extract the result video URL from a /tasks response (data[].outputs[] where type == 'video')."""
task = response.data[0] if response.data else None
if task and task.outputs:
for output in task.outputs:
if output.type == "video" and output.url:
return output.url
raise RuntimeError(f"Kling 3.0 Turbo task finished without a video output: {response.model_dump()}")
async def execute_kling_turbo(
cls: type[IO.ComfyNode],
*,
prompt: str,
resolution: str,
aspect_ratio: str,
duration: int,
start_frame: torch.Tensor | None,
) -> IO.NodeOutput:
"""Create + poll a Kling 3.0 Turbo task. Image-to-video when start_frame is given, else text-to-video."""
if start_frame is not None:
validate_image_dimensions(start_frame, min_width=300, min_height=300)
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))
contents = [Kling3TurboContent(type="first_frame", url=tensor_to_base64_string(start_frame))]
if prompt:
contents.insert(0, Kling3TurboContent(type="prompt", text=prompt))
create = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/image-to-video/kling-3.0-turbo", method="POST"),
response_model=Kling3TurboCreateResponse,
data=Kling3TurboImage2VideoRequest(
contents=contents,
settings=Kling3TurboSettings(resolution=resolution, duration=duration), # i2v: no aspect_ratio
),
)
else:
create = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/text-to-video/kling-3.0-turbo", method="POST"),
response_model=Kling3TurboCreateResponse,
data=Kling3TurboText2VideoRequest(
prompt=prompt,
settings=Kling3TurboSettings(resolution=resolution, aspect_ratio=aspect_ratio, duration=duration),
),
)
if not (create.data and create.data.id):
raise RuntimeError(f"Kling 3.0 Turbo create failed. Code: {create.code}, Message: {create.message}")
final_response = await poll_op(
cls,
ApiEndpoint(path="/proxy/kling/tasks", query_params={"task_ids": create.data.id}),
response_model=Kling3TurboQueryResponse,
status_extractor=lambda r: (r.data[0].status if r.data else None),
)
return IO.NodeOutput(await download_url_to_video_output(_turbo_video_url(final_response)))
class KlingVideoNode(IO.ComfyNode):
@classmethod
@ -2884,7 +2951,11 @@ class KlingVideoNode(IO.ComfyNode):
],
tooltip="Generate a series of video segments with individual prompts and durations.",
),
IO.Boolean.Input("generate_audio", default=True),
IO.Boolean.Input(
"generate_audio",
default=True,
tooltip="'kling-3.0-turbo' always generates native audio, so the audio toggle is ignored.",
),
IO.DynamicCombo.Input(
"model",
options=[
@ -2899,6 +2970,17 @@ class KlingVideoNode(IO.ComfyNode):
),
],
),
IO.DynamicCombo.Option(
"kling-3.0-turbo",
[
IO.Combo.Input("resolution", options=["1080p", "720p"], default="720p"),
IO.Combo.Input(
"aspect_ratio",
options=["16:9", "9:16", "1:1"],
tooltip="Ignored in image-to-video mode.",
),
],
),
],
tooltip="Model and generation settings.",
),
@ -2930,6 +3012,7 @@ class KlingVideoNode(IO.ComfyNode):
price_badge=IO.PriceBadge(
depends_on=IO.PriceBadgeDepends(
widgets=[
"model",
"model.resolution",
"generate_audio",
"multi_shot",
@ -2944,14 +3027,7 @@ class KlingVideoNode(IO.ComfyNode):
),
expr="""
(
$rates := {
"4k": {"off": 0.42, "on": 0.42},
"1080p": {"off": 0.112, "on": 0.168},
"720p": {"off": 0.084, "on": 0.126}
};
$res := $lookup(widgets, "model.resolution");
$audio := widgets.generate_audio ? "on" : "off";
$rate := $lookup($lookup($rates, $res), $audio);
$ms := widgets.multi_shot;
$isSb := $ms != "disabled";
$n := $isSb ? $number($substring($ms, 0, 1)) : 0;
@ -2962,7 +3038,18 @@ class KlingVideoNode(IO.ComfyNode):
$d5 := $n >= 5 ? $lookup(widgets, "multi_shot.storyboard_5_duration") : 0;
$d6 := $n >= 6 ? $lookup(widgets, "multi_shot.storyboard_6_duration") : 0;
$dur := $isSb ? $d1 + $d2 + $d3 + $d4 + $d5 + $d6 : $lookup(widgets, "multi_shot.duration");
{"type":"usd","usd": $rate * $dur}
widgets.model = "kling-3.0-turbo"
? {"type":"usd","usd": ($res = "1080p" ? 0.14 : 0.112) * $dur}
: (
$rates := {
"4k": {"off": 0.42, "on": 0.42},
"1080p": {"off": 0.112, "on": 0.168},
"720p": {"off": 0.084, "on": 0.126}
};
$audio := widgets.generate_audio ? "on" : "off";
$rate := $lookup($lookup($rates, $res), $audio);
{"type":"usd","usd": $rate * $dur}
)
)
""",
),
@ -3015,6 +3102,17 @@ class KlingVideoNode(IO.ComfyNode):
duration = multi_shot["duration"]
validate_string(multi_shot["prompt"], min_length=1, max_length=2500)
if model["model"] == "kling-3.0-turbo":
turbo_prompt = build_turbo_shot_prompt(multi_prompt_list) if custom_multi_shot else multi_shot["prompt"]
return await execute_kling_turbo(
cls,
prompt=turbo_prompt,
resolution=model["resolution"],
aspect_ratio=model["aspect_ratio"],
duration=duration,
start_frame=start_frame,
)
if start_frame is not None:
validate_image_dimensions(start_frame, min_width=300, min_height=300)
validate_image_aspect_ratio(start_frame, (1, 2.5), (2.5, 1))

View File

@ -9,6 +9,7 @@ from PIL import Image
from typing_extensions import override
import folder_paths
from comfy.utils import common_upscale
from comfy_api.latest import IO, ComfyExtension, Input
from comfy_api_nodes.apis.openai import (
InputFileContent,
@ -62,7 +63,8 @@ async def validate_and_cast_response(response, timeout: int = None) -> torch.Ten
timeout: Request timeout in seconds. Defaults to None (no timeout).
Returns:
A torch.Tensor representing the image (1, H, W, C).
A torch.Tensor of shape (N, H, W, C) with all returned images; images whose
dimensions differ from the first image's are resized to match it.
Raises:
ValueError: If the response is not valid.
@ -89,6 +91,14 @@ async def validate_and_cast_response(response, timeout: int = None) -> torch.Ten
arr = np.asarray(pil_img).astype(np.float32) / 255.0
image_tensors.append(torch.from_numpy(arr))
# With size="auto" the API can return images whose dimensions differ by a few pixels within a single response
# resize them to the first image's dimensions so they can be stacked into one batch.
ref_h, ref_w = image_tensors[0].shape[:2]
for i, t in enumerate(image_tensors):
if t.shape[:2] != (ref_h, ref_w):
samples = t.unsqueeze(0).movedim(-1, 1)
samples = common_upscale(samples, ref_w, ref_h, "bilinear", "center")
image_tensors[i] = samples.movedim(1, -1).squeeze(0)
return torch.stack(image_tensors, dim=0)

View File

@ -30,13 +30,33 @@ from comfy_api_nodes.apis.runway import (
Model4,
ReferenceImage,
RunwayTextToImageAspectRatioEnum,
RunwayAleph2IO,
RunwayAleph2KeyframeChain,
RunwayAleph2KeyframeItem,
RunwayAleph2PromptImageChain,
RunwayAleph2PromptImageItem,
RunwayAleph2Request,
RunwayAleph2Response,
RunwayAleph2KeyframeSeconds,
RunwayAleph2KeyframeAt,
RunwayAleph2PromptImage,
RunwayAleph2TimestampPosition,
RunwayAleph2RelativePosition,
RunwayAleph2ContentModeration,
KEYFRAME_MODE_SECONDS,
KEYFRAME_MODE_AT,
PROMPT_IMAGE_MODE_TIMESTAMP,
PROMPT_IMAGE_MODE_POSITION,
)
from comfy_api_nodes.util import (
image_tensor_pair_to_batch,
validate_string,
validate_image_dimensions,
validate_image_aspect_ratio,
validate_video_duration,
upload_images_to_comfyapi,
upload_image_to_comfyapi,
upload_video_to_comfyapi,
download_url_to_video_output,
download_url_to_image_tensor,
ApiEndpoint,
@ -45,6 +65,7 @@ from comfy_api_nodes.util import (
)
PATH_IMAGE_TO_VIDEO = "/proxy/runway/image_to_video"
PATH_VIDEO_TO_VIDEO = "/proxy/runway/video_to_video"
PATH_TEXT_TO_IMAGE = "/proxy/runway/text_to_image"
PATH_GET_TASK_STATUS = "/proxy/runway/tasks"
@ -53,12 +74,6 @@ AVERAGE_DURATION_FLF_SECONDS = 256
AVERAGE_DURATION_T2I_SECONDS = 41
class RunwayApiError(Exception):
"""Base exception for Runway API errors."""
pass
class RunwayGen4TurboAspectRatio(str, Enum):
"""Aspect ratios supported for Image to Video API when using gen4_turbo model."""
@ -84,14 +99,6 @@ def get_video_url_from_task_status(response: TaskStatusResponse) -> str | None:
return None
def extract_progress_from_task_status(
response: TaskStatusResponse,
) -> float | None:
if hasattr(response, "progress") and response.progress is not None:
return response.progress * 100
return None
def get_image_url_from_task_status(response: TaskStatusResponse) -> str | None:
"""Returns the image URL from the task status response if it exists."""
if hasattr(response, "output") and len(response.output) > 0:
@ -102,14 +109,13 @@ def get_image_url_from_task_status(response: TaskStatusResponse) -> str | None:
async def get_response(
cls: type[IO.ComfyNode], task_id: str, estimated_duration: int | None = None
) -> TaskStatusResponse:
"""Poll the task status until it is finished then get the response."""
return await poll_op(
cls,
ApiEndpoint(path=f"{PATH_GET_TASK_STATUS}/{task_id}"),
response_model=TaskStatusResponse,
status_extractor=lambda r: r.status.value,
status_extractor=lambda r: r.status,
estimated_duration=estimated_duration,
progress_extractor=extract_progress_from_task_status,
progress_extractor=lambda r: r.progress * 100 if r.progress is not None else None,
)
@ -127,7 +133,7 @@ async def generate_video(
final_response = await get_response(cls, initial_response.id, estimated_duration)
if not final_response.output:
raise RunwayApiError("Runway task succeeded but no video data found in response.")
raise ValueError("Runway task succeeded but no video data found in response.")
video_url = get_video_url_from_task_status(final_response)
return await download_url_to_video_output(video_url)
@ -410,7 +416,7 @@ class RunwayFirstLastFrameNode(IO.ComfyNode):
mime_type="image/png",
)
if len(download_urls) != 2:
raise RunwayApiError("Failed to upload one or more images to comfy api.")
raise ValueError("Failed to upload one or more images to comfy api.")
return IO.NodeOutput(
await generate_video(
@ -514,11 +520,321 @@ class RunwayTextToImageNode(IO.ComfyNode):
estimated_duration=AVERAGE_DURATION_T2I_SECONDS,
)
if not final_response.output:
raise RunwayApiError("Runway task succeeded but no image data found in response.")
raise ValueError("Runway task succeeded but no image data found in response.")
return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_task_status(final_response)))
_TIMING_ABSOLUTE = "Absolute time (seconds)"
_TIMING_FRACTION = "Fraction of duration (0.0-1.0)"
class RunwayAleph2KeyframeNode(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="RunwayAleph2KeyframeNode",
display_name="Runway Aleph2 Keyframe",
category="partner/video/Runway",
description="Anchor a guidance image to a moment of the input (source) video, so Aleph2 "
"steers the edit at that point of your footage. Connect this to the 'keyframes' input of "
"the Runway Aleph2 Video to Video node; chain several together (up to 5) via the optional "
"'keyframes' input below.",
inputs=[
IO.Image.Input(
"image",
tooltip="The guidance image to apply at the chosen moment of the input video.",
),
IO.DynamicCombo.Input(
"timing",
options=[
IO.DynamicCombo.Option(
_TIMING_ABSOLUTE,
[
IO.Float.Input(
"seconds",
default=0.0,
min=0.0,
max=30.0,
step=0.1,
display_mode=IO.NumberDisplay.number,
tooltip="Time in seconds from start of the input video where this image applies.",
),
],
),
IO.DynamicCombo.Option(
_TIMING_FRACTION,
[
IO.Float.Input(
"fraction",
default=0.0,
min=0.0,
max=1.0,
step=0.01,
display_mode=IO.NumberDisplay.number,
tooltip="Where in the input video this image applies, "
"as a fraction of its duration (0.0 = start, 1.0 = end).",
),
],
),
],
tooltip="How to place this image on the input video's timeline.",
),
IO.Custom(RunwayAleph2IO.KEYFRAME).Input(
"keyframes",
optional=True,
tooltip="Optional earlier keyframes to chain with this one.",
),
],
outputs=[IO.Custom(RunwayAleph2IO.KEYFRAME).Output(display_name="keyframes")],
)
@classmethod
def execute(
cls,
image: Input.Image,
timing: dict,
keyframes: RunwayAleph2KeyframeChain | None = None,
) -> IO.NodeOutput:
chain = keyframes.clone() if keyframes is not None else RunwayAleph2KeyframeChain()
if timing["timing"] == _TIMING_ABSOLUTE:
mode, value = KEYFRAME_MODE_SECONDS, float(timing["seconds"])
else:
mode, value = KEYFRAME_MODE_AT, float(timing["fraction"])
chain.add(RunwayAleph2KeyframeItem(image=image, mode=mode, value=value))
return IO.NodeOutput(chain)
class RunwayAleph2PromptImageNode(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="RunwayAleph2PromptImageNode",
display_name="Runway Aleph2 Prompt Image",
category="partner/video/Runway",
description="Anchor a guidance image to a moment of the output (result) video, to guide what "
"the edited video looks like at that point. Connect this to the 'prompt_images' input of the "
"Runway Aleph2 Video to Video node; chain several together (up to 5) via the optional "
"'prompt_images' input below.",
inputs=[
IO.Image.Input(
"image",
tooltip="The guidance image to place at the chosen moment of the output video.",
),
IO.DynamicCombo.Input(
"position",
options=[
IO.DynamicCombo.Option(
_TIMING_ABSOLUTE,
[
IO.Float.Input(
"seconds",
default=0.0,
min=0.0,
max=30.0,
step=0.1,
display_mode=IO.NumberDisplay.number,
tooltip="Time in seconds from start of the output video where this image applies.",
),
],
),
IO.DynamicCombo.Option(
_TIMING_FRACTION,
[
IO.Float.Input(
"fraction",
default=0.0,
min=0.0,
max=1.0,
step=0.01,
display_mode=IO.NumberDisplay.number,
tooltip="Where in the output video this image applies, "
"as a fraction of its duration (0.0 = start, 1.0 = end).",
),
],
),
],
tooltip="How to place this image on the output video's timeline.",
),
IO.Custom(RunwayAleph2IO.PROMPT_IMAGE).Input(
"prompt_images",
optional=True,
tooltip="Optional earlier prompt images to chain with this one.",
),
],
outputs=[IO.Custom(RunwayAleph2IO.PROMPT_IMAGE).Output(display_name="prompt_images")],
)
@classmethod
def execute(
cls,
image: Input.Image,
position: dict,
prompt_images: RunwayAleph2PromptImageChain | None = None,
) -> IO.NodeOutput:
chain = prompt_images.clone() if prompt_images is not None else RunwayAleph2PromptImageChain()
if position["position"] == _TIMING_ABSOLUTE:
mode, value = PROMPT_IMAGE_MODE_TIMESTAMP, float(position["seconds"])
else:
mode, value = PROMPT_IMAGE_MODE_POSITION, float(position["fraction"])
chain.add(RunwayAleph2PromptImageItem(image=image, mode=mode, value=value))
return IO.NodeOutput(chain)
class RunwayAleph2VideoToVideoNode(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="RunwayAleph2VideoToVideoNode",
display_name="Runway Aleph2 Video to Video",
category="partner/video/Runway",
description="Edit a video with a text prompt using Runway's Aleph2 model. Aleph2 transforms "
"your footage (restyle, relight, add or remove elements, change the viewpoint) while keeping "
"the original motion and timing; the output resolution matches the input video, which must be "
"2-30 seconds at 30 fps or lower. Optionally steer the edit with either keyframes (anchored to "
"the input video) or prompt images (anchored to the output video) - use one or the other, not both.",
inputs=[
IO.String.Input(
"prompt",
multiline=True,
default="",
tooltip="Describes what should appear in the output (1-1000 characters).",
),
IO.Video.Input(
"video",
tooltip="Input video to edit. Must be 2-30 seconds at 30 fps or lower.",
),
IO.Int.Input(
"seed",
default=0,
min=0,
max=4294967295,
step=1,
control_after_generate=True,
display_mode=IO.NumberDisplay.number,
tooltip="Random seed for generation",
),
IO.Combo.Input(
"public_figure_threshold",
options=["auto", "low"],
default="low",
tooltip="Content moderation for recognizable public figures.",
),
IO.Custom(RunwayAleph2IO.KEYFRAME).Input(
"keyframes",
optional=True,
tooltip="Guidance images anchored to the input video, from Aleph2 Keyframe nodes (up to 5). "
"Use keyframes or prompt images, not both.",
),
IO.Custom(RunwayAleph2IO.PROMPT_IMAGE).Input(
"prompt_images",
optional=True,
tooltip="Guidance images anchored to the output video, from Aleph2 Prompt Image nodes (up to 5). "
"Use keyframes or prompt images, not both.",
),
],
outputs=[
IO.Video.Output(),
],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"usd","usd": 0.4004, "format":{"suffix":"/second"}}""",
),
)
@classmethod
async def execute(
cls,
prompt: str,
video: Input.Video,
seed: int,
public_figure_threshold: str = "low",
keyframes: RunwayAleph2KeyframeChain | None = None,
prompt_images: RunwayAleph2PromptImageChain | None = None,
) -> IO.NodeOutput:
validate_string(prompt, min_length=1, max_length=1000)
validate_video_duration(
video,
min_duration=2.0,
max_duration=30.0,
)
try:
fps = float(video.get_frame_rate())
except Exception:
fps = None
if fps is not None and fps > 30.0 + 0.01:
raise ValueError(f"Input video frame rate ({fps:.2f} fps) exceeds Aleph2's maximum of 30 fps.")
if (keyframes and keyframes.items) and (prompt_images and prompt_images.items):
raise ValueError("Aleph2 accepts either keyframes or prompt images, not both.")
video_duration: float | None = None
try:
video_duration = video.get_duration()
except Exception:
video_duration = None
def _check_seconds(value: float, label: str) -> None:
if video_duration is not None and value > video_duration + 0.0001:
raise ValueError(f"{label} {value:.2f}s exceeds the input video duration ({video_duration:.2f}s).")
video_url = await upload_video_to_comfyapi(cls, video)
keyframe_models: list[RunwayAleph2KeyframeSeconds | RunwayAleph2KeyframeAt] = []
if keyframes is not None:
if len(keyframes.items) > 5:
raise ValueError("Aleph2 supports at most 5 keyframes.")
for item in keyframes.items:
image_url = await upload_image_to_comfyapi(cls, item.image, mime_type="image/png")
if item.mode == KEYFRAME_MODE_SECONDS:
_check_seconds(item.value, "Keyframe timestamp")
keyframe_models.append(RunwayAleph2KeyframeSeconds(seconds=item.value, uri=image_url))
else:
keyframe_models.append(RunwayAleph2KeyframeAt(at=item.value, uri=image_url))
prompt_image_models: list[RunwayAleph2PromptImage] = []
if prompt_images is not None:
if len(prompt_images.items) > 5:
raise ValueError("Aleph2 supports at most 5 prompt images.")
for item in prompt_images.items:
image_url = await upload_image_to_comfyapi(cls, item.image, mime_type="image/png")
position: RunwayAleph2TimestampPosition | RunwayAleph2RelativePosition
if item.mode == PROMPT_IMAGE_MODE_TIMESTAMP:
_check_seconds(item.value, "Prompt image timestamp")
position = RunwayAleph2TimestampPosition(timestampSeconds=item.value)
else:
position = RunwayAleph2RelativePosition(positionPercentage=item.value)
prompt_image_models.append(RunwayAleph2PromptImage(position=position, uri=image_url))
initial_response = await sync_op(
cls,
endpoint=ApiEndpoint(path=PATH_VIDEO_TO_VIDEO, method="POST"),
response_model=RunwayAleph2Response,
data=RunwayAleph2Request(
promptText=prompt,
videoUri=video_url,
seed=seed,
contentModeration=RunwayAleph2ContentModeration(publicFigureThreshold=public_figure_threshold),
keyframes=keyframe_models or None,
promptImage=prompt_image_models or None,
),
)
final_response = await get_response(cls, initial_response.id)
if not final_response.output:
raise ValueError("Runway task succeeded but no video data found in response.")
return IO.NodeOutput(await download_url_to_video_output(get_video_url_from_task_status(final_response)))
class RunwayExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@ -527,6 +843,9 @@ class RunwayExtension(ComfyExtension):
RunwayImageToVideoNodeGen3a,
RunwayImageToVideoNodeGen4,
RunwayTextToImageNode,
RunwayAleph2VideoToVideoNode,
RunwayAleph2KeyframeNode,
RunwayAleph2PromptImageNode,
]

View File

@ -16,7 +16,7 @@ from comfy_api_nodes.util import (
)
from comfy_api_nodes.util._helpers import (
default_base_url,
get_auth_header,
get_comfy_api_headers,
get_node_id,
is_processing_interrupted,
)
@ -100,8 +100,7 @@ class SoniloTextToMusic(IO.ComfyNode):
node_id="SoniloTextToMusic",
display_name="Sonilo Text to Music",
category="partner/audio/Sonilo",
description="Generate music from a text prompt using Sonilo's AI model. "
"Leave duration at 0 to let the model infer it from the prompt.",
description="Generate music from a text prompt using Sonilo's AI model.",
inputs=[
IO.String.Input(
"prompt",
@ -111,11 +110,10 @@ class SoniloTextToMusic(IO.ComfyNode):
),
IO.Int.Input(
"duration",
default=0,
min=0,
default=30,
min=1,
max=360,
tooltip="Target duration in seconds. Set to 0 to let the model "
"infer the duration from the prompt. Maximum: 6 minutes.",
tooltip="Target duration in seconds. Maximum: 6 minutes.",
),
IO.Int.Input(
"seed",
@ -136,13 +134,7 @@ class SoniloTextToMusic(IO.ComfyNode):
is_api_node=True,
price_badge=IO.PriceBadge(
depends_on=IO.PriceBadgeDepends(widgets=["duration"]),
expr="""
(
widgets.duration > 0
? {"type":"usd","usd": 0.005 * widgets.duration}
: {"type":"usd","usd": 0.005, "format":{"suffix":"/second"}}
)
""",
expr='{"type":"usd","usd": 0.0025 * widgets.duration}',
),
)
@ -150,14 +142,13 @@ class SoniloTextToMusic(IO.ComfyNode):
async def execute(
cls,
prompt: str,
duration: int = 0,
duration: int = 1,
seed: int = 0,
) -> IO.NodeOutput:
validate_string(prompt, strip_whitespace=True, min_length=1)
validate_string(prompt, strip_whitespace=True, min_length=1, max_length=1000)
form = aiohttp.FormData()
form.add_field("prompt", prompt)
if duration > 0:
form.add_field("duration", str(duration))
form.add_field("duration", str(duration))
audio_bytes = await _stream_sonilo_music(
cls,
ApiEndpoint(path="/proxy/sonilo/t2m/generate", method="POST"),
@ -174,8 +165,7 @@ async def _stream_sonilo_music(
"""POST ``form`` to Sonilo, read the NDJSON stream, and return the first stream's audio bytes."""
url = urljoin(default_base_url().rstrip("/") + "/", endpoint.path.lstrip("/"))
headers: dict[str, str] = {}
headers.update(get_auth_header(cls))
headers = get_comfy_api_headers(cls)
headers.update(endpoint.headers)
node_id = get_node_id(cls)

View File

@ -1,6 +1,6 @@
from typing_extensions import override
from comfy_api.latest import IO, ComfyExtension, Input
from comfy_api.latest import IO, ComfyExtension, Input, Types
from comfy_api_nodes.apis.tripo import (
TripoAnimateRetargetRequest,
TripoAnimateRigRequest,
@ -8,6 +8,7 @@ from comfy_api_nodes.apis.tripo import (
TripoFileEmptyReference,
TripoFileReference,
TripoImageToModelRequest,
TripoImportModelRequest,
TripoModelVersion,
TripoMultiviewToModelRequest,
TripoOrientation,
@ -21,6 +22,7 @@ from comfy_api_nodes.apis.tripo import (
TripoTaskType,
TripoTextToModelRequest,
TripoTextureModelRequest,
TripoTexturePrompt,
TripoUrlReference,
)
from comfy_api_nodes.util import (
@ -28,6 +30,7 @@ from comfy_api_nodes.util import (
download_url_to_file_3d,
poll_op,
sync_op,
upload_3d_model_to_comfyapi,
upload_images_to_comfyapi,
)
@ -538,6 +541,14 @@ class TripoTextureNode(IO.ComfyNode):
optional=True,
advanced=True,
),
IO.String.Input(
"texture_prompt",
default="",
multiline=True,
optional=True,
tooltip="Optional text guidance for texturing. Required in practice for imported "
"models (Tripo: Import Model), which carry no source image to infer colors from.",
),
],
outputs=[
IO.String.Output(display_name="model_file"), # for backward compatibility only
@ -571,6 +582,7 @@ class TripoTextureNode(IO.ComfyNode):
texture_seed: int | None = None,
texture_quality: str | None = None,
texture_alignment: str | None = None,
texture_prompt: str = "",
) -> IO.NodeOutput:
response = await sync_op(
cls,
@ -583,6 +595,7 @@ class TripoTextureNode(IO.ComfyNode):
texture_seed=texture_seed,
texture_quality=texture_quality,
texture_alignment=texture_alignment,
texture_prompt=TripoTexturePrompt(text=texture_prompt.strip()) if texture_prompt.strip() else None,
),
)
return await poll_until_finished(cls, response, average_duration=80)
@ -915,6 +928,90 @@ class TripoConversionNode(IO.ComfyNode):
return await poll_until_finished(cls, response, average_duration=30)
class TripoImportModelNode(IO.ComfyNode):
"""Imports an external 3D model into Tripo, producing a MODEL_TASK_ID for post-processing nodes."""
SUPPORTED_FORMATS = ("glb", "fbx", "obj", "stl")
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="TripoImportModelNode",
display_name="Tripo: Import Model",
category="partner/3d/Tripo",
description="Import an external 3D model (e.g. from Rodin, Hunyuan3D or a local file) into Tripo "
"to use it with Tripo's post-processing nodes: Texture, Rig, Convert. "
"GLB is recommended: textures survive import only when embedded in the file. "
"Note that texturing an imported model requires a texture prompt.",
inputs=[
IO.MultiType.Input(
"model_3d",
types=[IO.File3DGLB, IO.File3DFBX, IO.File3DOBJ, IO.File3DSTL, IO.File3DAny],
tooltip="3D model to import (GLB / FBX / OBJ / STL, up to 150 MB). "
"OBJ and STL files carry no embedded textures.",
),
],
outputs=[
IO.Custom("MODEL_TASK_ID").Output(display_name="model task_id"),
],
hidden=[
IO.Hidden.auth_token_comfy_org,
IO.Hidden.api_key_comfy_org,
IO.Hidden.unique_id,
],
is_api_node=True,
price_badge=IO.PriceBadge(
expr="""{"type":"text","text":"Free"}""",
),
)
@classmethod
async def execute(cls, model_3d: Types.File3D) -> IO.NodeOutput:
file_format = (model_3d.format or "").lstrip(".").lower()
if file_format == "gltf":
raise ValueError(
"GLTF (.gltf) references external files and cannot be imported. Export a single-file GLB instead."
)
if file_format not in cls.SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported 3D format '{file_format or 'unknown'}'. "
f"Tripo import supports: {', '.join(f.upper() for f in cls.SUPPORTED_FORMATS)}."
)
size = len(model_3d.get_bytes())
if size > 150 * 1024 * 1024:
raise ValueError(f"Model file is {size / (1024 * 1024):.1f} MB; Tripo import allows up to 150 MB.")
url = await upload_3d_model_to_comfyapi(cls, model_3d, file_format)
response = await sync_op(
cls,
endpoint=ApiEndpoint(path="/proxy/tripo/v2/openapi/import", method="POST"),
response_model=TripoTaskResponse,
data=TripoImportModelRequest(url=url, format=file_format),
)
if response.code != 0:
raise RuntimeError(f"Failed to import model: {response.error}")
task_id = response.data.task_id
response_poll = await poll_op(
cls,
poll_endpoint=ApiEndpoint(path=f"/proxy/tripo/v2/openapi/task/{task_id}"),
response_model=TripoTaskResponse,
failed_statuses=[
TripoTaskStatus.FAILED,
TripoTaskStatus.CANCELLED,
TripoTaskStatus.UNKNOWN,
TripoTaskStatus.BANNED,
TripoTaskStatus.EXPIRED,
],
status_extractor=lambda x: x.data.status,
progress_extractor=lambda x: x.data.progress,
estimated_duration=10,
)
if response_poll.data.status != TripoTaskStatus.SUCCESS:
raise RuntimeError(f"Failed to import model: {response_poll}")
return IO.NodeOutput(task_id)
def _p1_price_expr(*, geometry_credits: int, textured_credits: int, detailed_credits: int) -> str:
return (
"("
@ -1292,6 +1389,7 @@ class TripoExtension(ComfyExtension):
TripoP1TextToModelNode,
TripoP1ImageToModelNode,
TripoP1MultiviewToModelNode,
TripoImportModelNode,
TripoTextureNode,
TripoRefineNode,
TripoRigNode,

View File

@ -9,6 +9,7 @@ from io import BytesIO
from yarl import URL
from comfy.cli_args import args
from comfy.deploy_environment import get_deploy_environment
from comfy.model_management import processing_interrupted
from comfy_api.latest import IO
@ -35,6 +36,30 @@ def get_auth_header(node_cls: type[IO.ComfyNode]) -> dict[str, str]:
return {}
def get_usage_source(node_cls: type[IO.ComfyNode]) -> str:
"""Source of the prompt that triggered this API node.
Defaults to "comfyui-api" when the submitting client didn't identify itself,
i.e. a direct API call to this server.
"""
return node_cls.hidden.comfy_usage_source or "comfyui-api"
def get_comfy_api_headers(node_cls: type[IO.ComfyNode]) -> dict[str, str]:
"""Common headers (auth, deploy environment, usage source) for Comfy API requests.
Centralizes the shared header set so every Comfy API request sends a consistent
set and new shared headers only need to be added in one place. Intended for
relative/cloud URLs resolved against ``default_base_url()``; because the result
includes auth, callers must not attach it to arbitrary absolute/presigned URLs.
"""
return {
**get_auth_header(node_cls),
"Comfy-Env": get_deploy_environment(),
"Comfy-Usage-Source": get_usage_source(node_cls),
}
def default_base_url() -> str:
return getattr(args, "comfy_api_base", "https://api.comfy.org")

View File

@ -19,12 +19,10 @@ from comfy import utils
from comfy_api.latest import IO
from server import PromptServer
from comfy.deploy_environment import get_deploy_environment
from . import request_logger
from ._helpers import (
default_base_url,
get_auth_header,
get_comfy_api_headers,
get_node_id,
is_processing_interrupted,
sleep_with_interrupt,
@ -645,8 +643,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"}
if not parsed_url.scheme and not parsed_url.netloc: # is URL relative?
payload_headers.update(get_auth_header(cfg.node_cls))
payload_headers["Comfy-Env"] = get_deploy_environment()
payload_headers.update(get_comfy_api_headers(cfg.node_cls))
if cfg.endpoint.headers:
payload_headers.update(cfg.endpoint.headers)

View File

@ -17,7 +17,7 @@ from folder_paths import get_output_directory
from . import request_logger
from ._helpers import (
default_base_url,
get_auth_header,
get_comfy_api_headers,
is_processing_interrupted,
sleep_with_interrupt,
to_aiohttp_url,
@ -64,7 +64,7 @@ async def download_url_to_bytesio(
if cls is None:
raise ValueError("For relative 'cloud' paths, the `cls` parameter is required.")
url = urljoin(default_base_url().rstrip("/") + "/", url.lstrip("/"))
headers = get_auth_header(cls)
headers = get_comfy_api_headers(cls)
while True:
attempt += 1

View File

@ -0,0 +1,66 @@
"""Enrich executed-node output entries with asset id."""
import logging
import os
def enrich_output_with_assets(output_ui: dict) -> dict:
"""Register file-type output entries as assets and inject their ``id``.
Runs at output-processing time, once per produced output, when
--enable-assets is set. Returns a new dict; entries without a resolvable
on-disk file path are left unchanged. Errors are caught per-entry so a
failure never blocks execution or the other entries.
"""
from comfy.cli_args import args
if not args.enable_assets:
return output_ui
import folder_paths
from app.assets.services.ingest import register_file_in_place, DependencyMissingError
enriched = {}
for key, entries in output_ui.items():
if not isinstance(entries, list):
enriched[key] = entries
continue
new_entries = []
for entry in entries:
if not isinstance(entry, dict) or "filename" not in entry or "type" not in entry:
new_entries.append(entry)
continue
try:
base = folder_paths.get_directory_by_type(entry["type"])
if base is None:
new_entries.append(entry)
continue
base_abs = os.path.abspath(base)
abs_path = os.path.abspath(os.path.join(base_abs, entry.get("subfolder") or "", entry["filename"]))
try:
if os.path.commonpath([base_abs, abs_path]) != base_abs:
raise ValueError("escapes base")
except ValueError:
logging.warning("Asset enrichment skipped (path escapes base): %s", entry.get("filename"))
new_entries.append(entry)
continue
if not os.path.isfile(abs_path):
new_entries.append(entry)
continue
# Register unconditionally: the file was just produced, and
# register_file_in_place re-hashes so an overwritten path can
# never carry a stale id.
result = register_file_in_place(
abs_path=abs_path,
name=entry["filename"],
tags=[entry["type"]],
)
entry = dict(entry)
entry["id"] = result.ref.id
except DependencyMissingError:
logging.warning("Asset enrichment skipped (blake3 not available): %s", entry.get("filename"))
except Exception:
logging.warning("Failed to enrich output entry with asset id: %s", entry.get("filename"), exc_info=True)
new_entries.append(entry)
enriched[key] = new_entries
return enriched

View File

@ -3,6 +3,7 @@ Job utilities for the /api/jobs endpoint.
Provides normalization and helper functions for job status tracking.
"""
import uuid
from typing import Optional
from comfy_api.internal import prune_dict
@ -19,6 +20,25 @@ class JobStatus:
ALL = [PENDING, IN_PROGRESS, COMPLETED, FAILED, CANCELLED]
def validate_job_id(value) -> str:
"""Validate a client-supplied job (prompt) id.
Job ids must be UUIDs in the canonical lowercase hyphenated form. The id
is stored and compared verbatim everywhere downstream history keys,
websocket events, and /interrupt matching so accepting another spelling
would silently rewrite the client's id and then miss every exact-match
lookup. Rejecting loudly beats that.
Returns the id unchanged. Raises ValueError when the value is not a
string in canonical UUID form.
"""
if not isinstance(value, str):
raise ValueError(f"job id must be a string, got {type(value).__name__}")
if str(uuid.UUID(value)) != value:
raise ValueError("job id must be a UUID in canonical lowercase hyphenated form")
return value
# Media types that can be previewed in the frontend
PREVIEWABLE_MEDIA_TYPES = frozenset({'images', 'video', 'audio', '3d', 'text'})

View File

@ -11,7 +11,7 @@ class TextEncodeAceStepAudio(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="TextEncodeAceStepAudio",
category="model/conditioning",
category="model/conditioning/ace",
inputs=[
IO.Clip.Input("clip"),
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
@ -33,7 +33,7 @@ class TextEncodeAceStepAudio15(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="TextEncodeAceStepAudio1.5",
category="model/conditioning",
category="model/conditioning/ace",
inputs=[
IO.Clip.Input("clip"),
IO.String.Input("tags", multiline=True, dynamic_prompts=True),
@ -67,7 +67,7 @@ class EmptyAceStepLatentAudio(IO.ComfyNode):
return IO.Schema(
node_id="EmptyAceStepLatentAudio",
display_name="Empty Ace Step 1.0 Latent Audio",
category="model/latent/audio",
category="model/latent/ace",
inputs=[
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
IO.Int.Input(
@ -90,7 +90,7 @@ class EmptyAceStep15LatentAudio(IO.ComfyNode):
return IO.Schema(
node_id="EmptyAceStep1.5LatentAudio",
display_name="Empty Ace Step 1.5 Latent Audio",
category="model/latent/audio",
category="model/latent/ace",
inputs=[
IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
IO.Int.Input(
@ -111,8 +111,8 @@ class ReferenceAudio(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ReferenceTimbreAudio",
display_name="Reference Audio",
category="advanced/conditioning/audio",
display_name="Set Reference Audio",
category="model/conditioning",
is_experimental=True,
description="This node sets the reference audio for ace step 1.5",
inputs=[

View File

@ -16,7 +16,7 @@ class APG(io.ComfyNode):
return io.Schema(
node_id="APG",
display_name="Adaptive Projected Guidance",
category="model/sampling/custom_sampling",
category="model/sampling/custom",
inputs=[
io.Model.Input("model"),
io.Float.Input(

View File

@ -19,7 +19,7 @@ class EmptyARVideoLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyARVideoLatent",
category="model/latent/video",
category="model/latent/autoregressive",
inputs=[
io.Int.Input("width", default=832, min=16, max=8192, step=16),
io.Int.Input("height", default=480, min=16, max=8192, step=16),
@ -85,7 +85,7 @@ class ARVideoI2V(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ARVideoI2V",
category="model/conditioning/video_models",
category="model/conditioning/autoregressive",
inputs=[
io.Model.Input("model"),
io.Vae.Input("vae"),

View File

@ -16,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode):
return IO.Schema(
node_id="EmptyLatentAudio",
display_name="Empty Latent Audio",
category="model/latent/audio",
category="model/latent",
essentials_category="Audio",
inputs=[
IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1),
@ -41,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ConditioningStableAudio",
category="model/conditioning",
category="model/conditioning/stable audio",
inputs=[
IO.Conditioning.Input("positive"),
IO.Conditioning.Input("negative"),
@ -70,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode):
node_id="VAEEncodeAudio",
search_aliases=["audio to latent"],
display_name="VAE Encode Audio",
category="model/latent/audio",
category="model/latent",
inputs=[
IO.Audio.Input("audio"),
IO.Vae.Input("vae"),
@ -115,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode):
node_id="VAEDecodeAudio",
search_aliases=["latent to audio"],
display_name="VAE Decode Audio",
category="model/latent/audio",
category="model/latent",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),
@ -137,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode):
node_id="VAEDecodeAudioTiled",
search_aliases=["latent to audio"],
display_name="VAE Decode Audio (Tiled)",
category="model/latent/audio",
category="model/latent",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),

View File

@ -39,9 +39,9 @@ class BerniniConditioning(io.ComfyNode):
return io.Schema(
node_id="BerniniConditioning",
display_name="Bernini Conditioning",
category="conditioning/video_models",
category="model/conditioning/bernini",
description="Conditioning node for Bernini in-context video/image conditioning. It can be used for the following tasks: t2v (text-to-video), v2v (video-to-video), rv2v (reference-guided video editing), r2v (reference-to-video), ads2v (insert image/video into video)."
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
"Reference images injected as in-context tokens (r2v, rv2v) are encoded independently at their own native aspect ratio (long edge capped at ref_max_size)",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -50,14 +50,11 @@ class BerniniConditioning(io.ComfyNode):
io.Int.Input("height", default=480, min=16, max=8192, step=16),
io.Int.Input("length", default=81, min=1, max=8192, step=4),
io.Int.Input("batch_size", default=1, min=1, max=4096),
io.Image.Input("source_video", optional=True, tooltip=(
"Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
io.Image.Input("reference_video", optional=True, tooltip=(
"Video to insert into the source video (ads2v).")),
io.Image.Input("source_video", optional=True, tooltip=("Source video to edit or restyle (v2v, rv2v). Resized to width/height and trimmed to length.")),
io.Image.Input("reference_video", optional=True, tooltip=("Video to insert into the source video (ads2v).")),
io.Autogrow.Input("reference_images", optional=True,
template=io.Autogrow.TemplatePrefix(
input=io.Image.Input("reference_image", tooltip=(
"Reference image injected as an in-context token (r2v, rv2v).")),
input=io.Image.Input("reference_image", tooltip=("Reference image injected as an in-context token (r2v, rv2v).")),
prefix="reference_image_", min=0, max=8)),
io.Int.Input("ref_max_size", default=848, min=16, max=8192, step=16, optional=True, tooltip=(
"Max size for the long edge of reference_video and reference_images. Resized with preserved aspect ratio and snapped to 16px.")),
@ -70,10 +67,8 @@ class BerniniConditioning(io.ComfyNode):
)
@classmethod
def execute(cls, positive, negative, vae, width, height, length, batch_size,
source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8],
device=comfy.model_management.intermediate_device())
def execute(cls, positive, negative, vae, width, height, length, batch_size, source_video=None, reference_video=None, reference_images=None, ref_max_size=848) -> io.NodeOutput:
latent = torch.zeros([batch_size, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
# source_video (1), reference_video (2), reference_images (3, 4, ...).
context = []
@ -106,9 +101,7 @@ class BerniniConditioning(io.ComfyNode):
class BerniniExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
BerniniConditioning,
]
return [BerniniConditioning,]
async def comfy_entrypoint() -> BerniniExtension:

View File

@ -0,0 +1,97 @@
import math
import node_helpers
import comfy.utils
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
class TextEncodeBooguEdit(io.ComfyNode):
"""Boogu-Image Edit conditioning.
The edit image is used twice, matching the reference pipeline:
- Qwen3-VL vision tokens (instruction understanding) -> positive only
- VAE reference latent (image identity) -> positive and negative
The ref latent is in both conds so it cancels under CFG (identity preserved);
the vision tokens are only in the positive so CFG amplifies the instruction.
The tokenizer selects the right system prompt automatically (image -> TI2I,
empty negative -> DROP), so no template plumbing is needed here.
"""
@classmethod
def define_schema(cls):
return io.Schema(
node_id="TextEncodeBooguEdit",
category="model/conditioning/boogu",
inputs=[
io.Clip.Input("clip"),
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
io.String.Input("negative_prompt", multiline=True, dynamic_prompts=True, advanced=True),
io.Vae.Input("vae"),
io.Autogrow.Input(
"images",
template=io.Autogrow.TemplateNames(
io.Image.Input("image"),
names=[f"image_{i}" for i in range(1, 17)],
min=0,
),
tooltip="Reference image(s) to edit. Boogu focuses on one reference per sample; more are allowed.",
),
],
outputs=[
io.Conditioning.Output(display_name="positive"),
io.Conditioning.Output(display_name="negative"),
],
)
@classmethod
def execute(cls, clip, prompt, negative_prompt, vae=None, images: io.Autogrow.Type = None) -> io.NodeOutput:
ref_latents = []
images_vl = []
images = images or {}
for name in sorted(images, key=lambda n: int(n.rsplit("_", 1)[-1])):
image = images[name]
if image is None:
continue
samples = image.movedim(-1, 1)
# Vision tower input: the reference caps the VLM image at 384x384
# (max_vlm_input_pil_pixels in pipeline_boogu.py).
total = int(384 * 384)
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
width = round(samples.shape[3] * scale_by)
height = round(samples.shape[2] * scale_by)
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
images_vl.append(s.movedim(1, -1)[:, :, :, :3])
# Reference latent: align to 16 px (VAE /8 * patch_size 2).
if vae is not None:
total = int(1024 * 1024)
scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
width = round(samples.shape[3] * scale_by / 16.0) * 16
height = round(samples.shape[2] * scale_by / 16.0) * 16
s = comfy.utils.common_upscale(samples, width, height, "area", "disabled")
ref_latents.append(vae.encode(s.movedim(1, -1)[:, :, :, :3]))
# positive: instruction + vision tokens; negative: empty (no vision). Ref latent on both.
positive = clip.encode_from_tokens_scheduled(clip.tokenize(prompt, images=images_vl))
negative = clip.encode_from_tokens_scheduled(clip.tokenize(negative_prompt))
if len(ref_latents) > 0:
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
return io.NodeOutput(positive, negative)
class BooguExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
TextEncodeBooguEdit,
]
async def comfy_entrypoint() -> BooguExtension:
return BooguExtension()

View File

@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanCameraEmbedding",
category="model/conditioning/video_models",
category="model/conditioning/wan/camera",
inputs=[
io.Combo.Input(
"camera_pose",

View File

@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="EmptyChromaRadianceLatentImage",
category="model/latent/chroma_radiance",
category="model/latent/chroma radiance",
inputs=[
io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="ChromaRadianceOptions",
category="model/patch/chroma_radiance",
category="model/patch/chroma radiance",
description="Allows setting advanced options for the Chroma Radiance model.",
inputs=[
io.Model.Input(id="model"),

View File

@ -9,7 +9,8 @@ class CLIPTextEncodeSDXLRefiner(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeSDXLRefiner",
category="advanced/conditioning",
display_name="CLIP Text Encode (SDXL Refiner)",
category="model/conditioning/stable diffusion",
inputs=[
io.Float.Input("ascore", default=6.0, min=0.0, max=1000.0, step=0.01),
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),
@ -30,7 +31,8 @@ class CLIPTextEncodeSDXL(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeSDXL",
category="advanced/conditioning",
display_name="CLIP Text Encode (SDXL)",
category="model/conditioning/stable diffusion",
inputs=[
io.Clip.Input("clip"),
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),

View File

@ -66,6 +66,7 @@ class WanContextWindowsManualNode(ContextWindowsManualNode):
schema.node_id = "WanContextWindowsManual"
schema.display_name = "WAN Context Windows (Manual)"
schema.description = "Manually set context windows for WAN-like models (dim=2)."
schema.category="model/patch/wan"
schema.inputs = [
io.Model.Input("model", tooltip="The model to apply context windows to during sampling."),
io.Int.Input("context_length", min=1, max=nodes.MAX_RESOLUTION, step=4, default=81, tooltip="The length of the context window.", advanced=True),

View File

@ -9,6 +9,8 @@ class SetUnionControlNetType(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SetUnionControlNetType",
search_aliases=["set controlnet type", "union controlnet type"],
display_name="Set Union ControlNet Type",
category="model/conditioning/controlnet",
inputs=[
io.ControlNet.Input("control_net"),
@ -39,6 +41,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode):
return io.Schema(
node_id="ControlNetInpaintingAliMamaApply",
search_aliases=["masked controlnet"],
display_name="Apply ControlNet Inpainting (AliMama)",
category="model/conditioning/controlnet",
inputs=[
io.Conditioning.Input("positive"),

View File

@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="EmptyCosmosLatentVideo",
category="model/latent/video",
category="model/latent/cosmos",
inputs=[
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="CosmosImageToVideoLatent",
category="model/conditioning/inpaint",
category="model/conditioning/cosmos",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="CosmosPredict2ImageToVideoLatent",
category="model/conditioning/inpaint",
category="model/conditioning/cosmos",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),

View File

@ -729,7 +729,7 @@ class SamplerCustom(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerCustom",
category="model/sampling/custom_sampling",
category="model/sampling/custom",
inputs=[
io.Model.Input("model"),
io.Boolean.Input("add_noise", default=True, advanced=True),
@ -1015,7 +1015,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerCustomAdvanced",
category="model/sampling/custom_sampling",
category="model/sampling/custom",
inputs=[
io.Noise.Input("noise"),
io.Guider.Input("guider"),
@ -1143,7 +1143,7 @@ class CFGOverride(io.ComfyNode):
display_name="CFG Override",
description="Override cfg to a fixed value over a [start, end] percent (sigma) range. "
"With multiple overrides, the one nearest the sampler wins on overlap.",
category="sampling/custom_sampling",
category="model/sampling/guiders",
inputs=[
io.Model.Input("model"),
io.Float.Input("cfg", default=1.0, min=0.0, max=100.0, step=0.1, round=0.01),

View File

@ -363,7 +363,7 @@ class EasyCacheNode(io.ComfyNode):
node_id="EasyCache",
display_name="EasyCache",
description="Native EasyCache implementation.",
category="advanced/debug/model",
category="advanced/debug",
is_experimental=True,
inputs=[
io.Model.Input("model", tooltip="The model to add EasyCache to."),
@ -496,7 +496,7 @@ class LazyCacheNode(io.ComfyNode):
node_id="LazyCache",
display_name="LazyCache",
description="A homebrew version of EasyCache - even 'easier' version of EasyCache to implement. Overall works worse than EasyCache, but better in some rare cases AND universal compatibility with everything in ComfyUI.",
category="advanced/debug/model",
category="advanced/debug",
is_experimental=True,
inputs=[
io.Model.Input("model", tooltip="The model to add LazyCache to."),

View File

@ -8,7 +8,8 @@ class ReferenceLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ReferenceLatent",
category="advanced/conditioning/edit_models",
display_name="Set Reference Latent",
category="model/conditioning",
description="This node sets the guiding latent for an edit model. If the model supports it you can chain multiple to set multiple reference images.",
inputs=[
io.Conditioning.Input("conditioning"),

View File

@ -13,7 +13,7 @@ class CLIPTextEncodeFlux(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeFlux",
category="advanced/conditioning/flux",
category="model/conditioning/flux",
inputs=[
io.Clip.Input("clip"),
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),
@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode):
return io.Schema(
node_id="EmptyFlux2LatentImage",
display_name="Empty Flux 2 Latent",
category="model/latent",
category="model/latent/flux",
inputs=[
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -61,7 +61,7 @@ class FluxGuidance(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="FluxGuidance",
category="advanced/conditioning/flux",
category="model/conditioning/flux",
inputs=[
io.Conditioning.Input("conditioning"),
io.Float.Input("guidance", default=3.5, min=0.0, max=100.0, step=0.1),
@ -84,7 +84,7 @@ class FluxDisableGuidance(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="FluxDisableGuidance",
category="advanced/conditioning/flux",
category="model/conditioning/flux",
description="This node completely disables the guidance embed on Flux and Flux like models",
inputs=[
io.Conditioning.Input("conditioning"),
@ -128,7 +128,7 @@ class FluxKontextImageScale(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="FluxKontextImageScale",
category="advanced/conditioning/flux",
category="model/conditioning/flux",
description="This node resizes the image to one that is more optimal for flux kontext.",
inputs=[
io.Image.Input("image"),
@ -156,7 +156,7 @@ class FluxKontextMultiReferenceLatentMethod(io.ComfyNode):
return io.Schema(
node_id="FluxKontextMultiReferenceLatentMethod",
display_name="Edit Model Reference Method",
category="advanced/conditioning/flux",
category="model/conditioning/flux",
inputs=[
io.Conditioning.Input("conditioning"),
io.Combo.Input(
@ -245,6 +245,11 @@ class KV_Attn_Input:
cache_key = "{}_{}".format(extra_options["block_type"], extra_options["block_index"])
if cache_key in self.cache:
kk, vv = self.cache[cache_key]
# Fix batch size changing.
kk = comfy.utils.repeat_to_batch_size(kk, k.shape[0])
vv = comfy.utils.repeat_to_batch_size(vv, v.shape[0])
self.set_cache = False
return {"q": q, "k": torch.cat((k, kk), dim=2), "v": torch.cat((v, vv), dim=2)}

View File

@ -11,8 +11,9 @@ class QuadrupleCLIPLoader(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="QuadrupleCLIPLoader",
category="advanced/loaders",
description="[Recipes]\n\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
display_name="Load CLIP (Quadruple)",
category="model/loaders",
description="Recipes:\nhidream: long clip-l, long clip-g, t5xxl, llama_8b_3.1_instruct",
inputs=[
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
@ -38,8 +39,9 @@ class CLIPTextEncodeHiDream(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeHiDream",
display_name="CLIP Text Encode (HiDream)",
search_aliases=["hidream prompt"],
category="advanced/conditioning",
category="model/conditioning/hidream",
inputs=[
io.Clip.Input("clip"),
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),

View File

@ -14,7 +14,7 @@ class EmptyHiDreamO1LatentImage(io.ComfyNode):
return io.Schema(
node_id="EmptyHiDreamO1LatentImage",
display_name="Empty HiDream-O1 Latent Image",
category="model/latent/image",
category="model/latent/hidream",
description=(
"Empty pixel-space latent for HiDream-O1-Image. The model was "
"trained at ~4 megapixels; lower resolutions go off-distribution "
@ -47,7 +47,7 @@ class HiDreamO1ReferenceImages(io.ComfyNode):
return io.Schema(
node_id="HiDreamO1ReferenceImages",
display_name="HiDream-O1 Reference Images",
category="model/conditioning/image",
category="model/conditioning/hidream",
description=(
"Attach 1-10 reference images to conditioning, one for edit instruction"
"or multiple for subject-driven personalization."
@ -117,7 +117,7 @@ class HiDreamO1PatchSeamSmoothing(io.ComfyNode):
return io.Schema(
node_id="HiDreamO1PatchSeamSmoothing",
display_name="HiDream-O1 Patch Seam Smoothing",
category="advanced/model",
category="model/patch/hidream",
is_experimental=True,
description=(
"Average the model output across multiple shifted patch-grid "

View File

@ -14,7 +14,8 @@ class CLIPTextEncodeHunyuanDiT(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeHunyuanDiT",
category="advanced/conditioning",
display_name="CLIP Text Encode (Hunyuan Image)",
category="model/conditioning/hunyuan image",
inputs=[
io.Clip.Input("clip"),
io.String.Input("bert", multiline=True, dynamic_prompts=True),
@ -41,7 +42,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
return io.Schema(
node_id="EmptyHunyuanLatentVideo",
display_name="Empty HunyuanVideo 1.0 Latent",
category="model/latent/video",
category="model/latent/hunyuan video",
inputs=[
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -67,6 +68,7 @@ class EmptyHunyuanVideo15Latent(EmptyHunyuanLatentVideo):
schema = super().define_schema()
schema.node_id = "EmptyHunyuanVideo15Latent"
schema.display_name = "Empty HunyuanVideo 1.5 Latent"
schema.category = "model/latent/hunyuan video"
return schema
@classmethod
@ -81,7 +83,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanVideo15ImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/hunyuan video",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -132,7 +134,7 @@ class HunyuanVideo15SuperResolution(io.ComfyNode):
return io.Schema(
node_id="HunyuanVideo15SuperResolution",
display_name="Hunyuan Video 1.5 Super Resolution",
category="model/conditioning/video_models",
category="model/conditioning/hunyuan video",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -227,7 +229,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
return io.Schema(
node_id="HunyuanVideo15LatentUpscaleWithModel",
display_name="Hunyuan Video 15 Latent Upscale With Model",
category="model/latent",
category="model/latent/hunyhuan video",
inputs=[
io.LatentUpscaleModel.Input("model"),
io.Latent.Input("samples"),
@ -276,7 +278,7 @@ class TextEncodeHunyuanVideo_ImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TextEncodeHunyuanVideo_ImageToVideo",
category="advanced/conditioning",
category="model/conditioning/hunyuan video",
inputs=[
io.Clip.Input("clip"),
io.ClipVisionOutput.Input("clip_vision_output"),
@ -308,7 +310,7 @@ class HunyuanImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/hunyuan video",
inputs=[
io.Conditioning.Input("positive"),
io.Vae.Input("vae"),
@ -359,7 +361,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyHunyuanImageLatent",
category="model/latent",
category="model/latent/hunyuan image",
inputs=[
io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
@ -384,7 +386,7 @@ class HunyuanRefinerLatent(io.ComfyNode):
return io.Schema(
node_id="HunyuanRefinerLatent",
display_name="Hunyuan Latent Refiner",
category="model/conditioning/video_models",
category="model/conditioning/hunyuan video",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),

View File

@ -12,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="EmptyLatentHunyuan3Dv2",
category="model/latent/3d",
category="model/latent/hunyuan 3d",
inputs=[
IO.Int.Input("resolution", default=3072, min=1, max=8192),
IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
@ -35,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="Hunyuan3Dv2Conditioning",
category="model/conditioning/3d_models",
category="model/conditioning/hunyuan 3d",
inputs=[
IO.ClipVisionOutput.Input("clip_vision_output"),
],
@ -60,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="Hunyuan3Dv2ConditioningMultiView",
category="model/conditioning/3d_models",
category="model/conditioning/hunyuan 3d",
inputs=[
IO.ClipVisionOutput.Input("front", optional=True),
IO.ClipVisionOutput.Input("left", optional=True),
@ -97,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="VAEDecodeHunyuan3D",
category="model/latent/3d",
category="model/latent/hunyuan 3d",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),

View File

@ -38,7 +38,7 @@ class Ideogram4Scheduler(io.ComfyNode):
return io.Schema(
node_id="Ideogram4Scheduler",
display_name="Ideogram 4 Scheduler",
category="sampling/custom_sampling/schedulers",
category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=200),
io.Int.Input("width", default=1024, min=256, max=8192, step=16),

View File

@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Kandinsky5ImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/kandinsky",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="NormalizeVideoLatentStart",
category="model/conditioning/video_models",
category="model/conditioning",
description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.",
inputs=[
io.Latent.Input("latent"),
@ -104,8 +104,9 @@ class CLIPTextEncodeKandinsky5(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodeKandinsky5",
display_name="CLIP Text Encode (Kandinsky 5)",
search_aliases=["kandinsky prompt"],
category="advanced/conditioning/kandinsky5",
category="model/conditioning/kandinsky",
inputs=[
io.Clip.Input("clip"),
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),

View File

@ -262,6 +262,7 @@ class LatentBatch(io.ComfyNode):
return io.Schema(
node_id="LatentBatch",
search_aliases=["combine latents", "merge latents", "join latents"],
display_name="Batch Latents (DEPRECATED)",
category="model/latent/batch",
is_deprecated=True,
inputs=[
@ -447,6 +448,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ReplaceVideoLatentFrames",
display_name="Replace Video Latent Frames",
category="model/latent/batch",
inputs=[
io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."),

View File

@ -25,7 +25,7 @@ class GetICLoRAParameters(io.ComfyNode):
display_name="Get IC-LoRA Parameters",
description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded "
"model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"],
inputs=[
io.Model.Input(
@ -62,7 +62,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyLTXVLatentVideo",
category="model/latent/video/ltxv",
category="model/latent/ltxv",
inputs=[
io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
@ -86,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVImgToVideo",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -131,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVImgToVideoInplace",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
inputs=[
io.Vae.Input("vae"),
io.Image.Input("image"),
@ -251,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVAddGuide",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -498,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVCropGuides",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -542,7 +542,7 @@ class LTXVConditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVConditioning",
category="model/conditioning/video_models",
category="model/conditioning/ltxv",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -566,7 +566,7 @@ class ModelSamplingLTXV(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ModelSamplingLTXV",
category="advanced/model",
category="model/patch/ltxv",
inputs=[
io.Model.Input("model"),
io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
@ -746,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVConcatAVLatent",
category="model/latent/video/ltxv",
category="model/latent/ltxv",
inputs=[
io.Latent.Input("video_latent"),
io.Latent.Input("audio_latent"),
@ -781,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVSeparateAVLatent",
category="model/latent/video/ltxv",
category="model/latent/ltxv",
description="LTXV Separate AV Latent",
inputs=[
io.Latent.Input("av_latent"),
@ -814,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode):
return io.Schema(
node_id="LTXVReferenceAudio",
display_name="LTXV Reference Audio (ID-LoRA)",
category="model/conditioning/audio",
category="model/conditioning/ltxv",
description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).",
inputs=[
io.Model.Input("model"),

View File

@ -40,7 +40,7 @@ class LTXVAudioVAEEncode(VAEEncodeAudio):
return io.Schema(
node_id="LTXVAudioVAEEncode",
display_name="LTXV Audio VAE Encode",
category="model/latent/audio",
category="model/latent/ltxv",
inputs=[
io.Audio.Input("audio", tooltip="The audio to be encoded."),
io.Vae.Input(
@ -63,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode):
return io.Schema(
node_id="LTXVAudioVAEDecode",
display_name="LTXV Audio VAE Decode",
category="model/latent/audio",
category="model/latent/ltxv",
inputs=[
io.Latent.Input("samples", tooltip="The latent to be decoded."),
io.Vae.Input(
@ -96,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
return io.Schema(
node_id="LTXVEmptyLatentAudio",
display_name="LTXV Empty Latent Audio",
category="model/latent/audio",
category="model/latent/ltxv",
inputs=[
io.Int.Input(
"frames_number",
@ -168,9 +168,9 @@ class LTXAVTextEncoderLoader(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="LTXAVTextEncoderLoader",
display_name="LTXV Audio Text Encoder Loader",
category="advanced/loaders",
description="[Recipes]\n\nltxav: gemma 3 12B",
display_name="Load LTXV Audio Text Encoder",
category="model/loaders",
description="Recipes:\nltxav: gemma 3 12B",
inputs=[
io.Combo.Input(
"text_encoder",

View File

@ -13,7 +13,7 @@ class LTXVLatentUpsampler(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="LTXVLatentUpsampler",
category="model/latent/video",
category="model/latent/ltxv",
is_experimental=True,
inputs=[
IO.Latent.Input("samples"),

View File

@ -9,7 +9,7 @@ class RenormCFG(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RenormCFG",
category="advanced/model",
category="model/patch",
inputs=[
io.Model.Input("model"),
io.Float.Input("cfg_trunc", default=100, min=0.0, max=100.0, step=0.01, advanced=True),
@ -80,8 +80,8 @@ class CLIPTextEncodeLumina2(io.ComfyNode):
return io.Schema(
node_id="CLIPTextEncodeLumina2",
search_aliases=["lumina prompt"],
display_name="CLIP Text Encode for Lumina2",
category="model/conditioning",
display_name="CLIP Text Encode (Lumina 2)",
category="model/conditioning/lumina",
description="Encodes a system prompt and a user prompt using a CLIP model into an embedding "
"that can be used to guide the diffusion model towards generating specific images.",
inputs=[

View File

@ -53,6 +53,7 @@ class LatentCompositeMasked(IO.ComfyNode):
return IO.Schema(
node_id="LatentCompositeMasked",
search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"],
display_name="Latent Composite Masked",
category="model/latent",
inputs=[
IO.Latent.Input("destination"),

View File

@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyMochiLatentVideo",
category="model/latent/video",
category="model/latent/mochi",
inputs=[
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),

View File

@ -59,7 +59,7 @@ class ModelSamplingDiscrete:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch"
def patch(self, model, sampling, zsnr):
m = model.clone()
@ -97,7 +97,7 @@ class ModelSamplingStableCascade:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch/stable cascade"
def patch(self, model, shift):
m = model.clone()
@ -123,7 +123,7 @@ class ModelSamplingSD3:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch/stable diffusion"
def patch(self, model, shift, multiplier=1000):
m = model.clone()
@ -150,6 +150,7 @@ class ModelSamplingAuraFlow(ModelSamplingSD3):
}}
FUNCTION = "patch_aura"
CATEGORY = "model/patch"
def patch_aura(self, model, shift):
return self.patch(model, shift, multiplier=1.0)
@ -167,7 +168,7 @@ class ModelSamplingFlux:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch/flux"
def patch(self, model, max_shift, base_shift, width, height):
m = model.clone()
@ -202,7 +203,7 @@ class ModelSamplingContinuousEDM:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch"
def patch(self, model, sampling, sigma_max, sigma_min):
m = model.clone()
@ -247,7 +248,7 @@ class ModelSamplingContinuousV:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch"
def patch(self, model, sampling, sigma_max, sigma_min):
m = model.clone()
@ -273,7 +274,7 @@ class RescaleCFG:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch"
def patch(self, model, multiplier):
def rescale_cfg(args):
@ -314,7 +315,7 @@ class ModelNoiseScale:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/model"
CATEGORY = "model/patch"
def patch(self, model, noise_scale):
m = model.clone()
@ -337,7 +338,7 @@ class ModelComputeDtype:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
CATEGORY = "advanced/debug/model"
CATEGORY = "advanced/debug"
def patch(self, model, dtype):
m = model.clone()

View File

@ -21,7 +21,7 @@ class ModelMergeSimple:
RETURN_TYPES = ("MODEL",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, model1, model2, ratio):
m = model1.clone()
@ -40,7 +40,7 @@ class ModelSubtract:
RETURN_TYPES = ("MODEL",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, model1, model2, multiplier):
m = model1.clone()
@ -58,7 +58,7 @@ class ModelAdd:
RETURN_TYPES = ("MODEL",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, model1, model2):
m = model1.clone()
@ -78,7 +78,7 @@ class CLIPMergeSimple:
RETURN_TYPES = ("CLIP",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, clip1, clip2, ratio):
m = clip1.clone()
@ -101,7 +101,7 @@ class CLIPSubtract:
RETURN_TYPES = ("CLIP",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, clip1, clip2, multiplier):
m = clip1.clone()
@ -123,7 +123,7 @@ class CLIPAdd:
RETURN_TYPES = ("CLIP",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, clip1, clip2):
m = clip1.clone()
@ -147,7 +147,7 @@ class ModelMergeBlocks:
RETURN_TYPES = ("MODEL",)
FUNCTION = "merge"
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def merge(self, model1, model2, **kwargs):
m = model1.clone()
@ -242,7 +242,7 @@ class CheckpointSave:
FUNCTION = "save"
OUTPUT_NODE = True
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def save(self, model, clip, vae, filename_prefix, prompt=None, extra_pnginfo=None):
save_checkpoint(model, clip=clip, vae=vae, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)
@ -261,7 +261,7 @@ class CLIPSave:
FUNCTION = "save"
OUTPUT_NODE = True
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def save(self, clip, filename_prefix, prompt=None, extra_pnginfo=None):
prompt_info = ""
@ -318,7 +318,7 @@ class VAESave:
FUNCTION = "save"
OUTPUT_NODE = True
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def save(self, vae, filename_prefix, prompt=None, extra_pnginfo=None):
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
@ -353,7 +353,7 @@ class ModelSave:
FUNCTION = "save"
OUTPUT_NODE = True
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
def save(self, model, filename_prefix, prompt=None, extra_pnginfo=None):
save_checkpoint(model, filename_prefix=filename_prefix, output_dir=self.output_dir, prompt=prompt, extra_pnginfo=extra_pnginfo)

View File

@ -1,7 +1,7 @@
import comfy_extras.nodes_model_merging
class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
arg_dict = { "model1": ("MODEL",),
@ -27,7 +27,7 @@ class ModelMergeSD1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -53,7 +53,7 @@ class ModelMergeSDXL(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -77,7 +77,7 @@ class ModelMergeSD3_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -104,7 +104,7 @@ class ModelMergeAuraflow(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -130,7 +130,7 @@ class ModelMergeFlux1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -153,7 +153,7 @@ class ModelMergeSD35_Large(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -175,7 +175,7 @@ class ModelMergeMochiPreview(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -197,7 +197,7 @@ class ModelMergeLTXV(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -221,7 +221,7 @@ class ModelMergeCosmos7B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -245,7 +245,7 @@ class ModelMergeCosmos14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
DESCRIPTION = "1.3B model has 30 blocks, 14B model has 40 blocks. Image to video model has the extra img_emb."
@classmethod
@ -269,7 +269,7 @@ class ModelMergeWAN2_1(comfy_extras.nodes_model_merging.ModelMergeBlocks):
return {"required": arg_dict}
class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -292,7 +292,7 @@ class ModelMergeCosmosPredict2_2B(comfy_extras.nodes_model_merging.ModelMergeBlo
return {"required": arg_dict}
class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):
@ -315,7 +315,7 @@ class ModelMergeCosmosPredict2_14B(comfy_extras.nodes_model_merging.ModelMergeBl
return {"required": arg_dict}
class ModelMergeQwenImage(comfy_extras.nodes_model_merging.ModelMergeBlocks):
CATEGORY = "advanced/model_merging/model_specific"
CATEGORY = "model/merging/model specific"
@classmethod
def INPUT_TYPES(s):

View File

@ -232,7 +232,7 @@ class ModelPatchLoader:
FUNCTION = "load_model_patch"
EXPERIMENTAL = True
CATEGORY = "advanced/loaders"
CATEGORY = "model/loaders"
def load_model_patch(self, name):
model_patch_path = folder_paths.get_full_path_or_raise("model_patches", name)
@ -479,7 +479,7 @@ class QwenImageDiffsynthControlnet:
FUNCTION = "diffsynth_controlnet"
EXPERIMENTAL = True
CATEGORY = "advanced/loaders/qwen"
CATEGORY = "model/patch/qwen"
def diffsynth_controlnet(self, model, model_patch, vae, image=None, strength=1.0, inpaint_image=None, mask=None):
model_patched = model.clone()
@ -512,7 +512,7 @@ class ZImageFunControlnet(QwenImageDiffsynthControlnet):
},
"optional": {"image": ("IMAGE",), "inpaint_image": ("IMAGE",), "mask": ("MASK",)}}
CATEGORY = "advanced/loaders/zimage"
CATEGORY = "model/patch/z-image"
class UsoStyleProjectorPatch:
def __init__(self, model_patch, encoded_image):
@ -675,3 +675,11 @@ NODE_CLASS_MAPPINGS = {
"USOStyleReference": USOStyleReference,
"SUPIRApply": SUPIRApply,
}
NODE_DISPLAY_NAME_MAPPINGS = {
"ModelPatchLoader": "Load Model Patch",
"QwenImageDiffsynthControlnet": "Apply Qwen Image DiffSynth ControlNet",
"ZImageFunControlnet": "Apply Z-Image Fun ControlNet",
"USOStyleReference": "Apply USO Style Reference",
"SUPIRApply": "Apply SUPIR Patch",
}

View File

@ -14,10 +14,8 @@ class PiDConditioning(io.ComfyNode):
return io.Schema(
node_id="PiDConditioning",
display_name="PiD Conditioning",
category="advanced/conditioning",
description=(
"Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"
),
category="model/conditioning",
description=("Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"),
inputs=[
io.Conditioning.Input("positive"),
io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."),

View File

@ -7,8 +7,9 @@ class CLIPTextEncodePixArtAlpha(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CLIPTextEncodePixArtAlpha",
display_name="CLIP Text Encode (PixArt Alpha)",
search_aliases=["pixart prompt"],
category="advanced/conditioning",
category="model/conditioning/pixart",
description="Encodes text and sets the resolution conditioning for PixArt Alpha. Does not apply to PixArt Sigma.",
inputs=[
io.Int.Input("width", default=1024, min=0, max=nodes.MAX_RESOLUTION),

View File

@ -616,7 +616,7 @@ class BatchLatentsNode(io.ComfyNode):
node_id="BatchLatentsNode",
search_aliases=["combine latents", "stack latents", "merge latents"],
display_name="Batch Latents",
category="model/latent",
category="model/latent/batch",
inputs=[
io.Autogrow.Input("latents", template=autogrow_template)
],

View File

@ -12,7 +12,7 @@ class TextEncodeQwenImageEdit(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TextEncodeQwenImageEdit",
category="advanced/conditioning",
category="model/conditioning/qwen image",
inputs=[
io.Clip.Input("clip"),
io.String.Input("prompt", multiline=True, dynamic_prompts=True),
@ -55,7 +55,7 @@ class TextEncodeQwenImageEditPlus(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TextEncodeQwenImageEditPlus",
category="advanced/conditioning",
category="model/conditioning/qwen image",
inputs=[
io.Clip.Input("clip"),
io.String.Input("prompt", multiline=True, dynamic_prompts=True),

View File

@ -14,7 +14,7 @@ class RTDETR_detect(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RTDETR_detect",
display_name="RT-DETR Detect",
display_name="Run Real-Time Detection (RT-DETR)",
category="image/detection",
search_aliases=["bbox", "bounding box", "object detection", "coco"],
inputs=[

View File

@ -264,7 +264,7 @@ class SAM3_VideoTrack(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SAM3_VideoTrack",
display_name="SAM3 Video Track",
display_name="Run SAM3 Video Track",
category="image/detection",
search_aliases=["sam3", "video", "track", "propagate"],
inputs=[

View File

@ -34,14 +34,20 @@ def _unpack(track_data):
return unpack_masks(packed)
def _first_frame_cx_area(masks_bool):
first = masks_bool[0].float()
H, W = first.shape[-2], first.shape[-1]
n_pixels = H * W
grid_x = torch.arange(W, device=first.device, dtype=first.dtype).view(1, W)
area = first.sum(dim=(-1, -2)).clamp_(min=1)
cx = (first * grid_x).sum(dim=(-1, -2)) / area
return (cx / W).tolist(), (area / n_pixels).tolist()
def _first_appearance_cx_area(masks_bool):
"""Per object: first frame it appears in, plus centroid-x and area in that frame."""
m = masks_bool.float()
T, H, W = m.shape[0], m.shape[-2], m.shape[-1]
grid_x = torch.arange(W, device=m.device, dtype=m.dtype).view(1, 1, 1, W)
area_t = m.sum(dim=(-1, -2))
cx_t = (m * grid_x).sum(dim=(-1, -2)) / area_t.clamp(min=1)
present = area_t > 0
frame_idx = torch.arange(T, device=m.device).unsqueeze(1)
first_t = torch.where(present, frame_idx, T).amin(dim=0)
sel = first_t.clamp(max=T - 1).unsqueeze(0)
cx = cx_t.gather(0, sel).squeeze(0)
area = area_t.gather(0, sel).squeeze(0)
return first_t.tolist(), (cx / W).tolist(), (area / (H * W)).tolist()
def _subset_track_data(track_data, obj_indices):
@ -81,12 +87,26 @@ def _render_colored_masks(track_data, background="black"):
masks_full.view(T * N_obj, 1, Hm, Wm), size=(H, W), mode="nearest"
).view(T, N_obj, H, W) > 0.5
any_mask = masks_full.any(dim=1)
obj_idx_map = masks_full.to(torch.uint8).argmax(dim=1)
color_overlay = colors[obj_idx_map]
color_overlay = colors[masks_full.to(torch.uint8).argmax(dim=1)]
bg_tensor = torch.tensor(bg_rgb, device=device, dtype=color_overlay.dtype).view(1, 1, 1, 3)
return torch.where(any_mask.unsqueeze(-1), color_overlay, bg_tensor.expand_as(color_overlay))
def _render_mask_as_identity(mask, background="black"):
"""Plain comfy MASK (B,H,W) or (H,W) -> (B,H,W,3) rendered as a single identity (palette[0])
on the given background. A batch is treated as multiple views of that one subject."""
device = comfy.model_management.intermediate_device()
dtype = comfy.model_management.intermediate_dtype()
if mask.ndim == 2:
mask = mask.unsqueeze(0)
mask = mask.to(device=device, dtype=dtype)
B, H, W = mask.shape
bg_rgb = (1.0, 1.0, 1.0) if background.startswith("white") else (0.0, 0.0, 0.0)
color = torch.tensor(DEFAULT_PALETTE[0], device=device, dtype=dtype).view(1, 1, 1, 3)
bg = torch.tensor(bg_rgb, device=device, dtype=dtype).view(1, 1, 1, 3)
return torch.where((mask > 0.5).unsqueeze(-1), color.expand(B, H, W, 3), bg.expand(B, H, W, 3))
def _extract_mask_to_28ch(rgb_video):
"""Colored RGB mask (T, H, W, 3) in [0, 1] -> SCAIL-2 28-channel binary latent
(1, T_lat, 28, H_lat, W_lat). 7 per-color binary channels (white/r/g/b/y/m/c)
@ -123,7 +143,7 @@ class WanSCAILToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSCAILToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/scail",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -138,8 +158,8 @@ class WanSCAILToVideo(io.ComfyNode):
io.Float.Input("pose_strength", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="Strength of the pose latent."),
io.Float.Input("pose_start", default=0.0, min=0.0, max=1.0, step=0.01, tooltip="Start step of the pose conditioning."),
io.Float.Input("pose_end", default=1.0, min=0.0, max=1.0, step=0.01, tooltip="End step of the pose conditioning."),
io.Image.Input("reference_image", optional=True, tooltip="Reference image, for multiple references composite all on single image."),
io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask at the same resolution as reference_image."),
io.Image.Input("reference_image", optional=True, tooltip="Reference image. The first image is the primary reference (composite all identities onto it). SCAIL-2: extra batch images are used as additional views (back view, close-up, occluded background), each needing a matching reference_image_mask in that identity's color."),
io.Image.Input("reference_image_mask", optional=True, tooltip="SCAIL-2 only. Colored reference mask, batch matching reference_image (first = primary reference mask, rest = identity masks for the additional reference_image)."),
io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="CLIP vision features for conditioning. Model is trained with stretch resize to aspect ratio."),
io.Int.Input("video_frame_offset", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1, tooltip="Cumulative output frame this chunk begins at. Wire from the previous chunk's video_frame_offset output."),
io.Int.Input("previous_frame_count", default=5, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="Tail frames of previous_frames to anchor. SCAIL-2 trained at 5 (81-frame chunks, 76-frame step)."),
@ -171,19 +191,21 @@ class WanSCAILToVideo(io.ComfyNode):
video_frame_offset -= prev_trimmed.shape[0]
video_frame_offset = max(0, video_frame_offset)
ref_latent = None
if reference_image is not None:
reference_image = comfy.utils.common_upscale(reference_image[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
# Replacement Mode: composite ref on black bg using reference_image_mask as alpha matte
if replacement_mode and reference_image_mask is not None:
rm = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(reference_image.dtype)
reference_image = reference_image * is_char
ref_latent = vae.encode(reference_image[:, :, :, :3])
ref_imgs = comfy.utils.common_upscale(reference_image.movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
n_ref = ref_imgs.shape[0]
# SCAIL-2 multi-reference: the first image is the primary ref, the rest are additional references.
if ref_latent is not None:
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": [ref_latent]}, append=True)
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": [ref_latent]}, append=True)
# Replacement Mode: composite each ref on black bg using its mask as alpha matte
if replacement_mode and reference_image_mask is not None:
rm = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
rm = rm[[min(i, rm.shape[0] - 1) for i in range(n_ref)]]
is_char = (rm[..., :3].max(dim=-1, keepdim=True).values > 0.1).to(ref_imgs.dtype)
ref_imgs = ref_imgs * is_char
# encode each ref individually so each stays a single latent frame (a batched encode would be treated as a video)
ref_latents = [vae.encode(ref_imgs[i:i + 1, :, :, :3]) for i in range(n_ref)]
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": ref_latents}, append=True)
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": ref_latents}, append=True)
if clip_vision_output is not None:
positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output})
@ -221,11 +243,16 @@ class WanSCAILToVideo(io.ComfyNode):
positive = node_helpers.conditioning_set_values(positive, {"driving_mask_28ch": driving_mask_28ch})
negative = node_helpers.conditioning_set_values(negative, {"driving_mask_28ch": driving_mask_28ch})
if reference_image_mask is not None:
ref_mask_hw = comfy.utils.common_upscale(reference_image_mask[:1].movedim(-1, 1), width, height, "bicubic", "center").movedim(1, -1)
ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw)
# The ref mask binds reference frames to identities, so it only applies when there's a reference image.
if reference_image_mask is not None and reference_image is not None:
ref_mask_hw = comfy.utils.common_upscale(reference_image_mask.movedim(-1, 1), width, height, "nearest-exact", "center").movedim(1, -1)
n_masks = ref_mask_hw.shape[0]
n_ref = reference_image.shape[0]
add_masks = [_extract_mask_to_28ch(ref_mask_hw[min(i, n_masks - 1)][None]) for i in range(1, n_ref)]
ref_mask_1f = _extract_mask_to_28ch(ref_mask_hw[:1])
zeros = torch.zeros((1, latent.shape[2], 28, ref_mask_1f.shape[-2], ref_mask_1f.shape[-1]), device=ref_mask_1f.device, dtype=ref_mask_1f.dtype)
ref_mask_28ch = torch.cat([ref_mask_1f, zeros], dim=1)
ref_mask_28ch = torch.cat(add_masks + [ref_mask_1f, zeros], dim=1)
positive = node_helpers.conditioning_set_values(positive, {"ref_mask_28ch": ref_mask_28ch})
negative = node_helpers.conditioning_set_values(negative, {"ref_mask_28ch": ref_mask_28ch})
@ -244,12 +271,9 @@ class WanSCAILToVideo(io.ComfyNode):
class SCAIL2ColoredMask(io.ComfyNode):
"""Render SAM3 tracks for the driving pose video and (optionally) the reference
image into the two colored masks WanSCAILToVideo consumes. Shared `sort_by`
across both outputs guarantees identity K maps to the same color on both
sides, for multi-person workflow consistency.
reference_image_mask is always rendered black-bg (model convention)
pose_video_mask bg follows replacement_mode: black = Animation Mode, white = Replacement Mode
"""Render SAM3 tracks for the driving pose video and reference image(s) into the
colored masks WanSCAILToVideo consumes. Shared `sort_by` keeps each identity on the
same color across both outputs.
"""
@classmethod
@ -257,17 +281,18 @@ class SCAIL2ColoredMask(io.ComfyNode):
return io.Schema(
node_id="SCAIL2ColoredMask",
display_name="Create SCAIL-2 Colored Mask",
category="conditioning/video_models/scail",
category="model/conditioning/wan/scail",
inputs=[
SAM3TrackData.Input("driving_track_data", tooltip="SAM3 track of the driving pose video. Will be rendered into the pose_video_mask output."),
SAM3TrackData.Input("ref_track_data", optional=True,
tooltip="SAM3 track of the reference image."),
io.MultiType.Input("ref_track_data", [SAM3TrackData, io.Mask], optional=True, display_name="reference_masks",
tooltip="SAM3 track of the reference image(s) (one identity per object, colored in batch order), or a plain MASK of the reference subject (rendered as a single identity)."),
io.String.Input("object_indices", default="",
tooltip="Comma-separated list of person indices to include (e.g. '0,2,3'). Applied to both reference and pose video masks. Empty = all."),
io.Combo.Input("sort_by", options=["none", "left_to_right", "area"], default="left_to_right",
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). left_to_right = leftmost object (by first-frame centroid) gets the first color; area = biggest object (by first-frame mask area) gets the first color; none = keep SAM3's order."),
tooltip="Order in which palette colors are assigned to the tracked objects (applied to both reference and pose video so each identity keeps the same color). Objects that appear in earlier frames always come first; within a frame, left_to_right = leftmost object (by centroid at first appearance) gets the first color, area = biggest object (by mask area at first appearance) gets the first color; none = keep SAM3's order."),
io.Boolean.Input("replacement_mode", default=False,
tooltip="False = mask_video has black bg (Animation Mode). True = white bg (Replacement Mode). Set the matching replacement_mode on WanSCAILToVideo. reference_image_mask is always black-bg regardless."),
tooltip="False = Animation Mode (pose_video_mask has black background, reference_image_mask has white background). "
"True = Replacement Mode (pose_video_mask has white background, reference_image_mask has black background)."),
],
outputs=[
io.Image.Output("pose_video_mask"),
@ -281,11 +306,11 @@ class SCAIL2ColoredMask(io.ComfyNode):
def _prep(td):
masks_bool = _unpack(td)
if sort_by != "none" and masks_bool is not None:
cx, area = _first_frame_cx_area(masks_bool)
first_t, cx, area = _first_appearance_cx_area(masks_bool)
if sort_by == "left_to_right":
order = sorted(range(len(cx)), key=lambda i: cx[i])
order = sorted(range(len(cx)), key=lambda i: (first_t[i], cx[i]))
else: # "area"
order = sorted(range(len(area)), key=lambda i: -area[i])
order = sorted(range(len(area)), key=lambda i: (first_t[i], -area[i]))
td = _subset_track_data(td, order)
if object_indices.strip():
indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
@ -296,14 +321,19 @@ class SCAIL2ColoredMask(io.ComfyNode):
return td
drv = _prep(driving_track_data)
# Animation: driving=black, ref=white. Replacement: driving=white, ref=black.
mask_video = _render_colored_masks(drv, "white" if replacement_mode else "black")
ref_bg = "black" if replacement_mode else "white"
if ref_track_data is not None:
ref = _prep(ref_track_data)
reference_image_mask = _render_colored_masks(ref, "black")
if isinstance(ref_track_data, torch.Tensor): # plain comfy MASK
reference_image_mask = _render_mask_as_identity(ref_track_data, ref_bg)
else:
reference_image_mask = _render_colored_masks(_prep(ref_track_data), ref_bg)
else:
H, W = drv["orig_size"]
reference_image_mask = torch.zeros(1, H, W, 3, device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
fill_value = 1.0 if ref_bg == "white" else 0.0
reference_image_mask = torch.full((1, H, W, 3), fill_value, device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
return io.NodeOutput(mask_video, reference_image_mask)

View File

@ -13,8 +13,9 @@ class TripleCLIPLoader(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TripleCLIPLoader",
category="advanced/loaders",
description="[Recipes]\n\nsd3: clip-l, clip-g, t5",
display_name="Load CLIP (Triple)",
category="model/loaders",
description="Recipes:\nsd3: clip-l, clip-g, t5",
inputs=[
io.Combo.Input("clip_name1", options=folder_paths.get_filename_list("text_encoders")),
io.Combo.Input("clip_name2", options=folder_paths.get_filename_list("text_encoders")),
@ -41,7 +42,7 @@ class EmptySD3LatentImage(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptySD3LatentImage",
category="model/latent/sd3",
category="model/latent/stable diffusion",
inputs=[
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@ -66,7 +67,8 @@ class CLIPTextEncodeSD3(io.ComfyNode):
return io.Schema(
node_id="CLIPTextEncodeSD3",
search_aliases=["sd3 prompt"],
category="advanced/conditioning",
display_name="CLIP Text Encode (SD3)",
category="model/conditioning/stable diffusion",
inputs=[
io.Clip.Input("clip"),
io.String.Input("clip_l", multiline=True, dynamic_prompts=True),

View File

@ -96,8 +96,12 @@ class KeypointDraw:
# Body connections - matching DWPose limbSeq (1-indexed, converted to 0-indexed)
self.body_limbSeq = [
[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10],
[10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17],
[1, 16], [16, 18]
[10, 11], [2, 12], [12, 13], [13, 14]
]
# Head connections (1-indexed, converted to 0-indexed)
self.head_edges = [
[2, 1], [1, 15], [15, 17], [1, 16], [16, 18]
]
# Colors matching DWPose
@ -215,7 +219,7 @@ class KeypointDraw:
return unique_pts if len(unique_pts) > 1 else [[center[0], center[1]], [center[0], center[1]]]
def draw_wholebody_keypoints(self, canvas, keypoints, scores=None, threshold=0.3,
draw_body=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
draw_body=True, draw_head=True, draw_feet=True, draw_face=True, draw_hands=True, stick_width=4, face_point_size=3):
"""
Draw wholebody keypoints (134 keypoints after processing) in DWPose style.
@ -237,9 +241,17 @@ class KeypointDraw:
"""
H, W, C = canvas.shape
# Draw body limbs
if draw_body and len(keypoints) >= 18:
for i, limb in enumerate(self.body_limbSeq):
# Draw body limbs & head connections
if (draw_body or draw_head) and len(keypoints) >= 18:
colorIndexOffset = 0
edges = []
if draw_body:
edges += self.body_limbSeq
else:
colorIndexOffset += len(self.body_limbSeq)
if draw_head:
edges += self.head_edges
for i, limb in enumerate(edges):
# Convert from 1-indexed to 0-indexed
idx1, idx2 = limb[0] - 1, limb[1] - 1
@ -262,11 +274,17 @@ class KeypointDraw:
polygon = self.draw.ellipse2Poly((int(mY), int(mX)), (int(length / 2), stick_width), int(angle), 0, 360, 1)
self.draw.fillConvexPoly(canvas, polygon, self.colors[i % len(self.colors)])
self.draw.fillConvexPoly(canvas, polygon, self.colors[(i + colorIndexOffset) % len(self.colors)])
# Draw body keypoints
if draw_body and len(keypoints) >= 18:
# Draw body & head keypoints
if (draw_body or draw_head) and len(keypoints) >= 18:
head_keypoints = {0, 14, 15, 16, 17} # nose, eyes, ears
neck_point = 1
for i in range(18):
if not draw_head and i in head_keypoints:
continue
if not draw_body and i not in head_keypoints and i != neck_point:
continue
if scores is not None and scores[i] < threshold:
continue
x, y = int(keypoints[i][0]), int(keypoints[i][1])
@ -365,6 +383,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
io.Int.Input("stick_width", default=4, min=1, max=10, step=1),
io.Int.Input("face_point_size", default=3, min=1, max=10, step=1),
io.Float.Input("score_threshold", default=0.3, min=0.0, max=1.0, step=0.01),
io.Boolean.Input("draw_head", default=True),
],
outputs=[
io.Image.Output(),
@ -372,7 +391,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
)
@classmethod
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold) -> io.NodeOutput:
def execute(cls, keypoints, draw_body, draw_hands, draw_face, draw_feet, stick_width, face_point_size, score_threshold, draw_head) -> io.NodeOutput:
if not keypoints:
return io.NodeOutput(torch.zeros((1, 64, 64, 3), dtype=torch.float32))
height = keypoints[0]["canvas_height"]
@ -405,7 +424,7 @@ class SDPoseDrawKeypoints(io.ComfyNode):
canvas = drawer.draw_wholebody_keypoints(
canvas, kp, sc,
threshold=score_threshold,
draw_body=draw_body, draw_feet=draw_feet,
draw_body=draw_body, draw_head=draw_head, draw_feet=draw_feet,
draw_face=draw_face, draw_hands=draw_hands,
stick_width=stick_width, face_point_size=face_point_size,
)

View File

@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SD_4XUpscale_Conditioning",
category="model/conditioning/upscale_diffusion",
category="model/conditioning/stable diffusion upscaler",
inputs=[
io.Image.Input("images"),
io.Conditioning.Input("positive"),

View File

@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableZero123_Conditioning",
category="model/conditioning/3d_models",
category="model/conditioning/stable zero123",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),
@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableZero123_Conditioning_Batched",
category="model/conditioning/3d_models",
category="model/conditioning/stable zero123",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),
@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SV3D_Conditioning",
category="model/conditioning/3d_models",
category="model/conditioning/stable video 3d",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),

View File

@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_EmptyLatentImage",
category="model/latent/stable_cascade",
category="model/latent/stable cascade",
inputs=[
io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_StageC_VAEEncode",
category="model/latent/stable_cascade",
category="model/latent/stable cascade",
inputs=[
io.Image.Input("image"),
io.Vae.Input("vae"),
@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_StageB_Conditioning",
category="model/conditioning/stable_cascade",
category="model/conditioning/stable cascade",
inputs=[
io.Conditioning.Input("conditioning"),
io.Latent.Input("stage_c"),

View File

@ -35,7 +35,7 @@ class TextGenerate(io.ComfyNode):
io.Image.Input("image", optional=True),
io.Image.Input("video", optional=True, tooltip="Video frames as image batch. Assumed to be 24 FPS; subsampled to 1 FPS internally."),
io.Audio.Input("audio", optional=True),
io.Int.Input("max_length", default=256, min=1, max=2048),
io.Int.Input("max_length", default=512, min=1, max=32768),
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),

View File

@ -1367,7 +1367,7 @@ class SaveLoRA(io.ComfyNode):
node_id="SaveLoRA",
search_aliases=["export lora"],
display_name="Save LoRA Weights",
category="advanced/model_merging",
category="model/merging",
is_experimental=True,
is_output_node=True,
inputs=[

View File

@ -65,7 +65,7 @@ class TripoSplatPreprocessImage(IO.ComfyNode):
return IO.Schema(
node_id="TripoSplatPreprocessImage",
display_name="TripoSplat Preprocess Image",
category="3d/conditioning",
category="model/conditioning/triposplat",
description="Crop center each image to a square canvas on a black background and add padding.",
inputs=[
IO.Image.Input("image"),
@ -95,7 +95,7 @@ class TripoSplatConditioning(IO.ComfyNode):
return IO.Schema(
node_id="TripoSplatConditioning",
display_name="TripoSplat Conditioning",
category="3d/conditioning",
category="model/conditioning/triposplat",
description="Encode the image with DINOv3 and the Flux2 VAE into TripoSplat positive/negative "
"conditioning, and create the fixed size noise target (latent + camera) for the KSampler",
inputs=[

View File

@ -134,6 +134,17 @@ class CreateVideo(io.ComfyNode):
io.Image.Input("images", tooltip="The images to create a video from."),
io.Float.Input("fps", default=30.0, min=1.0, max=120.0, step=1.0),
io.Audio.Input("audio", optional=True, tooltip="The audio to add to the video."),
io.Int.Input(
"bit_depth",
min=8,
max=10,
default=8,
step=2,
tooltip="Bit depth of the created video. 10-bit keeps smoother gradients with less"
" banding, but some players and downstream nodes may not support it.",
optional=True,
display_mode=io.NumberDisplay.number,
),
],
outputs=[
io.Video.Output(),
@ -141,9 +152,14 @@ class CreateVideo(io.ComfyNode):
)
@classmethod
def execute(cls, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None) -> io.NodeOutput:
def execute(
cls, images: Input.Image, fps: float, audio: Optional[Input.Audio] = None, bit_depth: int = 8,
) -> io.NodeOutput:
return io.NodeOutput(
InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps)))
InputImpl.VideoFromComponents(
Types.VideoComponents(images=images, audio=audio, frame_rate=Fraction(fps)),
bit_depth=bit_depth,
)
)
class GetVideoComponents(io.ComfyNode):
@ -154,7 +170,7 @@ class GetVideoComponents(io.ComfyNode):
search_aliases=["extract frames", "split video", "video to images", "demux"],
display_name="Get Video Components",
category="video",
description="Extracts all components from a video: frames, audio, and framerate.",
description="Extracts all components from a video: frames, audio, framerate, and bit depth.",
inputs=[
io.Video.Input("video", tooltip="The video to extract components from."),
],
@ -162,13 +178,14 @@ class GetVideoComponents(io.ComfyNode):
io.Image.Output(display_name="images"),
io.Audio.Output(display_name="audio"),
io.Float.Output(display_name="fps"),
io.Int.Output(display_name="bit_depth"),
],
)
@classmethod
def execute(cls, video: Input.Video) -> io.NodeOutput:
components = video.get_components()
return io.NodeOutput(components.images, components.audio, float(components.frame_rate))
return io.NodeOutput(components.images, components.audio, float(components.frame_rate), video.get_bit_depth())
class LoadVideo(io.ComfyNode):

View File

@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning:
FUNCTION = "encode"
CATEGORY = "model/conditioning/video_models"
CATEGORY = "model/conditioning/stable video"
def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level):
output = clip_vision.encode_image(init_image)
@ -108,7 +108,7 @@ class VideoTriangleCFGGuidance:
return (m, )
class ImageOnlyCheckpointSave(comfy_extras.nodes_model_merging.CheckpointSave):
CATEGORY = "advanced/model_merging"
CATEGORY = "model/merging"
@classmethod
def INPUT_TYPES(s):
@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
CATEGORY = "model/conditioning"
CATEGORY = "model/conditioning/transform"
def append(self, conditioning, width, height, temporal, x, y, z, strength):
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
@ -160,4 +160,5 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"ImageOnlyCheckpointLoader": "Load Checkpoint Image Only (img2vid model)",
"VideoLinearCFGGuidance": "Video Linear CFG Guidance",
"VideoTriangleCFGGuidance": "Video Triangle CFG Guidance",
"ConditioningSetAreaPercentageVideo": "Conditioning (Set Area with Percentage for Video)",
}

View File

@ -175,7 +175,7 @@ class VOIDInpaintConditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="VOIDInpaintConditioning",
category="model/conditioning/video_models",
category="model/conditioning/void",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -288,7 +288,7 @@ class VOIDWarpedNoise(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="VOIDWarpedNoise",
category="model/latent/video",
category="model/latent/void",
inputs=[
OpticalFlow.Input(
"optical_flow",
@ -393,7 +393,7 @@ class VOIDWarpedNoiseSource(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="VOIDWarpedNoiseSource",
category="model/sampling/noise",
category="model/latent/void",
inputs=[
io.Latent.Input("warped_noise",
tooltip="Warped noise latent from VOIDWarpedNoise"),

View File

@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFunControlToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/fun control",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Wan22FunControlToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/fun control",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFirstLastFrameToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFunInpaintToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/fun inpaint",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode):
return io.Schema(
node_id="WanVaceToVideo",
search_aliases=["video conditioning", "video control"],
category="model/conditioning/video_models",
category="model/conditioning/wan/vace",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -375,7 +375,8 @@ class TrimVideoLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TrimVideoLatent",
category="model/latent/video",
display_name="Trim Video Latent",
category="model/latent",
inputs=[
io.Latent.Input("samples"),
io.Int.Input("trim_amount", default=0, min=0, max=99999),
@ -398,7 +399,7 @@ class WanCameraImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanCameraImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/camera",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -452,7 +453,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanPhantomSubjectToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/phantom subject",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -707,7 +708,7 @@ class WanTrackToVideo(io.ComfyNode):
return io.Schema(
node_id="WanTrackToVideo",
search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"],
category="model/conditioning/video_models",
category="model/conditioning/wan/move",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -951,7 +952,7 @@ class WanSoundImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSoundImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/sound",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -984,7 +985,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSoundImageToVideoExtend",
category="model/conditioning/video_models",
category="model/conditioning/wan/sound",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -1046,7 +1047,7 @@ class WanHuMoImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanHuMoImageToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/humo",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -1112,7 +1113,7 @@ class WanAnimateToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanAnimateToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/animate",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@ -1252,7 +1253,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Wan22ImageToVideoLatent",
category="model/conditioning/inpaint",
category="model/conditioning/wan",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32),
@ -1302,7 +1303,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanInfiniteTalkToVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/infinite talk",
inputs=[
io.DynamicCombo.Input("mode", options=[
io.DynamicCombo.Option("single_speaker", []),

View File

@ -713,7 +713,7 @@ class WanDancerEncodeAudio(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanDancerEncodeAudio",
category="model/conditioning/video_models",
category="model/conditioning/wan/dancer",
inputs=[
io.Audio.Input("audio"),
io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4),
@ -787,7 +787,7 @@ class WanDancerVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanDancerVideo",
category="model/conditioning/video_models",
category="model/conditioning/wan/dancer",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),

Some files were not shown because too many files have changed in this diff Show More