mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-06 22:37:43 +08:00
Merge branch 'master' into trellis2
This commit is contained in:
commit
47ce08b67d
@ -89,3 +89,12 @@ rules:
|
||||
then:
|
||||
field: description
|
||||
function: truthy
|
||||
|
||||
overrides:
|
||||
# /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
|
||||
# WebSocket upgrade, but not a 2xx, so operation-success-response fires
|
||||
# as a false positive. OpenAPI 3.x has no native WebSocket support.
|
||||
- files:
|
||||
- "openapi.yaml#/paths/~1ws"
|
||||
rules:
|
||||
operation-success-response: off
|
||||
|
||||
@ -431,9 +431,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts image brightness and contrast using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
@ -162,7 +162,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Canny to Image (Z-Image-Turbo)",
|
||||
"name": "Canny to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1553,7 +1553,8 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Canny to image"
|
||||
"category": "Image generation and editing/Canny to image",
|
||||
"description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1574,4 +1575,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -192,7 +192,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Canny to Video (LTX 2.0)",
|
||||
"name": "Canny to Video (LTX 2.0)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -3600,7 +3600,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Canny to video"
|
||||
"category": "Video generation and editing/Canny to video",
|
||||
"description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -3616,4 +3617,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -377,8 +377,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds lens-style chromatic aberration (color fringing) using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -596,7 +596,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts saturation, temperature, tint, and vibrance using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -1129,7 +1129,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Balances colors across shadows, midtones, and highlights using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -608,7 +608,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Fine-tunes tone and color with per-channel curve adjustments using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
1412
blueprints/ControlNet (Z-Image-Turbo).json
Normal file
1412
blueprints/ControlNet (Z-Image-Turbo).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1609,7 +1609,8 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Crop"
|
||||
"category": "Image Tools/Crop",
|
||||
"description": "Splits an image into a 2×2 grid of four equal tiles."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -2946,7 +2946,8 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Crop"
|
||||
"category": "Image Tools/Crop",
|
||||
"description": "Splits an image into a 3×3 grid of nine equal tiles."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -1579,7 +1579,8 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image"
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
|
||||
},
|
||||
{
|
||||
"id": "458bdf3c-4b58-421c-af50-c9c663a4d74c",
|
||||
@ -2461,7 +2462,8 @@
|
||||
]
|
||||
},
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -4233,7 +4233,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Depth to video"
|
||||
"category": "Video generation and editing/Depth to video",
|
||||
"description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
|
||||
},
|
||||
{
|
||||
"id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
|
||||
@ -5192,7 +5193,8 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -450,9 +450,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Blur"
|
||||
"category": "Image Tools/Blur",
|
||||
"description": "Applies bilateral (edge-preserving) blur to soften images while retaining detail."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
@ -580,8 +580,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds procedural film grain texture for a cinematic look via GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -3350,7 +3350,8 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video generation and editing/First-Last-Frame to Video"
|
||||
"category": "Video generation and editing/First-Last-Frame to Video",
|
||||
"description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
3361
blueprints/First-Last-Frame to Video.json
Normal file
3361
blueprints/First-Last-Frame to Video.json
Normal file
File diff suppressed because it is too large
Load Diff
858
blueprints/Frame Interpolation.json
Normal file
858
blueprints/Frame Interpolation.json
Normal file
@ -0,0 +1,858 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 16,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 16,
|
||||
"type": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"pos": [
|
||||
-2990,
|
||||
-3240
|
||||
],
|
||||
"size": [
|
||||
410,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "multiplier",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "enable_fps_multiplier",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value_1"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "VIDEO",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"9",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"13",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"1",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Frame Interpolation"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 17,
|
||||
"lastLinkId": 28,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Frame Interpolation",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2810,
|
||||
-3070,
|
||||
159.7421875,
|
||||
120
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1270,
|
||||
-3075,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
2
|
||||
],
|
||||
"localized_name": "video",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3050
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
22
|
||||
],
|
||||
"label": "multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3030
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
23
|
||||
],
|
||||
"label": "enable_fps_multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3010
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a22b101e-8773-4e17-a297-7ee3aae09162",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
24
|
||||
],
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-2990
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"label": "VIDEO",
|
||||
"pos": [
|
||||
-1250,
|
||||
-3055
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "5aacc622-2a07-4983-b31c-e04461f7f953",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"pos": [
|
||||
-1250,
|
||||
-3035
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "FrameInterpolationModelLoader",
|
||||
"pos": [
|
||||
-2510,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 24
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INTERP_MODEL",
|
||||
"name": "INTERP_MODEL",
|
||||
"type": "INTERP_MODEL",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolationModelLoader",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"models": [
|
||||
{
|
||||
"name": "film_net_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
|
||||
"directory": "frame_interpolation"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"film_net_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "FrameInterpolate",
|
||||
"pos": [
|
||||
-2040,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "interp_model",
|
||||
"name": "interp_model",
|
||||
"type": "INTERP_MODEL",
|
||||
"link": 1
|
||||
},
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 3
|
||||
},
|
||||
{
|
||||
"localized_name": "multiplier",
|
||||
"name": "multiplier",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "multiplier"
|
||||
},
|
||||
"link": 8
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
4,
|
||||
28
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolate",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-1600,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 4
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
26
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2970
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 22
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
8,
|
||||
19
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (Multiplier)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "ComfySwitchNode",
|
||||
"pos": [
|
||||
-1610,
|
||||
-3120
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "on_false",
|
||||
"name": "on_false",
|
||||
"type": "*",
|
||||
"link": 11
|
||||
},
|
||||
{
|
||||
"localized_name": "on_true",
|
||||
"name": "on_true",
|
||||
"type": "*",
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "switch",
|
||||
"name": "switch",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "switch"
|
||||
},
|
||||
"link": 15
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "output",
|
||||
"name": "output",
|
||||
"type": "*",
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfySwitchNode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "PrimitiveBoolean",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2770
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 23
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "BOOLEAN",
|
||||
"name": "BOOLEAN",
|
||||
"type": "BOOLEAN",
|
||||
"links": [
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Boolean (Apply multiplier to FPS?)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveBoolean",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-2500,
|
||||
-3170
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 2
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
11,
|
||||
18
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-2090,
|
||||
-3070
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
210
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": false
|
||||
},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 18
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 19
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(abs(b), 16) * a"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "INTERP_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 1,
|
||||
"target_id": 5,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 10,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 1,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 9,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": 5,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
485
blueprints/Get Any Video Frame.json
Normal file
485
blueprints/Get Any Video Frame.json
Normal file
@ -0,0 +1,485 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 98,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 98,
|
||||
"type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"pos": [
|
||||
-410,
|
||||
-2230
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
104
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "frame_index",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Get Any Video Frame",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"100",
|
||||
"value"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 136,
|
||||
"lastLinkId": 302,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Get Any Video Frame",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
380,
|
||||
-57,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1460,
|
||||
-57,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "2ceec378-8dcf-4340-8570-155967f59a93",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
4
|
||||
],
|
||||
"pos": [
|
||||
480,
|
||||
-37
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "819955f6-c686-4896-8032-ff2d0059109a",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
283
|
||||
],
|
||||
"label": "frame_index",
|
||||
"pos": [
|
||||
480,
|
||||
-17
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5
|
||||
],
|
||||
"pos": [
|
||||
1480,
|
||||
-37
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
560,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
560,
|
||||
50
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 1
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
285
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "ImageFromBatch",
|
||||
"pos": [
|
||||
1130,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 2
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_index",
|
||||
"name": "batch_index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "batch_index"
|
||||
},
|
||||
"link": 286
|
||||
},
|
||||
{
|
||||
"localized_name": "length",
|
||||
"name": "length",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "length"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageFromBatch"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 99,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
910,
|
||||
100
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 284
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 285
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
286
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(max(int(a if a >= 0 else b + a), 0), b - 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 100,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
560,
|
||||
250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 283
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
284
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
"fixed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 283,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 100,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 284,
|
||||
"origin_id": 100,
|
||||
"origin_slot": 0,
|
||||
"target_id": 99,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 285,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 2,
|
||||
"target_id": 99,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 286,
|
||||
"origin_id": 99,
|
||||
"origin_slot": 1,
|
||||
"target_id": 3,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.197015527856339,
|
||||
"offset": [
|
||||
-168.76833554248222,
|
||||
540.6638955283997
|
||||
]
|
||||
},
|
||||
"frontendVersion": "1.42.8"
|
||||
}
|
||||
}
|
||||
@ -575,8 +575,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adds a glow/bloom effect around bright image areas via GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -752,8 +752,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts hue, saturation, and lightness of an image using a real-time GPU fragment shader."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -374,7 +374,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Blur"
|
||||
"category": "Image Tools/Blur",
|
||||
"description": "Applies Gaussian, Box, or Radial blur to soften images and create stylized depth or motion effects."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -310,7 +310,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Image Captioning"
|
||||
"category": "Text generation/Image Captioning",
|
||||
"description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -315,8 +315,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Manipulates individual RGBA channels for masking, compositing, and channel effects."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
2050
blueprints/Image Edit (Flux.2 Dev).json
Normal file
2050
blueprints/Image Edit (Flux.2 Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1472,7 +1472,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Edit image"
|
||||
"category": "Image generation and editing/Edit image",
|
||||
"description": "Edits an input image via text instructions using FLUX.2 [klein] 4B."
|
||||
},
|
||||
{
|
||||
"id": "6007e698-2ebd-4917-84d8-299b35d7b7ab",
|
||||
@ -1821,7 +1822,8 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Applies reference image conditioning for style/identity transfer (Flux.2 Klein 4B)."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1837,4 +1839,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
|
||||
@ -1417,7 +1417,8 @@
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Edit image"
|
||||
"category": "Image generation and editing/Edit image",
|
||||
"description": "Edits images via text instructions using LongCat Image Edit, an instruction-following image editing diffusion model."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
1947
blueprints/Image Edit (Qwen 2509).json
Normal file
1947
blueprints/Image Edit (Qwen 2509).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -132,7 +132,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image Edit (Qwen 2511)",
|
||||
"name": "Image Edit (Qwen 2511)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1468,7 +1468,8 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Edit image"
|
||||
"category": "Image generation and editing/Edit image",
|
||||
"description": "Edits images via text instructions using Qwen-Image-Edit-2511 with improved character consistency and integrated LoRA."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1489,4 +1490,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -1188,7 +1188,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Inpaint image"
|
||||
"category": "Image generation and editing/Inpaint image",
|
||||
"description": "Inpaints masked image regions using Flux.1 fill [dev], Black Forest Labs' inpainting/outpainting model."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1202,4 +1203,4 @@
|
||||
},
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1548,7 +1548,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Inpaint image"
|
||||
"category": "Image generation and editing/Inpaint image",
|
||||
"description": "Inpaints masked regions using Qwen-Image, extending its multilingual text rendering to inpainting tasks."
|
||||
},
|
||||
{
|
||||
"id": "56a1f603-fbd2-40ed-94ef-c9ecbd96aca8",
|
||||
@ -1907,7 +1908,8 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Expands and softens mask edges to reduce visible seams after image processing."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -742,9 +742,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Color adjust"
|
||||
"category": "Image Tools/Color adjust",
|
||||
"description": "Adjusts black point, white point, and gamma for tonal range control via GPU shader."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
@ -1919,7 +1919,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Outpaint image"
|
||||
"category": "Image generation and editing/Outpaint image",
|
||||
"description": "Outpaints beyond image boundaries using Qwen-Image's outpainting capabilities."
|
||||
},
|
||||
{
|
||||
"id": "f93c215e-c393-460e-9534-ed2c3d8a652e",
|
||||
@ -2278,7 +2279,8 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Expands and softens mask edges to reduce visible seams after image processing."
|
||||
},
|
||||
{
|
||||
"id": "2a4b2cc0-db37-4302-a067-da392f38f06b",
|
||||
@ -2733,7 +2735,8 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Scales both image and mask together while preserving alignment for editing workflows."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
714
blueprints/Image Segmentation (SAM3).json
Normal file
714
blueprints/Image Segmentation (SAM3).json
Normal file
@ -0,0 +1,714 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 99,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 99,
|
||||
"type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"pos": [
|
||||
-1630,
|
||||
-3270
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "object",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"78",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"77",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"text": true
|
||||
},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 113,
|
||||
"lastLinkId": 283,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1130,
|
||||
-3305,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
264
|
||||
],
|
||||
"localized_name": "image",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
265
|
||||
],
|
||||
"label": "object",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
266
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
267
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
268
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
269
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b1439668-b050-490b-a5dc-fc4052c55666",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
270
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
271
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
272
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3285
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3265
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 75,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1470,
|
||||
-3460
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
260
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 264
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 266
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 267
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 268
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 269
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 270
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 271
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 77,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3200
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 272
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 78,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2000,
|
||||
-3000
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 265
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 78,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 265,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 75,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 267,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 75,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 268,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 75,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 269,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 75,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 270,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 75,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 271,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 75,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 272,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 77,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Image Segmentation",
|
||||
"description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
@ -141,7 +141,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image Upscale(Z-image-Turbo)",
|
||||
"name": "Image Upscale (Z-image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1302,7 +1302,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Enhance"
|
||||
"category": "Image generation and editing/Enhance",
|
||||
"description": "Upscales images to higher resolution using Z-Image-Turbo."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -99,7 +99,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image to Depth Map (Lotus)",
|
||||
"name": "Image to Depth Map (Lotus)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -948,7 +948,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Depth to image"
|
||||
"category": "Image generation and editing/Depth to image",
|
||||
"description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -964,4 +965,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -1586,7 +1586,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Image to layers"
|
||||
"category": "Image generation and editing/Image to layers",
|
||||
"description": "Decomposes an image into variable-resolution RGBA layers for independent editing using Qwen-Image-Layered."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -72,7 +72,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image to Model (Hunyuan3d 2.1)",
|
||||
"name": "Image to 3D Model (Hunyuan3d 2.1)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -765,7 +765,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "3D/Image to 3D Model"
|
||||
"category": "3D/Image to 3D Model",
|
||||
"description": "Generates 3D mesh models from a single input image using Hunyuan3D 2.0/2.1."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -4223,7 +4223,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "Vue-corrected"
|
||||
},
|
||||
"category": "Video generation and editing/Image to video"
|
||||
"category": "Video generation and editing/Image to video",
|
||||
"description": "Generates video from a single input image using LTX-2.3."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -206,7 +206,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image to Video (Wan 2.2)",
|
||||
"name": "Image to Video (Wan 2.2)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2027,7 +2027,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Image to video"
|
||||
"category": "Video generation and editing/Image to video",
|
||||
"description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -134,7 +134,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Pose to Image (Z-Image-Turbo)",
|
||||
"name": "Pose to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1298,7 +1298,8 @@
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"category": "Image generation and editing/Pose to image"
|
||||
"category": "Image generation and editing/Pose to image",
|
||||
"description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1319,4 +1320,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -3870,7 +3870,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Pose to video"
|
||||
"category": "Video generation and editing/Pose to video",
|
||||
"description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -270,9 +270,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Prompt enhance"
|
||||
"category": "Text generation/Prompt enhance",
|
||||
"description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
397
blueprints/Remove Background (BiRefNet).json
Normal file
397
blueprints/Remove Background (BiRefNet).json
Normal file
@ -0,0 +1,397 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 19,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 19,
|
||||
"type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"pos": [
|
||||
-6411.330578108367,
|
||||
1940.2638932730042
|
||||
],
|
||||
"size": [
|
||||
349.609375,
|
||||
145.9375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"14",
|
||||
"bg_removal_name"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Remove Background (BiRefNet)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 21,
|
||||
"lastLinkId": 16,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Remove Background (BiRefNet)",
|
||||
"description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6728.534070722246,
|
||||
1475.2619799128663,
|
||||
150.9140625,
|
||||
88
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-6169.049695722246,
|
||||
1475.2619799128663,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5,
|
||||
7
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
12
|
||||
],
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
8
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
11
|
||||
],
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 13,
|
||||
"type": "RemoveBackground",
|
||||
"pos": [
|
||||
-6536.764823982709,
|
||||
1444.9963409012412
|
||||
],
|
||||
"size": [
|
||||
302.25,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "bg_removal_model",
|
||||
"name": "bg_removal_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"link": 3
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
4,
|
||||
11
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "RemoveBackground"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "LoadBackgroundRemovalModel",
|
||||
"pos": [
|
||||
-6540.534070722246,
|
||||
1302.223464635445
|
||||
],
|
||||
"size": [
|
||||
311.484375,
|
||||
85.515625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "bg_removal_name",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "bg_model",
|
||||
"name": "bg_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadBackgroundRemovalModel",
|
||||
"models": [
|
||||
{
|
||||
"name": "birefnet.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
|
||||
"directory": "background_removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"birefnet.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "InvertMask",
|
||||
"pos": [
|
||||
-6532.446160529669,
|
||||
1571.1111286839914
|
||||
],
|
||||
"size": [
|
||||
285.984375,
|
||||
48
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MASK",
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "InvertMask"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "JoinImageWithAlpha",
|
||||
"pos": [
|
||||
-6527.4370171636665,
|
||||
1674.3004951902876
|
||||
],
|
||||
"size": [
|
||||
284.96875,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 7
|
||||
},
|
||||
{
|
||||
"localized_name": "alpha",
|
||||
"name": "alpha",
|
||||
"type": "MASK",
|
||||
"link": 6
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
8
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "JoinImageWithAlpha"
|
||||
}
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 14,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 1,
|
||||
"type": "BACKGROUND_REMOVAL"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 15,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"origin_id": 15,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 16,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 14,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Background Removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -302,8 +302,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen"
|
||||
"category": "Image Tools/Sharpen",
|
||||
"description": "Sharpens image details using a GPU fragment shader for enhanced clarity."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -222,7 +222,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Text to Audio (ACE-Step 1.5)",
|
||||
"name": "Text to Audio (ACE-Step 1.5)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1502,7 +1502,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Audio/Music generation"
|
||||
"category": "Audio/Music generation",
|
||||
"description": "Generates audio/music from text prompts using ACE-Step 1.5, a diffusion-based audio generation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1518,4 +1519,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
2112
blueprints/Text to Image (Ernie Image Turbo).json
Normal file
2112
blueprints/Text to Image (Ernie Image Turbo).json
Normal file
File diff suppressed because it is too large
Load Diff
2190
blueprints/Text to Image (Ernie Image).json
Normal file
2190
blueprints/Text to Image (Ernie Image).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1029,7 +1029,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from prompts using FLUX.1 [dev]: a 12B rectified-flow MMDiT with dual CLIP plus T5-XXL text encoders and guidance-distilled sampling for sharp prompt following versus classic DDPM diffusion."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1043,4 +1044,4 @@
|
||||
},
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1023,7 +1023,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "FLUX.1 Krea [dev] (Black Forest Labs × Krea): open-weight 12B rectified-flow text-to-image drop-in alongside FLUX.1 [dev], tuned away from overcooked saturation toward more natural diversity in people, realism, and style while keeping ecosystem compatibility."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1037,4 +1038,4 @@
|
||||
},
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1870
blueprints/Text to Image (Flux.2 Dev).json
Normal file
1870
blueprints/Text to Image (Flux.2 Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1104,7 +1104,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using NetaYume Lumina, fine-tuned from Neta Lumina for anime-style and illustration generation."
|
||||
},
|
||||
{
|
||||
"id": "a07fdf06-1bda-4dac-bdbd-63ee8ebca1c9",
|
||||
@ -1458,11 +1459,12 @@
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
}
|
||||
},
|
||||
"description": "Encodes a negative text prompt via CLIP for classifier-free guidance in anime-style generation (NetaYume Lumina)."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1941,7 +1941,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "Vue-corrected"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Qwen-Image-2512, with enhanced human realism and finer natural detail over the base version."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -1873,7 +1873,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Qwen-Image, Alibaba's 20B MMDiT model with excellent multilingual text rendering."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
1184
blueprints/Text to Image (Z-Image-Base).json
Normal file
1184
blueprints/Text to Image (Z-Image-Base).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,22 +1,21 @@
|
||||
{
|
||||
"id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
|
||||
"revision": 0,
|
||||
"last_node_id": 34,
|
||||
"last_link_id": 40,
|
||||
"last_node_id": 57,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 5,
|
||||
"type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"id": 57,
|
||||
"type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"pos": [
|
||||
-2.5766491043910378e-05,
|
||||
1229.999928629805
|
||||
130,
|
||||
200
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
470
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -44,6 +43,22 @@
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
@ -80,15 +95,15 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"27",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"13",
|
||||
"width"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"13",
|
||||
"height"
|
||||
],
|
||||
[
|
||||
@ -97,19 +112,23 @@
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
"steps"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"28",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"30",
|
||||
"clip_name"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"29",
|
||||
"vae_name"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
@ -122,48 +141,40 @@
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
1024,
|
||||
1024,
|
||||
null,
|
||||
null,
|
||||
"z_image_turbo_bf16.safetensors",
|
||||
"qwen_3_4b.safetensors",
|
||||
"ae.safetensors"
|
||||
]
|
||||
"widgets_values": [],
|
||||
"title": "Text to Image (Z-Image-Turbo)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"groups": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 4,
|
||||
"lastNodeId": 34,
|
||||
"lastLinkId": 40,
|
||||
"lastNodeId": 61,
|
||||
"lastLinkId": 75,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Text to Image (Z-Image-Turbo)",
|
||||
"name": "Text to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-80,
|
||||
425,
|
||||
-560,
|
||||
480,
|
||||
120,
|
||||
160
|
||||
200
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1490,
|
||||
415,
|
||||
1670,
|
||||
320,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -178,8 +189,8 @@
|
||||
],
|
||||
"label": "prompt",
|
||||
"pos": [
|
||||
20,
|
||||
445
|
||||
-460,
|
||||
500
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -190,8 +201,8 @@
|
||||
35
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
465
|
||||
-460,
|
||||
520
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -202,44 +213,68 @@
|
||||
36
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
485
|
||||
-460,
|
||||
540
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
|
||||
"id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
71
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
560
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
72
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
580
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
38
|
||||
73
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
505
|
||||
-460,
|
||||
600
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
|
||||
"id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
39
|
||||
74
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
525
|
||||
-460,
|
||||
620
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
|
||||
"id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
40
|
||||
75
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
545
|
||||
-460,
|
||||
640
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -253,8 +288,8 @@
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
1510,
|
||||
435
|
||||
1690,
|
||||
340
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -264,15 +299,15 @@
|
||||
"id": 30,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
329.99999029608756
|
||||
30,
|
||||
420
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
106
|
||||
270,
|
||||
150
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -282,7 +317,7 @@
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": 39
|
||||
"link": 74
|
||||
},
|
||||
{
|
||||
"localized_name": "type",
|
||||
@ -315,9 +350,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_3_4b.safetensors",
|
||||
@ -343,15 +378,15 @@
|
||||
"id": 29,
|
||||
"type": "VAELoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
479.9999847172637
|
||||
30,
|
||||
650
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
58
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -361,7 +396,7 @@
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": 40
|
||||
"link": 75
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -375,9 +410,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "ae.safetensors",
|
||||
@ -401,12 +436,12 @@
|
||||
"id": 33,
|
||||
"type": "ConditioningZeroOut",
|
||||
"pos": [
|
||||
639.9999103333332,
|
||||
620.0000271257795
|
||||
630,
|
||||
960
|
||||
],
|
||||
"size": [
|
||||
204.134765625,
|
||||
26
|
||||
230,
|
||||
80
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -430,9 +465,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -440,22 +475,21 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "VAEDecode",
|
||||
"pos": [
|
||||
1219.9999088104782,
|
||||
160.00009184959066
|
||||
1320,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
209.98697916666669,
|
||||
46
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -483,9 +517,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -493,22 +527,21 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"type": "UNETLoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
200.0000502647102
|
||||
30,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
82
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -518,7 +551,7 @@
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": 38
|
||||
"link": 73
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
@ -541,9 +574,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "z_image_turbo_bf16.safetensors",
|
||||
@ -568,15 +601,15 @@
|
||||
"id": 27,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
429.99997828947767,
|
||||
200.0000502647102
|
||||
400,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
409.9869791666667,
|
||||
319.9869791666667
|
||||
450,
|
||||
650
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -607,9 +640,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -626,15 +659,15 @@
|
||||
"id": 13,
|
||||
"type": "EmptySD3LatentImage",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
629.9999791384399
|
||||
40,
|
||||
890
|
||||
],
|
||||
"size": [
|
||||
259.9869791666667,
|
||||
106
|
||||
260,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -677,9 +710,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -694,19 +727,77 @@
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
950,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "KSampler",
|
||||
"pos": [
|
||||
879.9999615530063,
|
||||
269.9999774911694
|
||||
950,
|
||||
400
|
||||
],
|
||||
"size": [
|
||||
314.9869791666667,
|
||||
262
|
||||
320,
|
||||
350
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -740,7 +831,7 @@
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
"link": 71
|
||||
},
|
||||
{
|
||||
"localized_name": "steps",
|
||||
@ -749,7 +840,7 @@
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": null
|
||||
"link": 72
|
||||
},
|
||||
{
|
||||
"localized_name": "cfg",
|
||||
@ -800,9 +891,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSampler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "KSampler",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -814,81 +905,23 @@
|
||||
"widgets_values": [
|
||||
0,
|
||||
"randomize",
|
||||
4,
|
||||
8,
|
||||
1,
|
||||
"res_multistep",
|
||||
"simple",
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
879.9999615530063,
|
||||
160.00009184959066
|
||||
],
|
||||
"size": [
|
||||
309.9869791666667,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Image size",
|
||||
"title": "Step2 - Image size",
|
||||
"bounding": [
|
||||
100,
|
||||
560,
|
||||
290,
|
||||
200
|
||||
10,
|
||||
820,
|
||||
320,
|
||||
280
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -896,12 +929,12 @@
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Prompt",
|
||||
"title": "Step3 - Prompt",
|
||||
"bounding": [
|
||||
410,
|
||||
360,
|
||||
130,
|
||||
450,
|
||||
540
|
||||
530,
|
||||
970
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -909,12 +942,12 @@
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Models",
|
||||
"title": "Step1 - Load models",
|
||||
"bounding": [
|
||||
100,
|
||||
0,
|
||||
130,
|
||||
290,
|
||||
413.6
|
||||
330,
|
||||
660
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -1027,25 +1060,41 @@
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 38,
|
||||
"id": 71,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 3,
|
||||
"target_slot": 4,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 72,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 3,
|
||||
"target_slot": 5,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 73,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 28,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 39,
|
||||
"id": 74,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"origin_slot": 6,
|
||||
"target_id": 30,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"id": 75,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"origin_slot": 7,
|
||||
"target_id": 29,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
@ -1054,25 +1103,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image"
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Z-Image-Turbo, Alibaba's distilled 6B DiT model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"config": {},
|
||||
"extra": {
|
||||
"frontendVersion": "1.37.10",
|
||||
"workflowRendererVersion": "LG",
|
||||
"VHS_latentpreview": false,
|
||||
"VHS_latentpreviewrate": 0,
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true,
|
||||
"ds": {
|
||||
"scale": 0.8401370345180755,
|
||||
"offset": [
|
||||
940.0587067393087,
|
||||
-830.7121087564725
|
||||
]
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
"extra": {}
|
||||
}
|
||||
1132
blueprints/Text to Image.json
Normal file
1132
blueprints/Text to Image.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -4286,7 +4286,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "Vue-corrected"
|
||||
},
|
||||
"category": "Video generation and editing/Text to video"
|
||||
"category": "Video generation and editing/Text to video",
|
||||
"description": "Generates video from text prompts using LTX-2.3, Lightricks' video diffusion model."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -1572,7 +1572,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Text to video"
|
||||
"category": "Video generation and editing/Text to video",
|
||||
"description": "Generates video from text prompts using Wan2.2, Alibaba's diffusion video model."
|
||||
}
|
||||
]
|
||||
},
|
||||
@ -1586,4 +1587,4 @@
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -434,8 +434,9 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image Tools/Sharpen"
|
||||
"category": "Image Tools/Sharpen",
|
||||
"description": "Enhances edge contrast via unsharp masking for a sharper image appearance."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -307,7 +307,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Text generation/Video Captioning"
|
||||
"category": "Text generation/Video Captioning",
|
||||
"description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
@ -165,7 +165,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Video Inpaint(Wan2.1 VACE)",
|
||||
"name": "Video Inpaint (Wan 2.1 VACE)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2368,7 +2368,8 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Inpaint video"
|
||||
"category": "Video generation and editing/Inpaint video",
|
||||
"description": "Inpaints masked regions in video frames using Wan 2.1 VACE."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
827
blueprints/Video Segmentation (SAM3).json
Normal file
827
blueprints/Video Segmentation (SAM3).json
Normal file
@ -0,0 +1,827 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 130,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 130,
|
||||
"type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"pos": [
|
||||
-1210,
|
||||
-2780
|
||||
],
|
||||
"size": [
|
||||
300,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"125",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"127",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Video Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 130,
|
||||
"lastLinkId": 299,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1050,
|
||||
-3510,
|
||||
120,
|
||||
120
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
252
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
254
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
255
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
256
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b090a498-2bde-46b9-9554-18501401d687",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
257
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
263
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
273
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3490
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3470
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"linkIds": [
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3450
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3430
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 125,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2010,
|
||||
-3040
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 254
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 126,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1520,
|
||||
-3520
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
290
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 253
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 255
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 256
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 257
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 261
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 262
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 263
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 127,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3310
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 273
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 128,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-1910,
|
||||
-3540
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
253
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
259
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
260
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 129,
|
||||
"type": "Note",
|
||||
"pos": [
|
||||
-1980,
|
||||
-2790
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
250
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Note: Prompt format",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 125,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 252,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 128,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 253,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 254,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 255,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 126,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 256,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 126,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 257,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 126,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 259,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 260,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 3,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 261,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 126,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 262,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 126,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 263,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 126,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 273,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 127,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -1,21 +1,21 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 84,
|
||||
"last_node_id": 85,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 84,
|
||||
"type": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"id": 85,
|
||||
"type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"pos": [
|
||||
-6100,
|
||||
2620
|
||||
-880,
|
||||
-2260
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -76,31 +76,26 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"direction"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"match_image_size"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"spacing_width"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"spacing_color"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"right",
|
||||
true,
|
||||
0,
|
||||
"white"
|
||||
],
|
||||
"widgets_values": [],
|
||||
"title": "Video Stitch"
|
||||
}
|
||||
],
|
||||
@ -109,12 +104,12 @@
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 84,
|
||||
"lastLinkId": 262,
|
||||
"lastNodeId": 97,
|
||||
"lastLinkId": 282,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -123,8 +118,8 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6580,
|
||||
2649,
|
||||
-6810,
|
||||
2580,
|
||||
143.55859375,
|
||||
160
|
||||
]
|
||||
@ -132,8 +127,8 @@
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-5720,
|
||||
2659,
|
||||
-4770,
|
||||
2600,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -149,8 +144,8 @@
|
||||
"localized_name": "video",
|
||||
"label": "Before Video",
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2669
|
||||
-6686.44140625,
|
||||
2600
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -163,8 +158,8 @@
|
||||
"localized_name": "video_1",
|
||||
"label": "After Video",
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2689
|
||||
-6686.44140625,
|
||||
2620
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -175,8 +170,8 @@
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2709
|
||||
-6686.44140625,
|
||||
2640
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -187,8 +182,8 @@
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2729
|
||||
-6686.44140625,
|
||||
2660
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -199,8 +194,8 @@
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2749
|
||||
-6686.44140625,
|
||||
2680
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -211,8 +206,8 @@
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2769
|
||||
-6686.44140625,
|
||||
2700
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -226,8 +221,8 @@
|
||||
],
|
||||
"localized_name": "VIDEO",
|
||||
"pos": [
|
||||
-5700,
|
||||
2679
|
||||
-4750,
|
||||
2620
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -238,11 +233,11 @@
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-6390,
|
||||
2560
|
||||
2600
|
||||
],
|
||||
"size": [
|
||||
193.530859375,
|
||||
66
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
@ -278,9 +273,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
"ver": "0.13.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -291,8 +286,8 @@
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
193.530859375,
|
||||
66
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
@ -332,21 +327,254 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
"ver": "0.13.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 90,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-6390,
|
||||
3030
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 266
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
274
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
276
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-5190,
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 282
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
255
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 95,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3020
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 274
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
279
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 96,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3290
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 276
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
280
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 79,
|
||||
"type": "ImageStitch",
|
||||
"pos": [
|
||||
-6390,
|
||||
2700
|
||||
2780
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
150
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
@ -408,14 +636,15 @@
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
250
|
||||
266,
|
||||
281
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageStitch",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "ImageStitch"
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"right",
|
||||
@ -425,60 +654,91 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"id": 97,
|
||||
"type": "ResizeImageMaskNode",
|
||||
"pos": [
|
||||
-6040,
|
||||
2610
|
||||
-5560,
|
||||
2790
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
78
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 250
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": 281
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"localized_name": "resize_type",
|
||||
"name": "resize_type",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
"name": "resize_type"
|
||||
},
|
||||
"link": 252
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "resize_type.width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.width"
|
||||
},
|
||||
"link": 279
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "resize_type.height",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.height"
|
||||
},
|
||||
"link": 280
|
||||
},
|
||||
{
|
||||
"localized_name": "crop",
|
||||
"name": "resize_type.crop",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_type.crop"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "scale_method",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"localized_name": "resized",
|
||||
"name": "resized",
|
||||
"type": "*",
|
||||
"links": [
|
||||
255
|
||||
282
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "CreateVideo"
|
||||
"Node name for S&R": "ResizeImageMaskNode"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
"scale dimensions",
|
||||
512,
|
||||
512,
|
||||
"center",
|
||||
"area"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -500,14 +760,6 @@
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 250,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 251,
|
||||
"origin_id": 77,
|
||||
@ -579,13 +831,71 @@
|
||||
"target_id": 79,
|
||||
"target_slot": 5,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 90,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 274,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 0,
|
||||
"target_id": 95,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 276,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 1,
|
||||
"target_id": 96,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 279,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 280,
|
||||
"origin_id": 96,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 3,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 281,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 97,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 282,
|
||||
"origin_id": 97,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video Tools/Stitch videos"
|
||||
"category": "Video Tools/Stitch videos",
|
||||
"description": "Stitches multiple video clips into a single sequential video file."
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -412,9 +412,10 @@
|
||||
"extra": {
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Enhance video"
|
||||
"category": "Video generation and editing/Enhance video",
|
||||
"description": "Upscales video to 4× resolution using a GAN-based upscaling model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
}
|
||||
@ -47,7 +47,7 @@ class BackgroundRemovalModel():
|
||||
out = self.model(pixel_values=pixel_values)
|
||||
out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
|
||||
|
||||
mask = out.sigmoid()
|
||||
mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
if mask.ndim == 3:
|
||||
mask = mask.unsqueeze(0)
|
||||
if mask.shape[1] != 1:
|
||||
|
||||
@ -242,6 +242,7 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -373,6 +374,7 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -686,6 +688,7 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
|
||||
lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
|
||||
@ -747,6 +750,7 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -832,6 +836,7 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
old_denoised = None
|
||||
h, h_last = None, None
|
||||
@ -889,6 +894,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
denoised_1, denoised_2 = None, None
|
||||
h, h_1, h_2 = None, None, None
|
||||
@ -1006,23 +1012,39 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
|
||||
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
|
||||
|
||||
# s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
|
||||
# noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
|
||||
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
n_steps = max(1, len(sigmas) - 1)
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
|
||||
s_start = float(s_noise)
|
||||
s_end = s_start if s_noise_end is None else float(s_noise_end)
|
||||
for i in trange(n_steps, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
if callback is not None:
|
||||
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
||||
|
||||
x = denoised
|
||||
if sigmas[i + 1] > 0:
|
||||
x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
|
||||
noise = noise_sampler(sigmas[i], sigmas[i + 1])
|
||||
if noise_clip_std > 0:
|
||||
clip_val = noise_clip_std * noise.std()
|
||||
noise = noise.clamp(min=-clip_val, max=clip_val)
|
||||
t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
|
||||
s_noise_i = s_start + (s_end - s_start) * t
|
||||
if s_noise_i != 1.0:
|
||||
noise = noise * s_noise_i
|
||||
x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
||||
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
|
||||
@ -1249,6 +1271,7 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
uncond_denoised = None
|
||||
|
||||
@ -1296,6 +1319,7 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
|
||||
temp = [0]
|
||||
def post_cfg_function(args):
|
||||
@ -1371,6 +1395,7 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda t: t.neg().exp()
|
||||
t_fn = lambda sigma: sigma.log().neg()
|
||||
@ -1504,6 +1529,7 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
def default_er_sde_noise_scaler(x):
|
||||
@ -1574,9 +1600,10 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1645,9 +1672,10 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1713,6 +1741,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
|
||||
|
||||
|
||||
@ -794,6 +794,13 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class HiDreamO1Pixel(ChromaRadiance):
|
||||
"""Pixel-space latent format for HiDream-O1.
|
||||
No VAE — model patches/unpatches raw RGB internally with patch_size=32.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
|
||||
41
comfy/ldm/hidream_o1/attention.py
Normal file
41
comfy/ldm/hidream_o1/attention.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
|
||||
attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
|
||||
mask the general-purpose path would build (~500 MB at T~16K) and lets the
|
||||
gen half hit the user's preferred backend via optimized_attention.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.ops
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
def make_two_pass_attention(ar_len: int, transformer_options=None):
|
||||
"""Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
|
||||
The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
|
||||
"""
|
||||
|
||||
def two_pass_attention(q, k, v, heads, **kwargs):
|
||||
B, H, T, D = q.shape
|
||||
|
||||
if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
elif ar_len >= T:
|
||||
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
|
||||
elif ar_len <= 0:
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
else:
|
||||
out_ar = comfy.ops.scaled_dot_product_attention(
|
||||
q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
|
||||
attn_mask=None, dropout_p=0.0, is_causal=True,
|
||||
)
|
||||
out_gen = optimized_attention(
|
||||
q[:, :, ar_len:], k, v, heads,
|
||||
mask=None, skip_reshape=True, skip_output_reshape=True,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
out = torch.cat([out_ar, out_gen], dim=2)
|
||||
|
||||
return out.transpose(1, 2).reshape(B, T, H * D)
|
||||
|
||||
return two_pass_attention
|
||||
230
comfy/ldm/hidream_o1/conditioning.py
Normal file
230
comfy/ldm/hidream_o1/conditioning.py
Normal file
@ -0,0 +1,230 @@
|
||||
"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
|
||||
|
||||
Each ref image goes through two paths: a 32x32 patchified stream concatenated
|
||||
to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
|
||||
into input_ids at <|image_pad|> positions.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.utils
|
||||
from comfy.text_encoders.qwen_vl import process_qwen2vl_images
|
||||
|
||||
from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
|
||||
|
||||
# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
|
||||
VIT_PATCH = 16
|
||||
VIT_MERGE = 2
|
||||
VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
VIT_IMAGE_STD = [0.5, 0.5, 0.5]
|
||||
|
||||
|
||||
def prepare_ref_images(
|
||||
ref_images: List[torch.Tensor],
|
||||
target_h: int,
|
||||
target_w: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Build the dual-path tensors for K reference images at (target_h, target_w).
|
||||
|
||||
Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
|
||||
ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
|
||||
"""
|
||||
K = len(ref_images)
|
||||
if K == 0:
|
||||
return None
|
||||
max_size = ref_max_size(max(target_h, target_w), K)
|
||||
cis = cond_image_size(K)
|
||||
|
||||
refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
|
||||
refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
|
||||
|
||||
# 32-patch path.
|
||||
ref_patches_per = []
|
||||
per_ref_patch_grids = []
|
||||
for t in refs_t:
|
||||
t_norm = (t.squeeze(0) - 0.5) / 0.5 # (3, H, W) in [-1, 1]
|
||||
h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
|
||||
per_ref_patch_grids.append((h_p, w_p))
|
||||
patches = (
|
||||
t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
|
||||
.permute(1, 3, 0, 2, 4)
|
||||
.reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
|
||||
)
|
||||
ref_patches_per.append(patches)
|
||||
ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
|
||||
|
||||
# ViT path.
|
||||
refs_vlm_t = []
|
||||
for t in refs_t:
|
||||
_, _, h, w = t.shape
|
||||
cond_w, cond_h = calculate_dimensions(cis, w / h)
|
||||
cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
|
||||
cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
|
||||
refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
|
||||
|
||||
pv_list, grid_list, per_ref_vit_tokens = [], [], []
|
||||
for t_v in refs_vlm_t:
|
||||
pv, grid_thw = process_qwen2vl_images(
|
||||
t_v.permute(0, 2, 3, 1),
|
||||
min_pixels=0, max_pixels=10**12,
|
||||
patch_size=VIT_PATCH, merge_size=VIT_MERGE,
|
||||
image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
|
||||
)
|
||||
grid_thw = grid_thw[0]
|
||||
pv_list.append(pv.to(device=device, dtype=dtype))
|
||||
grid_list.append(grid_thw.to(device=device))
|
||||
# Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
|
||||
gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
|
||||
per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
|
||||
|
||||
return {
|
||||
"ref_patches": ref_patches,
|
||||
"ref_pixel_values": torch.cat(pv_list, dim=0),
|
||||
"ref_image_grid_thw": torch.stack(grid_list, dim=0),
|
||||
"per_ref_vit_tokens": per_ref_vit_tokens,
|
||||
"per_ref_patch_grids": per_ref_patch_grids,
|
||||
}
|
||||
|
||||
|
||||
def build_ref_input_ids(
|
||||
text_input_ids: torch.Tensor,
|
||||
per_ref_vit_tokens: List[int],
|
||||
image_token_id: int,
|
||||
vision_start_id: int,
|
||||
vision_end_id: int,
|
||||
):
|
||||
"""Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
|
||||
after the [im_start, user, \\n] prefix (matches original chat template).
|
||||
"""
|
||||
ids = text_input_ids[0].tolist()
|
||||
inserted = []
|
||||
for n_pad in per_ref_vit_tokens:
|
||||
inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
|
||||
new_ids = ids[:3] + inserted + ids[3:] # 3 = len([im_start, user, \n])
|
||||
return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
|
||||
|
||||
def build_extra_conds(
|
||||
text_input_ids: torch.Tensor,
|
||||
noise: torch.Tensor,
|
||||
ref_images: List[torch.Tensor] = None,
|
||||
target_patch_size: int = 32,
|
||||
):
|
||||
"""Assemble all conditioning tensors for HiDreamO1Transformer.forward:
|
||||
input_ids (with ref-vision tokens spliced in for the edit/IP path),
|
||||
position_ids (MRoPE), token_types, vinput_mask, plus the ref
|
||||
dual-path tensors when refs are provided.
|
||||
"""
|
||||
from .utils import get_rope_index_fix_point
|
||||
from comfy.text_encoders.hidream_o1 import (
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
|
||||
if text_input_ids.dim() == 1:
|
||||
text_input_ids = text_input_ids.unsqueeze(0)
|
||||
text_input_ids = text_input_ids.long().to(noise.device)
|
||||
B = noise.shape[0]
|
||||
if text_input_ids.shape[0] == 1 and B > 1:
|
||||
text_input_ids = text_input_ids.expand(B, -1)
|
||||
|
||||
H, W = noise.shape[-2], noise.shape[-1]
|
||||
h_p, w_p = H // target_patch_size, W // target_patch_size
|
||||
image_len = h_p * w_p
|
||||
image_grid_thw_tgt = torch.tensor(
|
||||
[[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
|
||||
out = {}
|
||||
if ref_images:
|
||||
ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
|
||||
text_input_ids = build_ref_input_ids(
|
||||
text_input_ids, ref["per_ref_vit_tokens"],
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
new_txt_len = text_input_ids.shape[1]
|
||||
|
||||
# Each ref's patchified stream gets a [vision_start, image_pad*N-1]
|
||||
# block in the position-id stream after the noised target.
|
||||
ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
|
||||
tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
tgt_vision[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks = []
|
||||
for rl in ref_grid_lengths:
|
||||
blk = torch.full((1, rl), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
blk[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks.append(blk)
|
||||
ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
|
||||
input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
|
||||
total_ref_patches_len = sum(ref_grid_lengths)
|
||||
total_len = new_txt_len + image_len + total_ref_patches_len
|
||||
|
||||
# K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
|
||||
K = len(ref_images)
|
||||
igthw_cond = ref["ref_image_grid_thw"].clone()
|
||||
igthw_cond[:, 1] //= 2
|
||||
igthw_cond[:, 2] //= 2
|
||||
image_grid_thw_ref = torch.tensor(
|
||||
[[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
|
||||
dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
igthw_all = torch.cat([
|
||||
igthw_cond.to(text_input_ids.device),
|
||||
image_grid_thw_tgt,
|
||||
image_grid_thw_ref,
|
||||
], dim=0)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=igthw_all,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[0] * K + [1] + [1] * K,
|
||||
fix_point=4096,
|
||||
)
|
||||
|
||||
# tms + target_image + ref_patches are all gen.
|
||||
tms_pos = new_txt_len - 1
|
||||
ar_len = tms_pos
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, tms_pos:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, new_txt_len:] = True
|
||||
|
||||
# Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
|
||||
out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
|
||||
out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
|
||||
out["ref_patches"] = ref["ref_patches"]
|
||||
else:
|
||||
# T2I: text + noised target only, vision_start replaces the first image token
|
||||
txt_len = text_input_ids.shape[1]
|
||||
total_len = txt_len + image_len
|
||||
vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
vision_tokens[:, 0] = VISION_START_ID
|
||||
input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[1],
|
||||
)
|
||||
ar_len = txt_len - 1
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, ar_len:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, txt_len:] = True
|
||||
|
||||
out["input_ids"] = text_input_ids
|
||||
out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
|
||||
out["token_types"] = token_types
|
||||
out["vinput_mask"] = vinput_mask
|
||||
out["ar_len"] = ar_len
|
||||
return out
|
||||
306
comfy/ldm/hidream_o1/model.py
Normal file
306
comfy/ldm/hidream_o1/model.py
Normal file
@ -0,0 +1,306 @@
|
||||
"""HiDream-O1-Image transformer.
|
||||
|
||||
Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
|
||||
encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
|
||||
processes a unified text+image sequence, and 32x32 patch embed/unembed
|
||||
shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
|
||||
mergers go unused — their weights are dropped at load.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
import einops
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
|
||||
from comfy.text_encoders.llama import Llama2_
|
||||
from comfy.text_encoders.qwen35 import Qwen35VisionModel
|
||||
|
||||
from .attention import make_two_pass_attention
|
||||
|
||||
|
||||
IMAGE_TOKEN_ID = 151655 # Qwen3-VL <|image_pad|>
|
||||
TMS_TOKEN_ID = 151673 # HiDream-O1 <|tms_token|>
|
||||
PATCH_SIZE = 32
|
||||
|
||||
|
||||
@dataclass
|
||||
class HiDreamO1TextConfig:
|
||||
"""Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
|
||||
vocab_size: int = 151936
|
||||
hidden_size: int = 4096
|
||||
intermediate_size: int = 12288
|
||||
num_hidden_layers: int = 36
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
head_dim: int = 128
|
||||
max_position_embeddings: int = 128000
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 5000000.0
|
||||
rope_scale: Optional[float] = None
|
||||
rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
|
||||
interleaved_mrope: bool = True
|
||||
transformer_type: str = "llama"
|
||||
rms_norm_add: bool = False
|
||||
mlp_activation: str = "silu"
|
||||
qkv_bias: bool = False
|
||||
q_norm: str = "gemma3"
|
||||
k_norm: str = "gemma3"
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
|
||||
|
||||
|
||||
QWEN3VL_VISION_DEFAULTS = dict(
|
||||
hidden_size=1152,
|
||||
num_heads=16,
|
||||
intermediate_size=4304,
|
||||
depth=27,
|
||||
patch_size=16,
|
||||
temporal_patch_size=2,
|
||||
in_channels=3,
|
||||
spatial_merge_size=2,
|
||||
num_position_embeddings=2304,
|
||||
deepstack_visual_indexes=(8, 16, 24),
|
||||
out_hidden_size=4096, # final merger projects directly into LLM hidden
|
||||
)
|
||||
|
||||
|
||||
class BottleneckPatchEmbed(nn.Module):
|
||||
# 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
|
||||
def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
|
||||
self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.proj2(self.proj1(x))
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
# 4096 -> 3072 (LLM hidden -> flat pixel patch).
|
||||
def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(x)
|
||||
|
||||
|
||||
class HiDreamO1Transformer(nn.Module):
|
||||
"""HiDream-O1 unified pixel-level transformer."""
|
||||
|
||||
def __init__(self, image_model=None, dtype=None, device=None, operations=None,
|
||||
text_config_overrides=None, vision_config_overrides=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
|
||||
text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
|
||||
vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
|
||||
if vision_config_overrides:
|
||||
vision_cfg.update(vision_config_overrides)
|
||||
vision_cfg["out_hidden_size"] = text_cfg.hidden_size
|
||||
|
||||
self.text_config = text_cfg
|
||||
self.vision_config = vision_cfg
|
||||
self.hidden_size = text_cfg.hidden_size
|
||||
self.patch_size = PATCH_SIZE
|
||||
self.in_channels = 3
|
||||
self.tms_token_id = TMS_TOKEN_ID
|
||||
|
||||
self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.t_embedder1 = TimestepEmbedder(
|
||||
text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
self.x_embedder = BottleneckPatchEmbed(
|
||||
patch_size=self.patch_size, in_chans=self.in_channels,
|
||||
pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
|
||||
bias=True, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
self.final_layer2 = FinalLayer(
|
||||
text_cfg.hidden_size, patch_size=self.patch_size,
|
||||
out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
|
||||
self._visual_cache = None
|
||||
self._kv_cache_entries = []
|
||||
|
||||
def clear_kv_cache(self):
|
||||
self._kv_cache_entries = []
|
||||
self._visual_cache = None
|
||||
|
||||
def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timesteps, context, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
|
||||
vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
|
||||
"""Returns flow-match velocity (x - x_pred) / sigma"""
|
||||
|
||||
if input_ids is None or position_ids is None:
|
||||
raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
|
||||
|
||||
B, _, H, W = x.shape
|
||||
h_p, w_p = H // self.patch_size, W // self.patch_size
|
||||
tgt_image_len = h_p * w_p
|
||||
|
||||
z = einops.rearrange(
|
||||
x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
|
||||
p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
|
||||
|
||||
inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
|
||||
|
||||
if ref_pixel_values is not None and ref_image_grid_thw is not None:
|
||||
# ViT output is constant across sampling steps within a generation
|
||||
# identity-key by the input tensor so refs don't recompute every step.
|
||||
cached = self._visual_cache
|
||||
if cached is not None and cached[0] is ref_pixel_values:
|
||||
image_embeds = cached[1]
|
||||
else:
|
||||
ref_pv = ref_pixel_values.to(inputs_embeds.device)
|
||||
ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
|
||||
# extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
|
||||
if ref_pv.dim() == 3:
|
||||
ref_pv = ref_pv[0]
|
||||
if ref_grid.dim() == 3:
|
||||
ref_grid = ref_grid[0]
|
||||
image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
|
||||
self._visual_cache = (ref_pixel_values, image_embeds)
|
||||
# image_pad positions identical across batch (input_ids shared cond/uncond).
|
||||
image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
|
||||
if image_idx.shape[0] != image_embeds.shape[0]:
|
||||
raise ValueError(
|
||||
f"Image-token count {image_idx.shape[0]} != ViT output count "
|
||||
f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
|
||||
)
|
||||
inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
|
||||
|
||||
sigma = timesteps.float() / 1000.0
|
||||
t_pixeldit = 1.0 - sigma
|
||||
t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
|
||||
tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
|
||||
|
||||
vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
|
||||
inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
|
||||
|
||||
# extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
|
||||
freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
|
||||
freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
|
||||
|
||||
two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.language_model.layers)
|
||||
transformer_options["block_type"] = "double"
|
||||
|
||||
# Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
|
||||
# (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
|
||||
can_cache = not blocks_replace and ar_len > 0
|
||||
cache_len = ar_len if can_cache else 0
|
||||
ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
|
||||
pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
|
||||
cache_entries = self._kv_cache_entries
|
||||
# Drop stale entries from a previous device (model was unloaded and reloaded).
|
||||
if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
|
||||
cache_entries = []
|
||||
self._kv_cache_entries = []
|
||||
kv_cache = None
|
||||
if can_cache:
|
||||
for entry in cache_entries:
|
||||
ck = entry["input_ids"]
|
||||
ep = entry["position_ids"]
|
||||
if (entry["cache_len"] == cache_len
|
||||
and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
|
||||
and entry["ref_id"] == ref_id
|
||||
and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
|
||||
kv_cache = entry
|
||||
break
|
||||
|
||||
if kv_cache is not None:
|
||||
# Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
|
||||
hidden_states = inputs_embeds[:, cache_len:]
|
||||
sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
K_i, V_i = kv_cache["kv"][i]
|
||||
hidden_states, _ = layer(
|
||||
x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
|
||||
past_key_value=(K_i, V_i, cache_len),
|
||||
)
|
||||
else:
|
||||
# Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
|
||||
snapshots = [] if can_cache else None
|
||||
past_kv_cold = () if can_cache else None
|
||||
hidden_states = inputs_embeds
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args, _layer=layer):
|
||||
out = {}
|
||||
out["x"], _ = _layer(
|
||||
x=args["x"], attention_mask=args.get("attention_mask"),
|
||||
freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
|
||||
past_key_value=None,
|
||||
)
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)](
|
||||
{"x": hidden_states, "attention_mask": None,
|
||||
"freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
|
||||
"transformer_options": transformer_options},
|
||||
{"original_block": block_wrap},
|
||||
)
|
||||
hidden_states = out["x"]
|
||||
else:
|
||||
hidden_states, present_kv = layer(
|
||||
x=hidden_states, attention_mask=None,
|
||||
freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
|
||||
past_key_value=past_kv_cold,
|
||||
)
|
||||
if snapshots is not None:
|
||||
K, V, _ = present_kv
|
||||
snapshots.append((K[:, :, :cache_len].contiguous(),
|
||||
V[:, :, :cache_len].contiguous()))
|
||||
if snapshots is not None:
|
||||
# Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
|
||||
new_entry = {
|
||||
"input_ids": input_ids.clone(),
|
||||
"cache_len": cache_len,
|
||||
"kv": snapshots,
|
||||
"ref_id": ref_id,
|
||||
"position_ids": pos_ids_key.clone(),
|
||||
}
|
||||
self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
|
||||
|
||||
if self.language_model.norm is not None:
|
||||
hidden_states = self.language_model.norm(hidden_states)
|
||||
|
||||
# Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
|
||||
# In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
|
||||
sliced_offset = cache_len if kv_cache is not None else 0
|
||||
if vinput_mask is not None:
|
||||
vmask = vinput_mask.to(x.device).bool()
|
||||
if sliced_offset > 0:
|
||||
vmask = vmask[:, sliced_offset:]
|
||||
target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
|
||||
else:
|
||||
txt_seq_len = input_ids.shape[1]
|
||||
start = txt_seq_len - sliced_offset
|
||||
target_hidden = hidden_states[:, start:start + tgt_image_len]
|
||||
x_pred_tgt = self.final_layer2(target_hidden)
|
||||
|
||||
# fp32 final subtraction, bf16 here noticeably degrades samples.
|
||||
x_pred_img = einops.rearrange(
|
||||
x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
|
||||
H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)
|
||||
173
comfy/ldm/hidream_o1/utils.py
Normal file
173
comfy/ldm/hidream_o1/utils.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""HiDream-O1 input-prep helpers: image/resolution math and unified-sequence
|
||||
RoPE position-id assembly. The fix_point offset in get_rope_index_fix_point
|
||||
lets the target image and patchified ref images share spatial RoPE positions
|
||||
despite living at different sequence indices — same 2D image plane.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
PATCH_SIZE = 32
|
||||
CONDITION_IMAGE_SIZE = 384 # ViT-side base size for ref images
|
||||
|
||||
|
||||
def resize_tensor(img_t, image_size, patch_size=16):
|
||||
"""img_t: (1, 3, H, W) float [0, 1]. Fit to image_size**2 area, patch-aligned, center-cropped."""
|
||||
|
||||
while min(img_t.shape[-2], img_t.shape[-1]) >= 2 * image_size: # Pre-halves with 2x2 box averaging while the image is still very large
|
||||
img_t = torch.nn.functional.avg_pool2d(img_t, kernel_size=2, stride=2)
|
||||
|
||||
_, _, height, width = img_t.shape
|
||||
m = patch_size
|
||||
s_max = image_size * image_size
|
||||
scale = math.sqrt(s_max / (width * height))
|
||||
|
||||
candidates = [
|
||||
(round(width * scale) // m * m, round(height * scale) // m * m),
|
||||
(round(width * scale) // m * m, math.floor(height * scale) // m * m),
|
||||
(math.floor(width * scale) // m * m, round(height * scale) // m * m),
|
||||
(math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
|
||||
]
|
||||
candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
|
||||
new_size = candidates[-1]
|
||||
for c in candidates:
|
||||
if c[0] * c[1] <= s_max:
|
||||
new_size = c
|
||||
break
|
||||
|
||||
new_w, new_h = new_size
|
||||
s1 = width / new_w
|
||||
s2 = height / new_h
|
||||
if s1 < s2:
|
||||
resize_w, resize_h = new_w, round(height / s1)
|
||||
else:
|
||||
resize_w, resize_h = round(width / s2), new_h
|
||||
img_t = torch.nn.functional.interpolate(img_t, size=(resize_h, resize_w), mode="bicubic")
|
||||
top = (resize_h - new_h) // 2
|
||||
left = (resize_w - new_w) // 2
|
||||
return img_t[..., top:top + new_h, left:left + new_w]
|
||||
|
||||
|
||||
def calculate_dimensions(max_size, ratio):
|
||||
"""(W, H) for an aspect ratio fitting in max_size**2 area, 32-aligned."""
|
||||
width = math.sqrt(max_size * max_size * ratio)
|
||||
height = width / ratio
|
||||
width = int(width / 32) * 32
|
||||
height = int(height / 32) * 32
|
||||
return width, height
|
||||
|
||||
|
||||
def ref_max_size(target_max_dim, k):
|
||||
"""K-dependent ref-image max dim before patchifying."""
|
||||
if k == 1:
|
||||
return target_max_dim
|
||||
if k == 2:
|
||||
return target_max_dim * 48 // 64
|
||||
if k <= 4:
|
||||
return target_max_dim // 2
|
||||
if k <= 8:
|
||||
return target_max_dim * 24 // 64
|
||||
return target_max_dim // 4
|
||||
|
||||
|
||||
def cond_image_size(k):
|
||||
"""K-dependent ViT-side image size."""
|
||||
if k <= 4:
|
||||
return CONDITION_IMAGE_SIZE
|
||||
if k <= 8:
|
||||
return CONDITION_IMAGE_SIZE * 48 // 64
|
||||
return CONDITION_IMAGE_SIZE // 2
|
||||
|
||||
|
||||
def get_rope_index_fix_point(
|
||||
spatial_merge_size: int,
|
||||
image_token_id: int,
|
||||
vision_start_token_id: int,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
skip_vision_start_token=None,
|
||||
fix_point: int = 4096,
|
||||
):
|
||||
mrope_position_deltas = []
|
||||
if input_ids is not None and image_grid_thw is not None:
|
||||
total_input_ids = input_ids
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(total_input_ids)
|
||||
position_ids = torch.ones(
|
||||
3, input_ids.shape[0], input_ids.shape[1],
|
||||
dtype=input_ids.dtype, device=input_ids.device,
|
||||
)
|
||||
attention_mask = attention_mask.to(total_input_ids.device)
|
||||
for i, input_ids_b in enumerate(total_input_ids):
|
||||
fp = fix_point
|
||||
image_index = 0
|
||||
input_ids_b = input_ids_b[attention_mask[i] == 1]
|
||||
vision_start_indices = torch.argwhere(input_ids_b == vision_start_token_id).squeeze(1)
|
||||
vision_tokens = input_ids_b[vision_start_indices + 1]
|
||||
image_nums = (vision_tokens == image_token_id).sum()
|
||||
input_tokens = input_ids_b.tolist()
|
||||
llm_pos_ids_list = []
|
||||
st = 0
|
||||
remain_images = image_nums
|
||||
for _ in range(image_nums):
|
||||
if image_token_id in input_tokens and remain_images > 0:
|
||||
ed = input_tokens.index(image_token_id, st)
|
||||
else:
|
||||
ed = len(input_tokens) + 1
|
||||
t = image_grid_thw[image_index][0]
|
||||
h = image_grid_thw[image_index][1]
|
||||
w = image_grid_thw[image_index][2]
|
||||
image_index += 1
|
||||
remain_images -= 1
|
||||
llm_grid_t = t.item()
|
||||
llm_grid_h = h.item() // spatial_merge_size
|
||||
llm_grid_w = w.item() // spatial_merge_size
|
||||
text_len = ed - st
|
||||
text_len -= skip_vision_start_token[image_index - 1]
|
||||
text_len = max(0, text_len)
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
|
||||
|
||||
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
|
||||
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
|
||||
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
|
||||
|
||||
if skip_vision_start_token[image_index - 1]:
|
||||
if fp > 0:
|
||||
fp = fp - st_idx
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + fp + st_idx)
|
||||
fp = 0
|
||||
else:
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
|
||||
st = ed + llm_grid_t * llm_grid_h * llm_grid_w
|
||||
|
||||
if st < len(input_tokens):
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
text_len = len(input_tokens) - st
|
||||
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
|
||||
|
||||
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
|
||||
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
|
||||
mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
|
||||
mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
|
||||
return position_ids, mrope_position_deltas
|
||||
|
||||
if attention_mask is not None:
|
||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
|
||||
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
|
||||
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
|
||||
else:
|
||||
position_ids = (
|
||||
torch.arange(input_ids.shape[1], device=input_ids.device)
|
||||
.view(1, 1, -1).expand(3, input_ids.shape[0], -1)
|
||||
)
|
||||
mrope_position_deltas = torch.zeros(
|
||||
[input_ids.shape[0], 1], device=input_ids.device, dtype=input_ids.dtype,
|
||||
)
|
||||
return position_ids, mrope_position_deltas
|
||||
@ -1135,7 +1135,7 @@ class AudioInjector_WAN(nn.Module):
|
||||
self.injector_adain_output_layers = nn.ModuleList(
|
||||
[operations.Linear(dim, dim, dtype=dtype, device=device) for _ in range(audio_injector_id)])
|
||||
|
||||
def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len):
|
||||
def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len, scale=1.0):
|
||||
audio_attn_id = self.injected_block_id.get(block_id, None)
|
||||
if audio_attn_id is None:
|
||||
return x
|
||||
@ -1148,12 +1148,15 @@ class AudioInjector_WAN(nn.Module):
|
||||
attn_hidden_states = adain_hidden_states
|
||||
else:
|
||||
attn_hidden_states = self.injector_pre_norm_feat[audio_attn_id](input_hidden_states)
|
||||
audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
|
||||
attn_audio_emb = audio_emb
|
||||
|
||||
if audio_emb.dim() == 3: # WanDancer case
|
||||
attn_audio_emb = rearrange(audio_emb, "b t c -> (b t) 1 c", t=num_frames)
|
||||
else: # S2V case
|
||||
attn_audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
|
||||
|
||||
residual_out = self.injector[audio_attn_id](x=attn_hidden_states, context=attn_audio_emb)
|
||||
residual_out = rearrange(
|
||||
residual_out, "(b t) n c -> b (t n) c", t=num_frames)
|
||||
x[:, :seq_len] = x[:, :seq_len] + residual_out
|
||||
residual_out = rearrange(residual_out, "(b t) n c -> b (t n) c", t=num_frames)
|
||||
x[:, :seq_len] = x[:, :seq_len] + residual_out * scale
|
||||
return x
|
||||
|
||||
|
||||
|
||||
251
comfy/ldm/wan/model_wandancer.py
Normal file
251
comfy/ldm/wan/model_wandancer.py
Normal file
@ -0,0 +1,251 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import comfy
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.flux.math import apply_rope1
|
||||
from comfy.ldm.flux.layers import EmbedND
|
||||
|
||||
from .model import AudioInjector_WAN, WanModel, MLPProj, Head, sinusoidal_embedding_1d
|
||||
|
||||
|
||||
class MusicSelfAttention(nn.Module):
|
||||
def __init__(self, dim, num_heads, device=None, dtype=None, operations=None):
|
||||
assert dim % num_heads == 0
|
||||
super().__init__()
|
||||
self.embed_dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
|
||||
self.q_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, freqs):
|
||||
b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
|
||||
|
||||
q = self.q_proj(x).view(b, s, n, d)
|
||||
q = apply_rope1(q, freqs)
|
||||
|
||||
k = self.k_proj(x).view(b, s, n, d)
|
||||
k = apply_rope1(k, freqs)
|
||||
|
||||
x = optimized_attention(
|
||||
q.view(b, s, n * d),
|
||||
k.view(b, s, n * d),
|
||||
self.v_proj(x).view(b, s, n * d),
|
||||
heads=self.num_heads,
|
||||
)
|
||||
|
||||
return self.out_proj(x)
|
||||
|
||||
|
||||
class MusicEncoderLayer(nn.Module):
|
||||
def __init__(self, dim: int, num_heads: int, ffn_dim: int, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = MusicSelfAttention(dim, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.linear1 = operations.Linear(dim, ffn_dim, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(ffn_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
|
||||
x = x + self.self_attn(self.norm1(x), freqs=freqs)
|
||||
x = x + self.linear2(torch.nn.functional.gelu(self.linear1(self.norm2(x)))) # ffn
|
||||
return x
|
||||
|
||||
|
||||
class WanDancerModel(WanModel):
|
||||
def __init__(self,
|
||||
model_type='wandancer',
|
||||
patch_size=(1, 2, 2),
|
||||
text_len=512,
|
||||
in_dim=16,
|
||||
dim=5120,
|
||||
ffn_dim=8192,
|
||||
freq_dim=256,
|
||||
text_dim=4096,
|
||||
out_dim=16,
|
||||
num_heads=16,
|
||||
num_layers=40,
|
||||
window_size=(-1, -1),
|
||||
qk_norm=True,
|
||||
cross_attn_norm=True,
|
||||
eps=1e-6,
|
||||
in_dim_ref_conv=None,
|
||||
image_model=None,
|
||||
device=None, dtype=None, operations=None,
|
||||
audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27],
|
||||
music_dim = 256,
|
||||
music_heads = 4,
|
||||
music_feature_dim = 35,
|
||||
music_latent_dim = 256
|
||||
):
|
||||
|
||||
super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim,
|
||||
num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model, in_dim_ref_conv=in_dim_ref_conv,
|
||||
device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.dtype = dtype
|
||||
operation_settings = {"operations": operations, "device": device, "dtype": dtype}
|
||||
|
||||
self.patch_embedding_global = operations.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size, device=operation_settings.get("device"), dtype=torch.float32)
|
||||
self.img_emb_refimage = MLPProj(1280, dim, operation_settings=operation_settings)
|
||||
self.head_global = Head(dim, out_dim, patch_size, eps, operation_settings=operation_settings)
|
||||
|
||||
self.music_injector = AudioInjector_WAN(
|
||||
dim=self.dim,
|
||||
num_heads=self.num_heads,
|
||||
inject_layer=audio_inject_layers,
|
||||
root_net=self,
|
||||
enable_adain=False,
|
||||
dtype=dtype, device=device, operations=operations
|
||||
)
|
||||
|
||||
self.music_projection = operations.Linear(music_feature_dim, music_latent_dim, device=device, dtype=dtype)
|
||||
self.music_encoder = nn.ModuleList([MusicEncoderLayer(dim=music_dim, num_heads=music_heads, ffn_dim=1024, device=device, dtype=dtype, operations=operations) for _ in range(2)])
|
||||
music_head_dim = music_dim // music_heads
|
||||
self.music_rope_embedder = EmbedND(dim=music_head_dim, theta=10000.0, axes_dim=[music_head_dim])
|
||||
|
||||
def forward_orig(self, x, t, context, clip_fea=None, clip_fea_ref=None, freqs=None, audio_embed=None, fps=30, audio_inject_scale=1.0, transformer_options={}, **kwargs):
|
||||
# embeddings
|
||||
if int(fps + 0.5) != 30:
|
||||
x = self.patch_embedding_global(x.float()).to(x.dtype)
|
||||
else:
|
||||
x = self.patch_embedding(x.float()).to(x.dtype)
|
||||
|
||||
grid_sizes = x.shape[2:]
|
||||
latent_frames = grid_sizes[0]
|
||||
transformer_options["grid_sizes"] = grid_sizes
|
||||
x = x.flatten(2).transpose(1, 2)
|
||||
seq_len = x.size(1)
|
||||
|
||||
# time embeddings
|
||||
e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
|
||||
e = e.reshape(t.shape[0], -1, e.shape[-1])
|
||||
e0 = self.time_projection(e).unflatten(2, (6, self.dim))
|
||||
|
||||
full_ref = None
|
||||
if self.ref_conv is not None: # model has the weight, but this wasn't used in the original pipeline
|
||||
full_ref = kwargs.get("reference_latent", None)
|
||||
if full_ref is not None:
|
||||
full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
|
||||
x = torch.concat((full_ref, x), dim=1)
|
||||
|
||||
# context
|
||||
context = self.text_embedding(context)
|
||||
|
||||
audio_emb = None
|
||||
if audio_embed is not None: # encode music feature,[1, frame_num, 35] -> [1, F*8, dim]
|
||||
music_feature = self.music_projection(audio_embed)
|
||||
|
||||
music_seq_len = music_feature.shape[1]
|
||||
music_ids = torch.arange(music_seq_len, device=music_feature.device, dtype=music_feature.dtype).reshape(1, -1, 1) # create 1D position IDs
|
||||
music_freqs = self.music_rope_embedder(music_ids).movedim(1, 2)
|
||||
|
||||
# apply encoder layers
|
||||
for layer in self.music_encoder:
|
||||
music_feature = layer(music_feature, music_freqs)
|
||||
|
||||
# interpolate
|
||||
audio_emb = torch.nn.functional.interpolate(music_feature.unsqueeze(1), size=(latent_frames * 8, self.dim), mode='bilinear').squeeze(1)
|
||||
|
||||
context_img_len = 0
|
||||
if self.img_emb is not None and clip_fea is not None:
|
||||
context_clip = self.img_emb(clip_fea) # bs x 257 x dim
|
||||
context = torch.cat([context_clip, context], dim=1)
|
||||
context_img_len += clip_fea.shape[-2]
|
||||
if self.img_emb_refimage is not None and clip_fea_ref is not None:
|
||||
context_clip_ref = self.img_emb_refimage(clip_fea_ref)
|
||||
context = torch.cat([context_clip_ref, context], dim=1)
|
||||
context_img_len += clip_fea_ref.shape[-2]
|
||||
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.blocks)
|
||||
transformer_options["block_type"] = "double"
|
||||
for i, block in enumerate(self.blocks):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args):
|
||||
out = {}
|
||||
out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
|
||||
x = out["img"]
|
||||
else:
|
||||
x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
|
||||
if audio_emb is not None:
|
||||
x = self.music_injector(x, i, audio_emb, audio_emb_global=None, seq_len=seq_len, scale=audio_inject_scale)
|
||||
|
||||
# head
|
||||
if int(fps + 0.5) != 30:
|
||||
x = self.head_global(x, e)
|
||||
else:
|
||||
x = self.head(x, e)
|
||||
|
||||
if full_ref is not None:
|
||||
x = x[:, full_ref.shape[1]:]
|
||||
|
||||
# unpatchify
|
||||
x = self.unpatchify(x, grid_sizes)
|
||||
return x
|
||||
|
||||
def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, clip_fea_ref=None, fps=30, audio_inject_scale=1.0, **kwargs):
|
||||
bs, c, t, h, w = x.shape
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
|
||||
|
||||
t_len = t
|
||||
if time_dim_concat is not None:
|
||||
time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
|
||||
x = torch.cat([x, time_dim_concat], dim=2)
|
||||
t_len = x.shape[2]
|
||||
|
||||
freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, fps=fps, transformer_options=transformer_options)
|
||||
return self.forward_orig(x, timestep, context, clip_fea=clip_fea, clip_fea_ref=clip_fea_ref, freqs=freqs, fps=fps, audio_inject_scale=audio_inject_scale, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
|
||||
|
||||
def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, fps=30, device=None, dtype=None, transformer_options={}):
|
||||
patch_size = self.patch_size
|
||||
t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
|
||||
h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
|
||||
w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
|
||||
|
||||
if steps_t is None:
|
||||
steps_t = t_len
|
||||
if steps_h is None:
|
||||
steps_h = h_len
|
||||
if steps_w is None:
|
||||
steps_w = w_len
|
||||
|
||||
h_start = 0
|
||||
w_start = 0
|
||||
rope_options = transformer_options.get("rope_options", None)
|
||||
if rope_options is not None:
|
||||
t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
|
||||
h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
|
||||
w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
|
||||
|
||||
t_start += rope_options.get("shift_t", 0.0)
|
||||
h_start += rope_options.get("shift_y", 0.0)
|
||||
w_start += rope_options.get("shift_x", 0.0)
|
||||
|
||||
img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
|
||||
|
||||
if int(fps + 0.5) != 30:
|
||||
time_scale = 30.0 / fps # how many time units each frame represents relative to 30fps
|
||||
positions_new = torch.arange(steps_t, device=device, dtype=dtype) * time_scale + t_start
|
||||
total_frames_at_30fps = int(time_scale * steps_t + 0.5)
|
||||
positions_new[-1] = t_start + (total_frames_at_30fps - 1)
|
||||
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + positions_new.reshape(-1, 1, 1)
|
||||
else:
|
||||
img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
|
||||
|
||||
img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
|
||||
img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
|
||||
img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
|
||||
|
||||
freqs = self.rope_embedder(img_ids).movedim(1, 2)
|
||||
return freqs
|
||||
@ -97,12 +97,14 @@ def load_lora(lora, to_load, log_missing=True):
|
||||
|
||||
def model_lora_keys_clip(model, key_map={}):
|
||||
sdk = model.state_dict().keys()
|
||||
prefix_set = set()
|
||||
for k in sdk:
|
||||
if k.endswith(".weight"):
|
||||
key_map["text_encoders.{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
|
||||
tp = k.find(".transformer.") #also map without wrapper prefix for composite text encoder models
|
||||
if tp > 0 and not k.startswith("clip_"):
|
||||
key_map["text_encoders.{}".format(k[tp + 1:-len(".weight")])] = k
|
||||
prefix_set.add(k.split('.')[0])
|
||||
|
||||
text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
|
||||
clip_l_present = False
|
||||
@ -163,6 +165,13 @@ def model_lora_keys_clip(model, key_map={}):
|
||||
lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
|
||||
key_map[lora_key] = k
|
||||
|
||||
if len(prefix_set) == 1:
|
||||
full_prefix = "{}.transformer.model.".format(next(iter(prefix_set))) # kohya anima and maybe other single TE models that use a single llama arch based te
|
||||
for k in sdk:
|
||||
if k.endswith(".weight"):
|
||||
if k.startswith(full_prefix):
|
||||
l_key = k[len(full_prefix):-len(".weight")]
|
||||
key_map["lora_te_{}".format(l_key.replace(".", "_"))] = k
|
||||
|
||||
k = "clip_g.transformer.text_projection.weight"
|
||||
if k in sdk:
|
||||
|
||||
@ -43,6 +43,7 @@ import comfy.ldm.lumina.model
|
||||
import comfy.ldm.wan.model
|
||||
import comfy.ldm.wan.model_animate
|
||||
import comfy.ldm.wan.ar_model
|
||||
import comfy.ldm.wan.model_wandancer
|
||||
import comfy.ldm.hunyuan3d.model
|
||||
import comfy.ldm.hidream.model
|
||||
import comfy.ldm.chroma.model
|
||||
@ -58,6 +59,8 @@ import comfy.ldm.cogvideo.model
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
import comfy.ldm.sam3.detector
|
||||
import comfy.ldm.hidream_o1.model
|
||||
from comfy.ldm.hidream_o1.conditioning import build_extra_conds
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
@ -1609,6 +1612,30 @@ class WAN21_SCAIL(WAN21):
|
||||
out['pose_latents'] = [pose_latents.shape[0], 20, *pose_latents.shape[2:]]
|
||||
return out
|
||||
|
||||
class WAN22_WanDancer(WAN21):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=True, device=None):
|
||||
super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_wandancer.WanDancerModel)
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
audio_embed = kwargs.get("audio_embed", None)
|
||||
if audio_embed is not None:
|
||||
out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)
|
||||
|
||||
clip_vision_output_ref = kwargs.get("clip_vision_output_ref", None)
|
||||
if clip_vision_output_ref is not None:
|
||||
out['clip_fea_ref'] = comfy.conds.CONDRegular(clip_vision_output_ref.penultimate_hidden_states)
|
||||
|
||||
fps = kwargs.get("fps", None)
|
||||
if fps is not None:
|
||||
out['fps'] = comfy.conds.CONDRegular(torch.FloatTensor([fps]))
|
||||
|
||||
audio_inject_scale = kwargs.get("audio_inject_scale", None)
|
||||
if audio_inject_scale is not None:
|
||||
out['audio_inject_scale'] = comfy.conds.CONDRegular(torch.FloatTensor([audio_inject_scale]))
|
||||
return out
|
||||
|
||||
class Hunyuan3Dv2(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
|
||||
@ -1659,6 +1686,32 @@ class HiDream(BaseModel):
|
||||
out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
|
||||
return out
|
||||
|
||||
class HiDreamO1(BaseModel):
|
||||
"""HiDream-O1-Image: pixel-space DiT (no VAE). Refs from HiDreamO1ReferenceImages and tokens from the stub TE flow through
|
||||
extra_conds; the heavy preprocessing lives in comfy.ldm.hidream_o1.conditioning."""
|
||||
PATCH_SIZE = 32
|
||||
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream_o1.model.HiDreamO1Transformer)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
text_input_ids = kwargs.get("text_input_ids", None)
|
||||
noise = kwargs.get("noise", None)
|
||||
if text_input_ids is None or noise is None:
|
||||
return out
|
||||
|
||||
conds = build_extra_conds(
|
||||
text_input_ids, noise,
|
||||
ref_images=kwargs.get("reference_latents", None),
|
||||
target_patch_size=self.PATCH_SIZE,
|
||||
)
|
||||
for k, v in conds.items():
|
||||
# ar_len is a Python int (precomputed to avoid a GPU sync in forward).
|
||||
cls = comfy.conds.CONDConstant if k == "ar_len" else comfy.conds.CONDRegular
|
||||
out[k] = cls(v)
|
||||
return out
|
||||
|
||||
class Chroma(Flux):
|
||||
def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.chroma.model.Chroma):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=unet_model)
|
||||
|
||||
@ -588,6 +588,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["model_type"] = "animate"
|
||||
elif '{}patch_embedding_pose.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "scail"
|
||||
elif '{}patch_embedding_global.weight'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "wandancer"
|
||||
else:
|
||||
if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
|
||||
dit_config["model_type"] = "i2v"
|
||||
@ -634,6 +636,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys
|
||||
return dit_config
|
||||
|
||||
if '{}t_embedder1.mlp.0.weight'.format(key_prefix) in state_dict_keys and '{}x_embedder.proj1.weight'.format(key_prefix) in state_dict_keys: # HiDream-O1
|
||||
return {"image_model": "hidream_o1"}
|
||||
|
||||
if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys: # HiDream
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "hidream"
|
||||
|
||||
@ -242,6 +242,37 @@ class LazyCastingParam(torch.nn.Parameter):
|
||||
return self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True).to("cpu")
|
||||
|
||||
|
||||
class LazyCastingQuantizedParam:
|
||||
def __init__(self, model, key):
|
||||
self.model = model
|
||||
self.key = key
|
||||
self.cpu_state_dict = None
|
||||
|
||||
def state_dict_tensor(self, state_dict_key):
|
||||
if self.cpu_state_dict is None:
|
||||
weight = self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True)
|
||||
self.cpu_state_dict = {k: v.to("cpu") for k, v in weight.state_dict(self.key).items()}
|
||||
return self.cpu_state_dict[state_dict_key]
|
||||
|
||||
|
||||
class LazyCastingParamPiece(torch.nn.Parameter):
|
||||
def __new__(cls, caster, state_dict_key, tensor):
|
||||
return super().__new__(cls, tensor)
|
||||
|
||||
def __init__(self, caster, state_dict_key, tensor):
|
||||
self.caster = caster
|
||||
self.state_dict_key = state_dict_key
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return CustomTorchDevice
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
caster = self.caster
|
||||
del self.caster
|
||||
return caster.state_dict_tensor(self.state_dict_key)
|
||||
|
||||
|
||||
class ModelPatcher:
|
||||
def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
|
||||
self.size = size
|
||||
@ -1463,20 +1494,37 @@ class ModelPatcher:
|
||||
self.clear_cached_hook_weights()
|
||||
|
||||
def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
|
||||
unet_state_dict = self.model.diffusion_model.state_dict()
|
||||
for k, v in unet_state_dict.items():
|
||||
original_state_dict = self.model.diffusion_model.state_dict()
|
||||
unet_state_dict = {}
|
||||
keys = list(original_state_dict)
|
||||
while len(keys) > 0:
|
||||
k = keys.pop(0)
|
||||
v = original_state_dict[k]
|
||||
op_keys = k.rsplit('.', 1)
|
||||
if (len(op_keys) < 2) or op_keys[1] not in ["weight", "bias"]:
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
try:
|
||||
op = comfy.utils.get_attr(self.model.diffusion_model, op_keys[0])
|
||||
except:
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
if not op or not hasattr(op, "comfy_cast_weights") or \
|
||||
(hasattr(op, "comfy_patched_weights") and op.comfy_patched_weights == True):
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
key = "diffusion_model." + k
|
||||
unet_state_dict[k] = LazyCastingParam(self, key, comfy.utils.get_attr(self.model, key))
|
||||
weight = comfy.utils.get_attr(self.model, key)
|
||||
if isinstance(weight, QuantizedTensor) and k in original_state_dict:
|
||||
qt_state_dict = weight.state_dict(k)
|
||||
caster = LazyCastingQuantizedParam(self, key)
|
||||
for group_key in (x for x in qt_state_dict if x in original_state_dict):
|
||||
if group_key in keys:
|
||||
keys.remove(group_key)
|
||||
unet_state_dict.pop(group_key, "")
|
||||
unet_state_dict[group_key] = LazyCastingParamPiece(caster, "diffusion_model." + group_key, original_state_dict[group_key])
|
||||
continue
|
||||
unet_state_dict[k] = LazyCastingParam(self, key, weight)
|
||||
return self.model.state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
|
||||
|
||||
def __del__(self):
|
||||
|
||||
@ -93,7 +93,8 @@ class CONST:
|
||||
|
||||
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
|
||||
sigma = reshape_sigma(sigma, noise.ndim)
|
||||
return sigma * noise + (1.0 - sigma) * latent_image
|
||||
s = getattr(self, "noise_scale", 1.0)
|
||||
return sigma * (s * noise) + (1.0 - sigma) * latent_image
|
||||
|
||||
def inverse_noise_scaling(self, sigma, latent):
|
||||
sigma = reshape_sigma(sigma, latent.ndim)
|
||||
@ -288,7 +289,11 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
|
||||
else:
|
||||
sampling_settings = {}
|
||||
|
||||
self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
|
||||
self.set_noise_scale(sampling_settings.get("noise_scale", 1.0))
|
||||
self.set_parameters(
|
||||
shift=sampling_settings.get("shift", 1.0),
|
||||
multiplier=sampling_settings.get("multiplier", 1000),
|
||||
)
|
||||
|
||||
def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000):
|
||||
self.shift = shift
|
||||
@ -296,6 +301,9 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
|
||||
ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
|
||||
self.register_buffer('sigmas', ts)
|
||||
|
||||
def set_noise_scale(self, noise_scale):
|
||||
self.noise_scale = float(noise_scale)
|
||||
|
||||
@property
|
||||
def sigma_min(self):
|
||||
return self.sigmas[0]
|
||||
|
||||
@ -1285,7 +1285,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if quant_format in ["float8_e4m3fn", "float8_e5m2"] and weight_key in state_dict:
|
||||
self.quant_format = quant_format
|
||||
qconfig = QUANT_ALGOS[quant_format]
|
||||
layout_cls = get_layout_class(qconfig["comfy_tensor_layout"])
|
||||
self.layout_type = qconfig["comfy_tensor_layout"]
|
||||
layout_cls = get_layout_class(self.layout_type)
|
||||
weight = state_dict.pop(weight_key)
|
||||
manually_loaded_keys = [weight_key]
|
||||
|
||||
|
||||
@ -240,7 +240,8 @@ class CLIP:
|
||||
model_management.archive_model_dtypes(self.cond_stage_model)
|
||||
|
||||
self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher
|
||||
te_disable_dynamic = disable_dynamic or getattr(self.cond_stage_model, "disable_offload", False)
|
||||
ModelPatcher = comfy.model_patcher.ModelPatcher if te_disable_dynamic else comfy.model_patcher.CoreModelPatcher
|
||||
self.patcher = ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
|
||||
#Match torch.float32 hardcode upcast in TE implemention
|
||||
self.patcher.set_model_compute_dtype(torch.float32)
|
||||
@ -786,6 +787,7 @@ class VAE:
|
||||
self.latent_channels = 3
|
||||
self.latent_dim = 2
|
||||
self.output_channels = 3
|
||||
self.disable_offload = True
|
||||
elif "vocoder.activation_post.downsample.lowpass.filter" in sd: #MMAudio VAE
|
||||
sample_rate = 16000
|
||||
if sample_rate == 16000:
|
||||
|
||||
@ -28,6 +28,7 @@ import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
import comfy.text_encoders.hidream_o1
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@ -1336,6 +1337,36 @@ class WAN21_SCAIL(WAN21_T2V):
|
||||
out = model_base.WAN21_SCAIL(self, image_to_video=False, device=device)
|
||||
return out
|
||||
|
||||
class WAN22_WanDancer(WAN21_T2V):
|
||||
unet_config = {
|
||||
"image_model": "wan2.1",
|
||||
"model_type": "wandancer",
|
||||
"in_dim": 36,
|
||||
}
|
||||
|
||||
def __init__(self, unet_config):
|
||||
super().__init__(unet_config)
|
||||
self.memory_usage_factor = 1.8
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
out = model_base.WAN22_WanDancer(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
out_sd = {}
|
||||
for k in list(state_dict.keys()):
|
||||
# split music_encoder in_proj into q_proj, k_proj, v_proj
|
||||
if "music_encoder" in k and "self_attn.in_proj" in k:
|
||||
suffix = "weight" if k.endswith("weight") else "bias"
|
||||
tensor = state_dict[k]
|
||||
d = tensor.shape[0] // 3
|
||||
prefix = k.replace(f"in_proj_{suffix}", "")
|
||||
out_sd[f"{prefix}q_proj.{suffix}"] = tensor[:d]
|
||||
out_sd[f"{prefix}k_proj.{suffix}"] = tensor[d:2*d]
|
||||
out_sd[f"{prefix}v_proj.{suffix}"] = tensor[2*d:]
|
||||
else:
|
||||
out_sd[k] = state_dict[k]
|
||||
return out_sd
|
||||
|
||||
class Hunyuan3Dv2(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
@ -1424,6 +1455,50 @@ class HiDream(supported_models_base.BASE):
|
||||
def clip_target(self, state_dict={}):
|
||||
return None # TODO
|
||||
|
||||
class HiDreamO1(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "hidream_o1",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 3.0,
|
||||
"noise_scale": 8.0,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.HiDreamO1Pixel
|
||||
memory_usage_factor = 0.033
|
||||
# fp16 not supported: LM MLP down_proj activations fp16 overflow, causing NaNs
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
optimizations = {"fp8": False}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.HiDreamO1(self, device=device)
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
# Drop unused Qwen3-VL deepstack merger weights; upstream discards them at inference.
|
||||
for key in list(state_dict.keys()):
|
||||
if "visual.deepstack_merger_list" in key:
|
||||
del state_dict[key]
|
||||
return state_dict
|
||||
|
||||
def process_vae_state_dict(self, state_dict):
|
||||
# Pixel-space model: inject sentinel so VAE construction picks PixelspaceConversionVAE.
|
||||
return {"pixel_space_vae": torch.tensor(1.0)}
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
# Tokenizer-only TE: inject sentinel so load_state_dict_guess_config triggers CLIP init.
|
||||
return {"_hidream_o1_te_sentinel": torch.zeros(1)}
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(
|
||||
comfy.text_encoders.hidream_o1.HiDreamO1Tokenizer,
|
||||
comfy.text_encoders.hidream_o1.HiDreamO1TE,
|
||||
)
|
||||
|
||||
class Chroma(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "chroma",
|
||||
@ -2006,10 +2081,12 @@ models = [
|
||||
WAN22_Animate,
|
||||
WAN21_FlowRVS,
|
||||
WAN21_SCAIL,
|
||||
WAN22_WanDancer,
|
||||
Hunyuan3Dv2mini,
|
||||
Hunyuan3Dv2,
|
||||
Hunyuan3Dv2_1,
|
||||
HiDream,
|
||||
HiDreamO1,
|
||||
Chroma,
|
||||
ChromaRadiance,
|
||||
ACEStep,
|
||||
|
||||
119
comfy/text_encoders/hidream_o1.py
Normal file
119
comfy/text_encoders/hidream_o1.py
Normal file
@ -0,0 +1,119 @@
|
||||
"""HiDream-O1-Image tokenizer-only text encoder.
|
||||
|
||||
The real Qwen3-VL backbone runs inside diffusion_model.* every step, so this
|
||||
module just tokenizes the prompt into text_input_ids and emits them as
|
||||
conditioning. Position ids / token_types / vinput_mask depend on target H/W
|
||||
and are built later in model_base.HiDreamO1.extra_conds.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
from comfy import sd1_clip
|
||||
|
||||
|
||||
# Qwen3-VL special tokens
|
||||
IM_START_ID = 151644
|
||||
IM_END_ID = 151645
|
||||
ASSISTANT_ID = 77091
|
||||
USER_ID = 872
|
||||
NEWLINE_ID = 198
|
||||
VISION_START_ID = 151652
|
||||
VISION_END_ID = 151653
|
||||
IMAGE_TOKEN_ID = 151655
|
||||
VIDEO_TOKEN_ID = 151656
|
||||
# HiDream-O1-specific tokens
|
||||
BOI_TOKEN_ID = 151669
|
||||
BOR_TOKEN_ID = 151670
|
||||
EOR_TOKEN_ID = 151671
|
||||
BOT_TOKEN_ID = 151672
|
||||
TMS_TOKEN_ID = 151673
|
||||
|
||||
|
||||
class HiDreamO1QwenTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_path = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer"
|
||||
)
|
||||
super().__init__(
|
||||
tokenizer_path,
|
||||
pad_with_end=False,
|
||||
embedding_size=4096,
|
||||
embedding_key="hidream_o1",
|
||||
tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False,
|
||||
has_end_token=False,
|
||||
pad_to_max_length=False,
|
||||
max_length=99999999,
|
||||
min_length=1,
|
||||
pad_token=151643,
|
||||
tokenizer_data=tokenizer_data,
|
||||
)
|
||||
|
||||
|
||||
class HiDreamO1Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
"""Wraps prompt in the upstream chat template ending with boi/tms markers.
|
||||
Image tokens get spliced in at sample time once target H/W is known.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(
|
||||
embedding_directory=embedding_directory,
|
||||
tokenizer_data=tokenizer_data,
|
||||
name="hidream_o1",
|
||||
tokenizer=HiDreamO1QwenTokenizer,
|
||||
)
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
||||
text_tokens_dict = super().tokenize_with_weights(
|
||||
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
||||
)
|
||||
text_tuples = text_tokens_dict["hidream_o1"][0]
|
||||
text_tuples = [t for t in text_tuples if int(t[0]) != 151643] # strip pad
|
||||
|
||||
# <|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n<|boi|><|tms|>
|
||||
def tok(tid):
|
||||
return (tid, 1.0) if not return_word_ids else (tid, 1.0, 0)
|
||||
|
||||
prefix = [tok(IM_START_ID), tok(USER_ID), tok(NEWLINE_ID)]
|
||||
suffix = [
|
||||
tok(IM_END_ID), tok(NEWLINE_ID),
|
||||
tok(IM_START_ID), tok(ASSISTANT_ID), tok(NEWLINE_ID),
|
||||
tok(BOI_TOKEN_ID), tok(TMS_TOKEN_ID),
|
||||
]
|
||||
full = prefix + list(text_tuples) + suffix
|
||||
return {"hidream_o1": [full]}
|
||||
|
||||
|
||||
class HiDreamO1TE(torch.nn.Module):
|
||||
"""Passthrough TE: emits int token ids; the Qwen3-VL backbone in diffusion_model does the actual encoding."""
|
||||
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__()
|
||||
self.dtypes = {torch.float32}
|
||||
self.disable_offload = True # skips dynamic VRAM management for this zero-parameter module
|
||||
self.device = torch.device("cpu") if device is None else torch.device(device)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
tok_pairs = token_weight_pairs["hidream_o1"][0]
|
||||
ids = [int(t[0]) for t in tok_pairs]
|
||||
input_ids = torch.tensor([ids], dtype=torch.long)
|
||||
# Surrogate keeps the cross_attn slot non-empty for CONDITIONING
|
||||
# plumbing; the model reads text_input_ids out of `extra` instead.
|
||||
cross_attn = input_ids.unsqueeze(-1).to(torch.float32)
|
||||
extra = {"text_input_ids": input_ids}
|
||||
return cross_attn, None, extra
|
||||
|
||||
def load_sd(self, sd):
|
||||
return []
|
||||
|
||||
def get_sd(self):
|
||||
return {}
|
||||
|
||||
def reset_clip_options(self):
|
||||
pass
|
||||
|
||||
def set_clip_options(self, options):
|
||||
pass
|
||||
@ -397,7 +397,7 @@ class RMSNorm(nn.Module):
|
||||
|
||||
|
||||
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None, interleaved_mrope=False):
|
||||
if not isinstance(theta, list):
|
||||
theta = [theta]
|
||||
|
||||
@ -415,16 +415,27 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
if rope_dims is not None and position_ids.shape[0] > 1 and interleaved_mrope:
|
||||
# Qwen3-VL interleaved MRoPE: T-freqs by default, H/W replace every 3rd dim.
|
||||
freqs_inter = freqs[0].clone()
|
||||
for axis_idx, offset in ((1, 1), (2, 2)):
|
||||
length = rope_dims[axis_idx] * 3
|
||||
idx = slice(offset, length, 3)
|
||||
freqs_inter[..., idx] = freqs[axis_idx, ..., idx]
|
||||
emb = torch.cat((freqs_inter, freqs_inter), dim=-1)
|
||||
cos = emb.cos().unsqueeze(0)
|
||||
sin = emb.sin().unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
sin_split = sin.shape[-1] // 2
|
||||
out.append((cos, sin[..., : sin_split], -sin[..., sin_split :]))
|
||||
|
||||
@ -689,6 +700,7 @@ class Llama2_(nn.Module):
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
|
||||
|
||||
@ -451,9 +451,8 @@ class Qwen35VisionPatchEmbed(nn.Module):
|
||||
self.proj = ops.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
target_dtype = self.proj.weight.dtype
|
||||
x = x.view(-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size)
|
||||
return self.proj(x.to(target_dtype)).view(-1, self.embed_dim)
|
||||
return self.proj(x).view(-1, self.embed_dim)
|
||||
|
||||
|
||||
class Qwen35VisionMLP(nn.Module):
|
||||
@ -651,7 +650,7 @@ class Qwen35VisionModel(nn.Module):
|
||||
x = self.patch_embed(x)
|
||||
pos_embeds = self.fast_pos_embed_interpolate(grid_thw).to(x.device)
|
||||
x = x + pos_embeds
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw).to(x.device)
|
||||
seq_len = x.shape[0]
|
||||
x = x.reshape(seq_len, -1)
|
||||
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
|
||||
|
||||
@ -1164,12 +1164,18 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
|
||||
|
||||
o = out
|
||||
o_d = out_div
|
||||
ps_view = ps
|
||||
mask_view = mask
|
||||
for d in range(dims):
|
||||
o = o.narrow(d + 2, upscaled[d], mask.shape[d + 2])
|
||||
o_d = o_d.narrow(d + 2, upscaled[d], mask.shape[d + 2])
|
||||
l = min(ps_view.shape[d + 2], o.shape[d + 2] - upscaled[d])
|
||||
o = o.narrow(d + 2, upscaled[d], l)
|
||||
o_d = o_d.narrow(d + 2, upscaled[d], l)
|
||||
if l < ps_view.shape[d + 2]:
|
||||
ps_view = ps_view.narrow(d + 2, 0, l)
|
||||
mask_view = mask_view.narrow(d + 2, 0, l)
|
||||
|
||||
o.add_(ps * mask)
|
||||
o_d.add_(mask)
|
||||
o.add_(ps_view * mask_view)
|
||||
o_d.add_(mask_view)
|
||||
|
||||
if pbar is not None:
|
||||
pbar.update(1)
|
||||
@ -1196,7 +1202,7 @@ def model_trange(*args, **kwargs):
|
||||
pbar.i1_time = time.time()
|
||||
pbar.set_postfix_str(" Model Initialization complete! ")
|
||||
elif pbar._i == 2:
|
||||
#bring forward the effective start time based the the diff between first and second iteration
|
||||
#bring forward the effective start time based the diff between first and second iteration
|
||||
#to attempt to remove load overhead from the final step rate estimate.
|
||||
pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
|
||||
pbar.set_postfix_str("")
|
||||
@ -1390,7 +1396,7 @@ def convert_old_quants(state_dict, model_prefix="", metadata={}):
|
||||
k_out = "{}.weight_scale".format(layer)
|
||||
|
||||
if layer is not None:
|
||||
layer_conf = {"format": "float8_e4m3fn"} # TODO: check if anyone did some non e4m3fn scaled checkpoints
|
||||
layer_conf = {"format": "float8_e4m3fn"}
|
||||
if full_precision_matrix_mult:
|
||||
layer_conf["full_precision_matrix_mult"] = full_precision_matrix_mult
|
||||
layers[layer] = layer_conf
|
||||
|
||||
@ -12,9 +12,24 @@ class VOXEL:
|
||||
|
||||
|
||||
class MESH:
|
||||
def __init__(self, vertices: torch.Tensor, faces: torch.Tensor):
|
||||
self.vertices = vertices
|
||||
self.faces = faces
|
||||
def __init__(self, vertices: torch.Tensor, faces: torch.Tensor,
|
||||
uvs: torch.Tensor | None = None,
|
||||
vertex_colors: torch.Tensor | None = None,
|
||||
texture: torch.Tensor | None = None,
|
||||
vertex_counts: torch.Tensor | None = None,
|
||||
face_counts: torch.Tensor | None = None):
|
||||
|
||||
assert (vertex_counts is None) == (face_counts is None), \
|
||||
"vertex_counts and face_counts must be provided together (both or neither)"
|
||||
self.vertices = vertices # vertices: (B, N, 3)
|
||||
self.faces = faces # faces: (B, M, 3)
|
||||
self.uvs = uvs # uvs: (B, N, 2)
|
||||
self.vertex_colors = vertex_colors # vertex_colors: (B, N, 3 or 4)
|
||||
self.texture = texture # texture: (B, H, W, 3)
|
||||
# When vertices/faces are zero-padded to a common N/M across the batch (variable-size mesh batch),
|
||||
# these hold the real per-item lengths (B,). None means rows are uniform and no slicing is needed.
|
||||
self.vertex_counts = vertex_counts
|
||||
self.face_counts = face_counts
|
||||
|
||||
|
||||
class File3D:
|
||||
|
||||
75
comfy_api_nodes/apis/anthropic.py
Normal file
75
comfy_api_nodes/apis/anthropic.py
Normal file
@ -0,0 +1,75 @@
|
||||
from enum import Enum
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class AnthropicRole(str, Enum):
|
||||
user = "user"
|
||||
assistant = "assistant"
|
||||
|
||||
|
||||
class AnthropicTextContent(BaseModel):
|
||||
type: Literal["text"] = "text"
|
||||
text: str = Field(...)
|
||||
|
||||
|
||||
class AnthropicImageSourceBase64(BaseModel):
|
||||
type: Literal["base64"] = "base64"
|
||||
media_type: str = Field(..., description="MIME type of the image, e.g. image/png, image/jpeg")
|
||||
data: str = Field(..., description="Base64-encoded image data")
|
||||
|
||||
|
||||
class AnthropicImageSourceUrl(BaseModel):
|
||||
type: Literal["url"] = "url"
|
||||
url: str = Field(...)
|
||||
|
||||
|
||||
class AnthropicImageContent(BaseModel):
|
||||
type: Literal["image"] = "image"
|
||||
source: AnthropicImageSourceBase64 | AnthropicImageSourceUrl = Field(...)
|
||||
|
||||
|
||||
class AnthropicMessage(BaseModel):
|
||||
role: AnthropicRole = Field(...)
|
||||
content: list[AnthropicTextContent | AnthropicImageContent] = Field(...)
|
||||
|
||||
|
||||
class AnthropicMessagesRequest(BaseModel):
|
||||
model: str = Field(...)
|
||||
messages: list[AnthropicMessage] = Field(...)
|
||||
max_tokens: int = Field(..., ge=1)
|
||||
system: str | None = Field(None, description="Top-level system prompt")
|
||||
temperature: float | None = Field(None, ge=0.0, le=1.0)
|
||||
top_p: float | None = Field(None, ge=0.0, le=1.0)
|
||||
top_k: int | None = Field(None, ge=0)
|
||||
stop_sequences: list[str] | None = Field(None)
|
||||
|
||||
|
||||
class AnthropicResponseTextBlock(BaseModel):
|
||||
type: Literal["text"] = "text"
|
||||
text: str = Field(...)
|
||||
|
||||
|
||||
class AnthropicCacheCreationUsage(BaseModel):
|
||||
ephemeral_5m_input_tokens: int | None = Field(None)
|
||||
ephemeral_1h_input_tokens: int | None = Field(None)
|
||||
|
||||
|
||||
class AnthropicMessagesUsage(BaseModel):
|
||||
input_tokens: int | None = Field(None)
|
||||
output_tokens: int | None = Field(None)
|
||||
cache_creation_input_tokens: int | None = Field(None)
|
||||
cache_read_input_tokens: int | None = Field(None)
|
||||
cache_creation: AnthropicCacheCreationUsage | None = Field(None)
|
||||
|
||||
|
||||
class AnthropicMessagesResponse(BaseModel):
|
||||
id: str | None = Field(None)
|
||||
type: str | None = Field(None)
|
||||
role: str | None = Field(None)
|
||||
model: str | None = Field(None)
|
||||
content: list[AnthropicResponseTextBlock] | None = Field(None)
|
||||
stop_reason: str | None = Field(None)
|
||||
stop_sequence: str | None = Field(None)
|
||||
usage: AnthropicMessagesUsage | None = Field(None)
|
||||
@ -23,7 +23,7 @@ class BriaEditImageRequest(BaseModel):
|
||||
None,
|
||||
description="Mask image (black and white). Black areas will be preserved, white areas will be edited. "
|
||||
"If omitted, the edit applies to the entire image. "
|
||||
"The input image and the the input mask must be of the same size.",
|
||||
"The input image and the input mask must be of the same size.",
|
||||
)
|
||||
negative_prompt: str | None = Field(None)
|
||||
guidance_scale: float = Field(...)
|
||||
|
||||
@ -198,6 +198,62 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||
("Custom", None, None),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_1K = [
|
||||
("(1K) 1024x1024 (1:1)", 1024, 1024),
|
||||
("(1K) 864x1152 (3:4)", 864, 1152),
|
||||
("(1K) 1152x864 (4:3)", 1152, 864),
|
||||
("(1K) 1312x736 (16:9)", 1312, 736),
|
||||
("(1K) 736x1312 (9:16)", 736, 1312),
|
||||
("(1K) 832x1248 (2:3)", 832, 1248),
|
||||
("(1K) 1248x832 (3:2)", 1248, 832),
|
||||
("(1K) 1568x672 (21:9)", 1568, 672),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_2K = [
|
||||
("(2K) 2048x2048 (1:1)", 2048, 2048),
|
||||
("(2K) 1728x2304 (3:4)", 1728, 2304),
|
||||
("(2K) 2304x1728 (4:3)", 2304, 1728),
|
||||
("(2K) 2848x1600 (16:9)", 2848, 1600),
|
||||
("(2K) 1600x2848 (9:16)", 1600, 2848),
|
||||
("(2K) 1664x2496 (2:3)", 1664, 2496),
|
||||
("(2K) 2496x1664 (3:2)", 2496, 1664),
|
||||
("(2K) 3136x1344 (21:9)", 3136, 1344),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_3K = [
|
||||
("(3K) 3072x3072 (1:1)", 3072, 3072),
|
||||
("(3K) 2592x3456 (3:4)", 2592, 3456),
|
||||
("(3K) 3456x2592 (4:3)", 3456, 2592),
|
||||
("(3K) 4096x2304 (16:9)", 4096, 2304),
|
||||
("(3K) 2304x4096 (9:16)", 2304, 4096),
|
||||
("(3K) 2496x3744 (2:3)", 2496, 3744),
|
||||
("(3K) 3744x2496 (3:2)", 3744, 2496),
|
||||
("(3K) 4704x2016 (21:9)", 4704, 2016),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_4K = [
|
||||
("(4K) 4096x4096 (1:1)", 4096, 4096),
|
||||
("(4K) 3520x4704 (3:4)", 3520, 4704),
|
||||
("(4K) 4704x3520 (4:3)", 4704, 3520),
|
||||
("(4K) 5504x3040 (16:9)", 5504, 3040),
|
||||
("(4K) 3040x5504 (9:16)", 3040, 5504),
|
||||
("(4K) 3328x4992 (2:3)", 3328, 4992),
|
||||
("(4K) 4992x3328 (3:2)", 4992, 3328),
|
||||
("(4K) 6240x2656 (21:9)", 6240, 2656),
|
||||
]
|
||||
|
||||
_CUSTOM_PRESET = [("Custom", None, None)]
|
||||
|
||||
RECOMMENDED_PRESETS_SEEDREAM_5_LITE = (
|
||||
_PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_3K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_5 = (
|
||||
_PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_0 = (
|
||||
_PRESETS_SEEDREAM_1K + _PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
|
||||
# Seedance 2.0 reference video pixel count limits per model and output resolution.
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
|
||||
"dreamina-seedance-2-0-260128": {
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
from __future__ import annotations
|
||||
from enum import Enum
|
||||
from typing import Optional, List, Dict, Any, Union
|
||||
from typing import Optional, Any
|
||||
|
||||
from pydantic import BaseModel, Field, RootModel
|
||||
|
||||
|
||||
class TripoModelVersion(str, Enum):
|
||||
v3_1_20260211 = 'v3.1-20260211'
|
||||
v3_0_20250812 = 'v3.0-20250812'
|
||||
v2_5_20250123 = 'v2.5-20250123'
|
||||
v2_0_20240919 = 'v2.0-20240919'
|
||||
@ -142,7 +143,7 @@ class TripoFileEmptyReference(BaseModel):
|
||||
pass
|
||||
|
||||
class TripoFileReference(RootModel):
|
||||
root: Union[TripoFileTokenReference, TripoUrlReference, TripoObjectReference, TripoFileEmptyReference]
|
||||
root: TripoFileTokenReference | TripoUrlReference | TripoObjectReference | TripoFileEmptyReference
|
||||
|
||||
class TripoGetStsTokenRequest(BaseModel):
|
||||
format: str = Field(..., description='The format of the image')
|
||||
@ -183,7 +184,7 @@ class TripoImageToModelRequest(BaseModel):
|
||||
|
||||
class TripoMultiviewToModelRequest(BaseModel):
|
||||
type: TripoTaskType = TripoTaskType.MULTIVIEW_TO_MODEL
|
||||
files: List[TripoFileReference] = Field(..., description='The file references to convert to a model')
|
||||
files: list[TripoFileReference] = Field(..., description='The file references to convert to a model')
|
||||
model_version: Optional[TripoModelVersion] = Field(None, description='The model version to use for generation')
|
||||
orthographic_projection: Optional[bool] = Field(False, description='Whether to use orthographic projection')
|
||||
face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to')
|
||||
@ -251,27 +252,13 @@ class TripoConvertModelRequest(BaseModel):
|
||||
with_animation: Optional[bool] = Field(None, description='Whether to include animations')
|
||||
pack_uv: Optional[bool] = Field(None, description='Whether to pack the UVs')
|
||||
bake: Optional[bool] = Field(None, description='Whether to bake the model')
|
||||
part_names: Optional[List[str]] = Field(None, description='The names of the parts to include')
|
||||
part_names: Optional[list[str]] = Field(None, description='The names of the parts to include')
|
||||
fbx_preset: Optional[TripoFbxPreset] = Field(None, description='The preset for the FBX export')
|
||||
export_vertex_colors: Optional[bool] = Field(None, description='Whether to export the vertex colors')
|
||||
export_orientation: Optional[TripoOrientation] = Field(None, description='The orientation for the export')
|
||||
animate_in_place: Optional[bool] = Field(None, description='Whether to animate in place')
|
||||
|
||||
|
||||
class TripoTaskRequest(RootModel):
|
||||
root: Union[
|
||||
TripoTextToModelRequest,
|
||||
TripoImageToModelRequest,
|
||||
TripoMultiviewToModelRequest,
|
||||
TripoTextureModelRequest,
|
||||
TripoRefineModelRequest,
|
||||
TripoAnimatePrerigcheckRequest,
|
||||
TripoAnimateRigRequest,
|
||||
TripoAnimateRetargetRequest,
|
||||
TripoStylizeModelRequest,
|
||||
TripoConvertModelRequest
|
||||
]
|
||||
|
||||
class TripoTaskOutput(BaseModel):
|
||||
model: Optional[str] = Field(None, description='URL to the model')
|
||||
base_model: Optional[str] = Field(None, description='URL to the base model')
|
||||
@ -283,12 +270,13 @@ class TripoTask(BaseModel):
|
||||
task_id: str = Field(..., description='The task ID')
|
||||
type: Optional[str] = Field(None, description='The type of task')
|
||||
status: Optional[TripoTaskStatus] = Field(None, description='The status of the task')
|
||||
input: Optional[Dict[str, Any]] = Field(None, description='The input parameters for the task')
|
||||
input: Optional[dict[str, Any]] = Field(None, description='The input parameters for the task')
|
||||
output: Optional[TripoTaskOutput] = Field(None, description='The output of the task')
|
||||
progress: Optional[int] = Field(None, description='The progress of the task', ge=0, le=100)
|
||||
create_time: Optional[int] = Field(None, description='The creation time of the task')
|
||||
running_left_time: Optional[int] = Field(None, description='The estimated time left for the task')
|
||||
queue_position: Optional[int] = Field(None, description='The position in the queue')
|
||||
consumed_credit: int | None = Field(None)
|
||||
|
||||
class TripoTaskResponse(BaseModel):
|
||||
code: int = Field(0, description='The response code')
|
||||
@ -296,7 +284,7 @@ class TripoTaskResponse(BaseModel):
|
||||
|
||||
class TripoGeneralResponse(BaseModel):
|
||||
code: int = Field(0, description='The response code')
|
||||
data: Dict[str, str] = Field(..., description='The task ID data')
|
||||
data: dict[str, str] = Field(..., description='The task ID data')
|
||||
|
||||
class TripoBalanceData(BaseModel):
|
||||
balance: float = Field(..., description='The account balance')
|
||||
|
||||
245
comfy_api_nodes/nodes_anthropic.py
Normal file
245
comfy_api_nodes/nodes_anthropic.py
Normal file
@ -0,0 +1,245 @@
|
||||
"""API Nodes for Anthropic Claude (Messages API). See: https://docs.anthropic.com/en/api/messages"""
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.anthropic import (
|
||||
AnthropicImageContent,
|
||||
AnthropicImageSourceUrl,
|
||||
AnthropicMessage,
|
||||
AnthropicMessagesRequest,
|
||||
AnthropicMessagesResponse,
|
||||
AnthropicRole,
|
||||
AnthropicTextContent,
|
||||
)
|
||||
from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
get_number_of_images,
|
||||
sync_op,
|
||||
upload_images_to_comfyapi,
|
||||
validate_string,
|
||||
)
|
||||
|
||||
ANTHROPIC_MESSAGES_ENDPOINT = "/proxy/anthropic/v1/messages"
|
||||
ANTHROPIC_IMAGE_MAX_PIXELS = 1568 * 1568
|
||||
CLAUDE_MAX_IMAGES = 20
|
||||
|
||||
CLAUDE_MODELS: dict[str, str] = {
|
||||
"Opus 4.7": "claude-opus-4-7",
|
||||
"Opus 4.6": "claude-opus-4-6",
|
||||
"Sonnet 4.6": "claude-sonnet-4-6",
|
||||
"Sonnet 4.5": "claude-sonnet-4-5-20250929",
|
||||
"Haiku 4.5": "claude-haiku-4-5-20251001",
|
||||
}
|
||||
|
||||
|
||||
def _claude_model_inputs():
|
||||
return [
|
||||
IO.Int.Input(
|
||||
"max_tokens",
|
||||
default=16000,
|
||||
min=32,
|
||||
max=32000,
|
||||
tooltip="Maximum number of tokens to generate before stopping.",
|
||||
advanced=True,
|
||||
),
|
||||
IO.Float.Input(
|
||||
"temperature",
|
||||
default=1.0,
|
||||
min=0.0,
|
||||
max=1.0,
|
||||
step=0.01,
|
||||
tooltip="Controls randomness. 0.0 is deterministic, 1.0 is most random.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _model_price_per_million(model: str) -> tuple[float, float] | None:
|
||||
"""Return (input_per_1M, output_per_1M) USD for a Claude model, or None if unknown."""
|
||||
if "opus-4-7" in model or "opus-4-6" in model or "opus-4-5" in model:
|
||||
return 5.0, 25.0
|
||||
if "sonnet-4" in model:
|
||||
return 3.0, 15.0
|
||||
if "haiku-4-5" in model:
|
||||
return 1.0, 5.0
|
||||
return None
|
||||
|
||||
|
||||
def calculate_tokens_price(response: AnthropicMessagesResponse) -> float | None:
|
||||
"""Compute approximate USD price from response usage. Server-side billing is authoritative."""
|
||||
if not response.usage or not response.model:
|
||||
return None
|
||||
rates = _model_price_per_million(response.model)
|
||||
if rates is None:
|
||||
return None
|
||||
input_rate, output_rate = rates
|
||||
input_tokens = response.usage.input_tokens or 0
|
||||
output_tokens = response.usage.output_tokens or 0
|
||||
cache_read = response.usage.cache_read_input_tokens or 0
|
||||
cache_5m = 0
|
||||
cache_1h = 0
|
||||
if response.usage.cache_creation:
|
||||
cache_5m = response.usage.cache_creation.ephemeral_5m_input_tokens or 0
|
||||
cache_1h = response.usage.cache_creation.ephemeral_1h_input_tokens or 0
|
||||
total = (
|
||||
input_tokens * input_rate
|
||||
+ output_tokens * output_rate
|
||||
+ cache_read * input_rate * 0.1
|
||||
+ cache_5m * input_rate * 1.25
|
||||
+ cache_1h * input_rate * 2.0
|
||||
)
|
||||
return total / 1_000_000.0
|
||||
|
||||
|
||||
def _get_text_from_response(response: AnthropicMessagesResponse) -> str:
|
||||
if not response.content:
|
||||
return ""
|
||||
return "\n".join(block.text for block in response.content if block.text)
|
||||
|
||||
|
||||
async def _build_image_content_blocks(
|
||||
cls: type[IO.ComfyNode],
|
||||
image_tensors: list[Input.Image],
|
||||
) -> list[AnthropicImageContent]:
|
||||
urls = await upload_images_to_comfyapi(
|
||||
cls,
|
||||
image_tensors,
|
||||
max_images=CLAUDE_MAX_IMAGES,
|
||||
total_pixels=ANTHROPIC_IMAGE_MAX_PIXELS,
|
||||
wait_label="Uploading reference images",
|
||||
)
|
||||
return [AnthropicImageContent(source=AnthropicImageSourceUrl(url=url)) for url in urls]
|
||||
|
||||
|
||||
class ClaudeNode(IO.ComfyNode):
|
||||
"""Generate text responses from an Anthropic Claude model."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ClaudeNode",
|
||||
display_name="Anthropic Claude",
|
||||
category="api node/text/Anthropic",
|
||||
essentials_category="Text Generation",
|
||||
description="Generate text responses with Anthropic's Claude models. "
|
||||
"Provide a text prompt and optionally one or more images for multimodal context.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text input to the model.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[IO.DynamicCombo.Option(label, _claude_model_inputs()) for label in CLAUDE_MODELS],
|
||||
tooltip="The Claude model used to generate the response.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed controls whether the node should re-run; "
|
||||
"results are non-deterministic regardless of seed.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, CLAUDE_MAX_IMAGES + 1)],
|
||||
min=0,
|
||||
),
|
||||
tooltip=f"Optional image(s) to use as context for the model. Up to {CLAUDE_MAX_IMAGES} images.",
|
||||
),
|
||||
IO.String.Input(
|
||||
"system_prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
optional=True,
|
||||
advanced=True,
|
||||
tooltip="Foundational instructions that dictate the model's behavior.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.String.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr="""
|
||||
(
|
||||
$m := widgets.model;
|
||||
$contains($m, "opus") ? {
|
||||
"type": "list_usd",
|
||||
"usd": [0.005, 0.025],
|
||||
"format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
|
||||
}
|
||||
: $contains($m, "sonnet") ? {
|
||||
"type": "list_usd",
|
||||
"usd": [0.003, 0.015],
|
||||
"format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
|
||||
}
|
||||
: $contains($m, "haiku") ? {
|
||||
"type": "list_usd",
|
||||
"usd": [0.001, 0.005],
|
||||
"format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
|
||||
}
|
||||
: {"type":"text", "text":"Token-based"}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
images: dict | None = None,
|
||||
system_prompt: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_label = model["model"]
|
||||
max_tokens = model["max_tokens"]
|
||||
temperature = model["temperature"]
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in (images or {}).values() if t is not None]
|
||||
if sum(get_number_of_images(t) for t in image_tensors) > CLAUDE_MAX_IMAGES:
|
||||
raise ValueError(f"Up to {CLAUDE_MAX_IMAGES} images are supported per request.")
|
||||
|
||||
content: list[AnthropicTextContent | AnthropicImageContent] = []
|
||||
if image_tensors:
|
||||
content.extend(await _build_image_content_blocks(cls, image_tensors))
|
||||
content.append(AnthropicTextContent(text=prompt))
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=ANTHROPIC_MESSAGES_ENDPOINT, method="POST"),
|
||||
response_model=AnthropicMessagesResponse,
|
||||
data=AnthropicMessagesRequest(
|
||||
model=CLAUDE_MODELS[model_label],
|
||||
max_tokens=max_tokens,
|
||||
messages=[AnthropicMessage(role=AnthropicRole.user, content=content)],
|
||||
system=system_prompt or None,
|
||||
temperature=temperature,
|
||||
),
|
||||
price_extractor=calculate_tokens_price,
|
||||
)
|
||||
return IO.NodeOutput(_get_text_from_response(response) or "Empty response from Claude model.")
|
||||
|
||||
|
||||
class AnthropicExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
return [ClaudeNode]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> AnthropicExtension:
|
||||
return AnthropicExtension()
|
||||
@ -596,6 +596,7 @@ class Flux2ProImageNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["width", "height"], inputs=["images"]),
|
||||
expr=cls.PRICE_BADGE_EXPR,
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -674,6 +675,175 @@ class Flux2MaxImageNode(Flux2ProImageNode):
|
||||
"""
|
||||
|
||||
|
||||
_FLUX2_MODEL_ENDPOINTS = {
|
||||
"Flux.2 [pro]": "/proxy/bfl/flux-2-pro/generate",
|
||||
"Flux.2 [max]": "/proxy/bfl/flux-2-max/generate",
|
||||
}
|
||||
|
||||
|
||||
def _flux2_model_inputs():
|
||||
return [
|
||||
IO.Int.Input(
|
||||
"width",
|
||||
default=1024,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=32,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"height",
|
||||
default=768,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=32,
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 9)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Optional reference image(s) for image-to-image generation. Up to 8 images.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class Flux2ImageNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="Flux2ImageNode",
|
||||
display_name="Flux.2 Image",
|
||||
category="api node/image/BFL",
|
||||
description="Generate images via Flux.2 [pro] or Flux.2 [max] from a prompt and optional reference images.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Prompt for the image generation or edit",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Flux.2 [pro]", _flux2_model_inputs()),
|
||||
IO.DynamicCombo.Option("Flux.2 [max]", _flux2_model_inputs()),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=0xFFFFFFFFFFFFFFFF,
|
||||
control_after_generate=True,
|
||||
tooltip="The random seed used for creating the noise.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Image.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "model.width", "model.height"],
|
||||
input_groups=["model.images"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$isMax := widgets.model = "flux.2 [max]";
|
||||
$MP := 1024 * 1024;
|
||||
$w := $lookup(widgets, "model.width");
|
||||
$h := $lookup(widgets, "model.height");
|
||||
$outMP := $max([1, $floor((($w * $h) + $MP - 1) / $MP)]);
|
||||
$outputCost := $isMax
|
||||
? (0.07 + 0.03 * ($outMP - 1))
|
||||
: (0.03 + 0.015 * ($outMP - 1));
|
||||
$refMin := $isMax ? 0.03 : 0.015;
|
||||
$refMax := $isMax ? 0.24 : 0.12;
|
||||
$hasRefs := $lookup(inputGroups, "model.images") > 0;
|
||||
$hasRefs
|
||||
? {
|
||||
"type": "range_usd",
|
||||
"min_usd": $outputCost + $refMin,
|
||||
"max_usd": $outputCost + $refMax,
|
||||
"format": { "approximate": true }
|
||||
}
|
||||
: {"type": "usd", "usd": $outputCost}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
model_choice = model["model"]
|
||||
endpoint = _FLUX2_MODEL_ENDPOINTS[model_choice]
|
||||
width = model["width"]
|
||||
height = model["height"]
|
||||
images_dict = model.get("images") or {}
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
if n_images > 8:
|
||||
raise ValueError("The current maximum number of supported images is 8.")
|
||||
|
||||
flat_tensors: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat_tensors.append(tensor)
|
||||
|
||||
reference_images: dict[str, str] = {}
|
||||
for idx, tensor in enumerate(flat_tensors):
|
||||
key_name = f"input_image_{idx + 1}" if idx else "input_image"
|
||||
reference_images[key_name] = tensor_to_base64_string(tensor, total_pixels=2048 * 2048)
|
||||
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=endpoint, method="POST"),
|
||||
response_model=BFLFluxProGenerateResponse,
|
||||
data=Flux2ProGenerateRequest(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
seed=seed,
|
||||
**reference_images,
|
||||
),
|
||||
)
|
||||
|
||||
def price_extractor(_r: BaseModel) -> float | None:
|
||||
return None if initial_response.cost is None else initial_response.cost / 100
|
||||
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(initial_response.polling_url),
|
||||
response_model=BFLFluxStatusResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
progress_extractor=lambda r: r.progress,
|
||||
price_extractor=price_extractor,
|
||||
completed_statuses=[BFLStatus.ready],
|
||||
failed_statuses=[
|
||||
BFLStatus.request_moderated,
|
||||
BFLStatus.content_moderated,
|
||||
BFLStatus.error,
|
||||
BFLStatus.task_not_found,
|
||||
],
|
||||
queued_statuses=[],
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
|
||||
|
||||
|
||||
class BFLExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -685,6 +855,7 @@ class BFLExtension(ComfyExtension):
|
||||
FluxProFillNode,
|
||||
Flux2ProImageNode,
|
||||
Flux2MaxImageNode,
|
||||
Flux2ImageNode,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -10,6 +10,9 @@ from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.bytedance import (
|
||||
RECOMMENDED_PRESETS,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_0,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_5,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS,
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
|
||||
VIDEO_TASKS_EXECUTION_TIME,
|
||||
@ -68,6 +71,12 @@ SEEDREAM_MODELS = {
|
||||
"seedream-4-0-250828": "seedream-4-0-250828",
|
||||
}
|
||||
|
||||
SEEDREAM_PRESETS = {
|
||||
"seedream-5-0-260128": RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
|
||||
"seedream-4-5-251128": RECOMMENDED_PRESETS_SEEDREAM_4_5,
|
||||
"seedream-4-0-250828": RECOMMENDED_PRESETS_SEEDREAM_4_0,
|
||||
}
|
||||
|
||||
# Long-running tasks endpoints(e.g., video)
|
||||
BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
|
||||
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
|
||||
@ -562,6 +571,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
)
|
||||
""",
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -651,6 +661,226 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
|
||||
|
||||
|
||||
def _seedream_model_inputs(*, max_ref_images: int, presets: list):
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"size_preset",
|
||||
options=[label for label, _, _ in presets],
|
||||
tooltip="Pick a recommended size. Select Custom to use the width and height below.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"width",
|
||||
default=2048,
|
||||
min=1024,
|
||||
max=6240,
|
||||
step=2,
|
||||
tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"height",
|
||||
default=2048,
|
||||
min=1024,
|
||||
max=4992,
|
||||
step=2,
|
||||
tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"max_images",
|
||||
default=1,
|
||||
min=1,
|
||||
max=max_ref_images,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Maximum number of images to generate. With 1, exactly one image is produced. "
|
||||
"With >1, the model generates between 1 and max_images related images "
|
||||
"(e.g., story scenes, character variations). "
|
||||
"Total images (input + generated) cannot exceed 15.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
|
||||
min=0,
|
||||
),
|
||||
tooltip=f"Optional reference image(s) for image-to-image or multi-reference generation. "
|
||||
f"Up to {max_ref_images} images.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"fail_on_partial",
|
||||
default=False,
|
||||
tooltip="If enabled, abort execution if any requested images are missing or return an error.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class ByteDanceSeedreamNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceSeedreamNodeV2",
|
||||
display_name="ByteDance Seedream 4.5 & 5.0",
|
||||
category="api node/image/ByteDance",
|
||||
description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text prompt for creating or editing an image.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream 5.0 lite",
|
||||
_seedream_model_inputs(max_ref_images=14, presets=RECOMMENDED_PRESETS_SEEDREAM_5_LITE),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream-4-5-251128",
|
||||
_seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_5),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream-4-0-250828",
|
||||
_seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_0),
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the image.',
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr="""
|
||||
(
|
||||
$price := $contains(widgets.model, "5.0 lite") ? 0.035 :
|
||||
$contains(widgets.model, "4-5") ? 0.04 : 0.03;
|
||||
{
|
||||
"type":"usd",
|
||||
"usd": $price,
|
||||
"format": { "suffix":" x images/Run", "approximate": true }
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int = 0,
|
||||
watermark: bool = False,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDREAM_MODELS[model["model"]]
|
||||
presets = SEEDREAM_PRESETS[model_id]
|
||||
|
||||
size_preset = model.get("size_preset", presets[0][0])
|
||||
width = model.get("width", 2048)
|
||||
height = model.get("height", 2048)
|
||||
max_images = model.get("max_images", 1)
|
||||
sequential_image_generation = "disabled" if max_images == 1 else "auto"
|
||||
images_dict = model.get("images") or {}
|
||||
fail_on_partial = model.get("fail_on_partial", False)
|
||||
|
||||
w = h = None
|
||||
for label, tw, th in presets:
|
||||
if label == size_preset:
|
||||
w, h = tw, th
|
||||
break
|
||||
if w is None or h is None:
|
||||
w, h = width, height
|
||||
|
||||
out_num_pixels = w * h
|
||||
mp_provided = out_num_pixels / 1_000_000.0
|
||||
if ("seedream-4-5" in model_id or "seedream-5-0" in model_id) and out_num_pixels < 3686400:
|
||||
raise ValueError(
|
||||
f"Minimum image resolution for the selected model is 3.68MP, but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
if "seedream-4-0" in model_id and out_num_pixels < 921600:
|
||||
raise ValueError(
|
||||
f"Minimum image resolution that the selected model can generate is 0.92MP, "
|
||||
f"but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
if out_num_pixels > 16_777_216:
|
||||
raise ValueError(
|
||||
f"Maximum image resolution for the selected model is 16.78MP, but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_input_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
max_num_of_images = 14 if model_id == "seedream-5-0-260128" else 10
|
||||
if n_input_images > max_num_of_images:
|
||||
raise ValueError(
|
||||
f"Maximum of {max_num_of_images} reference images are supported, but {n_input_images} received."
|
||||
)
|
||||
if sequential_image_generation == "auto" and n_input_images + max_images > 15:
|
||||
raise ValueError(
|
||||
"The maximum number of generated images plus the number of reference images cannot exceed 15."
|
||||
)
|
||||
|
||||
reference_images_urls: list[str] = []
|
||||
if image_tensors:
|
||||
for tensor in image_tensors:
|
||||
validate_image_aspect_ratio(tensor, (1, 3), (3, 1))
|
||||
reference_images_urls = await upload_images_to_comfyapi(
|
||||
cls,
|
||||
image_tensors,
|
||||
max_images=n_input_images,
|
||||
mime_type="image/png",
|
||||
wait_label="Uploading reference images",
|
||||
)
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"),
|
||||
response_model=ImageTaskCreationResponse,
|
||||
data=Seedream4TaskCreationRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
image=reference_images_urls,
|
||||
size=f"{w}x{h}",
|
||||
seed=seed,
|
||||
sequential_image_generation=sequential_image_generation,
|
||||
sequential_image_generation_options=Seedream4Options(max_images=max_images),
|
||||
watermark=watermark,
|
||||
),
|
||||
)
|
||||
if len(response.data) == 1:
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
|
||||
urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d]
|
||||
if fail_on_partial and len(urls) < len(response.data):
|
||||
raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.")
|
||||
return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
|
||||
|
||||
|
||||
class ByteDanceTextToVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -2105,6 +2335,7 @@ class ByteDanceExtension(ComfyExtension):
|
||||
return [
|
||||
ByteDanceImageNode,
|
||||
ByteDanceSeedreamNode,
|
||||
ByteDanceSeedreamNodeV2,
|
||||
ByteDanceTextToVideoNode,
|
||||
ByteDanceImageToVideoNode,
|
||||
ByteDanceFirstLastFrameNode,
|
||||
|
||||
@ -162,6 +162,61 @@ class GrokImageNode(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS = [
|
||||
"auto",
|
||||
"1:1",
|
||||
"2:3",
|
||||
"3:2",
|
||||
"3:4",
|
||||
"4:3",
|
||||
"9:16",
|
||||
"16:9",
|
||||
"9:19.5",
|
||||
"19.5:9",
|
||||
"9:20",
|
||||
"20:9",
|
||||
"1:2",
|
||||
"2:1",
|
||||
]
|
||||
|
||||
|
||||
def _grok_image_edit_model_inputs(*, max_ref_images: int, with_aspect_ratio: bool):
|
||||
inputs = [
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
|
||||
min=1,
|
||||
),
|
||||
tooltip=(
|
||||
"Reference image to edit."
|
||||
if max_ref_images == 1
|
||||
else f"Reference image(s) to edit. Up to {max_ref_images} images."
|
||||
),
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["1K", "2K"]),
|
||||
IO.Int.Input(
|
||||
"number_of_images",
|
||||
default=1,
|
||||
min=1,
|
||||
max=10,
|
||||
step=1,
|
||||
tooltip="Number of edited images to generate",
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
]
|
||||
if with_aspect_ratio:
|
||||
inputs.append(
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS,
|
||||
tooltip="Only allowed when multiple images are connected.",
|
||||
)
|
||||
)
|
||||
return inputs
|
||||
|
||||
|
||||
class GrokImageEditNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -256,6 +311,7 @@ class GrokImageEditNode(IO.ComfyNode):
|
||||
)
|
||||
""",
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -303,6 +359,143 @@ class GrokImageEditNode(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
class GrokImageEditNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="GrokImageEditNodeV2",
|
||||
display_name="Grok Image Edit",
|
||||
category="api node/image/Grok",
|
||||
description="Modify an existing image based on a text prompt",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="The text prompt used to generate the image",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image-quality",
|
||||
_grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image-pro",
|
||||
_grok_image_edit_model_inputs(max_ref_images=1, with_aspect_ratio=False),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image",
|
||||
_grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to determine if node should re-run; "
|
||||
"actual results are nondeterministic regardless of seed.",
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "model.resolution", "model.number_of_images"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$isQualityModel := widgets.model = "grok-imagine-image-quality";
|
||||
$isPro := $contains(widgets.model, "pro");
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$n := $lookup(widgets, "model.number_of_images");
|
||||
$rate := $isQualityModel
|
||||
? ($res = "1k" ? 0.05 : 0.07)
|
||||
: ($isPro ? 0.07 : 0.02);
|
||||
$base := $isQualityModel ? 0.01 : 0.002;
|
||||
$output := $rate * $n;
|
||||
$isPro
|
||||
? {"type":"usd","usd": $base + $output}
|
||||
: {"type":"range_usd","min_usd": $base + $output, "max_usd": 3 * $base + $output}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_id = model["model"]
|
||||
resolution = model["resolution"]
|
||||
number_of_images = model["number_of_images"]
|
||||
images_dict = model.get("images") or {}
|
||||
aspect_ratio = model.get("aspect_ratio", "auto")
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
if n_images < 1:
|
||||
raise ValueError("At least one image is required for editing.")
|
||||
if model_id == "grok-imagine-image-pro" and n_images > 1:
|
||||
raise ValueError("The pro model supports only 1 input image.")
|
||||
if model_id != "grok-imagine-image-pro" and n_images > 3:
|
||||
raise ValueError("A maximum of 3 input images is supported.")
|
||||
if aspect_ratio != "auto" and n_images == 1:
|
||||
raise ValueError(
|
||||
"Custom aspect ratio is only allowed when multiple images are connected to the image input."
|
||||
)
|
||||
|
||||
flat_tensors: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat_tensors.append(tensor)
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/xai/v1/images/edits", method="POST"),
|
||||
data=ImageEditRequest(
|
||||
model=model_id,
|
||||
images=[
|
||||
InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(i)}") for i in flat_tensors
|
||||
],
|
||||
prompt=prompt,
|
||||
resolution=resolution.lower(),
|
||||
n=number_of_images,
|
||||
seed=seed,
|
||||
aspect_ratio=None if aspect_ratio == "auto" else aspect_ratio,
|
||||
),
|
||||
response_model=ImageGenerationResponse,
|
||||
price_extractor=_extract_grok_price,
|
||||
)
|
||||
if len(response.data) == 1:
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url))
|
||||
return IO.NodeOutput(
|
||||
torch.cat(
|
||||
[await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class GrokVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -737,6 +930,7 @@ class GrokExtension(ComfyExtension):
|
||||
return [
|
||||
GrokImageNode,
|
||||
GrokImageEditNode,
|
||||
GrokImageEditNodeV2,
|
||||
GrokVideoNode,
|
||||
GrokVideoReferenceNode,
|
||||
GrokVideoEditNode,
|
||||
|
||||
@ -27,6 +27,7 @@ from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
download_url_to_bytesio,
|
||||
downscale_image_tensor,
|
||||
get_number_of_images,
|
||||
poll_op,
|
||||
sync_op,
|
||||
tensor_to_base64_string,
|
||||
@ -372,6 +373,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
display_name="OpenAI GPT Image 2",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images synchronously via OpenAI's GPT Image endpoint.",
|
||||
is_deprecated=True,
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
@ -640,6 +642,316 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
def _gpt_image_shared_inputs():
|
||||
"""Inputs shared by all GPT Image models (quality + reference images + mask)."""
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"quality",
|
||||
default="low",
|
||||
options=["low", "medium", "high"],
|
||||
tooltip="Image quality, affects cost and generation time.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 17)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Optional reference image(s) for image editing. Up to 16 images.",
|
||||
),
|
||||
IO.Mask.Input(
|
||||
"mask",
|
||||
optional=True,
|
||||
tooltip="Optional mask for inpainting (white areas will be replaced). "
|
||||
"Requires exactly one reference image.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _gpt_image_legacy_model_inputs():
|
||||
"""Per-model widget set for legacy gpt-image-1 / gpt-image-1.5 (4 base sizes, transparent bg allowed)."""
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=["auto", "1024x1024", "1024x1536", "1536x1024"],
|
||||
tooltip="Image size.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"background",
|
||||
default="auto",
|
||||
options=["auto", "opaque", "transparent"],
|
||||
tooltip="Return image with or without background.",
|
||||
),
|
||||
*_gpt_image_shared_inputs(),
|
||||
]
|
||||
|
||||
|
||||
class OpenAIGPTImageNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenAIGPTImageNodeV2",
|
||||
display_name="OpenAI GPT Image 2",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images via OpenAI's GPT Image endpoint.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
default="",
|
||||
multiline=True,
|
||||
tooltip="Text prompt for GPT Image",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"gpt-image-2",
|
||||
[
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=[
|
||||
"auto",
|
||||
"1024x1024",
|
||||
"1024x1536",
|
||||
"1536x1024",
|
||||
"2048x2048",
|
||||
"2048x1152",
|
||||
"1152x2048",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
"Custom",
|
||||
],
|
||||
tooltip="Image size. Select 'Custom' to use the custom width and height.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_width",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_height",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"background",
|
||||
default="auto",
|
||||
options=["auto", "opaque"],
|
||||
tooltip="Return image with or without background.",
|
||||
),
|
||||
*_gpt_image_shared_inputs(),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option("gpt-image-1.5", _gpt_image_legacy_model_inputs()),
|
||||
IO.DynamicCombo.Option("gpt-image-1", _gpt_image_legacy_model_inputs()),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"n",
|
||||
default=1,
|
||||
min=1,
|
||||
max=8,
|
||||
step=1,
|
||||
tooltip="How many images to generate",
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="not implemented yet in backend",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Image.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.quality", "n"]),
|
||||
expr="""
|
||||
(
|
||||
$ranges := {
|
||||
"gpt-image-1": {
|
||||
"low": [0.011, 0.02],
|
||||
"medium": [0.042, 0.07],
|
||||
"high": [0.167, 0.25]
|
||||
},
|
||||
"gpt-image-1.5": {
|
||||
"low": [0.009, 0.02],
|
||||
"medium": [0.034, 0.062],
|
||||
"high": [0.133, 0.22]
|
||||
},
|
||||
"gpt-image-2": {
|
||||
"low": [0.0048, 0.019],
|
||||
"medium": [0.041, 0.168],
|
||||
"high": [0.165, 0.67]
|
||||
}
|
||||
};
|
||||
$range := $lookup($lookup($ranges, widgets.model), $lookup(widgets, "model.quality"));
|
||||
$nRaw := widgets.n;
|
||||
$n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
|
||||
($n = 1)
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
|
||||
: {
|
||||
"type":"range_usd",
|
||||
"min_usd": $range[0] * $n,
|
||||
"max_usd": $range[1] * $n,
|
||||
"format": { "suffix": "/Run", "approximate": true }
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
n: int,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False)
|
||||
|
||||
model_id = model["model"]
|
||||
size = model["size"]
|
||||
background = model["background"]
|
||||
quality = model["quality"]
|
||||
custom_width = model.get("custom_width", 1024)
|
||||
custom_height = model.get("custom_height", 1024)
|
||||
|
||||
images_dict = model.get("images") or {}
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
mask = model.get("mask")
|
||||
|
||||
if mask is not None and n_images == 0:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
if size == "Custom":
|
||||
if custom_width % 16 != 0 or custom_height % 16 != 0:
|
||||
raise ValueError(
|
||||
f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}"
|
||||
)
|
||||
if max(custom_width, custom_height) > 3840:
|
||||
raise ValueError(
|
||||
f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}"
|
||||
)
|
||||
ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
|
||||
if ratio > 3:
|
||||
raise ValueError(
|
||||
f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
|
||||
)
|
||||
total_pixels = custom_width * custom_height
|
||||
if not 655_360 <= total_pixels <= 8_294_400:
|
||||
raise ValueError(
|
||||
f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
|
||||
)
|
||||
size = f"{custom_width}x{custom_height}"
|
||||
|
||||
if model_id == "gpt-image-1":
|
||||
price_extractor = calculate_tokens_price_image_1
|
||||
elif model_id == "gpt-image-1.5":
|
||||
price_extractor = calculate_tokens_price_image_1_5
|
||||
elif model_id == "gpt-image-2":
|
||||
price_extractor = calculate_tokens_price_image_2_0
|
||||
else:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
|
||||
if image_tensors:
|
||||
flat: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat.extend(tensor[i : i + 1] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat.append(tensor.unsqueeze(0))
|
||||
|
||||
files = []
|
||||
for i, single_image in enumerate(flat):
|
||||
scaled_image = downscale_image_tensor(single_image, total_pixels=2048 * 2048).squeeze()
|
||||
image_np = (scaled_image.numpy() * 255).astype(np.uint8)
|
||||
img = Image.fromarray(image_np)
|
||||
img_byte_arr = BytesIO()
|
||||
img.save(img_byte_arr, format="PNG")
|
||||
img_byte_arr.seek(0)
|
||||
|
||||
if len(flat) == 1:
|
||||
files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
else:
|
||||
files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
|
||||
if mask is not None:
|
||||
if len(flat) != 1:
|
||||
raise Exception("Cannot use a mask with multiple image")
|
||||
ref_image = flat[0]
|
||||
if mask.shape[1:] != ref_image.shape[1:-1]:
|
||||
raise Exception("Mask and Image must be the same size")
|
||||
_, height, width = mask.shape
|
||||
rgba_mask = torch.zeros(height, width, 4, device="cpu")
|
||||
rgba_mask[:, :, 3] = 1 - mask.squeeze().cpu()
|
||||
scaled_mask = downscale_image_tensor(
|
||||
rgba_mask.unsqueeze(0), total_pixels=2048 * 2048
|
||||
).squeeze()
|
||||
mask_np = (scaled_mask.numpy() * 255).astype(np.uint8)
|
||||
mask_img = Image.fromarray(mask_np)
|
||||
mask_img_byte_arr = BytesIO()
|
||||
mask_img.save(mask_img_byte_arr, format="PNG")
|
||||
mask_img_byte_arr.seek(0)
|
||||
files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png")))
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/edits", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageEditRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=n,
|
||||
size=size,
|
||||
moderation="low",
|
||||
),
|
||||
content_type="multipart/form-data",
|
||||
files=files,
|
||||
price_extractor=price_extractor,
|
||||
)
|
||||
else:
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/generations", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageGenerationRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=n,
|
||||
size=size,
|
||||
moderation="low",
|
||||
),
|
||||
price_extractor=price_extractor,
|
||||
)
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
class OpenAIChatNode(IO.ComfyNode):
|
||||
"""
|
||||
Node to generate text responses from an OpenAI model.
|
||||
@ -999,6 +1311,7 @@ class OpenAIExtension(ComfyExtension):
|
||||
OpenAIDalle2,
|
||||
OpenAIDalle3,
|
||||
OpenAIGPTImage1,
|
||||
OpenAIGPTImageNodeV2,
|
||||
OpenAIChatNode,
|
||||
OpenAIInputFiles,
|
||||
OpenAIChatConfig,
|
||||
|
||||
@ -143,7 +143,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
|
||||
if reference_images:
|
||||
references = []
|
||||
for key in reference_images:
|
||||
url = await upload_image_to_comfyapi(cls, reference_images[key])
|
||||
url = await upload_image_to_comfyapi(cls, reference_images[key], mime_type="image/png")
|
||||
references.append(QuiverImageObject(url=url))
|
||||
if len(references) > 4:
|
||||
raise ValueError("Maximum 4 reference images are allowed.")
|
||||
@ -252,7 +252,7 @@ class QuiverImageToSVGNode(IO.ComfyNode):
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
image_url = await upload_image_to_comfyapi(cls, image)
|
||||
image_url = await upload_image_to_comfyapi(cls, image, mime_type="image/png")
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
|
||||
@ -60,6 +60,7 @@ async def poll_until_finished(
|
||||
],
|
||||
status_extractor=lambda x: x.data.status,
|
||||
progress_extractor=lambda x: x.data.progress,
|
||||
price_extractor=lambda x: x.data.consumed_credit * 0.01 if x.data.consumed_credit else None,
|
||||
estimated_duration=average_duration,
|
||||
)
|
||||
if response_poll.data.status == TripoTaskStatus.SUCCESS:
|
||||
@ -113,7 +114,6 @@ class TripoTextToModelNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=[
|
||||
"model_version",
|
||||
"style",
|
||||
"texture",
|
||||
"pbr",
|
||||
"quad",
|
||||
@ -124,20 +124,17 @@ class TripoTextToModelNode(IO.ComfyNode):
|
||||
expr="""
|
||||
(
|
||||
$isV14 := $contains(widgets.model_version,"v1.4");
|
||||
$style := widgets.style;
|
||||
$hasStyle := ($style != "" and $style != "none");
|
||||
$isV3OrLater := $contains(widgets.model_version,"v3.");
|
||||
$withTexture := widgets.texture or widgets.pbr;
|
||||
$isHdTexture := (widgets.texture_quality = "detailed");
|
||||
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
|
||||
$baseCredits :=
|
||||
$isV14 ? 20 : ($withTexture ? 20 : 10);
|
||||
$credits :=
|
||||
$baseCredits
|
||||
+ ($hasStyle ? 5 : 0)
|
||||
$credits := $isV14 ? 20 : (
|
||||
($withTexture ? 20 : 10)
|
||||
+ (widgets.quad ? 5 : 0)
|
||||
+ ($isHdTexture ? 10 : 0)
|
||||
+ ($isDetailedGeometry ? 20 : 0);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2)}
|
||||
+ (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
|
||||
);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -239,7 +236,6 @@ class TripoImageToModelNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=[
|
||||
"model_version",
|
||||
"style",
|
||||
"texture",
|
||||
"pbr",
|
||||
"quad",
|
||||
@ -250,20 +246,17 @@ class TripoImageToModelNode(IO.ComfyNode):
|
||||
expr="""
|
||||
(
|
||||
$isV14 := $contains(widgets.model_version,"v1.4");
|
||||
$style := widgets.style;
|
||||
$hasStyle := ($style != "" and $style != "none");
|
||||
$isV3OrLater := $contains(widgets.model_version,"v3.");
|
||||
$withTexture := widgets.texture or widgets.pbr;
|
||||
$isHdTexture := (widgets.texture_quality = "detailed");
|
||||
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
|
||||
$baseCredits :=
|
||||
$isV14 ? 30 : ($withTexture ? 30 : 20);
|
||||
$credits :=
|
||||
$baseCredits
|
||||
+ ($hasStyle ? 5 : 0)
|
||||
$credits := $isV14 ? 30 : (
|
||||
($withTexture ? 30 : 20)
|
||||
+ (widgets.quad ? 5 : 0)
|
||||
+ ($isHdTexture ? 10 : 0)
|
||||
+ ($isDetailedGeometry ? 20 : 0);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2)}
|
||||
+ (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
|
||||
);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -358,7 +351,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
||||
"texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True, advanced=True
|
||||
),
|
||||
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True, advanced=True),
|
||||
IO.Boolean.Input("quad", default=False, optional=True, advanced=True),
|
||||
IO.Boolean.Input("quad", default=False, optional=True, advanced=True, tooltip="This parameter is deprecated and does nothing."),
|
||||
IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True),
|
||||
],
|
||||
outputs=[
|
||||
@ -379,7 +372,6 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
||||
"model_version",
|
||||
"texture",
|
||||
"pbr",
|
||||
"quad",
|
||||
"texture_quality",
|
||||
"geometry_quality",
|
||||
],
|
||||
@ -387,17 +379,16 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
||||
expr="""
|
||||
(
|
||||
$isV14 := $contains(widgets.model_version,"v1.4");
|
||||
$isV3OrLater := $contains(widgets.model_version,"v3.");
|
||||
$withTexture := widgets.texture or widgets.pbr;
|
||||
$isHdTexture := (widgets.texture_quality = "detailed");
|
||||
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
|
||||
$baseCredits :=
|
||||
$isV14 ? 30 : ($withTexture ? 30 : 20);
|
||||
$credits :=
|
||||
$baseCredits
|
||||
+ (widgets.quad ? 5 : 0)
|
||||
$credits := $isV14 ? 30 : (
|
||||
($withTexture ? 30 : 20)
|
||||
+ ($isHdTexture ? 10 : 0)
|
||||
+ ($isDetailedGeometry ? 20 : 0);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2)}
|
||||
+ (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
|
||||
);
|
||||
{"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -457,7 +448,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
|
||||
geometry_quality=geometry_quality,
|
||||
texture_alignment=texture_alignment,
|
||||
face_limit=face_limit if face_limit != -1 else None,
|
||||
quad=quad,
|
||||
quad=None,
|
||||
),
|
||||
)
|
||||
return await poll_until_finished(cls, response, average_duration=80)
|
||||
@ -498,7 +489,7 @@ class TripoTextureNode(IO.ComfyNode):
|
||||
expr="""
|
||||
(
|
||||
$tq := widgets.texture_quality;
|
||||
{"type":"usd","usd": ($contains($tq,"detailed") ? 0.2 : 0.1)}
|
||||
{"type":"usd","usd": ($contains($tq,"detailed") ? 0.2 : 0.1), "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
@ -555,7 +546,7 @@ class TripoRefineNode(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
is_output_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.3}""",
|
||||
expr="""{"type":"usd","usd":0.3, "format": {"approximate": true}}""",
|
||||
),
|
||||
)
|
||||
|
||||
@ -592,7 +583,7 @@ class TripoRigNode(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
is_output_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.25}""",
|
||||
expr="""{"type":"usd","usd":0.25, "format": {"approximate": true}}""",
|
||||
),
|
||||
)
|
||||
|
||||
@ -652,7 +643,7 @@ class TripoRetargetNode(IO.ComfyNode):
|
||||
is_api_node=True,
|
||||
is_output_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
expr="""{"type":"usd","usd":0.1}""",
|
||||
expr="""{"type":"usd","usd":0.1, "format": {"approximate": true}}""",
|
||||
),
|
||||
)
|
||||
|
||||
@ -761,19 +752,10 @@ class TripoConversionNode(IO.ComfyNode):
|
||||
"face_limit",
|
||||
"texture_size",
|
||||
"texture_format",
|
||||
"force_symmetry",
|
||||
"flatten_bottom",
|
||||
"flatten_bottom_threshold",
|
||||
"pivot_to_center_bottom",
|
||||
"scale_factor",
|
||||
"with_animation",
|
||||
"pack_uv",
|
||||
"bake",
|
||||
"part_names",
|
||||
"fbx_preset",
|
||||
"export_vertex_colors",
|
||||
"export_orientation",
|
||||
"animate_in_place",
|
||||
],
|
||||
),
|
||||
expr="""
|
||||
@ -783,28 +765,16 @@ class TripoConversionNode(IO.ComfyNode):
|
||||
$flatThresh := (widgets.flatten_bottom_threshold != null) ? widgets.flatten_bottom_threshold : 0;
|
||||
$scale := (widgets.scale_factor != null) ? widgets.scale_factor : 1;
|
||||
$texFmt := (widgets.texture_format != "" ? widgets.texture_format : "jpeg");
|
||||
$part := widgets.part_names;
|
||||
$fbx := (widgets.fbx_preset != "" ? widgets.fbx_preset : "blender");
|
||||
$orient := (widgets.export_orientation != "" ? widgets.export_orientation : "default");
|
||||
$advanced :=
|
||||
widgets.quad or
|
||||
widgets.force_symmetry or
|
||||
widgets.flatten_bottom or
|
||||
widgets.pivot_to_center_bottom or
|
||||
widgets.with_animation or
|
||||
widgets.pack_uv or
|
||||
widgets.bake or
|
||||
widgets.export_vertex_colors or
|
||||
widgets.animate_in_place or
|
||||
($face != -1) or
|
||||
($texSize != 4096) or
|
||||
($flatThresh != 0) or
|
||||
($scale != 1) or
|
||||
($texFmt != "jpeg") or
|
||||
($part != "") or
|
||||
($fbx != "blender") or
|
||||
($orient != "default");
|
||||
{"type":"usd","usd": ($advanced ? 0.1 : 0.05)}
|
||||
($texFmt != "jpeg");
|
||||
{"type":"usd","usd": ($advanced ? 0.1 : 0.05), "format": {"approximate": true}}
|
||||
)
|
||||
""",
|
||||
),
|
||||
|
||||
@ -488,10 +488,30 @@ async def _diagnose_connectivity() -> dict[str, bool]:
|
||||
"api_accessible": False,
|
||||
}
|
||||
timeout = aiohttp.ClientTimeout(total=5.0)
|
||||
|
||||
# Probe Google and Baidu in parallel: Google is blocked by the GFW in mainland China, so a Baidu probe is required
|
||||
# to correctly detect that Chinese users with working internet do have working internet.
|
||||
internet_probe_urls = ("https://www.google.com", "https://www.baidu.com")
|
||||
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
with contextlib.suppress(ClientError, OSError):
|
||||
async with session.get("https://www.google.com") as resp:
|
||||
results["internet_accessible"] = resp.status < 500
|
||||
async def _probe(url: str) -> bool:
|
||||
try:
|
||||
async with session.get(url) as resp:
|
||||
return resp.status < 500
|
||||
except (ClientError, OSError, asyncio.TimeoutError):
|
||||
return False
|
||||
|
||||
probe_tasks = [asyncio.create_task(_probe(u)) for u in internet_probe_urls]
|
||||
try:
|
||||
for fut in asyncio.as_completed(probe_tasks):
|
||||
if await fut:
|
||||
results["internet_accessible"] = True
|
||||
break
|
||||
finally:
|
||||
for t in probe_tasks:
|
||||
if not t.done():
|
||||
t.cancel()
|
||||
await asyncio.gather(*probe_tasks, return_exceptions=True)
|
||||
if not results["internet_accessible"]:
|
||||
return results
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user