mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-06-04 21:37:40 +08:00
Merge remote-tracking branch 'upstream/master' into moge
This commit is contained in:
commit
6e1fa0d1df
@ -89,3 +89,12 @@ rules:
|
||||
then:
|
||||
field: description
|
||||
function: truthy
|
||||
|
||||
overrides:
|
||||
# /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
|
||||
# WebSocket upgrade, but not a 2xx, so operation-success-response fires
|
||||
# as a false positive. OpenAPI 3.x has no native WebSocket support.
|
||||
- files:
|
||||
- "openapi.yaml#/paths/~1ws"
|
||||
rules:
|
||||
operation-success-response: off
|
||||
|
||||
1412
blueprints/ControlNet (Z-Image-Turbo).json
Normal file
1412
blueprints/ControlNet (Z-Image-Turbo).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -4234,7 +4234,7 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Depth to video",
|
||||
"description": "Generates video from depth maps using LTX-2, with optional synchronized audio."
|
||||
"description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
|
||||
},
|
||||
{
|
||||
"id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
|
||||
|
||||
3361
blueprints/First-Last-Frame to Video.json
Normal file
3361
blueprints/First-Last-Frame to Video.json
Normal file
File diff suppressed because it is too large
Load Diff
858
blueprints/Frame Interpolation.json
Normal file
858
blueprints/Frame Interpolation.json
Normal file
@ -0,0 +1,858 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 16,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 16,
|
||||
"type": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"pos": [
|
||||
-2990,
|
||||
-3240
|
||||
],
|
||||
"size": [
|
||||
410,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "multiplier",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "enable_fps_multiplier",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value_1"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"label": "VIDEO",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"9",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"13",
|
||||
"value"
|
||||
],
|
||||
[
|
||||
"1",
|
||||
"model_name"
|
||||
]
|
||||
],
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Frame Interpolation"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "022693be-2baa-4009-870a-28921508a7ef",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 17,
|
||||
"lastLinkId": 28,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Frame Interpolation",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2810,
|
||||
-3070,
|
||||
159.7421875,
|
||||
120
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1270,
|
||||
-3075,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
2
|
||||
],
|
||||
"localized_name": "video",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3050
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
22
|
||||
],
|
||||
"label": "multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3030
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
|
||||
"name": "value_1",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
23
|
||||
],
|
||||
"label": "enable_fps_multiplier",
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-3010
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a22b101e-8773-4e17-a297-7ee3aae09162",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
24
|
||||
],
|
||||
"pos": [
|
||||
-2670.2578125,
|
||||
-2990
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
|
||||
"name": "VIDEO_1",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
26
|
||||
],
|
||||
"label": "VIDEO",
|
||||
"pos": [
|
||||
-1250,
|
||||
-3055
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "5aacc622-2a07-4983-b31c-e04461f7f953",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
28
|
||||
],
|
||||
"pos": [
|
||||
-1250,
|
||||
-3035
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "FrameInterpolationModelLoader",
|
||||
"pos": [
|
||||
-2510,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model_name",
|
||||
"name": "model_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "model_name"
|
||||
},
|
||||
"link": 24
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INTERP_MODEL",
|
||||
"name": "INTERP_MODEL",
|
||||
"type": "INTERP_MODEL",
|
||||
"links": [
|
||||
1
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolationModelLoader",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"models": [
|
||||
{
|
||||
"name": "film_net_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
|
||||
"directory": "frame_interpolation"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"film_net_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "FrameInterpolate",
|
||||
"pos": [
|
||||
-2040,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "interp_model",
|
||||
"name": "interp_model",
|
||||
"type": "INTERP_MODEL",
|
||||
"link": 1
|
||||
},
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 3
|
||||
},
|
||||
{
|
||||
"localized_name": "multiplier",
|
||||
"name": "multiplier",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "multiplier"
|
||||
},
|
||||
"link": 8
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
4,
|
||||
28
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "FrameInterpolate",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-1600,
|
||||
-3370
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 4
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
26
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2970
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 22
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
8,
|
||||
19
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Int (Multiplier)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
2,
|
||||
"fixed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"type": "ComfySwitchNode",
|
||||
"pos": [
|
||||
-1610,
|
||||
-3120
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "on_false",
|
||||
"name": "on_false",
|
||||
"type": "*",
|
||||
"link": 11
|
||||
},
|
||||
{
|
||||
"localized_name": "on_true",
|
||||
"name": "on_true",
|
||||
"type": "*",
|
||||
"link": 13
|
||||
},
|
||||
{
|
||||
"localized_name": "switch",
|
||||
"name": "switch",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "switch"
|
||||
},
|
||||
"link": 15
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "output",
|
||||
"name": "output",
|
||||
"type": "*",
|
||||
"links": [
|
||||
12
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfySwitchNode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"type": "PrimitiveBoolean",
|
||||
"pos": [
|
||||
-2500,
|
||||
-2770
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
90
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 23
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "BOOLEAN",
|
||||
"name": "BOOLEAN",
|
||||
"type": "BOOLEAN",
|
||||
"links": [
|
||||
15
|
||||
]
|
||||
}
|
||||
],
|
||||
"title": "Boolean (Apply multiplier to FPS?)",
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveBoolean",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
true
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-2500,
|
||||
-3170
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 2
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
11,
|
||||
18
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-2090,
|
||||
-3070
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
210
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": false
|
||||
},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 18
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 19
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(abs(b), 16) * a"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "INTERP_MODEL"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 1,
|
||||
"target_id": 5,
|
||||
"target_slot": 1,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": 10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 5,
|
||||
"target_slot": 2,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 10,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"origin_id": 11,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 1,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 10,
|
||||
"target_slot": 2,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 2,
|
||||
"target_id": 11,
|
||||
"target_slot": 0,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"origin_id": 9,
|
||||
"origin_slot": 0,
|
||||
"target_id": 11,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 9,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"origin_id": 5,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
485
blueprints/Get Any Video Frame.json
Normal file
485
blueprints/Get Any Video Frame.json
Normal file
@ -0,0 +1,485 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 98,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 98,
|
||||
"type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"pos": [
|
||||
-410,
|
||||
-2230
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
104
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "frame_index",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Get Any Video Frame",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"100",
|
||||
"value"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 136,
|
||||
"lastLinkId": 302,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Get Any Video Frame",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
380,
|
||||
-57,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1460,
|
||||
-57,
|
||||
120,
|
||||
60
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "2ceec378-8dcf-4340-8570-155967f59a93",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
4
|
||||
],
|
||||
"pos": [
|
||||
480,
|
||||
-37
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "819955f6-c686-4896-8032-ff2d0059109a",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
283
|
||||
],
|
||||
"label": "frame_index",
|
||||
"pos": [
|
||||
480,
|
||||
-17
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5
|
||||
],
|
||||
"pos": [
|
||||
1480,
|
||||
-37
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
560,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
1,
|
||||
2
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
560,
|
||||
50
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 1
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
285
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "ImageFromBatch",
|
||||
"pos": [
|
||||
1130,
|
||||
-150
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 2
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_index",
|
||||
"name": "batch_index",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "batch_index"
|
||||
},
|
||||
"link": 286
|
||||
},
|
||||
{
|
||||
"localized_name": "length",
|
||||
"name": "length",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "length"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
5
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageFromBatch"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 99,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
910,
|
||||
100
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 284
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": 285
|
||||
},
|
||||
{
|
||||
"label": "c",
|
||||
"localized_name": "values.c",
|
||||
"name": "values.c",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
286
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"min(max(int(a if a >= 0 else b + a), 0), b - 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 100,
|
||||
"type": "PrimitiveInt",
|
||||
"pos": [
|
||||
560,
|
||||
250
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "value",
|
||||
"name": "value",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "value"
|
||||
},
|
||||
"link": 283
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
284
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "PrimitiveInt"
|
||||
},
|
||||
"widgets_values": [
|
||||
0,
|
||||
"fixed"
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 1,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 2,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"origin_id": 1,
|
||||
"origin_slot": 0,
|
||||
"target_id": 3,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 1,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": 3,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 283,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 100,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 284,
|
||||
"origin_id": 100,
|
||||
"origin_slot": 0,
|
||||
"target_id": 99,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 285,
|
||||
"origin_id": 2,
|
||||
"origin_slot": 2,
|
||||
"target_id": 99,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 286,
|
||||
"origin_id": 99,
|
||||
"origin_slot": 1,
|
||||
"target_id": 3,
|
||||
"target_slot": 1,
|
||||
"type": "INT"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.197015527856339,
|
||||
"offset": [
|
||||
-168.76833554248222,
|
||||
540.6638955283997
|
||||
]
|
||||
},
|
||||
"frontendVersion": "1.42.8"
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
2050
blueprints/Image Edit (Flux.2 Dev).json
Normal file
2050
blueprints/Image Edit (Flux.2 Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
1947
blueprints/Image Edit (Qwen 2509).json
Normal file
1947
blueprints/Image Edit (Qwen 2509).json
Normal file
File diff suppressed because it is too large
Load Diff
714
blueprints/Image Segmentation (SAM3).json
Normal file
714
blueprints/Image Segmentation (SAM3).json
Normal file
@ -0,0 +1,714 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 99,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 99,
|
||||
"type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"pos": [
|
||||
-1630,
|
||||
-3270
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"label": "object",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"78",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"75",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"77",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {
|
||||
"text": true
|
||||
},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Image Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 113,
|
||||
"lastLinkId": 283,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Image Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1130,
|
||||
-3305,
|
||||
120,
|
||||
80
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
264
|
||||
],
|
||||
"localized_name": "image",
|
||||
"label": "image",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
265
|
||||
],
|
||||
"label": "object",
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
266
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
267
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
268
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
269
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b1439668-b050-490b-a5dc-fc4052c55666",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
270
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
271
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
272
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3285
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1110,
|
||||
-3265
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 75,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1470,
|
||||
-3460
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
260
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 264
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 266
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 267
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 268
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 269
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 270
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 271
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 77,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3200
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 272
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 78,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2000,
|
||||
-3000
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 265
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"version": "7.7",
|
||||
"input_ue_unconnectable": {}
|
||||
},
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 78,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 77,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 75,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 264,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 75,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 265,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 78,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 75,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 267,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 75,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 268,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 75,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 269,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 75,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 270,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 75,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 271,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 75,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 272,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 77,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image Tools/Image Segmentation",
|
||||
"description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
@ -2028,7 +2028,7 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Video generation and editing/Image to video",
|
||||
"description": "Generates video from an image and text prompt using Wan 2.2, supporting T2V and I2V."
|
||||
"description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
397
blueprints/Remove Background (BiRefNet).json
Normal file
397
blueprints/Remove Background (BiRefNet).json
Normal file
@ -0,0 +1,397 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 19,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 19,
|
||||
"type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"pos": [
|
||||
-6411.330578108367,
|
||||
1940.2638932730042
|
||||
],
|
||||
"size": [
|
||||
349.609375,
|
||||
145.9375
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"14",
|
||||
"bg_removal_name"
|
||||
]
|
||||
]
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Remove Background (BiRefNet)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 21,
|
||||
"lastLinkId": 16,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Remove Background (BiRefNet)",
|
||||
"description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6728.534070722246,
|
||||
1475.2619799128663,
|
||||
150.9140625,
|
||||
88
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-6169.049695722246,
|
||||
1475.2619799128663,
|
||||
128,
|
||||
88
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
5,
|
||||
7
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
12
|
||||
],
|
||||
"pos": [
|
||||
-6601.620008222246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"linkIds": [
|
||||
8
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1499.2619799128663
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
11
|
||||
],
|
||||
"pos": [
|
||||
-6145.049695722246,
|
||||
1519.2619799128663
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 13,
|
||||
"type": "RemoveBackground",
|
||||
"pos": [
|
||||
-6536.764823982709,
|
||||
1444.9963409012412
|
||||
],
|
||||
"size": [
|
||||
302.25,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 5
|
||||
},
|
||||
{
|
||||
"localized_name": "bg_removal_model",
|
||||
"name": "bg_removal_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"link": 3
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
4,
|
||||
11
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "RemoveBackground"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"type": "LoadBackgroundRemovalModel",
|
||||
"pos": [
|
||||
-6540.534070722246,
|
||||
1302.223464635445
|
||||
],
|
||||
"size": [
|
||||
311.484375,
|
||||
85.515625
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "bg_removal_name",
|
||||
"name": "bg_removal_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "bg_removal_name"
|
||||
},
|
||||
"link": 12
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "bg_model",
|
||||
"name": "bg_model",
|
||||
"type": "BACKGROUND_REMOVAL",
|
||||
"links": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LoadBackgroundRemovalModel",
|
||||
"models": [
|
||||
{
|
||||
"name": "birefnet.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
|
||||
"directory": "background_removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"birefnet.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"type": "InvertMask",
|
||||
"pos": [
|
||||
-6532.446160529669,
|
||||
1571.1111286839914
|
||||
],
|
||||
"size": [
|
||||
285.984375,
|
||||
48
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "mask",
|
||||
"name": "mask",
|
||||
"type": "MASK",
|
||||
"link": 4
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MASK",
|
||||
"name": "MASK",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
6
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "InvertMask"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"type": "JoinImageWithAlpha",
|
||||
"pos": [
|
||||
-6527.4370171636665,
|
||||
1674.3004951902876
|
||||
],
|
||||
"size": [
|
||||
284.96875,
|
||||
72
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 7
|
||||
},
|
||||
{
|
||||
"localized_name": "alpha",
|
||||
"name": "alpha",
|
||||
"type": "MASK",
|
||||
"link": 6
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "IMAGE",
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
8
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "JoinImageWithAlpha"
|
||||
}
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 3,
|
||||
"origin_id": 14,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 1,
|
||||
"type": "BACKGROUND_REMOVAL"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": 15,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"origin_id": 15,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 13,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 16,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"origin_id": 16,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"origin_id": 13,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 14,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Image generation and editing/Background Removal"
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
2112
blueprints/Text to Image (Ernie Image Turbo).json
Normal file
2112
blueprints/Text to Image (Ernie Image Turbo).json
Normal file
File diff suppressed because it is too large
Load Diff
2190
blueprints/Text to Image (Ernie Image).json
Normal file
2190
blueprints/Text to Image (Ernie Image).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1030,7 +1030,7 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Flux.1 [dev], Black Forest Labs' 12B diffusion model."
|
||||
"description": "Generates images from prompts using FLUX.1 [dev]: a 12B rectified-flow MMDiT with dual CLIP plus T5-XXL text encoders and guidance-distilled sampling for sharp prompt following versus classic DDPM diffusion."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
@ -1024,7 +1024,7 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"category": "Image generation and editing/Text to image",
|
||||
"description": "Generates images from text prompts using Flux.1 Krea Dev, a Black Forest Labs × Krea collaboration variant."
|
||||
"description": "FLUX.1 Krea [dev] (Black Forest Labs × Krea): open-weight 12B rectified-flow text-to-image drop-in alongside FLUX.1 [dev], tuned away from overcooked saturation toward more natural diversity in people, realism, and style while keeping ecosystem compatibility."
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
1870
blueprints/Text to Image (Flux.2 Dev).json
Normal file
1870
blueprints/Text to Image (Flux.2 Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
1184
blueprints/Text to Image (Z-Image-Base).json
Normal file
1184
blueprints/Text to Image (Z-Image-Base).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,22 +1,21 @@
|
||||
{
|
||||
"id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
|
||||
"revision": 0,
|
||||
"last_node_id": 34,
|
||||
"last_link_id": 40,
|
||||
"last_node_id": 57,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 5,
|
||||
"type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"id": 57,
|
||||
"type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"pos": [
|
||||
-2.5766491043910378e-05,
|
||||
1229.999928629805
|
||||
130,
|
||||
200
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
470
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -44,6 +43,22 @@
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
@ -80,15 +95,15 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"27",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"13",
|
||||
"width"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"13",
|
||||
"height"
|
||||
],
|
||||
[
|
||||
@ -97,19 +112,23 @@
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
"steps"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"28",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"30",
|
||||
"clip_name"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"29",
|
||||
"vae_name"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
@ -122,29 +141,21 @@
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
1024,
|
||||
1024,
|
||||
null,
|
||||
null,
|
||||
"z_image_turbo_bf16.safetensors",
|
||||
"qwen_3_4b.safetensors",
|
||||
"ae.safetensors"
|
||||
]
|
||||
"widgets_values": [],
|
||||
"title": "Text to Image (Z-Image-Turbo)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"groups": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
|
||||
"id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 4,
|
||||
"lastNodeId": 34,
|
||||
"lastLinkId": 40,
|
||||
"lastNodeId": 61,
|
||||
"lastLinkId": 75,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -153,17 +164,17 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-80,
|
||||
425,
|
||||
-560,
|
||||
480,
|
||||
120,
|
||||
160
|
||||
200
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
1490,
|
||||
415,
|
||||
1670,
|
||||
320,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -178,8 +189,8 @@
|
||||
],
|
||||
"label": "prompt",
|
||||
"pos": [
|
||||
20,
|
||||
445
|
||||
-460,
|
||||
500
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -190,8 +201,8 @@
|
||||
35
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
465
|
||||
-460,
|
||||
520
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -202,44 +213,68 @@
|
||||
36
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
485
|
||||
-460,
|
||||
540
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
|
||||
"id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
71
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
560
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
|
||||
"name": "steps",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
72
|
||||
],
|
||||
"pos": [
|
||||
-460,
|
||||
580
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
38
|
||||
73
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
505
|
||||
-460,
|
||||
600
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
|
||||
"id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
39
|
||||
74
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
525
|
||||
-460,
|
||||
620
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
|
||||
"id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
40
|
||||
75
|
||||
],
|
||||
"pos": [
|
||||
20,
|
||||
545
|
||||
-460,
|
||||
640
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -253,8 +288,8 @@
|
||||
],
|
||||
"localized_name": "IMAGE",
|
||||
"pos": [
|
||||
1510,
|
||||
435
|
||||
1690,
|
||||
340
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -264,15 +299,15 @@
|
||||
"id": 30,
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
329.99999029608756
|
||||
30,
|
||||
420
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
106
|
||||
270,
|
||||
150
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -282,7 +317,7 @@
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": 39
|
||||
"link": 74
|
||||
},
|
||||
{
|
||||
"localized_name": "type",
|
||||
@ -315,9 +350,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_3_4b.safetensors",
|
||||
@ -343,15 +378,15 @@
|
||||
"id": 29,
|
||||
"type": "VAELoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
479.9999847172637
|
||||
30,
|
||||
650
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
58
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -361,7 +396,7 @@
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": 40
|
||||
"link": 75
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -375,9 +410,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "ae.safetensors",
|
||||
@ -401,12 +436,12 @@
|
||||
"id": 33,
|
||||
"type": "ConditioningZeroOut",
|
||||
"pos": [
|
||||
639.9999103333332,
|
||||
620.0000271257795
|
||||
630,
|
||||
960
|
||||
],
|
||||
"size": [
|
||||
204.134765625,
|
||||
26
|
||||
230,
|
||||
80
|
||||
],
|
||||
"flags": {},
|
||||
"order": 8,
|
||||
@ -430,9 +465,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "ConditioningZeroOut",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -440,22 +475,21 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "VAEDecode",
|
||||
"pos": [
|
||||
1219.9999088104782,
|
||||
160.00009184959066
|
||||
1320,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
209.98697916666669,
|
||||
46
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -483,9 +517,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -493,22 +527,21 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 28,
|
||||
"type": "UNETLoader",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
200.0000502647102
|
||||
30,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
269.9869791666667,
|
||||
82
|
||||
270,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -518,7 +551,7 @@
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": 38
|
||||
"link": 73
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
@ -541,9 +574,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "z_image_turbo_bf16.safetensors",
|
||||
@ -568,15 +601,15 @@
|
||||
"id": 27,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
429.99997828947767,
|
||||
200.0000502647102
|
||||
400,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
409.9869791666667,
|
||||
319.9869791666667
|
||||
450,
|
||||
650
|
||||
],
|
||||
"flags": {},
|
||||
"order": 7,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -607,9 +640,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.73",
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -626,15 +659,15 @@
|
||||
"id": 13,
|
||||
"type": "EmptySD3LatentImage",
|
||||
"pos": [
|
||||
109.99997264844609,
|
||||
629.9999791384399
|
||||
40,
|
||||
890
|
||||
],
|
||||
"size": [
|
||||
259.9869791666667,
|
||||
106
|
||||
260,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -677,9 +710,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "EmptySD3LatentImage",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -694,19 +727,77 @@
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
950,
|
||||
230
|
||||
],
|
||||
"size": [
|
||||
310,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "KSampler",
|
||||
"pos": [
|
||||
879.9999615530063,
|
||||
269.9999774911694
|
||||
950,
|
||||
400
|
||||
],
|
||||
"size": [
|
||||
314.9869791666667,
|
||||
262
|
||||
320,
|
||||
350
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -740,7 +831,7 @@
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
"link": 71
|
||||
},
|
||||
{
|
||||
"localized_name": "steps",
|
||||
@ -749,7 +840,7 @@
|
||||
"widget": {
|
||||
"name": "steps"
|
||||
},
|
||||
"link": null
|
||||
"link": 72
|
||||
},
|
||||
{
|
||||
"localized_name": "cfg",
|
||||
@ -800,9 +891,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSampler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "KSampler",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -814,81 +905,23 @@
|
||||
"widgets_values": [
|
||||
0,
|
||||
"randomize",
|
||||
4,
|
||||
8,
|
||||
1,
|
||||
"res_multistep",
|
||||
"simple",
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"type": "ModelSamplingAuraFlow",
|
||||
"pos": [
|
||||
879.9999615530063,
|
||||
160.00009184959066
|
||||
],
|
||||
"size": [
|
||||
309.9869791666667,
|
||||
58
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 26
|
||||
},
|
||||
{
|
||||
"localized_name": "shift",
|
||||
"name": "shift",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "shift"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"slot_index": 0,
|
||||
"links": [
|
||||
13
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.3.64",
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
3
|
||||
]
|
||||
}
|
||||
],
|
||||
"groups": [
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Image size",
|
||||
"title": "Step2 - Image size",
|
||||
"bounding": [
|
||||
100,
|
||||
560,
|
||||
290,
|
||||
200
|
||||
10,
|
||||
820,
|
||||
320,
|
||||
280
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -896,12 +929,12 @@
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Prompt",
|
||||
"title": "Step3 - Prompt",
|
||||
"bounding": [
|
||||
410,
|
||||
360,
|
||||
130,
|
||||
450,
|
||||
540
|
||||
530,
|
||||
970
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -909,12 +942,12 @@
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Models",
|
||||
"title": "Step1 - Load models",
|
||||
"bounding": [
|
||||
100,
|
||||
0,
|
||||
130,
|
||||
290,
|
||||
413.6
|
||||
330,
|
||||
660
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -1027,25 +1060,41 @@
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 38,
|
||||
"id": 71,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 3,
|
||||
"target_slot": 4,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 72,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 3,
|
||||
"target_slot": 5,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 73,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 28,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 39,
|
||||
"id": 74,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"origin_slot": 6,
|
||||
"target_id": 30,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 40,
|
||||
"id": 75,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"origin_slot": 7,
|
||||
"target_id": 29,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
@ -1059,21 +1108,5 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"config": {},
|
||||
"extra": {
|
||||
"frontendVersion": "1.37.10",
|
||||
"workflowRendererVersion": "LG",
|
||||
"VHS_latentpreview": false,
|
||||
"VHS_latentpreviewrate": 0,
|
||||
"VHS_MetadataImage": true,
|
||||
"VHS_KeepIntermediate": true,
|
||||
"ds": {
|
||||
"scale": 0.8401370345180755,
|
||||
"offset": [
|
||||
940.0587067393087,
|
||||
-830.7121087564725
|
||||
]
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
"extra": {}
|
||||
}
|
||||
1132
blueprints/Text to Image.json
Normal file
1132
blueprints/Text to Image.json
Normal file
File diff suppressed because it is too large
Load Diff
827
blueprints/Video Segmentation (SAM3).json
Normal file
827
blueprints/Video Segmentation (SAM3).json
Normal file
@ -0,0 +1,827 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 130,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 130,
|
||||
"type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"pos": [
|
||||
-1210,
|
||||
-2780
|
||||
],
|
||||
"size": [
|
||||
300,
|
||||
370
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": []
|
||||
},
|
||||
{
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"125",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"threshold"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"refine_iterations"
|
||||
],
|
||||
[
|
||||
"126",
|
||||
"individual_masks"
|
||||
],
|
||||
[
|
||||
"127",
|
||||
"ckpt_name"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [],
|
||||
"title": "Video Segmentation (SAM3)"
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 0,
|
||||
"lastNodeId": 130,
|
||||
"lastLinkId": 299,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "Video Segmentation (SAM3)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-2260,
|
||||
-3450,
|
||||
136.369140625,
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-1050,
|
||||
-3510,
|
||||
120,
|
||||
120
|
||||
]
|
||||
},
|
||||
"inputs": [
|
||||
{
|
||||
"id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"linkIds": [
|
||||
252
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3430
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
254
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3410
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
255
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3390
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
|
||||
"name": "positive_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
256
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3370
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "b090a498-2bde-46b9-9554-18501401d687",
|
||||
"name": "negative_coords",
|
||||
"type": "STRING",
|
||||
"linkIds": [
|
||||
257
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3350
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3330
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3310
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"linkIds": [
|
||||
263
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3290
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
273
|
||||
],
|
||||
"pos": [
|
||||
-2143.630859375,
|
||||
-3270
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"linkIds": [
|
||||
231
|
||||
],
|
||||
"localized_name": "masks",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3490
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8f622e40-8528-4078-b7d3-147e9f872194",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"linkIds": [
|
||||
232
|
||||
],
|
||||
"localized_name": "bboxes",
|
||||
"pos": [
|
||||
-1030,
|
||||
-3470
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"linkIds": [
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3450
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"linkIds": [
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-1030,
|
||||
-3430
|
||||
]
|
||||
}
|
||||
],
|
||||
"widgets": [],
|
||||
"nodes": [
|
||||
{
|
||||
"id": 125,
|
||||
"type": "CLIPTextEncode",
|
||||
"pos": [
|
||||
-2010,
|
||||
-3040
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "clip",
|
||||
"name": "clip",
|
||||
"type": "CLIP",
|
||||
"link": 240
|
||||
},
|
||||
{
|
||||
"localized_name": "text",
|
||||
"name": "text",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "text"
|
||||
},
|
||||
"link": 254
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "CONDITIONING",
|
||||
"name": "CONDITIONING",
|
||||
"type": "CONDITIONING",
|
||||
"links": [
|
||||
200
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 126,
|
||||
"type": "SAM3_Detect",
|
||||
"pos": [
|
||||
-1520,
|
||||
-3520
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
290
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "model",
|
||||
"localized_name": "model",
|
||||
"name": "model",
|
||||
"type": "MODEL",
|
||||
"link": 237
|
||||
},
|
||||
{
|
||||
"label": "image",
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 253
|
||||
},
|
||||
{
|
||||
"label": "conditioning",
|
||||
"localized_name": "conditioning",
|
||||
"name": "conditioning",
|
||||
"shape": 7,
|
||||
"type": "CONDITIONING",
|
||||
"link": 200
|
||||
},
|
||||
{
|
||||
"label": "bboxes",
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"shape": 7,
|
||||
"type": "BOUNDING_BOX",
|
||||
"link": 255
|
||||
},
|
||||
{
|
||||
"label": "positive_coords",
|
||||
"localized_name": "positive_coords",
|
||||
"name": "positive_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 256
|
||||
},
|
||||
{
|
||||
"label": "negative_coords",
|
||||
"localized_name": "negative_coords",
|
||||
"name": "negative_coords",
|
||||
"shape": 7,
|
||||
"type": "STRING",
|
||||
"link": 257
|
||||
},
|
||||
{
|
||||
"localized_name": "threshold",
|
||||
"name": "threshold",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "threshold"
|
||||
},
|
||||
"link": 261
|
||||
},
|
||||
{
|
||||
"localized_name": "refine_iterations",
|
||||
"name": "refine_iterations",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "refine_iterations"
|
||||
},
|
||||
"link": 262
|
||||
},
|
||||
{
|
||||
"localized_name": "individual_masks",
|
||||
"name": "individual_masks",
|
||||
"type": "BOOLEAN",
|
||||
"widget": {
|
||||
"name": "individual_masks"
|
||||
},
|
||||
"link": 263
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "masks",
|
||||
"name": "masks",
|
||||
"type": "MASK",
|
||||
"links": [
|
||||
231
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "bboxes",
|
||||
"name": "bboxes",
|
||||
"type": "BOUNDING_BOX",
|
||||
"links": [
|
||||
232
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "SAM3_Detect",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
0.5,
|
||||
2,
|
||||
false
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 127,
|
||||
"type": "CheckpointLoaderSimple",
|
||||
"pos": [
|
||||
-1970,
|
||||
-3310
|
||||
],
|
||||
"size": [
|
||||
330,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "ckpt_name",
|
||||
"name": "ckpt_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "ckpt_name"
|
||||
},
|
||||
"link": 273
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "MODEL",
|
||||
"name": "MODEL",
|
||||
"type": "MODEL",
|
||||
"links": [
|
||||
237
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "CLIP",
|
||||
"name": "CLIP",
|
||||
"type": "CLIP",
|
||||
"links": [
|
||||
240
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "VAE",
|
||||
"name": "VAE",
|
||||
"type": "VAE",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CheckpointLoaderSimple",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65,
|
||||
"models": [
|
||||
{
|
||||
"name": "sam3.1_multiplex_fp16.safetensors",
|
||||
"url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
|
||||
"directory": "checkpoints"
|
||||
}
|
||||
]
|
||||
},
|
||||
"widgets_values": [
|
||||
"sam3.1_multiplex_fp16.safetensors"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 128,
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-1910,
|
||||
-3540
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "video",
|
||||
"name": "video",
|
||||
"type": "VIDEO",
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
253
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"type": "AUDIO",
|
||||
"links": [
|
||||
259
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"links": [
|
||||
260
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.19.3",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
"hasSecondTab": false,
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 129,
|
||||
"type": "Note",
|
||||
"pos": [
|
||||
-1980,
|
||||
-2790
|
||||
],
|
||||
"size": [
|
||||
370,
|
||||
250
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [],
|
||||
"outputs": [],
|
||||
"title": "Note: Prompt format",
|
||||
"properties": {},
|
||||
"widgets_values": [
|
||||
"Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
|
||||
],
|
||||
"color": "#432",
|
||||
"bgcolor": "#653"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"links": [
|
||||
{
|
||||
"id": 237,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 0,
|
||||
"type": "MODEL"
|
||||
},
|
||||
{
|
||||
"id": 200,
|
||||
"origin_id": 125,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 2,
|
||||
"type": "CONDITIONING"
|
||||
},
|
||||
{
|
||||
"id": 240,
|
||||
"origin_id": 127,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 0,
|
||||
"type": "CLIP"
|
||||
},
|
||||
{
|
||||
"id": 231,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 0,
|
||||
"target_id": -20,
|
||||
"target_slot": 0,
|
||||
"type": "MASK"
|
||||
},
|
||||
{
|
||||
"id": 232,
|
||||
"origin_id": 126,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 1,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 252,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 0,
|
||||
"target_id": 128,
|
||||
"target_slot": 0,
|
||||
"type": "VIDEO"
|
||||
},
|
||||
{
|
||||
"id": 253,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 0,
|
||||
"target_id": 126,
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 254,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 1,
|
||||
"target_id": 125,
|
||||
"target_slot": 1,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 255,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 2,
|
||||
"target_id": 126,
|
||||
"target_slot": 3,
|
||||
"type": "BOUNDING_BOX"
|
||||
},
|
||||
{
|
||||
"id": 256,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 3,
|
||||
"target_id": 126,
|
||||
"target_slot": 4,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 257,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 4,
|
||||
"target_id": 126,
|
||||
"target_slot": 5,
|
||||
"type": "STRING"
|
||||
},
|
||||
{
|
||||
"id": 259,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 1,
|
||||
"target_id": -20,
|
||||
"target_slot": 2,
|
||||
"type": "AUDIO"
|
||||
},
|
||||
{
|
||||
"id": 260,
|
||||
"origin_id": 128,
|
||||
"origin_slot": 2,
|
||||
"target_id": -20,
|
||||
"target_slot": 3,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 261,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 126,
|
||||
"target_slot": 6,
|
||||
"type": "FLOAT"
|
||||
},
|
||||
{
|
||||
"id": 262,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 126,
|
||||
"target_slot": 7,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 263,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 126,
|
||||
"target_slot": 8,
|
||||
"type": "BOOLEAN"
|
||||
},
|
||||
{
|
||||
"id": 273,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 127,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {},
|
||||
"category": "Video Tools",
|
||||
"description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
|
||||
}
|
||||
]
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -1,21 +1,21 @@
|
||||
{
|
||||
"revision": 0,
|
||||
"last_node_id": 84,
|
||||
"last_node_id": 85,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 84,
|
||||
"type": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"id": 85,
|
||||
"type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"pos": [
|
||||
-6100,
|
||||
2620
|
||||
-880,
|
||||
-2260
|
||||
],
|
||||
"size": [
|
||||
290,
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -76,31 +76,26 @@
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"direction"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"match_image_size"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"spacing_width"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"79",
|
||||
"spacing_color"
|
||||
]
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"right",
|
||||
true,
|
||||
0,
|
||||
"white"
|
||||
],
|
||||
"widgets_values": [],
|
||||
"title": "Video Stitch"
|
||||
}
|
||||
],
|
||||
@ -109,12 +104,12 @@
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "8e8aa94a-647e-436d-8440-8ee4691864de",
|
||||
"id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 1,
|
||||
"lastNodeId": 84,
|
||||
"lastLinkId": 262,
|
||||
"lastNodeId": 97,
|
||||
"lastLinkId": 282,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -123,8 +118,8 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-6580,
|
||||
2649,
|
||||
-6810,
|
||||
2580,
|
||||
143.55859375,
|
||||
160
|
||||
]
|
||||
@ -132,8 +127,8 @@
|
||||
"outputNode": {
|
||||
"id": -20,
|
||||
"bounding": [
|
||||
-5720,
|
||||
2659,
|
||||
-4770,
|
||||
2600,
|
||||
120,
|
||||
60
|
||||
]
|
||||
@ -149,8 +144,8 @@
|
||||
"localized_name": "video",
|
||||
"label": "Before Video",
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2669
|
||||
-6686.44140625,
|
||||
2600
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -163,8 +158,8 @@
|
||||
"localized_name": "video_1",
|
||||
"label": "After Video",
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2689
|
||||
-6686.44140625,
|
||||
2620
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -175,8 +170,8 @@
|
||||
259
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2709
|
||||
-6686.44140625,
|
||||
2640
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -187,8 +182,8 @@
|
||||
260
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2729
|
||||
-6686.44140625,
|
||||
2660
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -199,8 +194,8 @@
|
||||
261
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2749
|
||||
-6686.44140625,
|
||||
2680
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -211,8 +206,8 @@
|
||||
262
|
||||
],
|
||||
"pos": [
|
||||
-6456.44140625,
|
||||
2769
|
||||
-6686.44140625,
|
||||
2700
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -226,8 +221,8 @@
|
||||
],
|
||||
"localized_name": "VIDEO",
|
||||
"pos": [
|
||||
-5700,
|
||||
2679
|
||||
-4750,
|
||||
2620
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -238,11 +233,11 @@
|
||||
"type": "GetVideoComponents",
|
||||
"pos": [
|
||||
-6390,
|
||||
2560
|
||||
2600
|
||||
],
|
||||
"size": [
|
||||
193.530859375,
|
||||
66
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
@ -278,9 +273,9 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
"ver": "0.13.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -291,8 +286,8 @@
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
193.530859375,
|
||||
66
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
@ -332,21 +327,254 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetVideoComponents",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "GetVideoComponents"
|
||||
"ver": "0.13.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 90,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
-6390,
|
||||
3030
|
||||
],
|
||||
"size": [
|
||||
230,
|
||||
120
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "image",
|
||||
"name": "image",
|
||||
"type": "IMAGE",
|
||||
"link": 266
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "width",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
274
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "height",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
276
|
||||
]
|
||||
},
|
||||
{
|
||||
"localized_name": "batch_size",
|
||||
"name": "batch_size",
|
||||
"type": "INT",
|
||||
"links": null
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"pos": [
|
||||
-5190,
|
||||
2420
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
130
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 282
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
},
|
||||
"link": 252
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"links": [
|
||||
255
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CreateVideo",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 95,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3020
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 274
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
279
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 96,
|
||||
"type": "ComfyMathExpression",
|
||||
"pos": [
|
||||
-6040,
|
||||
3290
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"label": "a",
|
||||
"localized_name": "values.a",
|
||||
"name": "values.a",
|
||||
"type": "FLOAT,INT",
|
||||
"link": 276
|
||||
},
|
||||
{
|
||||
"label": "b",
|
||||
"localized_name": "values.b",
|
||||
"name": "values.b",
|
||||
"shape": 7,
|
||||
"type": "FLOAT,INT",
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "expression",
|
||||
"name": "expression",
|
||||
"type": "STRING",
|
||||
"widget": {
|
||||
"name": "expression"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "FLOAT",
|
||||
"name": "FLOAT",
|
||||
"type": "FLOAT",
|
||||
"links": null
|
||||
},
|
||||
{
|
||||
"localized_name": "INT",
|
||||
"name": "INT",
|
||||
"type": "INT",
|
||||
"links": [
|
||||
280
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ComfyMathExpression"
|
||||
},
|
||||
"widgets_values": [
|
||||
"a & ~1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 79,
|
||||
"type": "ImageStitch",
|
||||
"pos": [
|
||||
-6390,
|
||||
2700
|
||||
2780
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
150
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
@ -408,14 +636,15 @@
|
||||
"name": "IMAGE",
|
||||
"type": "IMAGE",
|
||||
"links": [
|
||||
250
|
||||
266,
|
||||
281
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ImageStitch",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "ImageStitch"
|
||||
"ver": "0.13.0"
|
||||
},
|
||||
"widgets_values": [
|
||||
"right",
|
||||
@ -425,60 +654,91 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 80,
|
||||
"type": "CreateVideo",
|
||||
"id": 97,
|
||||
"type": "ResizeImageMaskNode",
|
||||
"pos": [
|
||||
-6040,
|
||||
2610
|
||||
-5560,
|
||||
2790
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
78
|
||||
160
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "images",
|
||||
"name": "images",
|
||||
"type": "IMAGE",
|
||||
"link": 250
|
||||
"localized_name": "input",
|
||||
"name": "input",
|
||||
"type": "IMAGE,MASK",
|
||||
"link": 281
|
||||
},
|
||||
{
|
||||
"localized_name": "audio",
|
||||
"name": "audio",
|
||||
"shape": 7,
|
||||
"type": "AUDIO",
|
||||
"link": 251
|
||||
},
|
||||
{
|
||||
"localized_name": "fps",
|
||||
"name": "fps",
|
||||
"type": "FLOAT",
|
||||
"localized_name": "resize_type",
|
||||
"name": "resize_type",
|
||||
"type": "COMFY_DYNAMICCOMBO_V3",
|
||||
"widget": {
|
||||
"name": "fps"
|
||||
"name": "resize_type"
|
||||
},
|
||||
"link": 252
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "width",
|
||||
"name": "resize_type.width",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.width"
|
||||
},
|
||||
"link": 279
|
||||
},
|
||||
{
|
||||
"localized_name": "height",
|
||||
"name": "resize_type.height",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "resize_type.height"
|
||||
},
|
||||
"link": 280
|
||||
},
|
||||
{
|
||||
"localized_name": "crop",
|
||||
"name": "resize_type.crop",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "resize_type.crop"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"localized_name": "scale_method",
|
||||
"name": "scale_method",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "scale_method"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
{
|
||||
"localized_name": "VIDEO",
|
||||
"name": "VIDEO",
|
||||
"type": "VIDEO",
|
||||
"localized_name": "resized",
|
||||
"name": "resized",
|
||||
"type": "*",
|
||||
"links": [
|
||||
255
|
||||
282
|
||||
]
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.13.0",
|
||||
"Node name for S&R": "CreateVideo"
|
||||
"Node name for S&R": "ResizeImageMaskNode"
|
||||
},
|
||||
"widgets_values": [
|
||||
30
|
||||
"scale dimensions",
|
||||
512,
|
||||
512,
|
||||
"center",
|
||||
"area"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -500,14 +760,6 @@
|
||||
"target_slot": 1,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 250,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 251,
|
||||
"origin_id": 77,
|
||||
@ -579,6 +831,62 @@
|
||||
"target_id": 79,
|
||||
"target_slot": 5,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 266,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 90,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 274,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 0,
|
||||
"target_id": 95,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 276,
|
||||
"origin_id": 90,
|
||||
"origin_slot": 1,
|
||||
"target_id": 96,
|
||||
"target_slot": 0,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 279,
|
||||
"origin_id": 95,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 280,
|
||||
"origin_id": 96,
|
||||
"origin_slot": 1,
|
||||
"target_id": 97,
|
||||
"target_slot": 3,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 281,
|
||||
"origin_id": 79,
|
||||
"origin_slot": 0,
|
||||
"target_id": 97,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
},
|
||||
{
|
||||
"id": 282,
|
||||
"origin_id": 97,
|
||||
"origin_slot": 0,
|
||||
"target_id": 80,
|
||||
"target_slot": 0,
|
||||
"type": "IMAGE"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
@ -588,5 +896,6 @@
|
||||
"description": "Stitches multiple video clips into a single sequential video file."
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"extra": {}
|
||||
}
|
||||
@ -242,6 +242,7 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -373,6 +374,7 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -686,6 +688,7 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
|
||||
lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
|
||||
@ -747,6 +750,7 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
@ -832,6 +836,7 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
old_denoised = None
|
||||
h, h_last = None, None
|
||||
@ -889,6 +894,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
denoised_1, denoised_2 = None, None
|
||||
h, h_1, h_2 = None, None, None
|
||||
@ -1006,23 +1012,39 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
|
||||
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
|
||||
def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
|
||||
|
||||
# s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
|
||||
# noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
|
||||
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
for i in trange(len(sigmas) - 1, disable=disable):
|
||||
n_steps = max(1, len(sigmas) - 1)
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
|
||||
s_start = float(s_noise)
|
||||
s_end = s_start if s_noise_end is None else float(s_noise_end)
|
||||
for i in trange(n_steps, disable=disable):
|
||||
denoised = model(x, sigmas[i] * s_in, **extra_args)
|
||||
if callback is not None:
|
||||
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
|
||||
|
||||
x = denoised
|
||||
if sigmas[i + 1] > 0:
|
||||
x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
|
||||
noise = noise_sampler(sigmas[i], sigmas[i + 1])
|
||||
if noise_clip_std > 0:
|
||||
clip_val = noise_clip_std * noise.std()
|
||||
noise = noise.clamp(min=-clip_val, max=clip_val)
|
||||
t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
|
||||
s_noise_i = s_start + (s_end - s_start) * t
|
||||
if s_noise_i != 1.0:
|
||||
noise = noise * s_noise_i
|
||||
x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
|
||||
return x
|
||||
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
|
||||
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
|
||||
@ -1249,6 +1271,7 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
|
||||
uncond_denoised = None
|
||||
|
||||
@ -1296,6 +1319,7 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
|
||||
temp = [0]
|
||||
def post_cfg_function(args):
|
||||
@ -1371,6 +1395,7 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
sigma_fn = lambda t: t.neg().exp()
|
||||
t_fn = lambda sigma: sigma.log().neg()
|
||||
@ -1504,6 +1529,7 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
|
||||
extra_args = {} if extra_args is None else extra_args
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
def default_er_sde_noise_scaler(x):
|
||||
@ -1574,9 +1600,10 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1645,9 +1672,10 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
|
||||
seed = extra_args.get("seed", None)
|
||||
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
inject_noise = eta > 0 and s_noise > 0
|
||||
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
|
||||
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
@ -1713,6 +1741,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
|
||||
s_in = x.new_ones([x.shape[0]])
|
||||
|
||||
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
|
||||
s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
|
||||
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
|
||||
lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
|
||||
|
||||
|
||||
@ -792,6 +792,13 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class HiDreamO1Pixel(ChromaRadiance):
|
||||
"""Pixel-space latent format for HiDream-O1.
|
||||
No VAE — model patches/unpatches raw RGB internally with patch_size=32.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
"""Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
|
||||
|
||||
|
||||
41
comfy/ldm/hidream_o1/attention.py
Normal file
41
comfy/ldm/hidream_o1/attention.py
Normal file
@ -0,0 +1,41 @@
|
||||
"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
|
||||
attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
|
||||
mask the general-purpose path would build (~500 MB at T~16K) and lets the
|
||||
gen half hit the user's preferred backend via optimized_attention.
|
||||
"""
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.ops
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
|
||||
|
||||
def make_two_pass_attention(ar_len: int, transformer_options=None):
|
||||
"""Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
|
||||
The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
|
||||
"""
|
||||
|
||||
def two_pass_attention(q, k, v, heads, **kwargs):
|
||||
B, H, T, D = q.shape
|
||||
|
||||
if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
elif ar_len >= T:
|
||||
out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
|
||||
elif ar_len <= 0:
|
||||
out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
|
||||
else:
|
||||
out_ar = comfy.ops.scaled_dot_product_attention(
|
||||
q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
|
||||
attn_mask=None, dropout_p=0.0, is_causal=True,
|
||||
)
|
||||
out_gen = optimized_attention(
|
||||
q[:, :, ar_len:], k, v, heads,
|
||||
mask=None, skip_reshape=True, skip_output_reshape=True,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
out = torch.cat([out_ar, out_gen], dim=2)
|
||||
|
||||
return out.transpose(1, 2).reshape(B, T, H * D)
|
||||
|
||||
return two_pass_attention
|
||||
230
comfy/ldm/hidream_o1/conditioning.py
Normal file
230
comfy/ldm/hidream_o1/conditioning.py
Normal file
@ -0,0 +1,230 @@
|
||||
"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
|
||||
|
||||
Each ref image goes through two paths: a 32x32 patchified stream concatenated
|
||||
to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
|
||||
into input_ids at <|image_pad|> positions.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.utils
|
||||
from comfy.text_encoders.qwen_vl import process_qwen2vl_images
|
||||
|
||||
from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
|
||||
|
||||
# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
|
||||
VIT_PATCH = 16
|
||||
VIT_MERGE = 2
|
||||
VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
VIT_IMAGE_STD = [0.5, 0.5, 0.5]
|
||||
|
||||
|
||||
def prepare_ref_images(
|
||||
ref_images: List[torch.Tensor],
|
||||
target_h: int,
|
||||
target_w: int,
|
||||
device: torch.device,
|
||||
dtype: torch.dtype,
|
||||
):
|
||||
"""Build the dual-path tensors for K reference images at (target_h, target_w).
|
||||
|
||||
Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
|
||||
ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
|
||||
"""
|
||||
K = len(ref_images)
|
||||
if K == 0:
|
||||
return None
|
||||
max_size = ref_max_size(max(target_h, target_w), K)
|
||||
cis = cond_image_size(K)
|
||||
|
||||
refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
|
||||
refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
|
||||
|
||||
# 32-patch path.
|
||||
ref_patches_per = []
|
||||
per_ref_patch_grids = []
|
||||
for t in refs_t:
|
||||
t_norm = (t.squeeze(0) - 0.5) / 0.5 # (3, H, W) in [-1, 1]
|
||||
h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
|
||||
per_ref_patch_grids.append((h_p, w_p))
|
||||
patches = (
|
||||
t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
|
||||
.permute(1, 3, 0, 2, 4)
|
||||
.reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
|
||||
)
|
||||
ref_patches_per.append(patches)
|
||||
ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
|
||||
|
||||
# ViT path.
|
||||
refs_vlm_t = []
|
||||
for t in refs_t:
|
||||
_, _, h, w = t.shape
|
||||
cond_w, cond_h = calculate_dimensions(cis, w / h)
|
||||
cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
|
||||
cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
|
||||
refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
|
||||
|
||||
pv_list, grid_list, per_ref_vit_tokens = [], [], []
|
||||
for t_v in refs_vlm_t:
|
||||
pv, grid_thw = process_qwen2vl_images(
|
||||
t_v.permute(0, 2, 3, 1),
|
||||
min_pixels=0, max_pixels=10**12,
|
||||
patch_size=VIT_PATCH, merge_size=VIT_MERGE,
|
||||
image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
|
||||
)
|
||||
grid_thw = grid_thw[0]
|
||||
pv_list.append(pv.to(device=device, dtype=dtype))
|
||||
grid_list.append(grid_thw.to(device=device))
|
||||
# Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
|
||||
gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
|
||||
per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
|
||||
|
||||
return {
|
||||
"ref_patches": ref_patches,
|
||||
"ref_pixel_values": torch.cat(pv_list, dim=0),
|
||||
"ref_image_grid_thw": torch.stack(grid_list, dim=0),
|
||||
"per_ref_vit_tokens": per_ref_vit_tokens,
|
||||
"per_ref_patch_grids": per_ref_patch_grids,
|
||||
}
|
||||
|
||||
|
||||
def build_ref_input_ids(
|
||||
text_input_ids: torch.Tensor,
|
||||
per_ref_vit_tokens: List[int],
|
||||
image_token_id: int,
|
||||
vision_start_id: int,
|
||||
vision_end_id: int,
|
||||
):
|
||||
"""Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
|
||||
after the [im_start, user, \\n] prefix (matches original chat template).
|
||||
"""
|
||||
ids = text_input_ids[0].tolist()
|
||||
inserted = []
|
||||
for n_pad in per_ref_vit_tokens:
|
||||
inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
|
||||
new_ids = ids[:3] + inserted + ids[3:] # 3 = len([im_start, user, \n])
|
||||
return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
|
||||
|
||||
def build_extra_conds(
|
||||
text_input_ids: torch.Tensor,
|
||||
noise: torch.Tensor,
|
||||
ref_images: List[torch.Tensor] = None,
|
||||
target_patch_size: int = 32,
|
||||
):
|
||||
"""Assemble all conditioning tensors for HiDreamO1Transformer.forward:
|
||||
input_ids (with ref-vision tokens spliced in for the edit/IP path),
|
||||
position_ids (MRoPE), token_types, vinput_mask, plus the ref
|
||||
dual-path tensors when refs are provided.
|
||||
"""
|
||||
from .utils import get_rope_index_fix_point
|
||||
from comfy.text_encoders.hidream_o1 import (
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
|
||||
if text_input_ids.dim() == 1:
|
||||
text_input_ids = text_input_ids.unsqueeze(0)
|
||||
text_input_ids = text_input_ids.long().to(noise.device)
|
||||
B = noise.shape[0]
|
||||
if text_input_ids.shape[0] == 1 and B > 1:
|
||||
text_input_ids = text_input_ids.expand(B, -1)
|
||||
|
||||
H, W = noise.shape[-2], noise.shape[-1]
|
||||
h_p, w_p = H // target_patch_size, W // target_patch_size
|
||||
image_len = h_p * w_p
|
||||
image_grid_thw_tgt = torch.tensor(
|
||||
[[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
|
||||
out = {}
|
||||
if ref_images:
|
||||
ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
|
||||
text_input_ids = build_ref_input_ids(
|
||||
text_input_ids, ref["per_ref_vit_tokens"],
|
||||
IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
|
||||
)
|
||||
new_txt_len = text_input_ids.shape[1]
|
||||
|
||||
# Each ref's patchified stream gets a [vision_start, image_pad*N-1]
|
||||
# block in the position-id stream after the noised target.
|
||||
ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
|
||||
tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
tgt_vision[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks = []
|
||||
for rl in ref_grid_lengths:
|
||||
blk = torch.full((1, rl), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
blk[:, 0] = VISION_START_ID
|
||||
ref_vision_blocks.append(blk)
|
||||
ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
|
||||
input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
|
||||
total_ref_patches_len = sum(ref_grid_lengths)
|
||||
total_len = new_txt_len + image_len + total_ref_patches_len
|
||||
|
||||
# K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
|
||||
K = len(ref_images)
|
||||
igthw_cond = ref["ref_image_grid_thw"].clone()
|
||||
igthw_cond[:, 1] //= 2
|
||||
igthw_cond[:, 2] //= 2
|
||||
image_grid_thw_ref = torch.tensor(
|
||||
[[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
|
||||
dtype=torch.long, device=text_input_ids.device,
|
||||
)
|
||||
igthw_all = torch.cat([
|
||||
igthw_cond.to(text_input_ids.device),
|
||||
image_grid_thw_tgt,
|
||||
image_grid_thw_ref,
|
||||
], dim=0)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=igthw_all,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[0] * K + [1] + [1] * K,
|
||||
fix_point=4096,
|
||||
)
|
||||
|
||||
# tms + target_image + ref_patches are all gen.
|
||||
tms_pos = new_txt_len - 1
|
||||
ar_len = tms_pos
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, tms_pos:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, new_txt_len:] = True
|
||||
|
||||
# Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
|
||||
out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
|
||||
out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
|
||||
out["ref_patches"] = ref["ref_patches"]
|
||||
else:
|
||||
# T2I: text + noised target only, vision_start replaces the first image token
|
||||
txt_len = text_input_ids.shape[1]
|
||||
total_len = txt_len + image_len
|
||||
vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
|
||||
dtype=text_input_ids.dtype, device=text_input_ids.device)
|
||||
vision_tokens[:, 0] = VISION_START_ID
|
||||
input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
|
||||
position_ids, _ = get_rope_index_fix_point(
|
||||
spatial_merge_size=1,
|
||||
image_token_id=IMAGE_TOKEN_ID,
|
||||
vision_start_token_id=VISION_START_ID,
|
||||
input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
|
||||
attention_mask=None,
|
||||
skip_vision_start_token=[1],
|
||||
)
|
||||
ar_len = txt_len - 1
|
||||
token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
|
||||
token_types[:, ar_len:] = 1
|
||||
vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
|
||||
vinput_mask[:, txt_len:] = True
|
||||
|
||||
out["input_ids"] = text_input_ids
|
||||
out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
|
||||
out["token_types"] = token_types
|
||||
out["vinput_mask"] = vinput_mask
|
||||
out["ar_len"] = ar_len
|
||||
return out
|
||||
306
comfy/ldm/hidream_o1/model.py
Normal file
306
comfy/ldm/hidream_o1/model.py
Normal file
@ -0,0 +1,306 @@
|
||||
"""HiDream-O1-Image transformer.
|
||||
|
||||
Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
|
||||
encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
|
||||
processes a unified text+image sequence, and 32x32 patch embed/unembed
|
||||
shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
|
||||
mergers go unused — their weights are dropped at load.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
|
||||
import einops
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
import comfy.patcher_extension
|
||||
from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
|
||||
from comfy.text_encoders.llama import Llama2_
|
||||
from comfy.text_encoders.qwen35 import Qwen35VisionModel
|
||||
|
||||
from .attention import make_two_pass_attention
|
||||
|
||||
|
||||
IMAGE_TOKEN_ID = 151655 # Qwen3-VL <|image_pad|>
|
||||
TMS_TOKEN_ID = 151673 # HiDream-O1 <|tms_token|>
|
||||
PATCH_SIZE = 32
|
||||
|
||||
|
||||
@dataclass
|
||||
class HiDreamO1TextConfig:
|
||||
"""Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
|
||||
vocab_size: int = 151936
|
||||
hidden_size: int = 4096
|
||||
intermediate_size: int = 12288
|
||||
num_hidden_layers: int = 36
|
||||
num_attention_heads: int = 32
|
||||
num_key_value_heads: int = 8
|
||||
head_dim: int = 128
|
||||
max_position_embeddings: int = 128000
|
||||
rms_norm_eps: float = 1e-6
|
||||
rope_theta: float = 5000000.0
|
||||
rope_scale: Optional[float] = None
|
||||
rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
|
||||
interleaved_mrope: bool = True
|
||||
transformer_type: str = "llama"
|
||||
rms_norm_add: bool = False
|
||||
mlp_activation: str = "silu"
|
||||
qkv_bias: bool = False
|
||||
q_norm: str = "gemma3"
|
||||
k_norm: str = "gemma3"
|
||||
final_norm: bool = True
|
||||
lm_head: bool = False
|
||||
stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
|
||||
|
||||
|
||||
QWEN3VL_VISION_DEFAULTS = dict(
|
||||
hidden_size=1152,
|
||||
num_heads=16,
|
||||
intermediate_size=4304,
|
||||
depth=27,
|
||||
patch_size=16,
|
||||
temporal_patch_size=2,
|
||||
in_channels=3,
|
||||
spatial_merge_size=2,
|
||||
num_position_embeddings=2304,
|
||||
deepstack_visual_indexes=(8, 16, 24),
|
||||
out_hidden_size=4096, # final merger projects directly into LLM hidden
|
||||
)
|
||||
|
||||
|
||||
class BottleneckPatchEmbed(nn.Module):
|
||||
# 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
|
||||
def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
|
||||
self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.proj2(self.proj1(x))
|
||||
|
||||
|
||||
class FinalLayer(nn.Module):
|
||||
# 4096 -> 3072 (LLM hidden -> flat pixel patch).
|
||||
def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
|
||||
super().__init__()
|
||||
self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(x)
|
||||
|
||||
|
||||
class HiDreamO1Transformer(nn.Module):
|
||||
"""HiDream-O1 unified pixel-level transformer."""
|
||||
|
||||
def __init__(self, image_model=None, dtype=None, device=None, operations=None,
|
||||
text_config_overrides=None, vision_config_overrides=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
|
||||
text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
|
||||
vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
|
||||
if vision_config_overrides:
|
||||
vision_cfg.update(vision_config_overrides)
|
||||
vision_cfg["out_hidden_size"] = text_cfg.hidden_size
|
||||
|
||||
self.text_config = text_cfg
|
||||
self.vision_config = vision_cfg
|
||||
self.hidden_size = text_cfg.hidden_size
|
||||
self.patch_size = PATCH_SIZE
|
||||
self.in_channels = 3
|
||||
self.tms_token_id = TMS_TOKEN_ID
|
||||
|
||||
self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
|
||||
self.t_embedder1 = TimestepEmbedder(
|
||||
text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
self.x_embedder = BottleneckPatchEmbed(
|
||||
patch_size=self.patch_size, in_chans=self.in_channels,
|
||||
pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
|
||||
bias=True, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
self.final_layer2 = FinalLayer(
|
||||
text_cfg.hidden_size, patch_size=self.patch_size,
|
||||
out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
|
||||
)
|
||||
|
||||
self._visual_cache = None
|
||||
self._kv_cache_entries = []
|
||||
|
||||
def clear_kv_cache(self):
|
||||
self._kv_cache_entries = []
|
||||
self._visual_cache = None
|
||||
|
||||
def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timesteps, context, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
|
||||
vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
|
||||
"""Returns flow-match velocity (x - x_pred) / sigma"""
|
||||
|
||||
if input_ids is None or position_ids is None:
|
||||
raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
|
||||
|
||||
B, _, H, W = x.shape
|
||||
h_p, w_p = H // self.patch_size, W // self.patch_size
|
||||
tgt_image_len = h_p * w_p
|
||||
|
||||
z = einops.rearrange(
|
||||
x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
|
||||
p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
|
||||
|
||||
inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
|
||||
|
||||
if ref_pixel_values is not None and ref_image_grid_thw is not None:
|
||||
# ViT output is constant across sampling steps within a generation
|
||||
# identity-key by the input tensor so refs don't recompute every step.
|
||||
cached = self._visual_cache
|
||||
if cached is not None and cached[0] is ref_pixel_values:
|
||||
image_embeds = cached[1]
|
||||
else:
|
||||
ref_pv = ref_pixel_values.to(inputs_embeds.device)
|
||||
ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
|
||||
# extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
|
||||
if ref_pv.dim() == 3:
|
||||
ref_pv = ref_pv[0]
|
||||
if ref_grid.dim() == 3:
|
||||
ref_grid = ref_grid[0]
|
||||
image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
|
||||
self._visual_cache = (ref_pixel_values, image_embeds)
|
||||
# image_pad positions identical across batch (input_ids shared cond/uncond).
|
||||
image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
|
||||
if image_idx.shape[0] != image_embeds.shape[0]:
|
||||
raise ValueError(
|
||||
f"Image-token count {image_idx.shape[0]} != ViT output count "
|
||||
f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
|
||||
)
|
||||
inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
|
||||
|
||||
sigma = timesteps.float() / 1000.0
|
||||
t_pixeldit = 1.0 - sigma
|
||||
t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
|
||||
tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
|
||||
inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
|
||||
|
||||
vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
|
||||
inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
|
||||
|
||||
# extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
|
||||
freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
|
||||
freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
|
||||
|
||||
two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
|
||||
patches_replace = transformer_options.get("patches_replace", {})
|
||||
blocks_replace = patches_replace.get("dit", {})
|
||||
transformer_options["total_blocks"] = len(self.language_model.layers)
|
||||
transformer_options["block_type"] = "double"
|
||||
|
||||
# Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
|
||||
# (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
|
||||
can_cache = not blocks_replace and ar_len > 0
|
||||
cache_len = ar_len if can_cache else 0
|
||||
ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
|
||||
pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
|
||||
cache_entries = self._kv_cache_entries
|
||||
# Drop stale entries from a previous device (model was unloaded and reloaded).
|
||||
if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
|
||||
cache_entries = []
|
||||
self._kv_cache_entries = []
|
||||
kv_cache = None
|
||||
if can_cache:
|
||||
for entry in cache_entries:
|
||||
ck = entry["input_ids"]
|
||||
ep = entry["position_ids"]
|
||||
if (entry["cache_len"] == cache_len
|
||||
and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
|
||||
and entry["ref_id"] == ref_id
|
||||
and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
|
||||
kv_cache = entry
|
||||
break
|
||||
|
||||
if kv_cache is not None:
|
||||
# Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
|
||||
hidden_states = inputs_embeds[:, cache_len:]
|
||||
sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
K_i, V_i = kv_cache["kv"][i]
|
||||
hidden_states, _ = layer(
|
||||
x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
|
||||
past_key_value=(K_i, V_i, cache_len),
|
||||
)
|
||||
else:
|
||||
# Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
|
||||
snapshots = [] if can_cache else None
|
||||
past_kv_cold = () if can_cache else None
|
||||
hidden_states = inputs_embeds
|
||||
for i, layer in enumerate(self.language_model.layers):
|
||||
transformer_options["block_index"] = i
|
||||
if ("double_block", i) in blocks_replace:
|
||||
def block_wrap(args, _layer=layer):
|
||||
out = {}
|
||||
out["x"], _ = _layer(
|
||||
x=args["x"], attention_mask=args.get("attention_mask"),
|
||||
freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
|
||||
past_key_value=None,
|
||||
)
|
||||
return out
|
||||
out = blocks_replace[("double_block", i)](
|
||||
{"x": hidden_states, "attention_mask": None,
|
||||
"freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
|
||||
"transformer_options": transformer_options},
|
||||
{"original_block": block_wrap},
|
||||
)
|
||||
hidden_states = out["x"]
|
||||
else:
|
||||
hidden_states, present_kv = layer(
|
||||
x=hidden_states, attention_mask=None,
|
||||
freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
|
||||
past_key_value=past_kv_cold,
|
||||
)
|
||||
if snapshots is not None:
|
||||
K, V, _ = present_kv
|
||||
snapshots.append((K[:, :, :cache_len].contiguous(),
|
||||
V[:, :, :cache_len].contiguous()))
|
||||
if snapshots is not None:
|
||||
# Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
|
||||
new_entry = {
|
||||
"input_ids": input_ids.clone(),
|
||||
"cache_len": cache_len,
|
||||
"kv": snapshots,
|
||||
"ref_id": ref_id,
|
||||
"position_ids": pos_ids_key.clone(),
|
||||
}
|
||||
self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
|
||||
|
||||
if self.language_model.norm is not None:
|
||||
hidden_states = self.language_model.norm(hidden_states)
|
||||
|
||||
# Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
|
||||
# In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
|
||||
sliced_offset = cache_len if kv_cache is not None else 0
|
||||
if vinput_mask is not None:
|
||||
vmask = vinput_mask.to(x.device).bool()
|
||||
if sliced_offset > 0:
|
||||
vmask = vmask[:, sliced_offset:]
|
||||
target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
|
||||
else:
|
||||
txt_seq_len = input_ids.shape[1]
|
||||
start = txt_seq_len - sliced_offset
|
||||
target_hidden = hidden_states[:, start:start + tgt_image_len]
|
||||
x_pred_tgt = self.final_layer2(target_hidden)
|
||||
|
||||
# fp32 final subtraction, bf16 here noticeably degrades samples.
|
||||
x_pred_img = einops.rearrange(
|
||||
x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
|
||||
H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
|
||||
)
|
||||
return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)
|
||||
173
comfy/ldm/hidream_o1/utils.py
Normal file
173
comfy/ldm/hidream_o1/utils.py
Normal file
@ -0,0 +1,173 @@
|
||||
"""HiDream-O1 input-prep helpers: image/resolution math and unified-sequence
|
||||
RoPE position-id assembly. The fix_point offset in get_rope_index_fix_point
|
||||
lets the target image and patchified ref images share spatial RoPE positions
|
||||
despite living at different sequence indices — same 2D image plane.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
PATCH_SIZE = 32
|
||||
CONDITION_IMAGE_SIZE = 384 # ViT-side base size for ref images
|
||||
|
||||
|
||||
def resize_tensor(img_t, image_size, patch_size=16):
|
||||
"""img_t: (1, 3, H, W) float [0, 1]. Fit to image_size**2 area, patch-aligned, center-cropped."""
|
||||
|
||||
while min(img_t.shape[-2], img_t.shape[-1]) >= 2 * image_size: # Pre-halves with 2x2 box averaging while the image is still very large
|
||||
img_t = torch.nn.functional.avg_pool2d(img_t, kernel_size=2, stride=2)
|
||||
|
||||
_, _, height, width = img_t.shape
|
||||
m = patch_size
|
||||
s_max = image_size * image_size
|
||||
scale = math.sqrt(s_max / (width * height))
|
||||
|
||||
candidates = [
|
||||
(round(width * scale) // m * m, round(height * scale) // m * m),
|
||||
(round(width * scale) // m * m, math.floor(height * scale) // m * m),
|
||||
(math.floor(width * scale) // m * m, round(height * scale) // m * m),
|
||||
(math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
|
||||
]
|
||||
candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
|
||||
new_size = candidates[-1]
|
||||
for c in candidates:
|
||||
if c[0] * c[1] <= s_max:
|
||||
new_size = c
|
||||
break
|
||||
|
||||
new_w, new_h = new_size
|
||||
s1 = width / new_w
|
||||
s2 = height / new_h
|
||||
if s1 < s2:
|
||||
resize_w, resize_h = new_w, round(height / s1)
|
||||
else:
|
||||
resize_w, resize_h = round(width / s2), new_h
|
||||
img_t = torch.nn.functional.interpolate(img_t, size=(resize_h, resize_w), mode="bicubic")
|
||||
top = (resize_h - new_h) // 2
|
||||
left = (resize_w - new_w) // 2
|
||||
return img_t[..., top:top + new_h, left:left + new_w]
|
||||
|
||||
|
||||
def calculate_dimensions(max_size, ratio):
|
||||
"""(W, H) for an aspect ratio fitting in max_size**2 area, 32-aligned."""
|
||||
width = math.sqrt(max_size * max_size * ratio)
|
||||
height = width / ratio
|
||||
width = int(width / 32) * 32
|
||||
height = int(height / 32) * 32
|
||||
return width, height
|
||||
|
||||
|
||||
def ref_max_size(target_max_dim, k):
|
||||
"""K-dependent ref-image max dim before patchifying."""
|
||||
if k == 1:
|
||||
return target_max_dim
|
||||
if k == 2:
|
||||
return target_max_dim * 48 // 64
|
||||
if k <= 4:
|
||||
return target_max_dim // 2
|
||||
if k <= 8:
|
||||
return target_max_dim * 24 // 64
|
||||
return target_max_dim // 4
|
||||
|
||||
|
||||
def cond_image_size(k):
|
||||
"""K-dependent ViT-side image size."""
|
||||
if k <= 4:
|
||||
return CONDITION_IMAGE_SIZE
|
||||
if k <= 8:
|
||||
return CONDITION_IMAGE_SIZE * 48 // 64
|
||||
return CONDITION_IMAGE_SIZE // 2
|
||||
|
||||
|
||||
def get_rope_index_fix_point(
|
||||
spatial_merge_size: int,
|
||||
image_token_id: int,
|
||||
vision_start_token_id: int,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
image_grid_thw: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
skip_vision_start_token=None,
|
||||
fix_point: int = 4096,
|
||||
):
|
||||
mrope_position_deltas = []
|
||||
if input_ids is not None and image_grid_thw is not None:
|
||||
total_input_ids = input_ids
|
||||
if attention_mask is None:
|
||||
attention_mask = torch.ones_like(total_input_ids)
|
||||
position_ids = torch.ones(
|
||||
3, input_ids.shape[0], input_ids.shape[1],
|
||||
dtype=input_ids.dtype, device=input_ids.device,
|
||||
)
|
||||
attention_mask = attention_mask.to(total_input_ids.device)
|
||||
for i, input_ids_b in enumerate(total_input_ids):
|
||||
fp = fix_point
|
||||
image_index = 0
|
||||
input_ids_b = input_ids_b[attention_mask[i] == 1]
|
||||
vision_start_indices = torch.argwhere(input_ids_b == vision_start_token_id).squeeze(1)
|
||||
vision_tokens = input_ids_b[vision_start_indices + 1]
|
||||
image_nums = (vision_tokens == image_token_id).sum()
|
||||
input_tokens = input_ids_b.tolist()
|
||||
llm_pos_ids_list = []
|
||||
st = 0
|
||||
remain_images = image_nums
|
||||
for _ in range(image_nums):
|
||||
if image_token_id in input_tokens and remain_images > 0:
|
||||
ed = input_tokens.index(image_token_id, st)
|
||||
else:
|
||||
ed = len(input_tokens) + 1
|
||||
t = image_grid_thw[image_index][0]
|
||||
h = image_grid_thw[image_index][1]
|
||||
w = image_grid_thw[image_index][2]
|
||||
image_index += 1
|
||||
remain_images -= 1
|
||||
llm_grid_t = t.item()
|
||||
llm_grid_h = h.item() // spatial_merge_size
|
||||
llm_grid_w = w.item() // spatial_merge_size
|
||||
text_len = ed - st
|
||||
text_len -= skip_vision_start_token[image_index - 1]
|
||||
text_len = max(0, text_len)
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
|
||||
|
||||
t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
|
||||
h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
|
||||
w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
|
||||
|
||||
if skip_vision_start_token[image_index - 1]:
|
||||
if fp > 0:
|
||||
fp = fp - st_idx
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + fp + st_idx)
|
||||
fp = 0
|
||||
else:
|
||||
llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
|
||||
st = ed + llm_grid_t * llm_grid_h * llm_grid_w
|
||||
|
||||
if st < len(input_tokens):
|
||||
st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
|
||||
text_len = len(input_tokens) - st
|
||||
llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
|
||||
|
||||
llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
|
||||
position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
|
||||
mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
|
||||
mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
|
||||
return position_ids, mrope_position_deltas
|
||||
|
||||
if attention_mask is not None:
|
||||
position_ids = attention_mask.long().cumsum(-1) - 1
|
||||
position_ids.masked_fill_(attention_mask == 0, 1)
|
||||
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
|
||||
max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
|
||||
mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
|
||||
else:
|
||||
position_ids = (
|
||||
torch.arange(input_ids.shape[1], device=input_ids.device)
|
||||
.view(1, 1, -1).expand(3, input_ids.shape[0], -1)
|
||||
)
|
||||
mrope_position_deltas = torch.zeros(
|
||||
[input_ids.shape[0], 1], device=input_ids.device, dtype=input_ids.dtype,
|
||||
)
|
||||
return position_ids, mrope_position_deltas
|
||||
@ -97,12 +97,14 @@ def load_lora(lora, to_load, log_missing=True):
|
||||
|
||||
def model_lora_keys_clip(model, key_map={}):
|
||||
sdk = model.state_dict().keys()
|
||||
prefix_set = set()
|
||||
for k in sdk:
|
||||
if k.endswith(".weight"):
|
||||
key_map["text_encoders.{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
|
||||
tp = k.find(".transformer.") #also map without wrapper prefix for composite text encoder models
|
||||
if tp > 0 and not k.startswith("clip_"):
|
||||
key_map["text_encoders.{}".format(k[tp + 1:-len(".weight")])] = k
|
||||
prefix_set.add(k.split('.')[0])
|
||||
|
||||
text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
|
||||
clip_l_present = False
|
||||
@ -163,6 +165,13 @@ def model_lora_keys_clip(model, key_map={}):
|
||||
lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
|
||||
key_map[lora_key] = k
|
||||
|
||||
if len(prefix_set) == 1:
|
||||
full_prefix = "{}.transformer.model.".format(next(iter(prefix_set))) # kohya anima and maybe other single TE models that use a single llama arch based te
|
||||
for k in sdk:
|
||||
if k.endswith(".weight"):
|
||||
if k.startswith(full_prefix):
|
||||
l_key = k[len(full_prefix):-len(".weight")]
|
||||
key_map["lora_te_{}".format(l_key.replace(".", "_"))] = k
|
||||
|
||||
k = "clip_g.transformer.text_projection.weight"
|
||||
if k in sdk:
|
||||
|
||||
@ -58,6 +58,8 @@ import comfy.ldm.cogvideo.model
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
import comfy.ldm.sam3.detector
|
||||
import comfy.ldm.hidream_o1.model
|
||||
from comfy.ldm.hidream_o1.conditioning import build_extra_conds
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
@ -1674,6 +1676,32 @@ class HiDream(BaseModel):
|
||||
out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
|
||||
return out
|
||||
|
||||
class HiDreamO1(BaseModel):
|
||||
"""HiDream-O1-Image: pixel-space DiT (no VAE). Refs from HiDreamO1ReferenceImages and tokens from the stub TE flow through
|
||||
extra_conds; the heavy preprocessing lives in comfy.ldm.hidream_o1.conditioning."""
|
||||
PATCH_SIZE = 32
|
||||
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream_o1.model.HiDreamO1Transformer)
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
text_input_ids = kwargs.get("text_input_ids", None)
|
||||
noise = kwargs.get("noise", None)
|
||||
if text_input_ids is None or noise is None:
|
||||
return out
|
||||
|
||||
conds = build_extra_conds(
|
||||
text_input_ids, noise,
|
||||
ref_images=kwargs.get("reference_latents", None),
|
||||
target_patch_size=self.PATCH_SIZE,
|
||||
)
|
||||
for k, v in conds.items():
|
||||
# ar_len is a Python int (precomputed to avoid a GPU sync in forward).
|
||||
cls = comfy.conds.CONDConstant if k == "ar_len" else comfy.conds.CONDRegular
|
||||
out[k] = cls(v)
|
||||
return out
|
||||
|
||||
class Chroma(Flux):
|
||||
def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.chroma.model.Chroma):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=unet_model)
|
||||
|
||||
@ -620,6 +620,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys
|
||||
return dit_config
|
||||
|
||||
if '{}t_embedder1.mlp.0.weight'.format(key_prefix) in state_dict_keys and '{}x_embedder.proj1.weight'.format(key_prefix) in state_dict_keys: # HiDream-O1
|
||||
return {"image_model": "hidream_o1"}
|
||||
|
||||
if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys: # HiDream
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "hidream"
|
||||
|
||||
@ -242,6 +242,37 @@ class LazyCastingParam(torch.nn.Parameter):
|
||||
return self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True).to("cpu")
|
||||
|
||||
|
||||
class LazyCastingQuantizedParam:
|
||||
def __init__(self, model, key):
|
||||
self.model = model
|
||||
self.key = key
|
||||
self.cpu_state_dict = None
|
||||
|
||||
def state_dict_tensor(self, state_dict_key):
|
||||
if self.cpu_state_dict is None:
|
||||
weight = self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True)
|
||||
self.cpu_state_dict = {k: v.to("cpu") for k, v in weight.state_dict(self.key).items()}
|
||||
return self.cpu_state_dict[state_dict_key]
|
||||
|
||||
|
||||
class LazyCastingParamPiece(torch.nn.Parameter):
|
||||
def __new__(cls, caster, state_dict_key, tensor):
|
||||
return super().__new__(cls, tensor)
|
||||
|
||||
def __init__(self, caster, state_dict_key, tensor):
|
||||
self.caster = caster
|
||||
self.state_dict_key = state_dict_key
|
||||
|
||||
@property
|
||||
def device(self):
|
||||
return CustomTorchDevice
|
||||
|
||||
def to(self, *args, **kwargs):
|
||||
caster = self.caster
|
||||
del self.caster
|
||||
return caster.state_dict_tensor(self.state_dict_key)
|
||||
|
||||
|
||||
class ModelPatcher:
|
||||
def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
|
||||
self.size = size
|
||||
@ -1463,20 +1494,37 @@ class ModelPatcher:
|
||||
self.clear_cached_hook_weights()
|
||||
|
||||
def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
|
||||
unet_state_dict = self.model.diffusion_model.state_dict()
|
||||
for k, v in unet_state_dict.items():
|
||||
original_state_dict = self.model.diffusion_model.state_dict()
|
||||
unet_state_dict = {}
|
||||
keys = list(original_state_dict)
|
||||
while len(keys) > 0:
|
||||
k = keys.pop(0)
|
||||
v = original_state_dict[k]
|
||||
op_keys = k.rsplit('.', 1)
|
||||
if (len(op_keys) < 2) or op_keys[1] not in ["weight", "bias"]:
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
try:
|
||||
op = comfy.utils.get_attr(self.model.diffusion_model, op_keys[0])
|
||||
except:
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
if not op or not hasattr(op, "comfy_cast_weights") or \
|
||||
(hasattr(op, "comfy_patched_weights") and op.comfy_patched_weights == True):
|
||||
unet_state_dict[k] = v
|
||||
continue
|
||||
key = "diffusion_model." + k
|
||||
unet_state_dict[k] = LazyCastingParam(self, key, comfy.utils.get_attr(self.model, key))
|
||||
weight = comfy.utils.get_attr(self.model, key)
|
||||
if isinstance(weight, QuantizedTensor) and k in original_state_dict:
|
||||
qt_state_dict = weight.state_dict(k)
|
||||
caster = LazyCastingQuantizedParam(self, key)
|
||||
for group_key in (x for x in qt_state_dict if x in original_state_dict):
|
||||
if group_key in keys:
|
||||
keys.remove(group_key)
|
||||
unet_state_dict.pop(group_key, "")
|
||||
unet_state_dict[group_key] = LazyCastingParamPiece(caster, "diffusion_model." + group_key, original_state_dict[group_key])
|
||||
continue
|
||||
unet_state_dict[k] = LazyCastingParam(self, key, weight)
|
||||
return self.model.state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
|
||||
|
||||
def __del__(self):
|
||||
|
||||
@ -93,7 +93,8 @@ class CONST:
|
||||
|
||||
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
|
||||
sigma = reshape_sigma(sigma, noise.ndim)
|
||||
return sigma * noise + (1.0 - sigma) * latent_image
|
||||
s = getattr(self, "noise_scale", 1.0)
|
||||
return sigma * (s * noise) + (1.0 - sigma) * latent_image
|
||||
|
||||
def inverse_noise_scaling(self, sigma, latent):
|
||||
sigma = reshape_sigma(sigma, latent.ndim)
|
||||
@ -288,7 +289,11 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
|
||||
else:
|
||||
sampling_settings = {}
|
||||
|
||||
self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
|
||||
self.set_noise_scale(sampling_settings.get("noise_scale", 1.0))
|
||||
self.set_parameters(
|
||||
shift=sampling_settings.get("shift", 1.0),
|
||||
multiplier=sampling_settings.get("multiplier", 1000),
|
||||
)
|
||||
|
||||
def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000):
|
||||
self.shift = shift
|
||||
@ -296,6 +301,9 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
|
||||
ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
|
||||
self.register_buffer('sigmas', ts)
|
||||
|
||||
def set_noise_scale(self, noise_scale):
|
||||
self.noise_scale = float(noise_scale)
|
||||
|
||||
@property
|
||||
def sigma_min(self):
|
||||
return self.sigmas[0]
|
||||
|
||||
@ -1285,7 +1285,8 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
|
||||
if quant_format in ["float8_e4m3fn", "float8_e5m2"] and weight_key in state_dict:
|
||||
self.quant_format = quant_format
|
||||
qconfig = QUANT_ALGOS[quant_format]
|
||||
layout_cls = get_layout_class(qconfig["comfy_tensor_layout"])
|
||||
self.layout_type = qconfig["comfy_tensor_layout"]
|
||||
layout_cls = get_layout_class(self.layout_type)
|
||||
weight = state_dict.pop(weight_key)
|
||||
manually_loaded_keys = [weight_key]
|
||||
|
||||
|
||||
@ -239,7 +239,8 @@ class CLIP:
|
||||
model_management.archive_model_dtypes(self.cond_stage_model)
|
||||
|
||||
self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
|
||||
ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher
|
||||
te_disable_dynamic = disable_dynamic or getattr(self.cond_stage_model, "disable_offload", False)
|
||||
ModelPatcher = comfy.model_patcher.ModelPatcher if te_disable_dynamic else comfy.model_patcher.CoreModelPatcher
|
||||
self.patcher = ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
|
||||
#Match torch.float32 hardcode upcast in TE implemention
|
||||
self.patcher.set_model_compute_dtype(torch.float32)
|
||||
@ -776,6 +777,7 @@ class VAE:
|
||||
self.latent_channels = 3
|
||||
self.latent_dim = 2
|
||||
self.output_channels = 3
|
||||
self.disable_offload = True
|
||||
elif "vocoder.activation_post.downsample.lowpass.filter" in sd: #MMAudio VAE
|
||||
sample_rate = 16000
|
||||
if sample_rate == 16000:
|
||||
|
||||
@ -28,6 +28,7 @@ import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
import comfy.text_encoders.hidream_o1
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@ -1431,6 +1432,50 @@ class HiDream(supported_models_base.BASE):
|
||||
def clip_target(self, state_dict={}):
|
||||
return None # TODO
|
||||
|
||||
class HiDreamO1(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "hidream_o1",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"shift": 3.0,
|
||||
"noise_scale": 8.0,
|
||||
}
|
||||
|
||||
latent_format = latent_formats.HiDreamO1Pixel
|
||||
memory_usage_factor = 0.033
|
||||
# fp16 not supported: LM MLP down_proj activations fp16 overflow, causing NaNs
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
optimizations = {"fp8": False}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.HiDreamO1(self, device=device)
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
# Drop unused Qwen3-VL deepstack merger weights; upstream discards them at inference.
|
||||
for key in list(state_dict.keys()):
|
||||
if "visual.deepstack_merger_list" in key:
|
||||
del state_dict[key]
|
||||
return state_dict
|
||||
|
||||
def process_vae_state_dict(self, state_dict):
|
||||
# Pixel-space model: inject sentinel so VAE construction picks PixelspaceConversionVAE.
|
||||
return {"pixel_space_vae": torch.tensor(1.0)}
|
||||
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
# Tokenizer-only TE: inject sentinel so load_state_dict_guess_config triggers CLIP init.
|
||||
return {"_hidream_o1_te_sentinel": torch.zeros(1)}
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(
|
||||
comfy.text_encoders.hidream_o1.HiDreamO1Tokenizer,
|
||||
comfy.text_encoders.hidream_o1.HiDreamO1TE,
|
||||
)
|
||||
|
||||
class Chroma(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "chroma",
|
||||
@ -2018,6 +2063,7 @@ models = [
|
||||
Hunyuan3Dv2,
|
||||
Hunyuan3Dv2_1,
|
||||
HiDream,
|
||||
HiDreamO1,
|
||||
Chroma,
|
||||
ChromaRadiance,
|
||||
ACEStep,
|
||||
|
||||
119
comfy/text_encoders/hidream_o1.py
Normal file
119
comfy/text_encoders/hidream_o1.py
Normal file
@ -0,0 +1,119 @@
|
||||
"""HiDream-O1-Image tokenizer-only text encoder.
|
||||
|
||||
The real Qwen3-VL backbone runs inside diffusion_model.* every step, so this
|
||||
module just tokenizes the prompt into text_input_ids and emits them as
|
||||
conditioning. Position ids / token_types / vinput_mask depend on target H/W
|
||||
and are built later in model_base.HiDreamO1.extra_conds.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
from transformers import Qwen2Tokenizer
|
||||
|
||||
from comfy import sd1_clip
|
||||
|
||||
|
||||
# Qwen3-VL special tokens
|
||||
IM_START_ID = 151644
|
||||
IM_END_ID = 151645
|
||||
ASSISTANT_ID = 77091
|
||||
USER_ID = 872
|
||||
NEWLINE_ID = 198
|
||||
VISION_START_ID = 151652
|
||||
VISION_END_ID = 151653
|
||||
IMAGE_TOKEN_ID = 151655
|
||||
VIDEO_TOKEN_ID = 151656
|
||||
# HiDream-O1-specific tokens
|
||||
BOI_TOKEN_ID = 151669
|
||||
BOR_TOKEN_ID = 151670
|
||||
EOR_TOKEN_ID = 151671
|
||||
BOT_TOKEN_ID = 151672
|
||||
TMS_TOKEN_ID = 151673
|
||||
|
||||
|
||||
class HiDreamO1QwenTokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
tokenizer_path = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer"
|
||||
)
|
||||
super().__init__(
|
||||
tokenizer_path,
|
||||
pad_with_end=False,
|
||||
embedding_size=4096,
|
||||
embedding_key="hidream_o1",
|
||||
tokenizer_class=Qwen2Tokenizer,
|
||||
has_start_token=False,
|
||||
has_end_token=False,
|
||||
pad_to_max_length=False,
|
||||
max_length=99999999,
|
||||
min_length=1,
|
||||
pad_token=151643,
|
||||
tokenizer_data=tokenizer_data,
|
||||
)
|
||||
|
||||
|
||||
class HiDreamO1Tokenizer(sd1_clip.SD1Tokenizer):
|
||||
"""Wraps prompt in the upstream chat template ending with boi/tms markers.
|
||||
Image tokens get spliced in at sample time once target H/W is known.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(
|
||||
embedding_directory=embedding_directory,
|
||||
tokenizer_data=tokenizer_data,
|
||||
name="hidream_o1",
|
||||
tokenizer=HiDreamO1QwenTokenizer,
|
||||
)
|
||||
|
||||
def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
|
||||
text_tokens_dict = super().tokenize_with_weights(
|
||||
text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
|
||||
)
|
||||
text_tuples = text_tokens_dict["hidream_o1"][0]
|
||||
text_tuples = [t for t in text_tuples if int(t[0]) != 151643] # strip pad
|
||||
|
||||
# <|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n<|boi|><|tms|>
|
||||
def tok(tid):
|
||||
return (tid, 1.0) if not return_word_ids else (tid, 1.0, 0)
|
||||
|
||||
prefix = [tok(IM_START_ID), tok(USER_ID), tok(NEWLINE_ID)]
|
||||
suffix = [
|
||||
tok(IM_END_ID), tok(NEWLINE_ID),
|
||||
tok(IM_START_ID), tok(ASSISTANT_ID), tok(NEWLINE_ID),
|
||||
tok(BOI_TOKEN_ID), tok(TMS_TOKEN_ID),
|
||||
]
|
||||
full = prefix + list(text_tuples) + suffix
|
||||
return {"hidream_o1": [full]}
|
||||
|
||||
|
||||
class HiDreamO1TE(torch.nn.Module):
|
||||
"""Passthrough TE: emits int token ids; the Qwen3-VL backbone in diffusion_model does the actual encoding."""
|
||||
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__()
|
||||
self.dtypes = {torch.float32}
|
||||
self.disable_offload = True # skips dynamic VRAM management for this zero-parameter module
|
||||
self.device = torch.device("cpu") if device is None else torch.device(device)
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
tok_pairs = token_weight_pairs["hidream_o1"][0]
|
||||
ids = [int(t[0]) for t in tok_pairs]
|
||||
input_ids = torch.tensor([ids], dtype=torch.long)
|
||||
# Surrogate keeps the cross_attn slot non-empty for CONDITIONING
|
||||
# plumbing; the model reads text_input_ids out of `extra` instead.
|
||||
cross_attn = input_ids.unsqueeze(-1).to(torch.float32)
|
||||
extra = {"text_input_ids": input_ids}
|
||||
return cross_attn, None, extra
|
||||
|
||||
def load_sd(self, sd):
|
||||
return []
|
||||
|
||||
def get_sd(self):
|
||||
return {}
|
||||
|
||||
def reset_clip_options(self):
|
||||
pass
|
||||
|
||||
def set_clip_options(self, options):
|
||||
pass
|
||||
@ -397,7 +397,7 @@ class RMSNorm(nn.Module):
|
||||
|
||||
|
||||
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
|
||||
def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None, interleaved_mrope=False):
|
||||
if not isinstance(theta, list):
|
||||
theta = [theta]
|
||||
|
||||
@ -415,16 +415,27 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di
|
||||
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
|
||||
position_ids_expanded = position_ids[:, None, :].float()
|
||||
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
if rope_dims is not None and position_ids.shape[0] > 1 and interleaved_mrope:
|
||||
# Qwen3-VL interleaved MRoPE: T-freqs by default, H/W replace every 3rd dim.
|
||||
freqs_inter = freqs[0].clone()
|
||||
for axis_idx, offset in ((1, 1), (2, 2)):
|
||||
length = rope_dims[axis_idx] * 3
|
||||
idx = slice(offset, length, 3)
|
||||
freqs_inter[..., idx] = freqs[axis_idx, ..., idx]
|
||||
emb = torch.cat((freqs_inter, freqs_inter), dim=-1)
|
||||
cos = emb.cos().unsqueeze(0)
|
||||
sin = emb.sin().unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
emb = torch.cat((freqs, freqs), dim=-1)
|
||||
cos = emb.cos()
|
||||
sin = emb.sin()
|
||||
if rope_dims is not None and position_ids.shape[0] > 1:
|
||||
mrope_section = rope_dims * 2
|
||||
cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
|
||||
else:
|
||||
cos = cos.unsqueeze(1)
|
||||
sin = sin.unsqueeze(1)
|
||||
sin_split = sin.shape[-1] // 2
|
||||
out.append((cos, sin[..., : sin_split], -sin[..., sin_split :]))
|
||||
|
||||
@ -689,6 +700,7 @@ class Llama2_(nn.Module):
|
||||
self.config.rope_theta,
|
||||
self.config.rope_scale,
|
||||
self.config.rope_dims,
|
||||
interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
|
||||
device=device)
|
||||
|
||||
def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
|
||||
|
||||
@ -451,9 +451,8 @@ class Qwen35VisionPatchEmbed(nn.Module):
|
||||
self.proj = ops.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
target_dtype = self.proj.weight.dtype
|
||||
x = x.view(-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size)
|
||||
return self.proj(x.to(target_dtype)).view(-1, self.embed_dim)
|
||||
return self.proj(x).view(-1, self.embed_dim)
|
||||
|
||||
|
||||
class Qwen35VisionMLP(nn.Module):
|
||||
@ -651,7 +650,7 @@ class Qwen35VisionModel(nn.Module):
|
||||
x = self.patch_embed(x)
|
||||
pos_embeds = self.fast_pos_embed_interpolate(grid_thw).to(x.device)
|
||||
x = x + pos_embeds
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw)
|
||||
rotary_pos_emb = self.rot_pos_emb(grid_thw).to(x.device)
|
||||
seq_len = x.shape[0]
|
||||
x = x.reshape(seq_len, -1)
|
||||
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
|
||||
|
||||
@ -1196,7 +1196,7 @@ def model_trange(*args, **kwargs):
|
||||
pbar.i1_time = time.time()
|
||||
pbar.set_postfix_str(" Model Initialization complete! ")
|
||||
elif pbar._i == 2:
|
||||
#bring forward the effective start time based the the diff between first and second iteration
|
||||
#bring forward the effective start time based the diff between first and second iteration
|
||||
#to attempt to remove load overhead from the final step rate estimate.
|
||||
pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
|
||||
pbar.set_postfix_str("")
|
||||
|
||||
@ -23,7 +23,7 @@ class BriaEditImageRequest(BaseModel):
|
||||
None,
|
||||
description="Mask image (black and white). Black areas will be preserved, white areas will be edited. "
|
||||
"If omitted, the edit applies to the entire image. "
|
||||
"The input image and the the input mask must be of the same size.",
|
||||
"The input image and the input mask must be of the same size.",
|
||||
)
|
||||
negative_prompt: str | None = Field(None)
|
||||
guidance_scale: float = Field(...)
|
||||
|
||||
@ -198,6 +198,62 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
|
||||
("Custom", None, None),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_1K = [
|
||||
("(1K) 1024x1024 (1:1)", 1024, 1024),
|
||||
("(1K) 864x1152 (3:4)", 864, 1152),
|
||||
("(1K) 1152x864 (4:3)", 1152, 864),
|
||||
("(1K) 1312x736 (16:9)", 1312, 736),
|
||||
("(1K) 736x1312 (9:16)", 736, 1312),
|
||||
("(1K) 832x1248 (2:3)", 832, 1248),
|
||||
("(1K) 1248x832 (3:2)", 1248, 832),
|
||||
("(1K) 1568x672 (21:9)", 1568, 672),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_2K = [
|
||||
("(2K) 2048x2048 (1:1)", 2048, 2048),
|
||||
("(2K) 1728x2304 (3:4)", 1728, 2304),
|
||||
("(2K) 2304x1728 (4:3)", 2304, 1728),
|
||||
("(2K) 2848x1600 (16:9)", 2848, 1600),
|
||||
("(2K) 1600x2848 (9:16)", 1600, 2848),
|
||||
("(2K) 1664x2496 (2:3)", 1664, 2496),
|
||||
("(2K) 2496x1664 (3:2)", 2496, 1664),
|
||||
("(2K) 3136x1344 (21:9)", 3136, 1344),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_3K = [
|
||||
("(3K) 3072x3072 (1:1)", 3072, 3072),
|
||||
("(3K) 2592x3456 (3:4)", 2592, 3456),
|
||||
("(3K) 3456x2592 (4:3)", 3456, 2592),
|
||||
("(3K) 4096x2304 (16:9)", 4096, 2304),
|
||||
("(3K) 2304x4096 (9:16)", 2304, 4096),
|
||||
("(3K) 2496x3744 (2:3)", 2496, 3744),
|
||||
("(3K) 3744x2496 (3:2)", 3744, 2496),
|
||||
("(3K) 4704x2016 (21:9)", 4704, 2016),
|
||||
]
|
||||
|
||||
_PRESETS_SEEDREAM_4K = [
|
||||
("(4K) 4096x4096 (1:1)", 4096, 4096),
|
||||
("(4K) 3520x4704 (3:4)", 3520, 4704),
|
||||
("(4K) 4704x3520 (4:3)", 4704, 3520),
|
||||
("(4K) 5504x3040 (16:9)", 5504, 3040),
|
||||
("(4K) 3040x5504 (9:16)", 3040, 5504),
|
||||
("(4K) 3328x4992 (2:3)", 3328, 4992),
|
||||
("(4K) 4992x3328 (3:2)", 4992, 3328),
|
||||
("(4K) 6240x2656 (21:9)", 6240, 2656),
|
||||
]
|
||||
|
||||
_CUSTOM_PRESET = [("Custom", None, None)]
|
||||
|
||||
RECOMMENDED_PRESETS_SEEDREAM_5_LITE = (
|
||||
_PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_3K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_5 = (
|
||||
_PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_0 = (
|
||||
_PRESETS_SEEDREAM_1K + _PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
|
||||
)
|
||||
|
||||
# Seedance 2.0 reference video pixel count limits per model and output resolution.
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
|
||||
"dreamina-seedance-2-0-260128": {
|
||||
|
||||
@ -596,6 +596,7 @@ class Flux2ProImageNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["width", "height"], inputs=["images"]),
|
||||
expr=cls.PRICE_BADGE_EXPR,
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -674,6 +675,175 @@ class Flux2MaxImageNode(Flux2ProImageNode):
|
||||
"""
|
||||
|
||||
|
||||
_FLUX2_MODEL_ENDPOINTS = {
|
||||
"Flux.2 [pro]": "/proxy/bfl/flux-2-pro/generate",
|
||||
"Flux.2 [max]": "/proxy/bfl/flux-2-max/generate",
|
||||
}
|
||||
|
||||
|
||||
def _flux2_model_inputs():
|
||||
return [
|
||||
IO.Int.Input(
|
||||
"width",
|
||||
default=1024,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=32,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"height",
|
||||
default=768,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=32,
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 9)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Optional reference image(s) for image-to-image generation. Up to 8 images.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class Flux2ImageNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="Flux2ImageNode",
|
||||
display_name="Flux.2 Image",
|
||||
category="api node/image/BFL",
|
||||
description="Generate images via Flux.2 [pro] or Flux.2 [max] from a prompt and optional reference images.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Prompt for the image generation or edit",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option("Flux.2 [pro]", _flux2_model_inputs()),
|
||||
IO.DynamicCombo.Option("Flux.2 [max]", _flux2_model_inputs()),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=0xFFFFFFFFFFFFFFFF,
|
||||
control_after_generate=True,
|
||||
tooltip="The random seed used for creating the noise.",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Image.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "model.width", "model.height"],
|
||||
input_groups=["model.images"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$isMax := widgets.model = "flux.2 [max]";
|
||||
$MP := 1024 * 1024;
|
||||
$w := $lookup(widgets, "model.width");
|
||||
$h := $lookup(widgets, "model.height");
|
||||
$outMP := $max([1, $floor((($w * $h) + $MP - 1) / $MP)]);
|
||||
$outputCost := $isMax
|
||||
? (0.07 + 0.03 * ($outMP - 1))
|
||||
: (0.03 + 0.015 * ($outMP - 1));
|
||||
$refMin := $isMax ? 0.03 : 0.015;
|
||||
$refMax := $isMax ? 0.24 : 0.12;
|
||||
$hasRefs := $lookup(inputGroups, "model.images") > 0;
|
||||
$hasRefs
|
||||
? {
|
||||
"type": "range_usd",
|
||||
"min_usd": $outputCost + $refMin,
|
||||
"max_usd": $outputCost + $refMax,
|
||||
"format": { "approximate": true }
|
||||
}
|
||||
: {"type": "usd", "usd": $outputCost}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
model_choice = model["model"]
|
||||
endpoint = _FLUX2_MODEL_ENDPOINTS[model_choice]
|
||||
width = model["width"]
|
||||
height = model["height"]
|
||||
images_dict = model.get("images") or {}
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
if n_images > 8:
|
||||
raise ValueError("The current maximum number of supported images is 8.")
|
||||
|
||||
flat_tensors: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat_tensors.append(tensor)
|
||||
|
||||
reference_images: dict[str, str] = {}
|
||||
for idx, tensor in enumerate(flat_tensors):
|
||||
key_name = f"input_image_{idx + 1}" if idx else "input_image"
|
||||
reference_images[key_name] = tensor_to_base64_string(tensor, total_pixels=2048 * 2048)
|
||||
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=endpoint, method="POST"),
|
||||
response_model=BFLFluxProGenerateResponse,
|
||||
data=Flux2ProGenerateRequest(
|
||||
prompt=prompt,
|
||||
width=width,
|
||||
height=height,
|
||||
seed=seed,
|
||||
**reference_images,
|
||||
),
|
||||
)
|
||||
|
||||
def price_extractor(_r: BaseModel) -> float | None:
|
||||
return None if initial_response.cost is None else initial_response.cost / 100
|
||||
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(initial_response.polling_url),
|
||||
response_model=BFLFluxStatusResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
progress_extractor=lambda r: r.progress,
|
||||
price_extractor=price_extractor,
|
||||
completed_statuses=[BFLStatus.ready],
|
||||
failed_statuses=[
|
||||
BFLStatus.request_moderated,
|
||||
BFLStatus.content_moderated,
|
||||
BFLStatus.error,
|
||||
BFLStatus.task_not_found,
|
||||
],
|
||||
queued_statuses=[],
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
|
||||
|
||||
|
||||
class BFLExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -685,6 +855,7 @@ class BFLExtension(ComfyExtension):
|
||||
FluxProFillNode,
|
||||
Flux2ProImageNode,
|
||||
Flux2MaxImageNode,
|
||||
Flux2ImageNode,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -10,6 +10,9 @@ from comfy_api.latest import IO, ComfyExtension, Input
|
||||
from comfy_api_nodes.apis.bytedance import (
|
||||
RECOMMENDED_PRESETS,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_0,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_4_5,
|
||||
RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS,
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
|
||||
VIDEO_TASKS_EXECUTION_TIME,
|
||||
@ -68,6 +71,12 @@ SEEDREAM_MODELS = {
|
||||
"seedream-4-0-250828": "seedream-4-0-250828",
|
||||
}
|
||||
|
||||
SEEDREAM_PRESETS = {
|
||||
"seedream-5-0-260128": RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
|
||||
"seedream-4-5-251128": RECOMMENDED_PRESETS_SEEDREAM_4_5,
|
||||
"seedream-4-0-250828": RECOMMENDED_PRESETS_SEEDREAM_4_0,
|
||||
}
|
||||
|
||||
# Long-running tasks endpoints(e.g., video)
|
||||
BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
|
||||
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
|
||||
@ -562,6 +571,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
)
|
||||
""",
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -651,6 +661,226 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
|
||||
return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
|
||||
|
||||
|
||||
def _seedream_model_inputs(*, max_ref_images: int, presets: list):
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"size_preset",
|
||||
options=[label for label, _, _ in presets],
|
||||
tooltip="Pick a recommended size. Select Custom to use the width and height below.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"width",
|
||||
default=2048,
|
||||
min=1024,
|
||||
max=6240,
|
||||
step=2,
|
||||
tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"height",
|
||||
default=2048,
|
||||
min=1024,
|
||||
max=4992,
|
||||
step=2,
|
||||
tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"max_images",
|
||||
default=1,
|
||||
min=1,
|
||||
max=max_ref_images,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
tooltip="Maximum number of images to generate. With 1, exactly one image is produced. "
|
||||
"With >1, the model generates between 1 and max_images related images "
|
||||
"(e.g., story scenes, character variations). "
|
||||
"Total images (input + generated) cannot exceed 15.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
|
||||
min=0,
|
||||
),
|
||||
tooltip=f"Optional reference image(s) for image-to-image or multi-reference generation. "
|
||||
f"Up to {max_ref_images} images.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"fail_on_partial",
|
||||
default=False,
|
||||
tooltip="If enabled, abort execution if any requested images are missing or return an error.",
|
||||
advanced=True,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class ByteDanceSeedreamNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceSeedreamNodeV2",
|
||||
display_name="ByteDance Seedream 4.5 & 5.0",
|
||||
category="api node/image/ByteDance",
|
||||
description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Text prompt for creating or editing an image.",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream 5.0 lite",
|
||||
_seedream_model_inputs(max_ref_images=14, presets=RECOMMENDED_PRESETS_SEEDREAM_5_LITE),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream-4-5-251128",
|
||||
_seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_5),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"seedream-4-0-250828",
|
||||
_seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_0),
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip='Whether to add an "AI generated" watermark to the image.',
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model"]),
|
||||
expr="""
|
||||
(
|
||||
$price := $contains(widgets.model, "5.0 lite") ? 0.035 :
|
||||
$contains(widgets.model, "4-5") ? 0.04 : 0.03;
|
||||
{
|
||||
"type":"usd",
|
||||
"usd": $price,
|
||||
"format": { "suffix":" x images/Run", "approximate": true }
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int = 0,
|
||||
watermark: bool = False,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDREAM_MODELS[model["model"]]
|
||||
presets = SEEDREAM_PRESETS[model_id]
|
||||
|
||||
size_preset = model.get("size_preset", presets[0][0])
|
||||
width = model.get("width", 2048)
|
||||
height = model.get("height", 2048)
|
||||
max_images = model.get("max_images", 1)
|
||||
sequential_image_generation = "disabled" if max_images == 1 else "auto"
|
||||
images_dict = model.get("images") or {}
|
||||
fail_on_partial = model.get("fail_on_partial", False)
|
||||
|
||||
w = h = None
|
||||
for label, tw, th in presets:
|
||||
if label == size_preset:
|
||||
w, h = tw, th
|
||||
break
|
||||
if w is None or h is None:
|
||||
w, h = width, height
|
||||
|
||||
out_num_pixels = w * h
|
||||
mp_provided = out_num_pixels / 1_000_000.0
|
||||
if ("seedream-4-5" in model_id or "seedream-5-0" in model_id) and out_num_pixels < 3686400:
|
||||
raise ValueError(
|
||||
f"Minimum image resolution for the selected model is 3.68MP, but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
if "seedream-4-0" in model_id and out_num_pixels < 921600:
|
||||
raise ValueError(
|
||||
f"Minimum image resolution that the selected model can generate is 0.92MP, "
|
||||
f"but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
if out_num_pixels > 16_777_216:
|
||||
raise ValueError(
|
||||
f"Maximum image resolution for the selected model is 16.78MP, but {mp_provided:.2f}MP provided."
|
||||
)
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_input_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
max_num_of_images = 14 if model_id == "seedream-5-0-260128" else 10
|
||||
if n_input_images > max_num_of_images:
|
||||
raise ValueError(
|
||||
f"Maximum of {max_num_of_images} reference images are supported, but {n_input_images} received."
|
||||
)
|
||||
if sequential_image_generation == "auto" and n_input_images + max_images > 15:
|
||||
raise ValueError(
|
||||
"The maximum number of generated images plus the number of reference images cannot exceed 15."
|
||||
)
|
||||
|
||||
reference_images_urls: list[str] = []
|
||||
if image_tensors:
|
||||
for tensor in image_tensors:
|
||||
validate_image_aspect_ratio(tensor, (1, 3), (3, 1))
|
||||
reference_images_urls = await upload_images_to_comfyapi(
|
||||
cls,
|
||||
image_tensors,
|
||||
max_images=n_input_images,
|
||||
mime_type="image/png",
|
||||
wait_label="Uploading reference images",
|
||||
)
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"),
|
||||
response_model=ImageTaskCreationResponse,
|
||||
data=Seedream4TaskCreationRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
image=reference_images_urls,
|
||||
size=f"{w}x{h}",
|
||||
seed=seed,
|
||||
sequential_image_generation=sequential_image_generation,
|
||||
sequential_image_generation_options=Seedream4Options(max_images=max_images),
|
||||
watermark=watermark,
|
||||
),
|
||||
)
|
||||
if len(response.data) == 1:
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
|
||||
urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d]
|
||||
if fail_on_partial and len(urls) < len(response.data):
|
||||
raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.")
|
||||
return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
|
||||
|
||||
|
||||
class ByteDanceTextToVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -2105,6 +2335,7 @@ class ByteDanceExtension(ComfyExtension):
|
||||
return [
|
||||
ByteDanceImageNode,
|
||||
ByteDanceSeedreamNode,
|
||||
ByteDanceSeedreamNodeV2,
|
||||
ByteDanceTextToVideoNode,
|
||||
ByteDanceImageToVideoNode,
|
||||
ByteDanceFirstLastFrameNode,
|
||||
|
||||
@ -162,6 +162,61 @@ class GrokImageNode(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS = [
|
||||
"auto",
|
||||
"1:1",
|
||||
"2:3",
|
||||
"3:2",
|
||||
"3:4",
|
||||
"4:3",
|
||||
"9:16",
|
||||
"16:9",
|
||||
"9:19.5",
|
||||
"19.5:9",
|
||||
"9:20",
|
||||
"20:9",
|
||||
"1:2",
|
||||
"2:1",
|
||||
]
|
||||
|
||||
|
||||
def _grok_image_edit_model_inputs(*, max_ref_images: int, with_aspect_ratio: bool):
|
||||
inputs = [
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
|
||||
min=1,
|
||||
),
|
||||
tooltip=(
|
||||
"Reference image to edit."
|
||||
if max_ref_images == 1
|
||||
else f"Reference image(s) to edit. Up to {max_ref_images} images."
|
||||
),
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["1K", "2K"]),
|
||||
IO.Int.Input(
|
||||
"number_of_images",
|
||||
default=1,
|
||||
min=1,
|
||||
max=10,
|
||||
step=1,
|
||||
tooltip="Number of edited images to generate",
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
]
|
||||
if with_aspect_ratio:
|
||||
inputs.append(
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS,
|
||||
tooltip="Only allowed when multiple images are connected.",
|
||||
)
|
||||
)
|
||||
return inputs
|
||||
|
||||
|
||||
class GrokImageEditNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -256,6 +311,7 @@ class GrokImageEditNode(IO.ComfyNode):
|
||||
)
|
||||
""",
|
||||
),
|
||||
is_deprecated=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
@ -303,6 +359,143 @@ class GrokImageEditNode(IO.ComfyNode):
|
||||
)
|
||||
|
||||
|
||||
class GrokImageEditNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="GrokImageEditNodeV2",
|
||||
display_name="Grok Image Edit",
|
||||
category="api node/image/Grok",
|
||||
description="Modify an existing image based on a text prompt",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="The text prompt used to generate the image",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image-quality",
|
||||
_grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image-pro",
|
||||
_grok_image_edit_model_inputs(max_ref_images=1, with_aspect_ratio=False),
|
||||
),
|
||||
IO.DynamicCombo.Option(
|
||||
"grok-imagine-image",
|
||||
_grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to determine if node should re-run; "
|
||||
"actual results are nondeterministic regardless of seed.",
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(
|
||||
widgets=["model", "model.resolution", "model.number_of_images"],
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$isQualityModel := widgets.model = "grok-imagine-image-quality";
|
||||
$isPro := $contains(widgets.model, "pro");
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$n := $lookup(widgets, "model.number_of_images");
|
||||
$rate := $isQualityModel
|
||||
? ($res = "1k" ? 0.05 : 0.07)
|
||||
: ($isPro ? 0.07 : 0.02);
|
||||
$base := $isQualityModel ? 0.01 : 0.002;
|
||||
$output := $rate * $n;
|
||||
$isPro
|
||||
? {"type":"usd","usd": $base + $output}
|
||||
: {"type":"range_usd","min_usd": $base + $output, "max_usd": 3 * $base + $output}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=True, min_length=1)
|
||||
model_id = model["model"]
|
||||
resolution = model["resolution"]
|
||||
number_of_images = model["number_of_images"]
|
||||
images_dict = model.get("images") or {}
|
||||
aspect_ratio = model.get("aspect_ratio", "auto")
|
||||
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
if n_images < 1:
|
||||
raise ValueError("At least one image is required for editing.")
|
||||
if model_id == "grok-imagine-image-pro" and n_images > 1:
|
||||
raise ValueError("The pro model supports only 1 input image.")
|
||||
if model_id != "grok-imagine-image-pro" and n_images > 3:
|
||||
raise ValueError("A maximum of 3 input images is supported.")
|
||||
if aspect_ratio != "auto" and n_images == 1:
|
||||
raise ValueError(
|
||||
"Custom aspect ratio is only allowed when multiple images are connected to the image input."
|
||||
)
|
||||
|
||||
flat_tensors: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat_tensors.append(tensor)
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/xai/v1/images/edits", method="POST"),
|
||||
data=ImageEditRequest(
|
||||
model=model_id,
|
||||
images=[
|
||||
InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(i)}") for i in flat_tensors
|
||||
],
|
||||
prompt=prompt,
|
||||
resolution=resolution.lower(),
|
||||
n=number_of_images,
|
||||
seed=seed,
|
||||
aspect_ratio=None if aspect_ratio == "auto" else aspect_ratio,
|
||||
),
|
||||
response_model=ImageGenerationResponse,
|
||||
price_extractor=_extract_grok_price,
|
||||
)
|
||||
if len(response.data) == 1:
|
||||
return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url))
|
||||
return IO.NodeOutput(
|
||||
torch.cat(
|
||||
[await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]],
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class GrokVideoNode(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -737,6 +930,7 @@ class GrokExtension(ComfyExtension):
|
||||
return [
|
||||
GrokImageNode,
|
||||
GrokImageEditNode,
|
||||
GrokImageEditNodeV2,
|
||||
GrokVideoNode,
|
||||
GrokVideoReferenceNode,
|
||||
GrokVideoEditNode,
|
||||
|
||||
@ -27,6 +27,7 @@ from comfy_api_nodes.util import (
|
||||
ApiEndpoint,
|
||||
download_url_to_bytesio,
|
||||
downscale_image_tensor,
|
||||
get_number_of_images,
|
||||
poll_op,
|
||||
sync_op,
|
||||
tensor_to_base64_string,
|
||||
@ -372,6 +373,7 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
display_name="OpenAI GPT Image 2",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images synchronously via OpenAI's GPT Image endpoint.",
|
||||
is_deprecated=True,
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
@ -640,6 +642,316 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
def _gpt_image_shared_inputs():
|
||||
"""Inputs shared by all GPT Image models (quality + reference images + mask)."""
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"quality",
|
||||
default="low",
|
||||
options=["low", "medium", "high"],
|
||||
tooltip="Image quality, affects cost and generation time.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 17)],
|
||||
min=0,
|
||||
),
|
||||
tooltip="Optional reference image(s) for image editing. Up to 16 images.",
|
||||
),
|
||||
IO.Mask.Input(
|
||||
"mask",
|
||||
optional=True,
|
||||
tooltip="Optional mask for inpainting (white areas will be replaced). "
|
||||
"Requires exactly one reference image.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _gpt_image_legacy_model_inputs():
|
||||
"""Per-model widget set for legacy gpt-image-1 / gpt-image-1.5 (4 base sizes, transparent bg allowed)."""
|
||||
return [
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=["auto", "1024x1024", "1024x1536", "1536x1024"],
|
||||
tooltip="Image size.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"background",
|
||||
default="auto",
|
||||
options=["auto", "opaque", "transparent"],
|
||||
tooltip="Return image with or without background.",
|
||||
),
|
||||
*_gpt_image_shared_inputs(),
|
||||
]
|
||||
|
||||
|
||||
class OpenAIGPTImageNodeV2(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenAIGPTImageNodeV2",
|
||||
display_name="OpenAI GPT Image 2",
|
||||
category="api node/image/OpenAI",
|
||||
description="Generates images via OpenAI's GPT Image endpoint.",
|
||||
inputs=[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
default="",
|
||||
multiline=True,
|
||||
tooltip="Text prompt for GPT Image",
|
||||
),
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"gpt-image-2",
|
||||
[
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=[
|
||||
"auto",
|
||||
"1024x1024",
|
||||
"1024x1536",
|
||||
"1536x1024",
|
||||
"2048x2048",
|
||||
"2048x1152",
|
||||
"1152x2048",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
"Custom",
|
||||
],
|
||||
tooltip="Image size. Select 'Custom' to use the custom width and height.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_width",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_height",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"background",
|
||||
default="auto",
|
||||
options=["auto", "opaque"],
|
||||
tooltip="Return image with or without background.",
|
||||
),
|
||||
*_gpt_image_shared_inputs(),
|
||||
],
|
||||
),
|
||||
IO.DynamicCombo.Option("gpt-image-1.5", _gpt_image_legacy_model_inputs()),
|
||||
IO.DynamicCombo.Option("gpt-image-1", _gpt_image_legacy_model_inputs()),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"n",
|
||||
default=1,
|
||||
min=1,
|
||||
max=8,
|
||||
step=1,
|
||||
tooltip="How many images to generate",
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="not implemented yet in backend",
|
||||
),
|
||||
],
|
||||
outputs=[IO.Image.Output()],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.quality", "n"]),
|
||||
expr="""
|
||||
(
|
||||
$ranges := {
|
||||
"gpt-image-1": {
|
||||
"low": [0.011, 0.02],
|
||||
"medium": [0.042, 0.07],
|
||||
"high": [0.167, 0.25]
|
||||
},
|
||||
"gpt-image-1.5": {
|
||||
"low": [0.009, 0.02],
|
||||
"medium": [0.034, 0.062],
|
||||
"high": [0.133, 0.22]
|
||||
},
|
||||
"gpt-image-2": {
|
||||
"low": [0.0048, 0.019],
|
||||
"medium": [0.041, 0.168],
|
||||
"high": [0.165, 0.67]
|
||||
}
|
||||
};
|
||||
$range := $lookup($lookup($ranges, widgets.model), $lookup(widgets, "model.quality"));
|
||||
$nRaw := widgets.n;
|
||||
$n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
|
||||
($n = 1)
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
|
||||
: {
|
||||
"type":"range_usd",
|
||||
"min_usd": $range[0] * $n,
|
||||
"max_usd": $range[1] * $n,
|
||||
"format": { "suffix": "/Run", "approximate": true }
|
||||
}
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
prompt: str,
|
||||
model: dict,
|
||||
n: int,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False)
|
||||
|
||||
model_id = model["model"]
|
||||
size = model["size"]
|
||||
background = model["background"]
|
||||
quality = model["quality"]
|
||||
custom_width = model.get("custom_width", 1024)
|
||||
custom_height = model.get("custom_height", 1024)
|
||||
|
||||
images_dict = model.get("images") or {}
|
||||
image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
|
||||
n_images = sum(get_number_of_images(t) for t in image_tensors)
|
||||
mask = model.get("mask")
|
||||
|
||||
if mask is not None and n_images == 0:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
if size == "Custom":
|
||||
if custom_width % 16 != 0 or custom_height % 16 != 0:
|
||||
raise ValueError(
|
||||
f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}"
|
||||
)
|
||||
if max(custom_width, custom_height) > 3840:
|
||||
raise ValueError(
|
||||
f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}"
|
||||
)
|
||||
ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
|
||||
if ratio > 3:
|
||||
raise ValueError(
|
||||
f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
|
||||
)
|
||||
total_pixels = custom_width * custom_height
|
||||
if not 655_360 <= total_pixels <= 8_294_400:
|
||||
raise ValueError(
|
||||
f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
|
||||
)
|
||||
size = f"{custom_width}x{custom_height}"
|
||||
|
||||
if model_id == "gpt-image-1":
|
||||
price_extractor = calculate_tokens_price_image_1
|
||||
elif model_id == "gpt-image-1.5":
|
||||
price_extractor = calculate_tokens_price_image_1_5
|
||||
elif model_id == "gpt-image-2":
|
||||
price_extractor = calculate_tokens_price_image_2_0
|
||||
else:
|
||||
raise ValueError(f"Unknown model: {model_id}")
|
||||
|
||||
if image_tensors:
|
||||
flat: list[torch.Tensor] = []
|
||||
for tensor in image_tensors:
|
||||
if len(tensor.shape) == 4:
|
||||
flat.extend(tensor[i : i + 1] for i in range(tensor.shape[0]))
|
||||
else:
|
||||
flat.append(tensor.unsqueeze(0))
|
||||
|
||||
files = []
|
||||
for i, single_image in enumerate(flat):
|
||||
scaled_image = downscale_image_tensor(single_image, total_pixels=2048 * 2048).squeeze()
|
||||
image_np = (scaled_image.numpy() * 255).astype(np.uint8)
|
||||
img = Image.fromarray(image_np)
|
||||
img_byte_arr = BytesIO()
|
||||
img.save(img_byte_arr, format="PNG")
|
||||
img_byte_arr.seek(0)
|
||||
|
||||
if len(flat) == 1:
|
||||
files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
else:
|
||||
files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png")))
|
||||
|
||||
if mask is not None:
|
||||
if len(flat) != 1:
|
||||
raise Exception("Cannot use a mask with multiple image")
|
||||
ref_image = flat[0]
|
||||
if mask.shape[1:] != ref_image.shape[1:-1]:
|
||||
raise Exception("Mask and Image must be the same size")
|
||||
_, height, width = mask.shape
|
||||
rgba_mask = torch.zeros(height, width, 4, device="cpu")
|
||||
rgba_mask[:, :, 3] = 1 - mask.squeeze().cpu()
|
||||
scaled_mask = downscale_image_tensor(
|
||||
rgba_mask.unsqueeze(0), total_pixels=2048 * 2048
|
||||
).squeeze()
|
||||
mask_np = (scaled_mask.numpy() * 255).astype(np.uint8)
|
||||
mask_img = Image.fromarray(mask_np)
|
||||
mask_img_byte_arr = BytesIO()
|
||||
mask_img.save(mask_img_byte_arr, format="PNG")
|
||||
mask_img_byte_arr.seek(0)
|
||||
files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png")))
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/edits", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageEditRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=n,
|
||||
size=size,
|
||||
moderation="low",
|
||||
),
|
||||
content_type="multipart/form-data",
|
||||
files=files,
|
||||
price_extractor=price_extractor,
|
||||
)
|
||||
else:
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/openai/images/generations", method="POST"),
|
||||
response_model=OpenAIImageGenerationResponse,
|
||||
data=OpenAIImageGenerationRequest(
|
||||
model=model_id,
|
||||
prompt=prompt,
|
||||
quality=quality,
|
||||
background=background,
|
||||
n=n,
|
||||
size=size,
|
||||
moderation="low",
|
||||
),
|
||||
price_extractor=price_extractor,
|
||||
)
|
||||
return IO.NodeOutput(await validate_and_cast_response(response))
|
||||
|
||||
|
||||
class OpenAIChatNode(IO.ComfyNode):
|
||||
"""
|
||||
Node to generate text responses from an OpenAI model.
|
||||
@ -999,6 +1311,7 @@ class OpenAIExtension(ComfyExtension):
|
||||
OpenAIDalle2,
|
||||
OpenAIDalle3,
|
||||
OpenAIGPTImage1,
|
||||
OpenAIGPTImageNodeV2,
|
||||
OpenAIChatNode,
|
||||
OpenAIInputFiles,
|
||||
OpenAIChatConfig,
|
||||
|
||||
@ -143,7 +143,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
|
||||
if reference_images:
|
||||
references = []
|
||||
for key in reference_images:
|
||||
url = await upload_image_to_comfyapi(cls, reference_images[key])
|
||||
url = await upload_image_to_comfyapi(cls, reference_images[key], mime_type="image/png")
|
||||
references.append(QuiverImageObject(url=url))
|
||||
if len(references) > 4:
|
||||
raise ValueError("Maximum 4 reference images are allowed.")
|
||||
@ -252,7 +252,7 @@ class QuiverImageToSVGNode(IO.ComfyNode):
|
||||
model: dict,
|
||||
seed: int,
|
||||
) -> IO.NodeOutput:
|
||||
image_url = await upload_image_to_comfyapi(cls, image)
|
||||
image_url = await upload_image_to_comfyapi(cls, image, mime_type="image/png")
|
||||
|
||||
response = await sync_op(
|
||||
cls,
|
||||
|
||||
@ -86,6 +86,37 @@ def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=No
|
||||
return x
|
||||
|
||||
|
||||
class SamplerLCM(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="SamplerLCM",
|
||||
category="sampling/samplers",
|
||||
description=("LCM sampler with tunable per-step noise. s_noise is a multiplier on the model's training noise scale"),
|
||||
inputs=[
|
||||
io.Float.Input("s_noise", default=1.0, min=0.0, max=64.0, step=0.01,
|
||||
tooltip="Per-step noise multiplier at the first step (1.0 = match training)."),
|
||||
io.Float.Input("s_noise_end", default=1.0, min=0.0, max=64.0, step=0.01,
|
||||
tooltip="Per-step noise multiplier at the last step. Set equal to s_noise for a constant schedule."),
|
||||
io.Float.Input("noise_clip_std", default=0.0, min=0.0, max=10.0, step=0.01,
|
||||
tooltip="Clamp per-step noise to +/- N*std. 0 disables."),
|
||||
],
|
||||
outputs=[io.Sampler.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, s_noise, s_noise_end, noise_clip_std) -> io.NodeOutput:
|
||||
sampler = comfy.samplers.ksampler(
|
||||
"lcm",
|
||||
{
|
||||
"s_noise": float(s_noise),
|
||||
"s_noise_end": float(s_noise_end),
|
||||
"noise_clip_std": float(noise_clip_std),
|
||||
},
|
||||
)
|
||||
return io.NodeOutput(sampler)
|
||||
|
||||
|
||||
class SamplerEulerCFGpp(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
@ -114,6 +145,7 @@ class AdvancedSamplersExtension(ComfyExtension):
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
SamplerLCMUpscale,
|
||||
SamplerLCM,
|
||||
SamplerEulerCFGpp,
|
||||
]
|
||||
|
||||
|
||||
@ -297,6 +297,7 @@ class LoadAudio(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
input_dir = folder_paths.get_input_directory()
|
||||
os.makedirs(input_dir, exist_ok=True)
|
||||
files = folder_paths.filter_files_content_types(os.listdir(input_dir), ["audio", "video"])
|
||||
return IO.Schema(
|
||||
node_id="LoadAudio",
|
||||
|
||||
256
comfy_extras/nodes_hidream_o1.py
Normal file
256
comfy_extras/nodes_hidream_o1.py
Normal file
@ -0,0 +1,256 @@
|
||||
from typing_extensions import override
|
||||
|
||||
import torch
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
import node_helpers
|
||||
from comfy_api.latest import ComfyExtension, io
|
||||
|
||||
|
||||
class EmptyHiDreamO1LatentImage(io.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="EmptyHiDreamO1LatentImage",
|
||||
display_name="Empty HiDream-O1 Latent Image",
|
||||
category="latent/image",
|
||||
description=(
|
||||
"Empty pixel-space latent for HiDream-O1-Image. The model was "
|
||||
"trained at ~4 megapixels; lower resolutions go off-distribution "
|
||||
"and quality regresses noticeably. Trained resolutions: "
|
||||
"2048x2048, 2304x1728, 1728x2304, 2560x1440, 1440x2560, "
|
||||
"2496x1664, 1664x2496, 3104x1312, 1312x3104, 2304x1792, 1792x2304."
|
||||
),
|
||||
inputs=[
|
||||
io.Int.Input(id="width", default=2048, min=64, max=4096, step=32),
|
||||
io.Int.Input(id="height", default=2048, min=64, max=4096, step=32),
|
||||
io.Int.Input(id="batch_size", default=1, min=1, max=64),
|
||||
],
|
||||
outputs=[io.Latent().Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, width: int, height: int, batch_size: int = 1) -> io.NodeOutput:
|
||||
latent = torch.zeros(
|
||||
(batch_size, 3, height, width),
|
||||
device=comfy.model_management.intermediate_device(),
|
||||
)
|
||||
return io.NodeOutput({"samples": latent})
|
||||
|
||||
|
||||
class HiDreamO1ReferenceImages(io.ComfyNode):
|
||||
"""Attach reference images to both positive and negative conditioning."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1ReferenceImages",
|
||||
display_name="HiDream-O1 Reference Images",
|
||||
category="conditioning/image",
|
||||
description=(
|
||||
"Attach 1-10 reference images to conditioning, one for edit instruction"
|
||||
"or multiple for subject-driven personalization."
|
||||
),
|
||||
inputs=[
|
||||
io.Conditioning.Input(id="positive"),
|
||||
io.Conditioning.Input(id="negative"),
|
||||
io.Autogrow.Input(
|
||||
"images",
|
||||
template=io.Autogrow.TemplateNames(
|
||||
io.Image.Input("image"),
|
||||
names=[f"image_{i}" for i in range(1, 11)],
|
||||
min=1,
|
||||
),
|
||||
tooltip=("Reference images. 1 image = instruction edit; 2-10 images = multi reference."
|
||||
),
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
io.Conditioning.Output(display_name="positive"),
|
||||
io.Conditioning.Output(display_name="negative"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, positive, negative, images: io.Autogrow.Type) -> io.NodeOutput:
|
||||
refs = [images[f"image_{i}"] for i in range(1, 11) if f"image_{i}" in images]
|
||||
positive = node_helpers.conditioning_set_values(positive, {"reference_latents": refs}, append=True)
|
||||
negative = node_helpers.conditioning_set_values(negative, {"reference_latents": refs}, append=True)
|
||||
return io.NodeOutput(positive, negative)
|
||||
|
||||
|
||||
class HiDreamO1PatchSeamSmoothing(io.ComfyNode):
|
||||
PATCH_SIZE = 32
|
||||
EDGE_FEATHER = 4
|
||||
|
||||
# Shift presets per (pattern, N). 8-pass = 4-quadrant + 4 quarter-patch offsets.
|
||||
SHIFTS_BY_PATTERN = {
|
||||
("single_shift", 2): [(0, 0), (16, 16)],
|
||||
("single_shift", 4): [(0, 0), (16, 0), (0, 16), (16, 16)],
|
||||
("single_shift", 8): [(0, 0), (16, 0), (0, 16), (16, 16),
|
||||
(8, 8), (24, 8), (8, 24), (24, 24)],
|
||||
("symmetric", 2): [(-8, -8), (8, 8)],
|
||||
("symmetric", 4): [(-8, -8), (8, -8), (-8, 8), (8, 8)],
|
||||
("symmetric", 8): [(-12, -12), (4, -12), (-12, 4), (4, 4),
|
||||
(-4, -4), (12, -4), (-4, 12), (12, 12)],
|
||||
}
|
||||
RAMP_LEVELS = {
|
||||
"2": [2],
|
||||
"4": [4],
|
||||
"ramp_2_4": [2, 4],
|
||||
"ramp_2_4_8": [2, 4, 8],
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _hann_tile(cy: int, cx: int, size: int = 32) -> torch.Tensor:
|
||||
"""size x size Hann tile peaking at (cy, cx) within a patch."""
|
||||
half = size // 2
|
||||
yy = torch.arange(size).view(size, 1)
|
||||
xx = torch.arange(size).view(1, size)
|
||||
dy = ((yy - cy + half) % size) - half
|
||||
dx = ((xx - cx + half) % size) - half
|
||||
return 0.25 * (1 + torch.cos(torch.pi * dy / half)) * (1 + torch.cos(torch.pi * dx / half))
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> io.Schema:
|
||||
return io.Schema(
|
||||
node_id="HiDreamO1PatchSeamSmoothing",
|
||||
display_name="HiDream-O1 Patch Seam Smoothing",
|
||||
category="advanced/model",
|
||||
is_experimental=True,
|
||||
description=(
|
||||
"Average the model output across multiple shifted patch-grid "
|
||||
"positions during the late portion of sampling. Cancels seams."
|
||||
),
|
||||
inputs=[
|
||||
io.Model.Input(id="model"),
|
||||
io.Float.Input(id="start_percent", default=0.8, min=0.0, max=1.0, step=0.01,
|
||||
tooltip="Sampling progress (0=start, 1=end) at which the blend turns ON.",
|
||||
),
|
||||
io.Float.Input(id="end_percent", default=1.0, min=0.0, max=1.0, step=0.01,
|
||||
tooltip="Sampling progress at which the blend turns OFF.",
|
||||
),
|
||||
io.Combo.Input(
|
||||
id="pattern",
|
||||
options=["single_shift", "symmetric"],
|
||||
default="single_shift",
|
||||
tooltip="Shift layout. single_shift: one pass at the natural patch grid + others offset. symmetric: all passes off-grid, shifts split around origin.",
|
||||
),
|
||||
io.Combo.Input(
|
||||
id="passes",
|
||||
options=["2", "4", "ramp_2_4", "ramp_2_4_8"],
|
||||
default="2",
|
||||
tooltip="Number of passes per gated step. 2/4 = fixed. ramp_*: pass count increases as sampling approaches end (more smoothing where seams are most visible).",
|
||||
),
|
||||
io.Combo.Input(
|
||||
id="blend",
|
||||
options=["average", "window", "median"],
|
||||
default="average",
|
||||
tooltip="average: equal-weight mean. window: Hann-windowed weighting favoring each pass away from its patch boundaries. median: per-pixel median, rejects wraparound-outlier passes.",
|
||||
),
|
||||
io.Float.Input(id="strength", default=1.0, min=0.0, max=1.0, step=0.01,
|
||||
tooltip="Interpolation between the natural-grid pred (0) and the averaged result (1).",
|
||||
),
|
||||
],
|
||||
outputs=[io.Model.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, *, model, start_percent: float, end_percent: float, pattern: str, passes: str, blend: str, strength: float) -> io.NodeOutput:
|
||||
if strength <= 0.0 or end_percent <= start_percent:
|
||||
return io.NodeOutput(model)
|
||||
|
||||
P = cls.PATCH_SIZE
|
||||
half = P // 2
|
||||
shift_levels = [cls.SHIFTS_BY_PATTERN[(pattern, n)] for n in cls.RAMP_LEVELS[passes]]
|
||||
|
||||
if blend == "window":
|
||||
window_tile_levels = [
|
||||
torch.stack([cls._hann_tile((half - sy) % P, (half - sx) % P, P) for sy, sx in lst], dim=0)
|
||||
for lst in shift_levels
|
||||
]
|
||||
else:
|
||||
window_tile_levels = [None] * len(shift_levels)
|
||||
|
||||
m = model.clone()
|
||||
model_sampling = m.get_model_object("model_sampling")
|
||||
multiplier = float(model_sampling.multiplier)
|
||||
start_t = float(model_sampling.percent_to_sigma(start_percent)) * multiplier
|
||||
end_t = float(model_sampling.percent_to_sigma(end_percent)) * multiplier
|
||||
|
||||
edge_ramp_cache: dict = {}
|
||||
|
||||
def get_edge_ramp(H: int, W: int, device, dtype) -> torch.Tensor:
|
||||
key = (H, W, device, dtype)
|
||||
cached = edge_ramp_cache.get(key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
feather = cls.EDGE_FEATHER
|
||||
ys = torch.minimum(torch.arange(H, device=device, dtype=torch.float32),
|
||||
(H - 1) - torch.arange(H, device=device, dtype=torch.float32))
|
||||
xs = torch.minimum(torch.arange(W, device=device, dtype=torch.float32),
|
||||
(W - 1) - torch.arange(W, device=device, dtype=torch.float32))
|
||||
y_mask = ((ys - P) / feather).clamp(0, 1)
|
||||
x_mask = ((xs - P) / feather).clamp(0, 1)
|
||||
ramp = (y_mask[:, None] * x_mask[None, :]).to(dtype)
|
||||
edge_ramp_cache[key] = ramp
|
||||
return ramp
|
||||
|
||||
def smoothing_wrapper(executor, *args, **kwargs):
|
||||
x = args[0]
|
||||
t = float(args[1][0])
|
||||
pred = executor(*args, **kwargs)
|
||||
if not (end_t <= t <= start_t):
|
||||
return pred
|
||||
# Pick shift-level by sigma phase across the gated range.
|
||||
if len(shift_levels) == 1:
|
||||
level_idx = 0
|
||||
else:
|
||||
phase = (start_t - t) / max(start_t - end_t, 1e-8)
|
||||
level_idx = min(int(phase * len(shift_levels)), len(shift_levels) - 1)
|
||||
shifts = shift_levels[level_idx]
|
||||
window_tiles = window_tile_levels[level_idx]
|
||||
|
||||
preds = []
|
||||
for sy, sx in shifts:
|
||||
if sy == 0 and sx == 0:
|
||||
preds.append(pred)
|
||||
continue
|
||||
x_rolled = torch.roll(x, shifts=(sy, sx), dims=(-2, -1))
|
||||
pred_rolled = executor(x_rolled, *args[1:], **kwargs)
|
||||
preds.append(torch.roll(pred_rolled, shifts=(-sy, -sx), dims=(-2, -1)))
|
||||
stacked = torch.stack(preds, dim=0) # (N, B, C, H, W)
|
||||
_, _, _, H, W = stacked.shape
|
||||
if blend == "window":
|
||||
N = stacked.shape[0]
|
||||
tiles = window_tiles.to(device=stacked.device, dtype=stacked.dtype)
|
||||
w = tiles.repeat(1, H // P, W // P)[:, :H, :W]
|
||||
sum_w = w.sum(dim=0, keepdim=True)
|
||||
w = torch.where(sum_w < 1e-3, torch.full_like(w, 1.0 / N), w / sum_w.clamp(min=1e-8))
|
||||
avg = (stacked * w[:, None, None, :, :]).sum(dim=0)
|
||||
elif blend == "median":
|
||||
avg = torch.median(stacked, dim=0).values
|
||||
else:
|
||||
avg = stacked.mean(dim=0)
|
||||
|
||||
# Mask out the P-px wraparound contamination strip at each edge.
|
||||
mask = get_edge_ramp(H, W, pred.device, pred.dtype)
|
||||
return pred * (1.0 - mask * strength) + avg * (mask * strength)
|
||||
|
||||
m.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "hidream_o1_patch_seam_smoothing", smoothing_wrapper)
|
||||
return io.NodeOutput(m)
|
||||
|
||||
|
||||
class HiDreamO1Extension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
EmptyHiDreamO1LatentImage,
|
||||
HiDreamO1ReferenceImages,
|
||||
HiDreamO1PatchSeamSmoothing,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> HiDreamO1Extension:
|
||||
return HiDreamO1Extension()
|
||||
@ -338,8 +338,25 @@ class LTXVAddGuide(io.ComfyNode):
|
||||
noise_mask = get_noise_mask(latent)
|
||||
|
||||
_, _, latent_length, latent_height, latent_width = latent_image.shape
|
||||
|
||||
# For mid-video multi-frame guides, prepend+strip a throwaway first frame so the VAE's "first latent = 1 pixel frame" asymmetry lands on the discarded slot
|
||||
time_scale_factor = scale_factors[0]
|
||||
num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1
|
||||
resolved_frame_idx = frame_idx
|
||||
if frame_idx < 0:
|
||||
_, num_keyframes = get_keyframe_idxs(positive)
|
||||
resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0)
|
||||
causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1
|
||||
|
||||
if not causal_fix:
|
||||
image = torch.cat([image[:1], image], dim=0)
|
||||
|
||||
image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors)
|
||||
|
||||
if not causal_fix:
|
||||
t = t[:, :, 1:, :, :]
|
||||
image = image[1:]
|
||||
|
||||
frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
|
||||
assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
|
||||
|
||||
@ -352,6 +369,7 @@ class LTXVAddGuide(io.ComfyNode):
|
||||
t,
|
||||
strength,
|
||||
scale_factors,
|
||||
causal_fix=causal_fix,
|
||||
)
|
||||
|
||||
# Track this guide for per-reference attention control.
|
||||
|
||||
@ -40,23 +40,13 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
|
||||
|
||||
inverse_mask = torch.ones_like(mask) - mask
|
||||
|
||||
source_rgb = source[:, :3, :visible_height, :visible_width]
|
||||
dest_slice = destination[..., top:bottom, left:right]
|
||||
|
||||
if destination.shape[1] == 4:
|
||||
if torch.max(dest_slice) == 0:
|
||||
destination[:, :3, top:bottom, left:right] = source_rgb
|
||||
destination[:, 3:4, top:bottom, left:right] = mask
|
||||
else:
|
||||
destination[:, :3, top:bottom, left:right] = (mask * source_rgb) + (inverse_mask * dest_slice[:, :3])
|
||||
destination[:, 3:4, top:bottom, left:right] = torch.max(mask, dest_slice[:, 3:4])
|
||||
else:
|
||||
source_portion = mask * source_rgb
|
||||
destination_portion = inverse_mask * dest_slice
|
||||
destination[..., top:bottom, left:right] = source_portion + destination_portion
|
||||
source_portion = mask * source[..., :visible_height, :visible_width]
|
||||
destination_portion = inverse_mask * destination[..., top:bottom, left:right]
|
||||
|
||||
destination[..., top:bottom, left:right] = source_portion + destination_portion
|
||||
return destination
|
||||
|
||||
|
||||
class LatentCompositeMasked(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
@ -95,23 +85,18 @@ class ImageCompositeMasked(IO.ComfyNode):
|
||||
display_name="Image Composite Masked",
|
||||
category="image",
|
||||
inputs=[
|
||||
IO.Image.Input("destination"),
|
||||
IO.Image.Input("source"),
|
||||
IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
|
||||
IO.Int.Input("y", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
|
||||
IO.Boolean.Input("resize_source", default=False),
|
||||
IO.Image.Input("destination", optional=True),
|
||||
IO.Mask.Input("mask", optional=True),
|
||||
],
|
||||
outputs=[IO.Image.Output()],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, source, x, y, resize_source, destination = None, mask = None) -> IO.NodeOutput:
|
||||
if destination is None: # transparent rgba
|
||||
B, H, W, C = source.shape
|
||||
destination = torch.zeros((B, H, W, 4), dtype=source.dtype, device=source.device)
|
||||
if C == 3:
|
||||
source = torch.nn.functional.pad(source, (0, 1), value=1.0)
|
||||
def execute(cls, destination, source, x, y, resize_source, mask = None) -> IO.NodeOutput:
|
||||
destination, source = node_helpers.image_alpha_fix(destination, source)
|
||||
destination = destination.clone().movedim(-1, 1)
|
||||
output = composite(destination, source.movedim(-1, 1), x, y, mask, 1, resize_source).movedim(1, -1)
|
||||
|
||||
@ -300,6 +300,29 @@ class RescaleCFG:
|
||||
m.set_model_sampler_cfg_function(rescale_cfg)
|
||||
return (m, )
|
||||
|
||||
class ModelNoiseScale:
|
||||
@classmethod
|
||||
def INPUT_TYPES(s):
|
||||
return {"required": { "model": ("MODEL",),
|
||||
"noise_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 64.0, "step": 0.01,
|
||||
"tooltip": "Absolute training noise scale. For example HiDream-O1 base: 8.0, dev: 7.5."}),
|
||||
}}
|
||||
|
||||
RETURN_TYPES = ("MODEL",)
|
||||
FUNCTION = "patch"
|
||||
|
||||
CATEGORY = "advanced/model"
|
||||
|
||||
def patch(self, model, noise_scale):
|
||||
m = model.clone()
|
||||
original = m.model.model_sampling
|
||||
ms = type(original)(m.model.model_config)
|
||||
ms.set_parameters(shift=original.shift, multiplier=original.multiplier)
|
||||
ms.set_noise_scale(noise_scale)
|
||||
m.add_object_patch("model_sampling", ms)
|
||||
return (m, )
|
||||
|
||||
|
||||
class ModelComputeDtype:
|
||||
SEARCH_ALIASES = ["model precision", "change dtype"]
|
||||
@classmethod
|
||||
@ -327,6 +350,7 @@ NODE_CLASS_MAPPINGS = {
|
||||
"ModelSamplingSD3": ModelSamplingSD3,
|
||||
"ModelSamplingAuraFlow": ModelSamplingAuraFlow,
|
||||
"ModelSamplingFlux": ModelSamplingFlux,
|
||||
"ModelNoiseScale": ModelNoiseScale,
|
||||
"RescaleCFG": RescaleCFG,
|
||||
"ModelComputeDtype": ModelComputeDtype,
|
||||
}
|
||||
|
||||
@ -116,7 +116,7 @@ class EmptyQwenImageLayeredLatentImage(io.ComfyNode):
|
||||
inputs=[
|
||||
io.Int.Input("width", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("height", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
|
||||
io.Int.Input("layers", default=3, min=0, max=nodes.MAX_RESOLUTION, step=1, advanced=True),
|
||||
io.Int.Input("layers", default=3, min=0, max=nodes.MAX_RESOLUTION, step=1),
|
||||
io.Int.Input("batch_size", default=1, min=1, max=4096),
|
||||
],
|
||||
outputs=[
|
||||
|
||||
@ -123,6 +123,7 @@ class CreateVideo(io.ComfyNode):
|
||||
search_aliases=["images to video"],
|
||||
display_name="Create Video",
|
||||
category="video",
|
||||
essentials_category="Video Tools",
|
||||
description="Create a video from images.",
|
||||
inputs=[
|
||||
io.Image.Input("images", tooltip="The images to create a video from."),
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.20.1"
|
||||
__version__ = "0.21.0"
|
||||
|
||||
1
nodes.py
1
nodes.py
@ -2435,6 +2435,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_sam3.py",
|
||||
"nodes_void.py",
|
||||
"nodes_wandancer.py",
|
||||
"nodes_hidream_o1.py",
|
||||
"nodes_save_3d.py",
|
||||
"nodes_moge.py",
|
||||
]
|
||||
|
||||
35
openapi.yaml
35
openapi.yaml
@ -2071,7 +2071,6 @@ paths:
|
||||
type: integer
|
||||
description: Number of assets marked as missing
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Cloud-runtime FE-facing operations
|
||||
#
|
||||
@ -2122,7 +2121,11 @@ paths:
|
||||
operationId: getCloudJobStatus
|
||||
tags: [queue]
|
||||
summary: Get status of a cloud job
|
||||
description: "[cloud-only] Returns the current execution status of a cloud job."
|
||||
deprecated: true
|
||||
description: |
|
||||
**Deprecated.** This endpoint is superseded by `GET /api/jobs/{job_id}`.
|
||||
Clients should migrate; the endpoint is retained for backward
|
||||
compatibility but will be removed in a future release.
|
||||
x-runtime: [cloud]
|
||||
parameters:
|
||||
- name: job_id
|
||||
@ -2192,7 +2195,11 @@ paths:
|
||||
operationId: getHistoryV2
|
||||
tags: [history]
|
||||
summary: Get paginated execution history (v2)
|
||||
description: "[cloud-only] Returns a paginated list of execution history entries in the v2 format, with richer metadata than the legacy history endpoint."
|
||||
deprecated: true
|
||||
description: |
|
||||
**Deprecated.** This endpoint is superseded by `GET /api/jobs`.
|
||||
Clients should migrate; the endpoint is retained for backward
|
||||
compatibility but will be removed in a future release.
|
||||
x-runtime: [cloud]
|
||||
parameters:
|
||||
- name: limit
|
||||
@ -2231,7 +2238,11 @@ paths:
|
||||
operationId: getHistoryV2ByPromptId
|
||||
tags: [history]
|
||||
summary: Get v2 history for a specific prompt
|
||||
description: "[cloud-only] Returns the v2 history entry for a specific prompt execution."
|
||||
deprecated: true
|
||||
description: |
|
||||
**Deprecated.** This endpoint is superseded by `GET /api/jobs/{prompt_id}`.
|
||||
Clients should migrate; the endpoint is retained for backward
|
||||
compatibility but will be removed in a future release.
|
||||
x-runtime: [cloud]
|
||||
parameters:
|
||||
- name: prompt_id
|
||||
@ -2266,7 +2277,12 @@ paths:
|
||||
operationId: getCloudLogs
|
||||
tags: [system]
|
||||
summary: Get cloud execution logs
|
||||
description: "[cloud-only] Returns execution logs for the authenticated user's cloud jobs."
|
||||
deprecated: true
|
||||
description: |
|
||||
**Deprecated.** This endpoint returns a static placeholder response and
|
||||
provides no real log data. It is retained only to avoid breaking clients
|
||||
that still call it. Clients should remove their dependency; the endpoint
|
||||
will be removed in a future release.
|
||||
x-runtime: [cloud]
|
||||
parameters:
|
||||
- name: job_id
|
||||
@ -5370,7 +5386,12 @@ paths:
|
||||
operationId: viewVideo
|
||||
tags: [view]
|
||||
summary: View or download a video file
|
||||
description: "[cloud-only] Serves a video file from the output directory. Used by the frontend video player."
|
||||
deprecated: true
|
||||
description: |
|
||||
**Deprecated.** This endpoint is an alias of `GET /api/view` added for
|
||||
legacy history-queue video playback. Callers should use `/api/view`
|
||||
directly; the endpoint is retained for backward compatibility but will
|
||||
be removed in a future release.
|
||||
x-runtime: [cloud]
|
||||
parameters:
|
||||
- name: filename
|
||||
@ -5523,7 +5544,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/CloudError"
|
||||
|
||||
|
||||
components:
|
||||
parameters:
|
||||
ComfyUserHeader:
|
||||
@ -6875,7 +6895,6 @@ components:
|
||||
error:
|
||||
type: string
|
||||
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
# Cloud-runtime schemas
|
||||
#
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.20.1"
|
||||
version = "0.21.0"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
comfyui-frontend-package==1.43.18
|
||||
comfyui-workflow-templates==0.9.73
|
||||
comfyui-embedded-docs==0.4.4
|
||||
comfyui-embedded-docs==0.5.0
|
||||
torch
|
||||
torchsde
|
||||
torchvision
|
||||
|
||||
Loading…
Reference in New Issue
Block a user