mirror of
https://github.com/comfyanonymous/ComfyUI.git
synced 2026-05-26 17:07:25 +08:00
Compare commits
53 Commits
5fef188fa1
...
77d7b6126e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77d7b6126e | ||
|
|
96f1cee9f5 | ||
|
|
97f58baaaf | ||
|
|
e8e8fee224 | ||
|
|
e9c311b245 | ||
|
|
e6e0936128 | ||
|
|
b633244635 | ||
|
|
38ecad8f8a | ||
|
|
a7d82baa06 | ||
|
|
d10fc2d652 | ||
|
|
a164c82913 | ||
|
|
5eeae3f1d8 | ||
|
|
0e25a6936e | ||
|
|
fce0398470 | ||
|
|
dae3d34751 | ||
|
|
c7a517c2f9 | ||
|
|
e514119e1e | ||
|
|
13519934ba | ||
|
|
24de8dc01b | ||
|
|
c0d77a5d53 | ||
|
|
ed201fff08 | ||
|
|
b47f15f25a | ||
|
|
3cbf015578 | ||
|
|
64b8457f55 | ||
|
|
75143eeb06 | ||
|
|
1233f077b1 | ||
|
|
6968a70e60 | ||
|
|
115f418b64 | ||
|
|
7385eb2800 | ||
|
|
df22bcd5e1 | ||
|
|
5e3f15a830 | ||
|
|
4304c15e9b | ||
|
|
7636599389 | ||
|
|
443074eee9 | ||
|
|
2e0503780d | ||
|
|
00d2f4047d | ||
|
|
c5d9edacd0 | ||
|
|
47ccecaee0 | ||
|
|
2327fa1c90 | ||
|
|
084e08c6e2 | ||
|
|
ef8f3cbcdc | ||
|
|
6fbb6b6f49 | ||
|
|
abf3d56f27 | ||
|
|
2a14e1e96a | ||
|
|
5edbdf4364 | ||
|
|
3cdc0d523f | ||
|
|
749d5b4e8d | ||
|
|
e988df72f8 | ||
|
|
0be87b082a | ||
|
|
ec4b1659ab | ||
|
|
cb388e2912 | ||
|
|
9949c19c63 | ||
|
|
cc6f9500a1 |
45
.github/workflows/tag-dispatch-cloud.yml
vendored
Normal file
45
.github/workflows/tag-dispatch-cloud.yml
vendored
Normal file
@ -0,0 +1,45 @@
|
||||
name: Tag Dispatch to Cloud
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
dispatch-cloud:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Send repository dispatch to cloud
|
||||
env:
|
||||
DISPATCH_TOKEN: ${{ secrets.CLOUD_REPO_DISPATCH_TOKEN }}
|
||||
RELEASE_TAG: ${{ github.ref_name }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
if [ -z "${DISPATCH_TOKEN:-}" ]; then
|
||||
echo "::error::CLOUD_REPO_DISPATCH_TOKEN is required but not set."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RELEASE_URL="https://github.com/${{ github.repository }}/releases/tag/${RELEASE_TAG}"
|
||||
|
||||
PAYLOAD="$(jq -n \
|
||||
--arg release_tag "$RELEASE_TAG" \
|
||||
--arg release_url "$RELEASE_URL" \
|
||||
'{
|
||||
event_type: "comfyui_tag_pushed",
|
||||
client_payload: {
|
||||
release_tag: $release_tag,
|
||||
release_url: $release_url
|
||||
}
|
||||
}')"
|
||||
|
||||
curl -fsSL \
|
||||
-X POST \
|
||||
-H "Accept: application/vnd.github+json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: Bearer ${DISPATCH_TOKEN}" \
|
||||
https://api.github.com/repos/Comfy-Org/cloud/dispatches \
|
||||
-d "$PAYLOAD"
|
||||
|
||||
echo "✅ Dispatched ComfyUI tag ${RELEASE_TAG} to Comfy-Org/cloud"
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@ -21,6 +21,5 @@ venv*/
|
||||
*.log
|
||||
web_custom_versions/
|
||||
.DS_Store
|
||||
openapi.yaml
|
||||
filtered-openapi.yaml
|
||||
uv.lock
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
# Admins
|
||||
* @comfyanonymous @kosinkadink @guill
|
||||
* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
precision mediump float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform int u_int0; // Blend mode
|
||||
uniform int u_int1; // Color tint
|
||||
uniform float u_float0; // Intensity
|
||||
@ -75,7 +74,7 @@ void main() {
|
||||
float t0 = threshold - 0.15;
|
||||
float t1 = threshold + 0.15;
|
||||
|
||||
vec2 texelSize = 1.0 / u_resolution;
|
||||
vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius2 = radius * radius;
|
||||
|
||||
float sampleScale = clamp(radius * 0.75, 0.35, 1.0);
|
||||
|
||||
@ -12,7 +12,6 @@ const int RADIAL_SAMPLES = 12;
|
||||
const float RADIAL_STRENGTH = 0.0003;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)
|
||||
uniform float u_float0; // Blur radius/amount
|
||||
uniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)
|
||||
@ -25,7 +24,7 @@ float gaussian(float x, float sigma) {
|
||||
}
|
||||
|
||||
void main() {
|
||||
vec2 texelSize = 1.0 / u_resolution;
|
||||
vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius = max(u_float0, 0.0);
|
||||
|
||||
// Radial (angular) blur - single pass, doesn't use separable
|
||||
|
||||
@ -2,14 +2,13 @@
|
||||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0
|
||||
|
||||
in vec2 v_texCoord;
|
||||
layout(location = 0) out vec4 fragColor0;
|
||||
|
||||
void main() {
|
||||
vec2 texel = 1.0 / u_resolution;
|
||||
vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
|
||||
// Sample center and neighbors
|
||||
vec4 center = texture(u_image0, v_texCoord);
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
precision highp float;
|
||||
|
||||
uniform sampler2D u_image0;
|
||||
uniform vec2 u_resolution;
|
||||
uniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5
|
||||
uniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels
|
||||
uniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen
|
||||
@ -19,7 +18,7 @@ float getLuminance(vec3 color) {
|
||||
}
|
||||
|
||||
void main() {
|
||||
vec2 texel = 1.0 / u_resolution;
|
||||
vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
|
||||
float radius = max(u_float1, 0.5);
|
||||
float amount = u_float0;
|
||||
float threshold = u_float2;
|
||||
|
||||
1620
blueprints/Crop Images 2x2.json
Normal file
1620
blueprints/Crop Images 2x2.json
Normal file
File diff suppressed because it is too large
Load Diff
2957
blueprints/Crop Images 3x3.json
Normal file
2957
blueprints/Crop Images 3x3.json
Normal file
File diff suppressed because it is too large
Load Diff
@ -160,7 +160,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Depth to Image (Z-Image-Turbo)",
|
||||
"name": "Depth to Image (Z-Image-Turbo)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2482,4 +2482,4 @@
|
||||
"VHS_KeepIntermediate": true
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -261,7 +261,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Depth to Video (LTX 2.0)",
|
||||
"name": "Depth to Video (LTX 2.0)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -5208,4 +5208,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
3360
blueprints/First-Last-Frame to Video (LTX-2.3).json
Normal file
3360
blueprints/First-Last-Frame to Video (LTX-2.3).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -268,7 +268,7 @@
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision mediump float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform int u_int0; // Blend mode\nuniform int u_int1; // Color tint\nuniform float u_float0; // Intensity\nuniform float u_float1; // Radius\nuniform float u_float2; // Threshold\n\nin vec2 v_texCoord;\nout vec4 fragColor;\n\nconst int BLEND_ADD = 0;\nconst int BLEND_SCREEN = 1;\nconst int BLEND_SOFT = 2;\nconst int BLEND_OVERLAY = 3;\nconst int BLEND_LIGHTEN = 4;\n\nconst float GOLDEN_ANGLE = 2.39996323;\nconst int MAX_SAMPLES = 48;\nconst vec3 LUMA = vec3(0.299, 0.587, 0.114);\n\nfloat hash(vec2 p) {\n p = fract(p * vec2(123.34, 456.21));\n p += dot(p, p + 45.32);\n return fract(p.x * p.y);\n}\n\nvec3 hexToRgb(int h) {\n return vec3(\n float((h >> 16) & 255),\n float((h >> 8) & 255),\n float(h & 255)\n ) * (1.0 / 255.0);\n}\n\nvec3 blend(vec3 base, vec3 glow, int mode) {\n if (mode == BLEND_SCREEN) {\n return 1.0 - (1.0 - base) * (1.0 - glow);\n }\n if (mode == BLEND_SOFT) {\n return mix(\n base - (1.0 - 2.0 * glow) * base * (1.0 - base),\n base + (2.0 * glow - 1.0) * (sqrt(base) - base),\n step(0.5, glow)\n );\n }\n if (mode == BLEND_OVERLAY) {\n return mix(\n 2.0 * base * glow,\n 1.0 - 2.0 * (1.0 - base) * (1.0 - glow),\n step(0.5, base)\n );\n }\n if (mode == BLEND_LIGHTEN) {\n return max(base, glow);\n }\n return base + glow;\n}\n\nvoid main() {\n vec4 original = texture(u_image0, v_texCoord);\n \n float intensity = u_float0 * 0.05;\n float radius = u_float1 * u_float1 * 0.012;\n \n if (intensity < 0.001 || radius < 0.1) {\n fragColor = original;\n return;\n }\n \n float threshold = 1.0 - u_float2 * 0.01;\n float t0 = threshold - 0.15;\n float t1 = threshold + 0.15;\n \n vec2 texelSize = 1.0 / u_resolution;\n float radius2 = radius * radius;\n \n float sampleScale = clamp(radius * 0.75, 0.35, 1.0);\n int samples = int(float(MAX_SAMPLES) * sampleScale);\n \n float noise = hash(gl_FragCoord.xy);\n float angleOffset = noise * GOLDEN_ANGLE;\n float radiusJitter = 0.85 + noise * 0.3;\n \n float ca = cos(GOLDEN_ANGLE);\n float sa = sin(GOLDEN_ANGLE);\n vec2 dir = vec2(cos(angleOffset), sin(angleOffset));\n \n vec3 glow = vec3(0.0);\n float totalWeight = 0.0;\n \n // Center tap\n float centerMask = smoothstep(t0, t1, dot(original.rgb, LUMA));\n glow += original.rgb * centerMask * 2.0;\n totalWeight += 2.0;\n \n for (int i = 1; i < MAX_SAMPLES; i++) {\n if (i >= samples) break;\n \n float fi = float(i);\n float dist = sqrt(fi / float(samples)) * radius * radiusJitter;\n \n vec2 offset = dir * dist * texelSize;\n vec3 c = texture(u_image0, v_texCoord + offset).rgb;\n float mask = smoothstep(t0, t1, dot(c, LUMA));\n \n float w = 1.0 - (dist * dist) / (radius2 * 1.5);\n w = max(w, 0.0);\n w *= w;\n \n glow += c * mask * w;\n totalWeight += w;\n \n dir = vec2(\n dir.x * ca - dir.y * sa,\n dir.x * sa + dir.y * ca\n );\n }\n \n glow *= intensity / max(totalWeight, 0.001);\n \n if (u_int1 > 0) {\n glow *= hexToRgb(u_int1);\n }\n \n vec3 result = blend(original.rgb, glow, u_int0);\n result += (noise - 0.5) * (1.0 / 255.0);\n \n fragColor = vec4(clamp(result, 0.0, 1.0), original.a);\n}",
|
||||
"#version 300 es\nprecision mediump float;\n\nuniform sampler2D u_image0;\nuniform int u_int0; // Blend mode\nuniform int u_int1; // Color tint\nuniform float u_float0; // Intensity\nuniform float u_float1; // Radius\nuniform float u_float2; // Threshold\n\nin vec2 v_texCoord;\nout vec4 fragColor;\n\nconst int BLEND_ADD = 0;\nconst int BLEND_SCREEN = 1;\nconst int BLEND_SOFT = 2;\nconst int BLEND_OVERLAY = 3;\nconst int BLEND_LIGHTEN = 4;\n\nconst float GOLDEN_ANGLE = 2.39996323;\nconst int MAX_SAMPLES = 48;\nconst vec3 LUMA = vec3(0.299, 0.587, 0.114);\n\nfloat hash(vec2 p) {\n p = fract(p * vec2(123.34, 456.21));\n p += dot(p, p + 45.32);\n return fract(p.x * p.y);\n}\n\nvec3 hexToRgb(int h) {\n return vec3(\n float((h >> 16) & 255),\n float((h >> 8) & 255),\n float(h & 255)\n ) * (1.0 / 255.0);\n}\n\nvec3 blend(vec3 base, vec3 glow, int mode) {\n if (mode == BLEND_SCREEN) {\n return 1.0 - (1.0 - base) * (1.0 - glow);\n }\n if (mode == BLEND_SOFT) {\n return mix(\n base - (1.0 - 2.0 * glow) * base * (1.0 - base),\n base + (2.0 * glow - 1.0) * (sqrt(base) - base),\n step(0.5, glow)\n );\n }\n if (mode == BLEND_OVERLAY) {\n return mix(\n 2.0 * base * glow,\n 1.0 - 2.0 * (1.0 - base) * (1.0 - glow),\n step(0.5, base)\n );\n }\n if (mode == BLEND_LIGHTEN) {\n return max(base, glow);\n }\n return base + glow;\n}\n\nvoid main() {\n vec4 original = texture(u_image0, v_texCoord);\n \n float intensity = u_float0 * 0.05;\n float radius = u_float1 * u_float1 * 0.012;\n \n if (intensity < 0.001 || radius < 0.1) {\n fragColor = original;\n return;\n }\n \n float threshold = 1.0 - u_float2 * 0.01;\n float t0 = threshold - 0.15;\n float t1 = threshold + 0.15;\n \n vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));\n float radius2 = radius * radius;\n \n float sampleScale = clamp(radius * 0.75, 0.35, 1.0);\n int samples = int(float(MAX_SAMPLES) * sampleScale);\n \n float noise = hash(gl_FragCoord.xy);\n float angleOffset = noise * GOLDEN_ANGLE;\n float radiusJitter = 0.85 + noise * 0.3;\n \n float ca = cos(GOLDEN_ANGLE);\n float sa = sin(GOLDEN_ANGLE);\n vec2 dir = vec2(cos(angleOffset), sin(angleOffset));\n \n vec3 glow = vec3(0.0);\n float totalWeight = 0.0;\n \n // Center tap\n float centerMask = smoothstep(t0, t1, dot(original.rgb, LUMA));\n glow += original.rgb * centerMask * 2.0;\n totalWeight += 2.0;\n \n for (int i = 1; i < MAX_SAMPLES; i++) {\n if (i >= samples) break;\n \n float fi = float(i);\n float dist = sqrt(fi / float(samples)) * radius * radiusJitter;\n \n vec2 offset = dir * dist * texelSize;\n vec3 c = texture(u_image0, v_texCoord + offset).rgb;\n float mask = smoothstep(t0, t1, dot(c, LUMA));\n \n float w = 1.0 - (dist * dist) / (radius2 * 1.5);\n w = max(w, 0.0);\n w *= w;\n \n glow += c * mask * w;\n totalWeight += w;\n \n dir = vec2(\n dir.x * ca - dir.y * sa,\n dir.x * sa + dir.y * ca\n );\n }\n \n glow *= intensity / max(totalWeight, 0.001);\n \n if (u_int1 > 0) {\n glow *= hexToRgb(u_int1);\n }\n \n vec3 result = blend(original.rgb, glow, u_int0);\n result += (noise - 0.5) * (1.0 / 255.0);\n \n fragColor = vec4(clamp(result, 0.0, 1.0), original.a);\n}",
|
||||
"from_input"
|
||||
]
|
||||
},
|
||||
|
||||
@ -331,7 +331,7 @@
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\n#pragma passes 2\nprecision highp float;\n\n// Blur type constants\nconst int BLUR_GAUSSIAN = 0;\nconst int BLUR_BOX = 1;\nconst int BLUR_RADIAL = 2;\n\n// Radial blur config\nconst int RADIAL_SAMPLES = 12;\nconst float RADIAL_STRENGTH = 0.0003;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)\nuniform float u_float0; // Blur radius/amount\nuniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nvoid main() {\n vec2 texelSize = 1.0 / u_resolution;\n float radius = max(u_float0, 0.0);\n\n // Radial (angular) blur - single pass, doesn't use separable\n if (u_int0 == BLUR_RADIAL) {\n // Only execute on first pass\n if (u_pass > 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec2 center = vec2(0.5);\n vec2 dir = v_texCoord - center;\n float dist = length(dir);\n\n if (dist < 1e-4) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec4 sum = vec4(0.0);\n float totalWeight = 0.0;\n float angleStep = radius * RADIAL_STRENGTH;\n\n dir /= dist;\n\n float cosStep = cos(angleStep);\n float sinStep = sin(angleStep);\n\n float negAngle = -float(RADIAL_SAMPLES) * angleStep;\n vec2 rotDir = vec2(\n dir.x * cos(negAngle) - dir.y * sin(negAngle),\n dir.x * sin(negAngle) + dir.y * cos(negAngle)\n );\n\n for (int i = -RADIAL_SAMPLES; i <= RADIAL_SAMPLES; i++) {\n vec2 uv = center + rotDir * dist;\n float w = 1.0 - abs(float(i)) / float(RADIAL_SAMPLES);\n sum += texture(u_image0, uv) * w;\n totalWeight += w;\n\n rotDir = vec2(\n rotDir.x * cosStep - rotDir.y * sinStep,\n rotDir.x * sinStep + rotDir.y * cosStep\n );\n }\n\n fragColor0 = sum / max(totalWeight, 0.001);\n return;\n }\n\n // Separable Gaussian / Box blur\n int samples = int(ceil(radius));\n\n if (samples == 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n // Direction: pass 0 = horizontal, pass 1 = vertical\n vec2 dir = (u_pass == 0) ? vec2(1.0, 0.0) : vec2(0.0, 1.0);\n\n vec4 color = vec4(0.0);\n float totalWeight = 0.0;\n float sigma = radius / 2.0;\n\n for (int i = -samples; i <= samples; i++) {\n vec2 offset = dir * float(i) * texelSize;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float weight;\n if (u_int0 == BLUR_GAUSSIAN) {\n weight = gaussian(float(i), sigma);\n } else {\n // BLUR_BOX\n weight = 1.0;\n }\n\n color += sample_color * weight;\n totalWeight += weight;\n }\n\n fragColor0 = color / totalWeight;\n}\n",
|
||||
"#version 300 es\n#pragma passes 2\nprecision highp float;\n\n// Blur type constants\nconst int BLUR_GAUSSIAN = 0;\nconst int BLUR_BOX = 1;\nconst int BLUR_RADIAL = 2;\n\n// Radial blur config\nconst int RADIAL_SAMPLES = 12;\nconst float RADIAL_STRENGTH = 0.0003;\n\nuniform sampler2D u_image0;\nuniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)\nuniform float u_float0; // Blur radius/amount\nuniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nvoid main() {\n vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));\n float radius = max(u_float0, 0.0);\n\n // Radial (angular) blur - single pass, doesn't use separable\n if (u_int0 == BLUR_RADIAL) {\n // Only execute on first pass\n if (u_pass > 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec2 center = vec2(0.5);\n vec2 dir = v_texCoord - center;\n float dist = length(dir);\n\n if (dist < 1e-4) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec4 sum = vec4(0.0);\n float totalWeight = 0.0;\n float angleStep = radius * RADIAL_STRENGTH;\n\n dir /= dist;\n\n float cosStep = cos(angleStep);\n float sinStep = sin(angleStep);\n\n float negAngle = -float(RADIAL_SAMPLES) * angleStep;\n vec2 rotDir = vec2(\n dir.x * cos(negAngle) - dir.y * sin(negAngle),\n dir.x * sin(negAngle) + dir.y * cos(negAngle)\n );\n\n for (int i = -RADIAL_SAMPLES; i <= RADIAL_SAMPLES; i++) {\n vec2 uv = center + rotDir * dist;\n float w = 1.0 - abs(float(i)) / float(RADIAL_SAMPLES);\n sum += texture(u_image0, uv) * w;\n totalWeight += w;\n\n rotDir = vec2(\n rotDir.x * cosStep - rotDir.y * sinStep,\n rotDir.x * sinStep + rotDir.y * cosStep\n );\n }\n\n fragColor0 = sum / max(totalWeight, 0.001);\n return;\n }\n\n // Separable Gaussian / Box blur\n int samples = int(ceil(radius));\n\n if (samples == 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n // Direction: pass 0 = horizontal, pass 1 = vertical\n vec2 dir = (u_pass == 0) ? vec2(1.0, 0.0) : vec2(0.0, 1.0);\n\n vec4 color = vec4(0.0);\n float totalWeight = 0.0;\n float sigma = radius / 2.0;\n\n for (int i = -samples; i <= samples; i++) {\n vec2 offset = dir * float(i) * texelSize;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float weight;\n if (u_int0 == BLUR_GAUSSIAN) {\n weight = gaussian(float(i), sigma);\n } else {\n // BLUR_BOX\n weight = 1.0;\n }\n\n color += sample_color * weight;\n totalWeight += weight;\n }\n\n fragColor0 = color / totalWeight;\n}\n",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
|
||||
2148
blueprints/Image Edit (FireRed Image Edit 1.1).json
Normal file
2148
blueprints/Image Edit (FireRed Image Edit 1.1).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -128,7 +128,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image Edit (Flux.2 Klein 4B)",
|
||||
"name": "Image Edit (Flux.2 Klein 4B)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1837,4 +1837,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
1427
blueprints/Image Edit (LongCat Image Edit).json
Normal file
1427
blueprints/Image Edit (LongCat Image Edit).json
Normal file
File diff suppressed because it is too large
Load Diff
1205
blueprints/Image Inpainting (Flux.1 Fill Dev).json
Normal file
1205
blueprints/Image Inpainting (Flux.1 Fill Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -124,7 +124,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image Inpainting (Qwen-image)",
|
||||
"name": "Image Inpainting (Qwen-image)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -1923,4 +1923,4 @@
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -204,7 +204,7 @@
|
||||
},
|
||||
"revision": 0,
|
||||
"config": {},
|
||||
"name": "local-Image Outpainting (Qwen-Image)",
|
||||
"name": "Image Outpainting (Qwen-Image)",
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
@ -2749,4 +2749,4 @@
|
||||
}
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
}
|
||||
@ -1,15 +1,14 @@
|
||||
{
|
||||
"id": "1a761372-7c82-4016-b9bf-fa285967e1e9",
|
||||
"revision": 0,
|
||||
"last_node_id": 83,
|
||||
"last_node_id": 176,
|
||||
"last_link_id": 0,
|
||||
"nodes": [
|
||||
{
|
||||
"id": 83,
|
||||
"type": "f754a936-daaf-4b6e-9658-41fdc54d301d",
|
||||
"id": 176,
|
||||
"type": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
|
||||
"pos": [
|
||||
61.999827823554256,
|
||||
153.3332507624185
|
||||
-1150,
|
||||
200
|
||||
],
|
||||
"size": [
|
||||
400,
|
||||
@ -56,6 +55,38 @@
|
||||
"name": "layers"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": null
|
||||
},
|
||||
{
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": null
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -66,28 +97,41 @@
|
||||
"links": []
|
||||
}
|
||||
],
|
||||
"title": "Image to Layers (Qwen-Image-Layered)",
|
||||
"properties": {
|
||||
"proxyWidgets": [
|
||||
[
|
||||
"-1",
|
||||
"6",
|
||||
"text"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"3",
|
||||
"steps"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"3",
|
||||
"cfg"
|
||||
],
|
||||
[
|
||||
"-1",
|
||||
"83",
|
||||
"layers"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"seed"
|
||||
],
|
||||
[
|
||||
"37",
|
||||
"unet_name"
|
||||
],
|
||||
[
|
||||
"38",
|
||||
"clip_name"
|
||||
],
|
||||
[
|
||||
"39",
|
||||
"vae_name"
|
||||
],
|
||||
[
|
||||
"3",
|
||||
"control_after_generate"
|
||||
@ -95,6 +139,11 @@
|
||||
],
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -103,25 +152,20 @@
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": [
|
||||
"",
|
||||
20,
|
||||
2.5,
|
||||
2
|
||||
]
|
||||
"widgets_values": []
|
||||
}
|
||||
],
|
||||
"links": [],
|
||||
"groups": [],
|
||||
"version": 0.4,
|
||||
"definitions": {
|
||||
"subgraphs": [
|
||||
{
|
||||
"id": "f754a936-daaf-4b6e-9658-41fdc54d301d",
|
||||
"id": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
|
||||
"version": 1,
|
||||
"state": {
|
||||
"lastGroupId": 3,
|
||||
"lastNodeId": 83,
|
||||
"lastLinkId": 159,
|
||||
"lastGroupId": 8,
|
||||
"lastNodeId": 176,
|
||||
"lastLinkId": 380,
|
||||
"lastRerouteId": 0
|
||||
},
|
||||
"revision": 0,
|
||||
@ -130,10 +174,10 @@
|
||||
"inputNode": {
|
||||
"id": -10,
|
||||
"bounding": [
|
||||
-510,
|
||||
523,
|
||||
-720,
|
||||
720,
|
||||
120,
|
||||
140
|
||||
220
|
||||
]
|
||||
},
|
||||
"outputNode": {
|
||||
@ -156,8 +200,8 @@
|
||||
],
|
||||
"localized_name": "image",
|
||||
"pos": [
|
||||
-410,
|
||||
543
|
||||
-620,
|
||||
740
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -168,8 +212,8 @@
|
||||
150
|
||||
],
|
||||
"pos": [
|
||||
-410,
|
||||
563
|
||||
-620,
|
||||
760
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -180,8 +224,8 @@
|
||||
153
|
||||
],
|
||||
"pos": [
|
||||
-410,
|
||||
583
|
||||
-620,
|
||||
780
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -192,8 +236,8 @@
|
||||
154
|
||||
],
|
||||
"pos": [
|
||||
-410,
|
||||
603
|
||||
-620,
|
||||
800
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -204,8 +248,56 @@
|
||||
159
|
||||
],
|
||||
"pos": [
|
||||
-410,
|
||||
623
|
||||
-620,
|
||||
820
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "9f76338b-f4ca-4bb3-b61a-57b3f233061e",
|
||||
"name": "seed",
|
||||
"type": "INT",
|
||||
"linkIds": [
|
||||
377
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
840
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "8d0422d5-5eee-4f7e-9817-dc613cc62eca",
|
||||
"name": "unet_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
378
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
860
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "552eece2-a735-4d00-ae78-ded454622bc1",
|
||||
"name": "clip_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
379
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
880
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": "1e6d141c-d0f9-4a2b-895c-b6780e57cfa0",
|
||||
"name": "vae_name",
|
||||
"type": "COMBO",
|
||||
"linkIds": [
|
||||
380
|
||||
],
|
||||
"pos": [
|
||||
-620,
|
||||
900
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -231,14 +323,14 @@
|
||||
"type": "CLIPLoader",
|
||||
"pos": [
|
||||
-320,
|
||||
310
|
||||
360
|
||||
],
|
||||
"size": [
|
||||
346.7470703125,
|
||||
106
|
||||
350,
|
||||
150
|
||||
],
|
||||
"flags": {},
|
||||
"order": 0,
|
||||
"order": 5,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -248,7 +340,7 @@
|
||||
"widget": {
|
||||
"name": "clip_name"
|
||||
},
|
||||
"link": null
|
||||
"link": 379
|
||||
},
|
||||
{
|
||||
"localized_name": "type",
|
||||
@ -283,9 +375,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
|
||||
@ -312,14 +409,14 @@
|
||||
"type": "VAELoader",
|
||||
"pos": [
|
||||
-320,
|
||||
460
|
||||
580
|
||||
],
|
||||
"size": [
|
||||
346.7470703125,
|
||||
58
|
||||
350,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 1,
|
||||
"order": 6,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -329,7 +426,7 @@
|
||||
"widget": {
|
||||
"name": "vae_name"
|
||||
},
|
||||
"link": null
|
||||
"link": 380
|
||||
}
|
||||
],
|
||||
"outputs": [
|
||||
@ -345,9 +442,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAELoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAELoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_image_layered_vae.safetensors",
|
||||
@ -375,11 +477,11 @@
|
||||
420
|
||||
],
|
||||
"size": [
|
||||
425.27801513671875,
|
||||
180.6060791015625
|
||||
430,
|
||||
190
|
||||
],
|
||||
"flags": {},
|
||||
"order": 3,
|
||||
"order": 2,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -411,9 +513,14 @@
|
||||
],
|
||||
"title": "CLIP Text Encode (Negative Prompt)",
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -432,12 +539,12 @@
|
||||
"id": 70,
|
||||
"type": "ReferenceLatent",
|
||||
"pos": [
|
||||
330,
|
||||
670
|
||||
140,
|
||||
700
|
||||
],
|
||||
"size": [
|
||||
204.1666717529297,
|
||||
46
|
||||
210,
|
||||
50
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
@ -470,9 +577,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -480,19 +592,18 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 69,
|
||||
"type": "ReferenceLatent",
|
||||
"pos": [
|
||||
330,
|
||||
710
|
||||
160,
|
||||
820
|
||||
],
|
||||
"size": [
|
||||
204.1666717529297,
|
||||
46
|
||||
210,
|
||||
50
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
@ -525,9 +636,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ReferenceLatent",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -535,8 +651,7 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 66,
|
||||
@ -547,10 +662,10 @@
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
58
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 4,
|
||||
"order": 7,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -580,9 +695,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "ModelSamplingAuraFlow",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -600,11 +720,11 @@
|
||||
"type": "LatentCutToBatch",
|
||||
"pos": [
|
||||
830,
|
||||
160
|
||||
140
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
82
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 11,
|
||||
@ -646,9 +766,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "LatentCutToBatch",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "LatentCutToBatch",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -666,12 +791,12 @@
|
||||
"id": 71,
|
||||
"type": "VAEEncode",
|
||||
"pos": [
|
||||
100,
|
||||
690
|
||||
-280,
|
||||
780
|
||||
],
|
||||
"size": [
|
||||
140,
|
||||
46
|
||||
230,
|
||||
100
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": false
|
||||
@ -704,9 +829,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAEEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -714,24 +844,23 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"type": "VAEDecode",
|
||||
"pos": [
|
||||
850,
|
||||
310
|
||||
370
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
46
|
||||
50
|
||||
],
|
||||
"flags": {
|
||||
"collapsed": true
|
||||
},
|
||||
"order": 7,
|
||||
"order": 3,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -759,9 +888,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "VAEDecode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -769,8 +903,7 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
@ -780,11 +913,11 @@
|
||||
180
|
||||
],
|
||||
"size": [
|
||||
422.84503173828125,
|
||||
164.31304931640625
|
||||
430,
|
||||
170
|
||||
],
|
||||
"flags": {},
|
||||
"order": 6,
|
||||
"order": 1,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -816,9 +949,14 @@
|
||||
],
|
||||
"title": "CLIP Text Encode (Positive Prompt)",
|
||||
"properties": {
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "CLIPTextEncode",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -838,14 +976,14 @@
|
||||
"type": "KSampler",
|
||||
"pos": [
|
||||
530,
|
||||
280
|
||||
340
|
||||
],
|
||||
"size": [
|
||||
270,
|
||||
400
|
||||
],
|
||||
"flags": {},
|
||||
"order": 5,
|
||||
"order": 0,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -879,7 +1017,7 @@
|
||||
"widget": {
|
||||
"name": "seed"
|
||||
},
|
||||
"link": null
|
||||
"link": 377
|
||||
},
|
||||
{
|
||||
"localized_name": "steps",
|
||||
@ -939,9 +1077,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "KSampler",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "KSampler",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -964,12 +1107,12 @@
|
||||
"id": 78,
|
||||
"type": "GetImageSize",
|
||||
"pos": [
|
||||
80,
|
||||
790
|
||||
-280,
|
||||
930
|
||||
],
|
||||
"size": [
|
||||
210,
|
||||
136
|
||||
230,
|
||||
140
|
||||
],
|
||||
"flags": {},
|
||||
"order": 12,
|
||||
@ -1007,9 +1150,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "GetImageSize",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -1017,23 +1165,23 @@
|
||||
"secondTabText": "Send Back",
|
||||
"secondTabOffset": 80,
|
||||
"secondTabWidth": 65
|
||||
},
|
||||
"widgets_values": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 83,
|
||||
"type": "EmptyQwenImageLayeredLatentImage",
|
||||
"pos": [
|
||||
320,
|
||||
790
|
||||
-280,
|
||||
1120
|
||||
],
|
||||
"size": [
|
||||
330.9341796875,
|
||||
130
|
||||
340,
|
||||
200
|
||||
],
|
||||
"flags": {},
|
||||
"order": 13,
|
||||
"mode": 0,
|
||||
"showAdvanced": true,
|
||||
"inputs": [
|
||||
{
|
||||
"localized_name": "width",
|
||||
@ -1083,9 +1231,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "EmptyQwenImageLayeredLatentImage",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "EmptyQwenImageLayeredLatentImage",
|
||||
"enableTabs": false,
|
||||
"tabWidth": 65,
|
||||
"tabXOffset": 10,
|
||||
@ -1109,11 +1262,11 @@
|
||||
180
|
||||
],
|
||||
"size": [
|
||||
346.7470703125,
|
||||
82
|
||||
350,
|
||||
110
|
||||
],
|
||||
"flags": {},
|
||||
"order": 2,
|
||||
"order": 4,
|
||||
"mode": 0,
|
||||
"inputs": [
|
||||
{
|
||||
@ -1123,7 +1276,7 @@
|
||||
"widget": {
|
||||
"name": "unet_name"
|
||||
},
|
||||
"link": null
|
||||
"link": 378
|
||||
},
|
||||
{
|
||||
"localized_name": "weight_dtype",
|
||||
@ -1147,9 +1300,14 @@
|
||||
}
|
||||
],
|
||||
"properties": {
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"cnr_id": "comfy-core",
|
||||
"ver": "0.5.1",
|
||||
"ue_properties": {
|
||||
"widget_ue_connectable": {},
|
||||
"input_ue_unconnectable": {},
|
||||
"version": "7.7"
|
||||
},
|
||||
"Node name for S&R": "UNETLoader",
|
||||
"models": [
|
||||
{
|
||||
"name": "qwen_image_layered_bf16.safetensors",
|
||||
@ -1191,8 +1349,8 @@
|
||||
"bounding": [
|
||||
-330,
|
||||
110,
|
||||
366.7470703125,
|
||||
421.6
|
||||
370,
|
||||
610
|
||||
],
|
||||
"color": "#3f789e",
|
||||
"font_size": 24,
|
||||
@ -1391,6 +1549,38 @@
|
||||
"target_id": 83,
|
||||
"target_slot": 2,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 377,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 5,
|
||||
"target_id": 3,
|
||||
"target_slot": 4,
|
||||
"type": "INT"
|
||||
},
|
||||
{
|
||||
"id": 378,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 6,
|
||||
"target_id": 37,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 379,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 7,
|
||||
"target_id": 38,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
},
|
||||
{
|
||||
"id": 380,
|
||||
"origin_id": -10,
|
||||
"origin_slot": 8,
|
||||
"target_id": 39,
|
||||
"target_slot": 0,
|
||||
"type": "COMBO"
|
||||
}
|
||||
],
|
||||
"extra": {
|
||||
@ -1400,7 +1590,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
"config": {},
|
||||
"extra": {
|
||||
"ds": {
|
||||
"scale": 1.14,
|
||||
@ -1409,7 +1598,6 @@
|
||||
6.855893974423647
|
||||
]
|
||||
},
|
||||
"workflowRendererVersion": "LG"
|
||||
},
|
||||
"version": 0.4
|
||||
}
|
||||
"ue_links": []
|
||||
}
|
||||
}
|
||||
4233
blueprints/Image to Video (LTX-2.3).json
Normal file
4233
blueprints/Image to Video (LTX-2.3).json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -267,7 +267,7 @@
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
|
||||
1046
blueprints/Text to Image (Flux.1 Dev).json
Normal file
1046
blueprints/Text to Image (Flux.1 Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
1040
blueprints/Text to Image (Flux.1 Krea Dev).json
Normal file
1040
blueprints/Text to Image (Flux.1 Krea Dev).json
Normal file
File diff suppressed because it is too large
Load Diff
1468
blueprints/Text to Image (NetaYume Lumina).json
Normal file
1468
blueprints/Text to Image (NetaYume Lumina).json
Normal file
File diff suppressed because it is too large
Load Diff
1951
blueprints/Text to Image (Qwen-Image 2512).json
Normal file
1951
blueprints/Text to Image (Qwen-Image 2512).json
Normal file
File diff suppressed because it is too large
Load Diff
1881
blueprints/Text to Image (Qwen-Image).json
Normal file
1881
blueprints/Text to Image (Qwen-Image).json
Normal file
File diff suppressed because it is too large
Load Diff
4296
blueprints/Text to Video (LTX-2.3).json
Normal file
4296
blueprints/Text to Video (LTX-2.3).json
Normal file
File diff suppressed because it is too large
Load Diff
@ -383,7 +383,7 @@
|
||||
"Node name for S&R": "GLSLShader"
|
||||
},
|
||||
"widgets_values": [
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5\nuniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels\nuniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nfloat getLuminance(vec3 color) {\n return dot(color, vec3(0.2126, 0.7152, 0.0722));\n}\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n float radius = max(u_float1, 0.5);\n float amount = u_float0;\n float threshold = u_float2;\n\n vec4 original = texture(u_image0, v_texCoord);\n\n // Gaussian blur for the \"unsharp\" mask\n int samples = int(ceil(radius));\n float sigma = radius / 2.0;\n\n vec4 blurred = vec4(0.0);\n float totalWeight = 0.0;\n\n for (int x = -samples; x <= samples; x++) {\n for (int y = -samples; y <= samples; y++) {\n vec2 offset = vec2(float(x), float(y)) * texel;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float dist = length(vec2(float(x), float(y)));\n float weight = gaussian(dist, sigma);\n blurred += sample_color * weight;\n totalWeight += weight;\n }\n }\n blurred /= totalWeight;\n\n // Unsharp mask = original - blurred\n vec3 mask = original.rgb - blurred.rgb;\n\n // Luminance-based threshold with smooth falloff\n float lumaDelta = abs(getLuminance(original.rgb) - getLuminance(blurred.rgb));\n float thresholdScale = smoothstep(0.0, threshold, lumaDelta);\n mask *= thresholdScale;\n\n // Sharpen: original + mask * amount\n vec3 sharpened = original.rgb + mask * amount;\n\n fragColor0 = vec4(clamp(sharpened, 0.0, 1.0), original.a);\n}\n",
|
||||
"#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5\nuniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels\nuniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nfloat getLuminance(vec3 color) {\n return dot(color, vec3(0.2126, 0.7152, 0.0722));\n}\n\nvoid main() {\n vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));\n float radius = max(u_float1, 0.5);\n float amount = u_float0;\n float threshold = u_float2;\n\n vec4 original = texture(u_image0, v_texCoord);\n\n // Gaussian blur for the \"unsharp\" mask\n int samples = int(ceil(radius));\n float sigma = radius / 2.0;\n\n vec4 blurred = vec4(0.0);\n float totalWeight = 0.0;\n\n for (int x = -samples; x <= samples; x++) {\n for (int y = -samples; y <= samples; y++) {\n vec2 offset = vec2(float(x), float(y)) * texel;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float dist = length(vec2(float(x), float(y)));\n float weight = gaussian(dist, sigma);\n blurred += sample_color * weight;\n totalWeight += weight;\n }\n }\n blurred /= totalWeight;\n\n // Unsharp mask = original - blurred\n vec3 mask = original.rgb - blurred.rgb;\n\n // Luminance-based threshold with smooth falloff\n float lumaDelta = abs(getLuminance(original.rgb) - getLuminance(blurred.rgb));\n float thresholdScale = smoothstep(0.0, threshold, lumaDelta);\n mask *= thresholdScale;\n\n // Sharpen: original + mask * amount\n vec3 sharpened = original.rgb + mask * amount;\n\n fragColor0 = vec4(clamp(sharpened, 0.0, 1.0), original.a);\n}\n",
|
||||
"from_input"
|
||||
]
|
||||
}
|
||||
|
||||
@ -224,6 +224,7 @@ class Flux2(LatentFormat):
|
||||
|
||||
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
|
||||
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
|
||||
self.taesd_decoder_name = "taef2_decoder"
|
||||
|
||||
def process_in(self, latent):
|
||||
return latent
|
||||
@ -783,3 +784,10 @@ class ZImagePixelSpace(ChromaRadiance):
|
||||
No VAE encoding/decoding — the model operates directly on RGB pixels.
|
||||
"""
|
||||
pass
|
||||
|
||||
class CogVideoX(LatentFormat):
|
||||
latent_channels = 16
|
||||
latent_dimensions = 3
|
||||
|
||||
def __init__(self):
|
||||
self.scale_factor = 1.15258426
|
||||
|
||||
0
comfy/ldm/cogvideo/__init__.py
Normal file
0
comfy/ldm/cogvideo/__init__.py
Normal file
573
comfy/ldm/cogvideo/model.py
Normal file
573
comfy/ldm/cogvideo/model.py
Normal file
@ -0,0 +1,573 @@
|
||||
# CogVideoX 3D Transformer - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers CogVideoXTransformer3DModel
|
||||
# Style reference: comfy/ldm/wan/model.py
|
||||
|
||||
import math
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
import comfy.patcher_extension
|
||||
import comfy.ldm.common_dit
|
||||
|
||||
|
||||
def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
|
||||
"""Returns (cos, sin) each with shape [seq_len, dim].
|
||||
|
||||
Frequencies are computed at dim//2 resolution then repeat_interleaved
|
||||
to full dim, matching CogVideoX's interleaved (real, imag) pair format.
|
||||
"""
|
||||
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
|
||||
angles = torch.outer(pos.float(), freqs.float())
|
||||
cos = angles.cos().repeat_interleave(2, dim=-1).float()
|
||||
sin = angles.sin().repeat_interleave(2, dim=-1).float()
|
||||
return (cos, sin)
|
||||
|
||||
|
||||
def apply_rotary_emb(x, freqs_cos_sin):
|
||||
"""Apply CogVideoX rotary embedding to query or key tensor.
|
||||
|
||||
x: [B, heads, seq_len, head_dim]
|
||||
freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
|
||||
|
||||
Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
|
||||
head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
|
||||
"""
|
||||
cos, sin = freqs_cos_sin
|
||||
cos = cos[None, None, :, :].to(x.device)
|
||||
sin = sin[None, None, :, :].to(x.device)
|
||||
|
||||
# Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
|
||||
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
|
||||
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
||||
|
||||
return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
||||
|
||||
|
||||
def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
|
||||
half = dim // 2
|
||||
freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
|
||||
args = timesteps[:, None].float() * freqs[None] * scale
|
||||
embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
|
||||
if flip_sin_to_cos:
|
||||
embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
|
||||
if dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
|
||||
if isinstance(spatial_size, int):
|
||||
spatial_size = (spatial_size, spatial_size)
|
||||
|
||||
grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
|
||||
grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
|
||||
|
||||
grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
|
||||
|
||||
embed_dim_spatial = 2 * (embed_dim // 3)
|
||||
embed_dim_temporal = embed_dim // 3
|
||||
|
||||
pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
|
||||
pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
|
||||
|
||||
T, H, W = grid_t.shape
|
||||
pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
|
||||
pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
|
||||
def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
|
||||
T, H, W = grid_h.shape
|
||||
half_dim = embed_dim // 2
|
||||
pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
|
||||
return torch.cat([pos_h, pos_w], dim=-1)
|
||||
|
||||
|
||||
def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
|
||||
half = embed_dim // 2
|
||||
freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
|
||||
args = pos.float().reshape(-1)[:, None] * freqs[None]
|
||||
embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
|
||||
if embed_dim % 2:
|
||||
embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
|
||||
return embedding
|
||||
|
||||
|
||||
|
||||
class CogVideoXPatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
|
||||
text_dim=4096, bias=True, sample_width=90, sample_height=60,
|
||||
sample_frames=49, temporal_compression_ratio=4,
|
||||
max_text_seq_length=226, spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0, use_positional_embeddings=True,
|
||||
use_learned_positional_embeddings=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.dim = dim
|
||||
self.sample_height = sample_height
|
||||
self.sample_width = sample_width
|
||||
self.sample_frames = sample_frames
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.use_positional_embeddings = use_positional_embeddings
|
||||
self.use_learned_positional_embeddings = use_learned_positional_embeddings
|
||||
|
||||
if patch_size_t is None:
|
||||
self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
|
||||
else:
|
||||
self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
|
||||
|
||||
self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
|
||||
|
||||
if use_positional_embeddings or use_learned_positional_embeddings:
|
||||
persistent = use_learned_positional_embeddings
|
||||
pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
|
||||
self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
|
||||
|
||||
def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
|
||||
post_patch_height = sample_height // self.patch_size
|
||||
post_patch_width = sample_width // self.patch_size
|
||||
post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
|
||||
if self.patch_size_t is not None:
|
||||
post_time_compression_frames = post_time_compression_frames // self.patch_size_t
|
||||
num_patches = post_patch_height * post_patch_width * post_time_compression_frames
|
||||
|
||||
pos_embedding = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(post_patch_width, post_patch_height),
|
||||
post_time_compression_frames,
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=device,
|
||||
)
|
||||
pos_embedding = pos_embedding.reshape(-1, self.dim)
|
||||
joint_pos_embedding = pos_embedding.new_zeros(
|
||||
1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
|
||||
)
|
||||
joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
|
||||
return joint_pos_embedding
|
||||
|
||||
def forward(self, text_embeds, image_embeds):
|
||||
input_dtype = text_embeds.dtype
|
||||
text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
|
||||
batch_size, num_frames, channels, height, width = image_embeds.shape
|
||||
|
||||
proj_dtype = self.proj.weight.dtype
|
||||
if self.patch_size_t is None:
|
||||
image_embeds = image_embeds.reshape(-1, channels, height, width)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
|
||||
image_embeds = image_embeds.flatten(3).transpose(2, 3)
|
||||
image_embeds = image_embeds.flatten(1, 2)
|
||||
else:
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
|
||||
image_embeds = image_embeds.reshape(
|
||||
batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
|
||||
)
|
||||
image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
|
||||
image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
|
||||
|
||||
embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
|
||||
|
||||
if self.use_positional_embeddings or self.use_learned_positional_embeddings:
|
||||
text_seq_length = text_embeds.shape[1]
|
||||
num_image_patches = image_embeds.shape[1]
|
||||
|
||||
if self.use_learned_positional_embeddings:
|
||||
image_pos = self.pos_embedding[
|
||||
:, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
|
||||
].to(device=embeds.device, dtype=embeds.dtype)
|
||||
else:
|
||||
image_pos = get_3d_sincos_pos_embed(
|
||||
self.dim,
|
||||
(width // self.patch_size, height // self.patch_size),
|
||||
num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
|
||||
self.spatial_interpolation_scale,
|
||||
self.temporal_interpolation_scale,
|
||||
device=embeds.device,
|
||||
).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
|
||||
|
||||
# Build joint: zeros for text + sincos for image
|
||||
joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
|
||||
joint_pos[:, text_seq_length:] = image_pos
|
||||
embeds = embeds + joint_pos
|
||||
|
||||
return embeds
|
||||
|
||||
|
||||
class CogVideoXLayerNormZero(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb):
|
||||
shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
|
||||
hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
|
||||
return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
|
||||
|
||||
|
||||
class CogVideoXAdaLayerNorm(nn.Module):
|
||||
def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.silu = nn.SiLU()
|
||||
self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, temb):
|
||||
temb = self.linear(self.silu(temb))
|
||||
shift, scale = temb.chunk(2, dim=1)
|
||||
x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
|
||||
return x
|
||||
|
||||
|
||||
class CogVideoXBlock(nn.Module):
|
||||
def __init__(self, dim, num_heads, head_dim, time_dim,
|
||||
eps=1e-5, ff_inner_dim=None, ff_bias=True,
|
||||
device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = head_dim
|
||||
|
||||
self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Self-attention (joint text + latent)
|
||||
self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
|
||||
self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
|
||||
|
||||
self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
# Feed-forward (GELU approximate)
|
||||
inner_dim = ff_inner_dim or dim * 4
|
||||
self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
text_seq_length = encoder_hidden_states.size(1)
|
||||
|
||||
# Norm & modulate
|
||||
norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Joint self-attention
|
||||
qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
b, s, _ = qkv_input.shape
|
||||
n, d = self.num_heads, self.head_dim
|
||||
|
||||
q = self.q(qkv_input).view(b, s, n, d)
|
||||
k = self.k(qkv_input).view(b, s, n, d)
|
||||
v = self.v(qkv_input)
|
||||
|
||||
q = self.norm_q(q).view(b, s, n, d)
|
||||
k = self.norm_k(k).view(b, s, n, d)
|
||||
|
||||
# Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
|
||||
if image_rotary_emb is not None:
|
||||
q_img = q[:, text_seq_length:].transpose(1, 2) # [B, heads, img_seq, head_dim]
|
||||
k_img = k[:, text_seq_length:].transpose(1, 2)
|
||||
q_img = apply_rotary_emb(q_img, image_rotary_emb)
|
||||
k_img = apply_rotary_emb(k_img, image_rotary_emb)
|
||||
q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
|
||||
k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
|
||||
|
||||
attn_out = optimized_attention(
|
||||
q.reshape(b, s, n * d),
|
||||
k.reshape(b, s, n * d),
|
||||
v,
|
||||
heads=self.num_heads,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
attn_out = self.attn_out(attn_out)
|
||||
|
||||
attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
|
||||
|
||||
hidden_states = hidden_states + gate_msa * attn_hidden
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
|
||||
|
||||
# Norm & modulate for FF
|
||||
norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
|
||||
|
||||
# Feed-forward (GELU on concatenated text + latent)
|
||||
ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
|
||||
ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
|
||||
|
||||
hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
|
||||
encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
|
||||
|
||||
return hidden_states, encoder_hidden_states
|
||||
|
||||
|
||||
class CogVideoXTransformer3DModel(nn.Module):
|
||||
def __init__(self,
|
||||
num_attention_heads=30,
|
||||
attention_head_dim=64,
|
||||
in_channels=16,
|
||||
out_channels=16,
|
||||
flip_sin_to_cos=True,
|
||||
freq_shift=0,
|
||||
time_embed_dim=512,
|
||||
ofs_embed_dim=None,
|
||||
text_embed_dim=4096,
|
||||
num_layers=30,
|
||||
dropout=0.0,
|
||||
attention_bias=True,
|
||||
sample_width=90,
|
||||
sample_height=60,
|
||||
sample_frames=49,
|
||||
patch_size=2,
|
||||
patch_size_t=None,
|
||||
temporal_compression_ratio=4,
|
||||
max_text_seq_length=226,
|
||||
spatial_interpolation_scale=1.875,
|
||||
temporal_interpolation_scale=1.0,
|
||||
use_rotary_positional_embeddings=False,
|
||||
use_learned_positional_embeddings=False,
|
||||
patch_bias=True,
|
||||
image_model=None,
|
||||
device=None,
|
||||
dtype=None,
|
||||
operations=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
dim = num_attention_heads * attention_head_dim
|
||||
self.dim = dim
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_dim = attention_head_dim
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.patch_size = patch_size
|
||||
self.patch_size_t = patch_size_t
|
||||
self.max_text_seq_length = max_text_seq_length
|
||||
self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
|
||||
|
||||
# 1. Patch embedding
|
||||
self.patch_embed = CogVideoXPatchEmbed(
|
||||
patch_size=patch_size,
|
||||
patch_size_t=patch_size_t,
|
||||
in_channels=in_channels,
|
||||
dim=dim,
|
||||
text_dim=text_embed_dim,
|
||||
bias=patch_bias,
|
||||
sample_width=sample_width,
|
||||
sample_height=sample_height,
|
||||
sample_frames=sample_frames,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
max_text_seq_length=max_text_seq_length,
|
||||
spatial_interpolation_scale=spatial_interpolation_scale,
|
||||
temporal_interpolation_scale=temporal_interpolation_scale,
|
||||
use_positional_embeddings=not use_rotary_positional_embeddings,
|
||||
use_learned_positional_embeddings=use_learned_positional_embeddings,
|
||||
device=device, dtype=torch.float32, operations=operations,
|
||||
)
|
||||
|
||||
# 2. Time embedding
|
||||
self.time_proj_dim = dim
|
||||
self.time_proj_flip = flip_sin_to_cos
|
||||
self.time_proj_shift = freq_shift
|
||||
self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
|
||||
self.time_embedding_act = nn.SiLU()
|
||||
self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
|
||||
|
||||
# Optional OFS embedding (CogVideoX 1.5 I2V)
|
||||
self.ofs_proj_dim = ofs_embed_dim
|
||||
if ofs_embed_dim:
|
||||
self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
self.ofs_embedding_act = nn.SiLU()
|
||||
self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
|
||||
else:
|
||||
self.ofs_embedding_linear_1 = None
|
||||
|
||||
# 3. Transformer blocks
|
||||
self.blocks = nn.ModuleList([
|
||||
CogVideoXBlock(
|
||||
dim=dim,
|
||||
num_heads=num_attention_heads,
|
||||
head_dim=attention_head_dim,
|
||||
time_dim=time_embed_dim,
|
||||
eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
|
||||
|
||||
# 4. Output
|
||||
self.norm_out = CogVideoXAdaLayerNorm(
|
||||
time_dim=time_embed_dim, dim=dim, eps=1e-5,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
)
|
||||
|
||||
if patch_size_t is None:
|
||||
output_dim = patch_size * patch_size * out_channels
|
||||
else:
|
||||
output_dim = patch_size * patch_size * patch_size_t * out_channels
|
||||
|
||||
self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
|
||||
|
||||
self.spatial_interpolation_scale = spatial_interpolation_scale
|
||||
self.temporal_interpolation_scale = temporal_interpolation_scale
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
return comfy.patcher_extension.WrapperExecutor.new_class_executor(
|
||||
self._forward,
|
||||
self,
|
||||
comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
|
||||
).execute(x, timestep, context, ofs, transformer_options, **kwargs)
|
||||
|
||||
def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
|
||||
if transformer_options is None:
|
||||
transformer_options = {}
|
||||
# ComfyUI passes [B, C, T, H, W]
|
||||
batch_size, channels, t, h, w = x.shape
|
||||
|
||||
# Pad to patch size (temporal + spatial), same pattern as WAN
|
||||
p_t = self.patch_size_t if self.patch_size_t is not None else 1
|
||||
x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
|
||||
|
||||
# CogVideoX expects [B, T, C, H, W]
|
||||
x = x.permute(0, 2, 1, 3, 4)
|
||||
batch_size, num_frames, channels, height, width = x.shape
|
||||
|
||||
# Time embedding
|
||||
t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
t_emb = t_emb.to(dtype=x.dtype)
|
||||
emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
|
||||
|
||||
if self.ofs_embedding_linear_1 is not None and ofs is not None:
|
||||
ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
|
||||
ofs_emb = ofs_emb.to(dtype=x.dtype)
|
||||
ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
|
||||
emb = emb + ofs_emb
|
||||
|
||||
# Patch embedding
|
||||
hidden_states = self.patch_embed(context, x)
|
||||
|
||||
text_seq_length = context.shape[1]
|
||||
encoder_hidden_states = hidden_states[:, :text_seq_length]
|
||||
hidden_states = hidden_states[:, text_seq_length:]
|
||||
|
||||
# Rotary embeddings (if used)
|
||||
image_rotary_emb = None
|
||||
if self.use_rotary_positional_embeddings:
|
||||
post_patch_height = height // self.patch_size
|
||||
post_patch_width = width // self.patch_size
|
||||
if self.patch_size_t is None:
|
||||
post_time = num_frames
|
||||
else:
|
||||
post_time = num_frames // self.patch_size_t
|
||||
image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
|
||||
|
||||
# Transformer blocks
|
||||
for i, block in enumerate(self.blocks):
|
||||
hidden_states, encoder_hidden_states = block(
|
||||
hidden_states=hidden_states,
|
||||
encoder_hidden_states=encoder_hidden_states,
|
||||
temb=emb,
|
||||
image_rotary_emb=image_rotary_emb,
|
||||
transformer_options=transformer_options,
|
||||
)
|
||||
|
||||
hidden_states = self.norm_final(hidden_states)
|
||||
|
||||
# Output projection
|
||||
hidden_states = self.norm_out(hidden_states, temb=emb)
|
||||
hidden_states = self.proj_out(hidden_states)
|
||||
|
||||
# Unpatchify
|
||||
p = self.patch_size
|
||||
p_t = self.patch_size_t
|
||||
|
||||
if p_t is None:
|
||||
output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
|
||||
output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
|
||||
else:
|
||||
output = hidden_states.reshape(
|
||||
batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
|
||||
)
|
||||
output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
|
||||
|
||||
# Back to ComfyUI format [B, C, T, H, W] and crop padding
|
||||
output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
|
||||
return output
|
||||
|
||||
def _get_rotary_emb(self, h, w, t, device):
|
||||
"""Compute CogVideoX 3D rotary positional embeddings.
|
||||
|
||||
For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
|
||||
are integer arange computed at max_size, then sliced to actual size.
|
||||
For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
|
||||
scaled by spatial_interpolation_scale.
|
||||
"""
|
||||
d = self.attention_head_dim
|
||||
dim_t = d // 4
|
||||
dim_h = d // 8 * 3
|
||||
dim_w = d // 8 * 3
|
||||
|
||||
if self.patch_size_t is not None:
|
||||
# CogVideoX 1.5: "slice" mode — positions are simple integer indices
|
||||
# Compute at max(sample_size, actual_size) then slice to actual
|
||||
base_h = self.patch_embed.sample_height // self.patch_size
|
||||
base_w = self.patch_embed.sample_width // self.patch_size
|
||||
max_h = max(base_h, h)
|
||||
max_w = max(base_w, w)
|
||||
|
||||
grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
|
||||
grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
else:
|
||||
# CogVideoX 1.0: "linspace" mode with interpolation scale
|
||||
grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
|
||||
grid_t = torch.arange(t, device=device, dtype=torch.float32)
|
||||
|
||||
freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
|
||||
freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
|
||||
freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
|
||||
|
||||
t_cos, t_sin = freqs_t
|
||||
h_cos, h_sin = freqs_h
|
||||
w_cos, w_sin = freqs_w
|
||||
|
||||
# Slice to actual size (for "slice" mode where grids may be larger)
|
||||
t_cos, t_sin = t_cos[:t], t_sin[:t]
|
||||
h_cos, h_sin = h_cos[:h], h_sin[:h]
|
||||
w_cos, w_sin = w_cos[:w], w_sin[:w]
|
||||
|
||||
# Broadcast and concatenate into [T*H*W, head_dim]
|
||||
t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
|
||||
t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
|
||||
h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
|
||||
h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
|
||||
w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
|
||||
w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
|
||||
|
||||
cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
|
||||
sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
|
||||
return (cos, sin)
|
||||
566
comfy/ldm/cogvideo/vae.py
Normal file
566
comfy/ldm/cogvideo/vae.py
Normal file
@ -0,0 +1,566 @@
|
||||
# CogVideoX VAE - ported to ComfyUI native ops
|
||||
# Architecture reference: diffusers AutoencoderKLCogVideoX
|
||||
# Style reference: comfy/ldm/wan/vae.py
|
||||
|
||||
import numpy as np
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
import comfy.ops
|
||||
ops = comfy.ops.disable_weight_init
|
||||
|
||||
|
||||
class CausalConv3d(nn.Module):
|
||||
"""Causal 3D convolution with temporal padding.
|
||||
|
||||
Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
|
||||
a single temporal frame and no cache, the 3D conv weight is sliced to act
|
||||
as a 2D conv, avoiding computation on zero-padded temporal dimensions.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
|
||||
super().__init__()
|
||||
if isinstance(kernel_size, int):
|
||||
kernel_size = (kernel_size,) * 3
|
||||
|
||||
time_kernel, height_kernel, width_kernel = kernel_size
|
||||
self.time_kernel_size = time_kernel
|
||||
self.pad_mode = pad_mode
|
||||
|
||||
height_pad = (height_kernel - 1) // 2
|
||||
width_pad = (width_kernel - 1) // 2
|
||||
self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
|
||||
|
||||
stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
|
||||
dilation = (dilation, 1, 1)
|
||||
self.conv = ops.Conv3d(
|
||||
in_channels, out_channels, kernel_size,
|
||||
stride=stride, dilation=dilation,
|
||||
padding=(0, height_pad, width_pad),
|
||||
)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
if self.pad_mode == "replicate":
|
||||
x = F.pad(x, self.time_causal_padding, mode="replicate")
|
||||
conv_cache = None
|
||||
else:
|
||||
kernel_t = self.time_kernel_size
|
||||
if kernel_t > 1:
|
||||
if conv_cache is None and x.shape[2] == 1:
|
||||
# Fast path: single frame, no cache. All temporal padding
|
||||
# frames are copies of the input (replicate-style), so the
|
||||
# 3D conv reduces to a 2D conv with summed temporal kernel.
|
||||
w = comfy.ops.cast_to_input(self.conv.weight, x)
|
||||
b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
|
||||
w2d = w.sum(dim=2, keepdim=True)
|
||||
out = F.conv3d(x, w2d, b,
|
||||
self.conv.stride, self.conv.padding,
|
||||
self.conv.dilation, self.conv.groups)
|
||||
return out, None
|
||||
cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
|
||||
x = torch.cat(cached + [x], dim=2)
|
||||
conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
|
||||
|
||||
out = self.conv(x)
|
||||
return out, conv_cache
|
||||
|
||||
|
||||
def _interpolate_zq(zq, target_size):
|
||||
"""Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
|
||||
t = target_size[0]
|
||||
if t > 1 and t % 2 == 1:
|
||||
z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
|
||||
z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
|
||||
return torch.cat([z_first, z_rest], dim=2)
|
||||
return F.interpolate(zq, size=target_size)
|
||||
|
||||
|
||||
class SpatialNorm3D(nn.Module):
|
||||
"""Spatially conditioned normalization."""
|
||||
def __init__(self, f_channels, zq_channels, groups=32):
|
||||
super().__init__()
|
||||
self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
|
||||
self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
|
||||
|
||||
def forward(self, f, zq, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
if zq.shape[-3:] != f.shape[-3:]:
|
||||
zq = _interpolate_zq(zq, f.shape[-3:])
|
||||
|
||||
conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
|
||||
conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
|
||||
|
||||
return self.norm_layer(f) * conv_y + conv_b, new_cache
|
||||
|
||||
|
||||
class ResnetBlock3D(nn.Module):
|
||||
"""3D ResNet block with optional spatial norm."""
|
||||
def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
|
||||
eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
out_channels = out_channels or in_channels
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.spatial_norm_dim = spatial_norm_dim
|
||||
|
||||
if act_fn == "silu":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
elif act_fn == "swish":
|
||||
self.nonlinearity = nn.SiLU()
|
||||
else:
|
||||
self.nonlinearity = nn.SiLU()
|
||||
|
||||
if spatial_norm_dim is None:
|
||||
self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
|
||||
self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
|
||||
else:
|
||||
self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
|
||||
self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
|
||||
|
||||
self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if temb_channels > 0:
|
||||
self.temb_proj = ops.Linear(temb_channels, out_channels)
|
||||
|
||||
self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
if in_channels != out_channels:
|
||||
self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
||||
else:
|
||||
self.conv_shortcut = None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
residual = x
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
|
||||
else:
|
||||
x = self.norm1(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
|
||||
|
||||
if temb is not None and hasattr(self, "temb_proj"):
|
||||
x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
|
||||
|
||||
if zq is not None:
|
||||
x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
|
||||
else:
|
||||
x = self.norm2(x)
|
||||
|
||||
x = self.nonlinearity(x)
|
||||
x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
|
||||
|
||||
if self.conv_shortcut is not None:
|
||||
residual = self.conv_shortcut(residual)
|
||||
|
||||
return x + residual, new_cache
|
||||
|
||||
|
||||
class Downsample3D(nn.Module):
|
||||
"""3D downsampling with optional temporal compression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
|
||||
if t % 2 == 1:
|
||||
x_first, x_rest = x[..., 0], x[..., 1:]
|
||||
if x_rest.shape[-1] > 0:
|
||||
x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
|
||||
x = torch.cat([x_first[..., None], x_rest], dim=-1)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
else:
|
||||
x = F.avg_pool1d(x, kernel_size=2, stride=2)
|
||||
x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
|
||||
|
||||
pad = (0, 1, 0, 1)
|
||||
x = F.pad(x, pad, mode="constant", value=0)
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class Upsample3D(nn.Module):
|
||||
"""3D upsampling with optional temporal decompression."""
|
||||
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
|
||||
super().__init__()
|
||||
self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
|
||||
self.compress_time = compress_time
|
||||
|
||||
def forward(self, x):
|
||||
if self.compress_time:
|
||||
if x.shape[2] > 1 and x.shape[2] % 2 == 1:
|
||||
x_first, x_rest = x[:, :, 0], x[:, :, 1:]
|
||||
x_first = F.interpolate(x_first, scale_factor=2.0)
|
||||
x_rest = F.interpolate(x_rest, scale_factor=2.0)
|
||||
x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
|
||||
elif x.shape[2] > 1:
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
else:
|
||||
x = x.squeeze(2)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x[:, :, None, :, :]
|
||||
else:
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = F.interpolate(x, scale_factor=2.0)
|
||||
x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
|
||||
|
||||
b, c, t, h, w = x.shape
|
||||
x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
|
||||
x = self.conv(x)
|
||||
x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
|
||||
return x
|
||||
|
||||
|
||||
class DownBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
|
||||
compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels,
|
||||
groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.downsamplers is not None:
|
||||
for ds in self.downsamplers:
|
||||
x = ds(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class MidBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels, out_channels=in_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class UpBlock3D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
|
||||
eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
|
||||
add_upsample=True, compress_time=False, pad_mode="first"):
|
||||
super().__init__()
|
||||
self.resnets = nn.ModuleList([
|
||||
ResnetBlock3D(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
out_channels=out_channels,
|
||||
temb_channels=temb_channels, groups=groups, eps=eps,
|
||||
act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
|
||||
)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
|
||||
|
||||
def forward(self, x, temb=None, zq=None, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
for i, resnet in enumerate(self.resnets):
|
||||
x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
|
||||
if self.upsamplers is not None:
|
||||
for us in self.upsamplers:
|
||||
x = us(x)
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Encoder3D(nn.Module):
|
||||
def __init__(self, in_channels=3, out_channels=16,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.down_blocks = nn.ModuleList()
|
||||
output_channel = block_out_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
input_channel = output_channel
|
||||
output_channel = block_out_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.down_blocks.append(DownBlock3D(
|
||||
in_channels=input_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
add_downsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=block_out_channels[-1], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, x, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
for i, block in enumerate(self.down_blocks):
|
||||
key = f"down_block_{i}"
|
||||
x, new_cache[key] = block(x, None, None, conv_cache.get(key))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
x = self.norm_out(x)
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
class Decoder3D(nn.Module):
|
||||
def __init__(self, in_channels=16, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
layers_per_block=3, act_fn="silu",
|
||||
eps=1e-6, groups=32, pad_mode="first",
|
||||
temporal_compression_ratio=4):
|
||||
super().__init__()
|
||||
reversed_channels = list(reversed(block_out_channels))
|
||||
temporal_compress_level = int(np.log2(temporal_compression_ratio))
|
||||
|
||||
self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
self.mid_block = MidBlock3D(
|
||||
in_channels=reversed_channels[0], temb_channels=0,
|
||||
num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels, pad_mode=pad_mode,
|
||||
)
|
||||
|
||||
self.up_blocks = nn.ModuleList()
|
||||
output_channel = reversed_channels[0]
|
||||
for i in range(len(block_out_channels)):
|
||||
prev_channel = output_channel
|
||||
output_channel = reversed_channels[i]
|
||||
is_final = i == len(block_out_channels) - 1
|
||||
compress_time = i < temporal_compress_level
|
||||
|
||||
self.up_blocks.append(UpBlock3D(
|
||||
in_channels=prev_channel, out_channels=output_channel,
|
||||
temb_channels=0, num_layers=layers_per_block + 1,
|
||||
eps=eps, act_fn=act_fn, groups=groups,
|
||||
spatial_norm_dim=in_channels,
|
||||
add_upsample=not is_final, compress_time=compress_time,
|
||||
))
|
||||
|
||||
self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
|
||||
self.conv_act = nn.SiLU()
|
||||
self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
|
||||
|
||||
def forward(self, sample, conv_cache=None):
|
||||
new_cache = {}
|
||||
conv_cache = conv_cache or {}
|
||||
|
||||
x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
|
||||
|
||||
x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
|
||||
|
||||
for i, block in enumerate(self.up_blocks):
|
||||
key = f"up_block_{i}"
|
||||
x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
|
||||
|
||||
x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
|
||||
x = self.conv_act(x)
|
||||
x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
|
||||
|
||||
return x, new_cache
|
||||
|
||||
|
||||
|
||||
class AutoencoderKLCogVideoX(nn.Module):
|
||||
"""CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
|
||||
|
||||
Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
|
||||
on the full (low-res) tensor, then the expensive spatial-only up_blocks +
|
||||
norm_out + conv_out are processed in small temporal chunks with conv_cache
|
||||
carrying causal state between chunks. This keeps peak VRAM proportional to
|
||||
chunk_size rather than total frame count.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels=3, out_channels=3,
|
||||
block_out_channels=(128, 256, 256, 512),
|
||||
latent_channels=16, layers_per_block=3,
|
||||
act_fn="silu", eps=1e-6, groups=32,
|
||||
temporal_compression_ratio=4,
|
||||
):
|
||||
super().__init__()
|
||||
self.latent_channels = latent_channels
|
||||
self.temporal_compression_ratio = temporal_compression_ratio
|
||||
|
||||
self.encoder = Encoder3D(
|
||||
in_channels=in_channels, out_channels=latent_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
self.decoder = Decoder3D(
|
||||
in_channels=latent_channels, out_channels=out_channels,
|
||||
block_out_channels=block_out_channels, layers_per_block=layers_per_block,
|
||||
act_fn=act_fn, eps=eps, groups=groups,
|
||||
temporal_compression_ratio=temporal_compression_ratio,
|
||||
)
|
||||
|
||||
self.num_latent_frames_batch_size = 2
|
||||
self.num_sample_frames_batch_size = 8
|
||||
|
||||
def encode(self, x):
|
||||
t = x.shape[2]
|
||||
frame_batch = self.num_sample_frames_batch_size
|
||||
remainder = t % frame_batch
|
||||
conv_cache = None
|
||||
enc = []
|
||||
|
||||
# Process remainder frames first so only the first chunk can have an
|
||||
# odd temporal dimension — where Downsample3D's first-frame-special
|
||||
# handling in temporal compression is actually correct.
|
||||
if remainder > 0:
|
||||
chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
for start in range(remainder, t, frame_batch):
|
||||
chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
|
||||
enc.append(chunk.to(x.device))
|
||||
|
||||
enc = torch.cat(enc, dim=2)
|
||||
mean, _ = enc.chunk(2, dim=1)
|
||||
return mean
|
||||
|
||||
def decode(self, z):
|
||||
return self._decode_rolling(z)
|
||||
|
||||
def _decode_batched(self, z):
|
||||
"""Original batched decode - processes 2 latent frames through full decoder."""
|
||||
t = z.shape[2]
|
||||
frame_batch = self.num_latent_frames_batch_size
|
||||
num_batches = max(t // frame_batch, 1)
|
||||
conv_cache = None
|
||||
dec = []
|
||||
for i in range(num_batches):
|
||||
remaining = t % frame_batch
|
||||
start = frame_batch * i + (0 if i == 0 else remaining)
|
||||
end = frame_batch * (i + 1) + remaining
|
||||
chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
|
||||
dec.append(chunk.cpu())
|
||||
return torch.cat(dec, dim=2).to(z.device)
|
||||
|
||||
def _decode_rolling(self, z):
|
||||
"""Rolling decode - processes low-res layers on full tensor, then rolls
|
||||
through expensive high-res layers in temporal chunks."""
|
||||
decoder = self.decoder
|
||||
device = z.device
|
||||
|
||||
# Determine which up_blocks have temporal upsample vs spatial-only.
|
||||
# Temporal up_blocks are cheap (low res), spatial-only are expensive.
|
||||
temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
|
||||
split_at = temporal_compress_level # first N up_blocks do temporal upsample
|
||||
|
||||
# Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
|
||||
x, _ = decoder.conv_in(z)
|
||||
x, _ = decoder.mid_block(x, None, z)
|
||||
|
||||
for i in range(split_at):
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
|
||||
# Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
|
||||
remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
|
||||
chunk_size = 4 # pixel frames per chunk through high-res layers
|
||||
t_expanded = x.shape[2]
|
||||
|
||||
if t_expanded <= chunk_size or len(remaining_blocks) == 0:
|
||||
# Small enough to process in one go
|
||||
for i in remaining_blocks:
|
||||
x, _ = decoder.up_blocks[i](x, None, z)
|
||||
x, _ = decoder.norm_out(x, z)
|
||||
x = decoder.conv_act(x)
|
||||
x, _ = decoder.conv_out(x)
|
||||
return x
|
||||
|
||||
# Expand z temporally once to match Phase 2's time dimension.
|
||||
# z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
|
||||
# for the old approach of pre-interpolating to every pixel resolution).
|
||||
z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
|
||||
|
||||
# Process in temporal chunks, interpolating spatially per-chunk to avoid
|
||||
# allocating full [B, C, t_expanded, H, W] tensors at each resolution.
|
||||
dec_out = []
|
||||
conv_caches = {}
|
||||
|
||||
for chunk_start in range(0, t_expanded, chunk_size):
|
||||
chunk_end = min(chunk_start + chunk_size, t_expanded)
|
||||
x_chunk = x[:, :, chunk_start:chunk_end]
|
||||
z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
|
||||
z_spatial_cache = {}
|
||||
|
||||
for i in remaining_blocks:
|
||||
block = decoder.up_blocks[i]
|
||||
cache_key = f"up_block_{i}"
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
|
||||
z_spatial_cache[hw_key] = z_t_chunk
|
||||
else:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
|
||||
conv_caches[cache_key] = new_cache
|
||||
|
||||
hw_key = (x_chunk.shape[3], x_chunk.shape[4])
|
||||
if hw_key not in z_spatial_cache:
|
||||
z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
|
||||
x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
|
||||
conv_caches["norm_out"] = new_cache
|
||||
x_chunk = decoder.conv_act(x_chunk)
|
||||
x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
|
||||
conv_caches["conv_out"] = new_cache
|
||||
|
||||
dec_out.append(x_chunk.cpu())
|
||||
del z_spatial_cache
|
||||
|
||||
del x, z_time_expanded
|
||||
return torch.cat(dec_out, dim=2).to(device)
|
||||
596
comfy/ldm/sam3/detector.py
Normal file
596
comfy/ldm/sam3/detector.py
Normal file
@ -0,0 +1,596 @@
|
||||
# SAM3 detector: transformer encoder-decoder, segmentation head, geometry encoder, scoring.
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchvision.ops import roi_align
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.sam3.tracker import SAM3Tracker, SAM31Tracker
|
||||
from comfy.ldm.sam3.sam import SAM3VisionBackbone # noqa: used in __init__
|
||||
from comfy.ldm.sam3.sam import MLP, PositionEmbeddingSine
|
||||
|
||||
TRACKER_CLASSES = {"SAM3": SAM3Tracker, "SAM31": SAM31Tracker}
|
||||
from comfy.ops import cast_to_input
|
||||
|
||||
|
||||
def box_cxcywh_to_xyxy(x):
|
||||
cx, cy, w, h = x.unbind(-1)
|
||||
return torch.stack([cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h], dim=-1)
|
||||
|
||||
|
||||
def gen_sineembed_for_position(pos_tensor, num_feats=256):
|
||||
"""Per-coordinate sinusoidal embedding: (..., N) -> (..., N * num_feats)."""
|
||||
assert num_feats % 2 == 0
|
||||
hdim = num_feats // 2
|
||||
freqs = 10000.0 ** (2 * (torch.arange(hdim, dtype=torch.float32, device=pos_tensor.device) // 2) / hdim)
|
||||
embeds = []
|
||||
for c in range(pos_tensor.shape[-1]):
|
||||
raw = (pos_tensor[..., c].float() * 2 * math.pi).unsqueeze(-1) / freqs
|
||||
embeds.append(torch.stack([raw[..., 0::2].sin(), raw[..., 1::2].cos()], dim=-1).flatten(-2))
|
||||
return torch.cat(embeds, dim=-1).to(pos_tensor.dtype)
|
||||
|
||||
|
||||
class SplitMHA(nn.Module):
|
||||
"""Multi-head attention with separate Q/K/V projections (split from fused in_proj_weight)."""
|
||||
def __init__(self, d_model, num_heads=8, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, q_input, k_input=None, v_input=None, mask=None):
|
||||
q = self.q_proj(q_input)
|
||||
if k_input is None:
|
||||
k = self.k_proj(q_input)
|
||||
v = self.v_proj(q_input)
|
||||
else:
|
||||
k = self.k_proj(k_input)
|
||||
v = self.v_proj(v_input if v_input is not None else k_input)
|
||||
if mask is not None and mask.ndim == 2:
|
||||
mask = mask[:, None, None, :] # [B, T] -> [B, 1, 1, T] for SDPA broadcast
|
||||
dtype = q.dtype # manual_cast may produce mixed dtypes
|
||||
out = optimized_attention(q, k.to(dtype), v.to(dtype), self.num_heads, mask=mask, low_precision_attention=False)
|
||||
return self.out_proj(out)
|
||||
|
||||
|
||||
class MLPWithNorm(nn.Module):
|
||||
"""MLP with residual connection and output LayerNorm."""
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, residual=True, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
|
||||
self.layers = nn.ModuleList([
|
||||
operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype)
|
||||
for i in range(num_layers)
|
||||
])
|
||||
self.out_norm = operations.LayerNorm(output_dim, device=device, dtype=dtype)
|
||||
self.residual = residual and (input_dim == output_dim)
|
||||
|
||||
def forward(self, x):
|
||||
orig = x
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = layer(x)
|
||||
if i < len(self.layers) - 1:
|
||||
x = F.relu(x)
|
||||
if self.residual:
|
||||
x = x + orig
|
||||
return self.out_norm(x)
|
||||
|
||||
|
||||
class EncoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_image = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, pos, text_memory=None, text_mask=None):
|
||||
normed = self.norm1(x)
|
||||
q_k = normed + pos
|
||||
x = x + self.self_attn(q_k, q_k, normed)
|
||||
if text_memory is not None:
|
||||
normed = self.norm2(x)
|
||||
x = x + self.cross_attn_image(normed, text_memory, text_memory, mask=text_mask)
|
||||
normed = self.norm3(x)
|
||||
x = x + self.linear2(F.relu(self.linear1(normed)))
|
||||
return x
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Module):
|
||||
"""Checkpoint: transformer.encoder.layers.N.*"""
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
EncoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, pos, text_memory=None, text_mask=None):
|
||||
for layer in self.layers:
|
||||
x = layer(x, pos, text_memory, text_mask)
|
||||
return x
|
||||
|
||||
|
||||
class DecoderLayer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.ca_text = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.catext_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
|
||||
self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, memory, x_pos, memory_pos, text_memory=None, text_mask=None, cross_attn_bias=None):
|
||||
q_k = x + x_pos
|
||||
x = self.norm2(x + self.self_attn(q_k, q_k, x))
|
||||
if text_memory is not None:
|
||||
x = self.catext_norm(x + self.ca_text(x + x_pos, text_memory, text_memory, mask=text_mask))
|
||||
x = self.norm1(x + self.cross_attn(x + x_pos, memory + memory_pos, memory, mask=cross_attn_bias))
|
||||
x = self.norm3(x + self.linear2(F.relu(self.linear1(x))))
|
||||
return x
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6,
|
||||
num_queries=200, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.num_queries = num_queries
|
||||
|
||||
self.layers = nn.ModuleList([
|
||||
DecoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.query_embed = operations.Embedding(num_queries, d_model, device=device, dtype=dtype)
|
||||
self.reference_points = operations.Embedding(num_queries, 4, device=device, dtype=dtype) # Reference points: Embedding(num_queries, 4) — learned anchor boxes
|
||||
self.ref_point_head = MLP(d_model * 2, d_model, d_model, 2, device=device, dtype=dtype, operations=operations) # ref_point_head input: 512 (4 coords * 128 sine features each)
|
||||
self.bbox_embed = MLP(d_model, d_model, 4, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.boxRPB_embed_x = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.boxRPB_embed_y = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
self.presence_token = operations.Embedding(1, d_model, device=device, dtype=dtype)
|
||||
self.presence_token_head = MLP(d_model, d_model, 1, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.presence_token_out_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
|
||||
@staticmethod
|
||||
def _inverse_sigmoid(x):
|
||||
return torch.log(x / (1 - x + 1e-6) + 1e-6)
|
||||
|
||||
def _compute_box_rpb(self, ref_points, H, W):
|
||||
"""Box rotary position bias: (B, Q, 4) cxcywh -> (B, n_heads, Q+1, H*W) bias."""
|
||||
boxes_xyxy = box_cxcywh_to_xyxy(ref_points)
|
||||
B, Q, _ = boxes_xyxy.shape
|
||||
coords_h = torch.arange(H, device=ref_points.device, dtype=torch.float32) / H
|
||||
coords_w = torch.arange(W, device=ref_points.device, dtype=torch.float32) / W
|
||||
deltas_x = coords_w.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 0:3:2]
|
||||
deltas_y = coords_h.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 1:4:2]
|
||||
|
||||
log2_8 = float(math.log2(8))
|
||||
def log_scale(d):
|
||||
return torch.sign(d * 8) * torch.log2(torch.abs(d * 8) + 1.0) / log2_8
|
||||
|
||||
rpb_x = self.boxRPB_embed_x(log_scale(deltas_x).to(ref_points.dtype))
|
||||
rpb_y = self.boxRPB_embed_y(log_scale(deltas_y).to(ref_points.dtype))
|
||||
|
||||
bias = (rpb_y.unsqueeze(3) + rpb_x.unsqueeze(2)).flatten(2, 3).permute(0, 3, 1, 2)
|
||||
pres_bias = torch.zeros(B, bias.shape[1], 1, bias.shape[3], device=bias.device, dtype=bias.dtype)
|
||||
return torch.cat([pres_bias, bias], dim=2)
|
||||
|
||||
def forward(self, memory, memory_pos, text_memory=None, text_mask=None, H=72, W=72):
|
||||
B = memory.shape[0]
|
||||
tgt = cast_to_input(self.query_embed.weight, memory).unsqueeze(0).expand(B, -1, -1)
|
||||
presence_out = cast_to_input(self.presence_token.weight, memory)[None].expand(B, -1, -1)
|
||||
ref_points = cast_to_input(self.reference_points.weight, memory).unsqueeze(0).expand(B, -1, -1).sigmoid()
|
||||
|
||||
for layer_idx, layer in enumerate(self.layers):
|
||||
query_pos = self.ref_point_head(gen_sineembed_for_position(ref_points, self.d_model))
|
||||
tgt_with_pres = torch.cat([presence_out, tgt], dim=1)
|
||||
pos_with_pres = torch.cat([torch.zeros_like(presence_out), query_pos], dim=1)
|
||||
tgt_with_pres = layer(tgt_with_pres, memory, pos_with_pres, memory_pos,
|
||||
text_memory, text_mask, self._compute_box_rpb(ref_points, H, W))
|
||||
presence_out, tgt = tgt_with_pres[:, :1], tgt_with_pres[:, 1:]
|
||||
if layer_idx < len(self.layers) - 1:
|
||||
ref_inv = self._inverse_sigmoid(ref_points)
|
||||
ref_points = (ref_inv + self.bbox_embed(self.norm(tgt))).sigmoid().detach()
|
||||
|
||||
query_out = self.norm(tgt)
|
||||
ref_inv = self._inverse_sigmoid(ref_points)
|
||||
boxes = (ref_inv + self.bbox_embed(query_out)).sigmoid()
|
||||
presence = self.presence_token_head(self.presence_token_out_norm(presence_out)).squeeze(-1)
|
||||
return {"decoder_output": query_out, "pred_boxes": boxes, "presence": presence}
|
||||
|
||||
|
||||
class Transformer(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, dim_ff=2048, enc_layers=6, dec_layers=6,
|
||||
num_queries=200, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.encoder = TransformerEncoder(d_model, num_heads, dim_ff, enc_layers, device=device, dtype=dtype, operations=operations)
|
||||
self.decoder = TransformerDecoder(d_model, num_heads, dim_ff, dec_layers, num_queries, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
|
||||
class GeometryEncoder(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, num_layers=3, roi_size=7, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.roi_size = roi_size
|
||||
self.pos_enc = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
|
||||
self.points_direct_project = operations.Linear(2, d_model, device=device, dtype=dtype)
|
||||
self.points_pool_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.points_pos_enc_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.boxes_direct_project = operations.Linear(4, d_model, device=device, dtype=dtype)
|
||||
self.boxes_pool_project = operations.Conv2d(d_model, d_model, kernel_size=roi_size, device=device, dtype=dtype)
|
||||
self.boxes_pos_enc_project = operations.Linear(d_model + 2, d_model, device=device, dtype=dtype)
|
||||
self.label_embed = operations.Embedding(2, d_model, device=device, dtype=dtype)
|
||||
self.cls_embed = operations.Embedding(1, d_model, device=device, dtype=dtype)
|
||||
self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.img_pre_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.encode = nn.ModuleList([
|
||||
EncoderLayer(d_model, num_heads, 2048, device=device, dtype=dtype, operations=operations)
|
||||
for _ in range(num_layers)
|
||||
])
|
||||
self.encode_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.final_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
|
||||
def _encode_points(self, coords, labels, img_feat_2d):
|
||||
"""Encode point prompts: direct + pool + pos_enc + label. coords: [B, N, 2] normalized."""
|
||||
B, N, _ = coords.shape
|
||||
embed = self.points_direct_project(coords)
|
||||
# Pool features from backbone at point locations via grid_sample
|
||||
grid = (coords * 2 - 1).unsqueeze(2) # [B, N, 1, 2] in [-1, 1]
|
||||
sampled = F.grid_sample(img_feat_2d, grid, align_corners=False) # [B, C, N, 1]
|
||||
embed = embed + self.points_pool_project(sampled.squeeze(-1).permute(0, 2, 1)) # [B, N, C]
|
||||
# Positional encoding of coordinates
|
||||
x, y = coords[:, :, 0], coords[:, :, 1] # [B, N]
|
||||
pos_x, pos_y = self.pos_enc._encode_xy(x.flatten(), y.flatten())
|
||||
enc = torch.cat([pos_x, pos_y], dim=-1).view(B, N, -1)
|
||||
embed = embed + self.points_pos_enc_project(cast_to_input(enc, embed))
|
||||
embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
|
||||
return embed
|
||||
|
||||
def _encode_boxes(self, boxes, labels, img_feat_2d):
|
||||
"""Encode box prompts: direct + pool + pos_enc + label. boxes: [B, N, 4] normalized cxcywh."""
|
||||
B, N, _ = boxes.shape
|
||||
embed = self.boxes_direct_project(boxes)
|
||||
# ROI align from backbone at box regions
|
||||
H, W = img_feat_2d.shape[-2:]
|
||||
boxes_xyxy = box_cxcywh_to_xyxy(boxes)
|
||||
scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype, device=boxes_xyxy.device)
|
||||
boxes_scaled = boxes_xyxy * scale
|
||||
sampled = roi_align(img_feat_2d, boxes_scaled.view(-1, 4).split(N), self.roi_size)
|
||||
proj = self.boxes_pool_project(sampled).view(B, N, -1) # Conv2d(roi_size) -> [B*N, C, 1, 1] -> [B, N, C]
|
||||
embed = embed + proj
|
||||
# Positional encoding of box center + size
|
||||
cx, cy, w, h = boxes[:, :, 0], boxes[:, :, 1], boxes[:, :, 2], boxes[:, :, 3]
|
||||
enc = self.pos_enc.encode_boxes(cx.flatten(), cy.flatten(), w.flatten(), h.flatten())
|
||||
enc = enc.view(B, N, -1)
|
||||
embed = embed + self.boxes_pos_enc_project(cast_to_input(enc, embed))
|
||||
embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
|
||||
return embed
|
||||
|
||||
def forward(self, points=None, boxes=None, image_features=None):
|
||||
"""Encode geometry prompts. image_features: [B, HW, C] flattened backbone features."""
|
||||
# Prepare 2D image features for pooling
|
||||
img_feat_2d = None
|
||||
if image_features is not None:
|
||||
B = image_features.shape[0]
|
||||
HW, C = image_features.shape[1], image_features.shape[2]
|
||||
hw = int(math.sqrt(HW))
|
||||
img_normed = self.img_pre_norm(image_features)
|
||||
img_feat_2d = img_normed.permute(0, 2, 1).view(B, C, hw, hw)
|
||||
|
||||
embeddings = []
|
||||
if points is not None:
|
||||
coords, labels = points
|
||||
embeddings.append(self._encode_points(coords, labels, img_feat_2d))
|
||||
if boxes is not None:
|
||||
B = boxes.shape[0]
|
||||
box_labels = torch.ones(B, boxes.shape[1], dtype=torch.long, device=boxes.device)
|
||||
embeddings.append(self._encode_boxes(boxes, box_labels, img_feat_2d))
|
||||
if not embeddings:
|
||||
return None
|
||||
geo = torch.cat(embeddings, dim=1)
|
||||
geo = self.norm(geo)
|
||||
if image_features is not None:
|
||||
for layer in self.encode:
|
||||
geo = layer(geo, torch.zeros_like(geo), image_features)
|
||||
geo = self.encode_norm(geo)
|
||||
return self.final_proj(geo)
|
||||
|
||||
|
||||
class PixelDecoder(nn.Module):
|
||||
"""Top-down FPN pixel decoder with GroupNorm + ReLU + nearest interpolation."""
|
||||
def __init__(self, d_model=256, num_stages=3, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.conv_layers = nn.ModuleList([operations.Conv2d(d_model, d_model, kernel_size=3, padding=1, device=device, dtype=dtype) for _ in range(num_stages)])
|
||||
self.norms = nn.ModuleList([operations.GroupNorm(8, d_model, device=device, dtype=dtype) for _ in range(num_stages)])
|
||||
|
||||
def forward(self, backbone_features):
|
||||
prev = backbone_features[-1]
|
||||
for i, feat in enumerate(backbone_features[:-1][::-1]):
|
||||
prev = F.relu(self.norms[i](self.conv_layers[i](feat + F.interpolate(prev, size=feat.shape[-2:], mode="nearest"))))
|
||||
return prev
|
||||
|
||||
|
||||
class MaskPredictor(nn.Module):
|
||||
def __init__(self, d_model=256, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.mask_embed = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, query_embeddings, pixel_features):
|
||||
mask_embed = self.mask_embed(query_embeddings)
|
||||
return torch.einsum("bqc,bchw->bqhw", mask_embed, pixel_features)
|
||||
|
||||
|
||||
class SegmentationHead(nn.Module):
|
||||
def __init__(self, d_model=256, num_heads=8, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.d_model = d_model
|
||||
self.pixel_decoder = PixelDecoder(d_model, 3, device=device, dtype=dtype, operations=operations)
|
||||
self.mask_predictor = MaskPredictor(d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attend_prompt = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
|
||||
self.instance_seg_head = operations.Conv2d(d_model, d_model, kernel_size=1, device=device, dtype=dtype)
|
||||
self.semantic_seg_head = operations.Conv2d(d_model, 1, kernel_size=1, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, query_embeddings, backbone_features, encoder_hidden_states=None, prompt=None, prompt_mask=None):
|
||||
if encoder_hidden_states is not None and prompt is not None:
|
||||
enc_normed = self.cross_attn_norm(encoder_hidden_states)
|
||||
enc_cross = self.cross_attend_prompt(enc_normed, prompt, prompt, mask=prompt_mask)
|
||||
encoder_hidden_states = enc_cross + encoder_hidden_states
|
||||
|
||||
if encoder_hidden_states is not None:
|
||||
B, H, W = encoder_hidden_states.shape[0], backbone_features[-1].shape[-2], backbone_features[-1].shape[-1]
|
||||
encoder_visual = encoder_hidden_states[:, :H * W].permute(0, 2, 1).view(B, self.d_model, H, W)
|
||||
backbone_features = list(backbone_features)
|
||||
backbone_features[-1] = encoder_visual
|
||||
|
||||
pixel_features = self.pixel_decoder(backbone_features)
|
||||
instance_features = self.instance_seg_head(pixel_features)
|
||||
masks = self.mask_predictor(query_embeddings, instance_features)
|
||||
return masks
|
||||
|
||||
|
||||
class DotProductScoring(nn.Module):
|
||||
def __init__(self, d_model=256, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.hs_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.prompt_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
|
||||
self.prompt_mlp = MLPWithNorm(d_model, 2048, d_model, 2, device=device, dtype=dtype, operations=operations)
|
||||
self.scale = 1.0 / (d_model ** 0.5)
|
||||
|
||||
def forward(self, query_embeddings, prompt_embeddings, prompt_mask=None):
|
||||
prompt = self.prompt_mlp(prompt_embeddings)
|
||||
if prompt_mask is not None:
|
||||
weight = prompt_mask.unsqueeze(-1).to(dtype=prompt.dtype)
|
||||
pooled = (prompt * weight).sum(dim=1) / weight.sum(dim=1).clamp(min=1)
|
||||
else:
|
||||
pooled = prompt.mean(dim=1)
|
||||
hs = self.hs_proj(query_embeddings)
|
||||
pp = self.prompt_proj(pooled).unsqueeze(-1).to(hs.dtype)
|
||||
scores = torch.matmul(hs, pp)
|
||||
return (scores * self.scale).clamp(-12.0, 12.0).squeeze(-1)
|
||||
|
||||
|
||||
class SAM3Detector(nn.Module):
|
||||
def __init__(self, d_model=256, embed_dim=1024, num_queries=200, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
image_model = kwargs.pop("image_model", "SAM3")
|
||||
for k in ("num_heads", "num_head_channels"):
|
||||
kwargs.pop(k, None)
|
||||
multiplex = image_model == "SAM31"
|
||||
# SAM3: 4 FPN levels, drop last (scalp=1); SAM3.1: 3 levels, use all (scalp=0)
|
||||
self.scalp = 0 if multiplex else 1
|
||||
self.backbone = nn.ModuleDict({
|
||||
"vision_backbone": SAM3VisionBackbone(embed_dim=embed_dim, d_model=d_model, multiplex=multiplex, device=device, dtype=dtype, operations=operations, **kwargs),
|
||||
"language_backbone": nn.ModuleDict({"resizer": operations.Linear(embed_dim, d_model, device=device, dtype=dtype)}),
|
||||
})
|
||||
self.transformer = Transformer(d_model=d_model, num_queries=num_queries, device=device, dtype=dtype, operations=operations)
|
||||
self.segmentation_head = SegmentationHead(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.geometry_encoder = GeometryEncoder(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
self.dot_prod_scoring = DotProductScoring(d_model=d_model, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def _get_backbone_features(self, images):
|
||||
"""Run backbone and return (detector_features, detector_positions, tracker_features, tracker_positions)."""
|
||||
bb = self.backbone["vision_backbone"]
|
||||
if bb.multiplex:
|
||||
all_f, all_p, tf, tp = bb(images, tracker_mode="propagation")
|
||||
else:
|
||||
all_f, all_p, tf, tp = bb(images, need_tracker=True)
|
||||
return all_f, all_p, tf, tp
|
||||
|
||||
@staticmethod
|
||||
def _run_geo_layer(layer, x, memory, memory_pos):
|
||||
x = x + layer.self_attn(layer.norm1(x))
|
||||
x = x + layer.cross_attn_image(layer.norm2(x), memory + memory_pos, memory)
|
||||
x = x + layer.linear2(F.relu(layer.linear1(layer.norm3(x))))
|
||||
return x
|
||||
|
||||
def _detect(self, features, positions, text_embeddings=None, text_mask=None,
|
||||
points=None, boxes=None):
|
||||
"""Shared detection: geometry encoding, transformer, scoring, segmentation."""
|
||||
B = features[0].shape[0]
|
||||
# Scalp for encoder (use top-level feature), but keep all levels for segmentation head
|
||||
seg_features = features
|
||||
if self.scalp > 0:
|
||||
features = features[:-self.scalp]
|
||||
positions = positions[:-self.scalp]
|
||||
enc_feat, enc_pos = features[-1], positions[-1]
|
||||
_, _, H, W = enc_feat.shape
|
||||
img_flat = enc_feat.flatten(2).permute(0, 2, 1)
|
||||
pos_flat = enc_pos.flatten(2).permute(0, 2, 1)
|
||||
|
||||
has_prompts = text_embeddings is not None or points is not None or boxes is not None
|
||||
if has_prompts:
|
||||
geo_enc = self.geometry_encoder
|
||||
geo_prompts = geo_enc(points=points, boxes=boxes, image_features=img_flat)
|
||||
geo_cls = geo_enc.norm(geo_enc.final_proj(cast_to_input(geo_enc.cls_embed.weight, img_flat).view(1, 1, -1).expand(B, -1, -1)))
|
||||
for layer in geo_enc.encode:
|
||||
geo_cls = self._run_geo_layer(layer, geo_cls, img_flat, pos_flat)
|
||||
geo_cls = geo_enc.encode_norm(geo_cls)
|
||||
if text_embeddings is not None and text_embeddings.shape[0] != B:
|
||||
text_embeddings = text_embeddings.expand(B, -1, -1)
|
||||
if text_mask is not None and text_mask.shape[0] != B:
|
||||
text_mask = text_mask.expand(B, -1)
|
||||
parts = [t for t in [text_embeddings, geo_prompts, geo_cls] if t is not None]
|
||||
text_embeddings = torch.cat(parts, dim=1)
|
||||
n_new = text_embeddings.shape[1] - (text_mask.shape[1] if text_mask is not None else 0)
|
||||
if text_mask is not None:
|
||||
text_mask = torch.cat([text_mask, torch.ones(B, n_new, dtype=torch.bool, device=text_mask.device)], dim=1)
|
||||
else:
|
||||
text_mask = torch.ones(B, text_embeddings.shape[1], dtype=torch.bool, device=text_embeddings.device)
|
||||
|
||||
memory = self.transformer.encoder(img_flat, pos_flat, text_embeddings, text_mask)
|
||||
dec_out = self.transformer.decoder(memory, pos_flat, text_embeddings, text_mask, H, W)
|
||||
query_out, pred_boxes = dec_out["decoder_output"], dec_out["pred_boxes"]
|
||||
|
||||
if text_embeddings is not None:
|
||||
scores = self.dot_prod_scoring(query_out, text_embeddings, text_mask)
|
||||
else:
|
||||
scores = torch.zeros(B, query_out.shape[1], device=query_out.device)
|
||||
|
||||
masks = self.segmentation_head(query_out, seg_features, encoder_hidden_states=memory, prompt=text_embeddings, prompt_mask=text_mask)
|
||||
return box_cxcywh_to_xyxy(pred_boxes), scores, masks, dec_out
|
||||
|
||||
def forward(self, images, text_embeddings=None, text_mask=None, points=None, boxes=None, threshold=0.3, orig_size=None):
|
||||
features, positions, _, _ = self._get_backbone_features(images)
|
||||
|
||||
if text_embeddings is not None:
|
||||
text_embeddings = self.backbone["language_backbone"]["resizer"](text_embeddings)
|
||||
if text_mask is not None:
|
||||
text_mask = text_mask.bool()
|
||||
|
||||
boxes_xyxy, scores, masks, dec_out = self._detect(
|
||||
features, positions, text_embeddings, text_mask, points, boxes)
|
||||
|
||||
if orig_size is not None:
|
||||
oh, ow = orig_size
|
||||
boxes_xyxy = boxes_xyxy * torch.tensor([ow, oh, ow, oh], device=boxes_xyxy.device, dtype=boxes_xyxy.dtype)
|
||||
masks = F.interpolate(masks, size=orig_size, mode="bilinear", align_corners=False)
|
||||
|
||||
return {
|
||||
"boxes": boxes_xyxy,
|
||||
"scores": scores,
|
||||
"masks": masks,
|
||||
"presence": dec_out.get("presence"),
|
||||
}
|
||||
|
||||
def forward_from_trunk(self, trunk_out, text_embeddings, text_mask):
|
||||
"""Run detection using a pre-computed ViTDet trunk output.
|
||||
|
||||
text_embeddings must already be resized through language_backbone.resizer.
|
||||
Returns dict with boxes (normalized xyxy), scores, masks at detector resolution.
|
||||
"""
|
||||
bb = self.backbone["vision_backbone"]
|
||||
features = [conv(trunk_out) for conv in bb.convs]
|
||||
positions = [cast_to_input(bb.position_encoding(f), f) for f in features]
|
||||
|
||||
if text_mask is not None:
|
||||
text_mask = text_mask.bool()
|
||||
|
||||
boxes_xyxy, scores, masks, _ = self._detect(features, positions, text_embeddings, text_mask)
|
||||
return {"boxes": boxes_xyxy, "scores": scores, "masks": masks}
|
||||
|
||||
|
||||
class SAM3Model(nn.Module):
|
||||
def __init__(self, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.dtype = dtype
|
||||
image_model = kwargs.get("image_model", "SAM3")
|
||||
tracker_cls = TRACKER_CLASSES[image_model]
|
||||
self.detector = SAM3Detector(device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
self.tracker = tracker_cls(device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
|
||||
def forward(self, images, **kwargs):
|
||||
return self.detector(images, **kwargs)
|
||||
|
||||
def forward_segment(self, images, point_inputs=None, box_inputs=None, mask_inputs=None):
|
||||
"""Interactive segmentation using SAM decoder with point/box/mask prompts.
|
||||
|
||||
Args:
|
||||
images: [B, 3, 1008, 1008] preprocessed images
|
||||
point_inputs: {"point_coords": [B, N, 2], "point_labels": [B, N]} in 1008x1008 pixel space
|
||||
box_inputs: [B, 2, 2] box corners (top-left, bottom-right) in 1008x1008 pixel space
|
||||
mask_inputs: [B, 1, H, W] coarse mask logits to refine
|
||||
Returns:
|
||||
[B, 1, image_size, image_size] high-res mask logits
|
||||
"""
|
||||
bb = self.detector.backbone["vision_backbone"]
|
||||
if bb.multiplex:
|
||||
_, _, tracker_features, tracker_positions = bb(images, tracker_mode="interactive")
|
||||
else:
|
||||
_, _, tracker_features, tracker_positions = bb(images, need_tracker=True)
|
||||
if self.detector.scalp > 0:
|
||||
tracker_features = tracker_features[:-self.detector.scalp]
|
||||
tracker_positions = tracker_positions[:-self.detector.scalp]
|
||||
|
||||
high_res = list(tracker_features[:-1])
|
||||
backbone_feat = tracker_features[-1]
|
||||
B, C, H, W = backbone_feat.shape
|
||||
# Add no-memory embedding (init frame path)
|
||||
no_mem = getattr(self.tracker, 'interactivity_no_mem_embed', None)
|
||||
if no_mem is None:
|
||||
no_mem = getattr(self.tracker, 'no_mem_embed', None)
|
||||
if no_mem is not None:
|
||||
feat_flat = backbone_feat.flatten(2).permute(0, 2, 1)
|
||||
feat_flat = feat_flat + cast_to_input(no_mem, feat_flat)
|
||||
backbone_feat = feat_flat.view(B, H, W, C).permute(0, 3, 1, 2)
|
||||
|
||||
num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
|
||||
_, high_res_masks, _, _ = self.tracker._forward_sam_heads(
|
||||
backbone_features=backbone_feat,
|
||||
point_inputs=point_inputs,
|
||||
mask_inputs=mask_inputs,
|
||||
box_inputs=box_inputs,
|
||||
high_res_features=high_res,
|
||||
multimask_output=(0 < num_pts <= 1),
|
||||
)
|
||||
return high_res_masks
|
||||
|
||||
def forward_video(self, images, initial_masks, pbar=None, text_prompts=None,
|
||||
new_det_thresh=0.5, max_objects=0, detect_interval=1):
|
||||
"""Track video with optional per-frame text-prompted detection."""
|
||||
bb = self.detector.backbone["vision_backbone"]
|
||||
|
||||
def backbone_fn(frame, frame_idx=None):
|
||||
trunk_out = bb.trunk(frame)
|
||||
if bb.multiplex:
|
||||
_, _, tf, tp = bb(frame, tracker_mode="propagation", cached_trunk=trunk_out, tracker_only=True)
|
||||
else:
|
||||
_, _, tf, tp = bb(frame, need_tracker=True, cached_trunk=trunk_out, tracker_only=True)
|
||||
return tf, tp, trunk_out
|
||||
|
||||
detect_fn = None
|
||||
if text_prompts:
|
||||
resizer = self.detector.backbone["language_backbone"]["resizer"]
|
||||
resized = [(resizer(emb), m.bool() if m is not None else None) for emb, m in text_prompts]
|
||||
def detect_fn(trunk_out):
|
||||
all_scores, all_masks = [], []
|
||||
for emb, mask in resized:
|
||||
det = self.detector.forward_from_trunk(trunk_out, emb, mask)
|
||||
all_scores.append(det["scores"])
|
||||
all_masks.append(det["masks"])
|
||||
return {"scores": torch.cat(all_scores, dim=1), "masks": torch.cat(all_masks, dim=1)}
|
||||
|
||||
if hasattr(self.tracker, 'track_video_with_detection'):
|
||||
return self.tracker.track_video_with_detection(
|
||||
backbone_fn, images, initial_masks, detect_fn,
|
||||
new_det_thresh=new_det_thresh, max_objects=max_objects,
|
||||
detect_interval=detect_interval, backbone_obj=bb, pbar=pbar)
|
||||
# SAM3 (non-multiplex) — no detection support, requires initial masks
|
||||
if initial_masks is None:
|
||||
raise ValueError("SAM3 (non-multiplex) requires initial_mask for video tracking")
|
||||
return self.tracker.track_video(backbone_fn, images, initial_masks, pbar=pbar, backbone_obj=bb)
|
||||
425
comfy/ldm/sam3/sam.py
Normal file
425
comfy/ldm/sam3/sam.py
Normal file
@ -0,0 +1,425 @@
|
||||
# SAM3 shared components: primitives, ViTDet backbone, FPN neck, position encodings.
|
||||
|
||||
import math
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from comfy.ldm.modules.attention import optimized_attention
|
||||
from comfy.ldm.flux.math import apply_rope
|
||||
from comfy.ldm.flux.layers import EmbedND
|
||||
from comfy.ops import cast_to_input
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers, sigmoid_output=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
|
||||
self.layers = nn.ModuleList([operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers)])
|
||||
self.sigmoid_output = sigmoid_output
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = F.relu(layer(x)) if i < len(self.layers) - 1 else layer(x)
|
||||
return torch.sigmoid(x) if self.sigmoid_output else x
|
||||
|
||||
|
||||
class SAMAttention(nn.Module):
|
||||
def __init__(self, embedding_dim, num_heads, downsample_rate=1, kv_in_dim=None, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
internal_dim = embedding_dim // downsample_rate
|
||||
kv_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
|
||||
self.q_proj = operations.Linear(embedding_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.k_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.v_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
|
||||
self.out_proj = operations.Linear(internal_dim, embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, q, k, v):
|
||||
q = self.q_proj(q)
|
||||
k = self.k_proj(k)
|
||||
v = self.v_proj(v)
|
||||
return self.out_proj(optimized_attention(q, k, v, self.num_heads, low_precision_attention=False))
|
||||
|
||||
|
||||
class TwoWayAttentionBlock(nn.Module):
|
||||
def __init__(self, embedding_dim, num_heads, mlp_dim=2048, attention_downsample_rate=2, skip_first_layer_pe=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.skip_first_layer_pe = skip_first_layer_pe
|
||||
self.self_attn = SAMAttention(embedding_dim, num_heads, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.cross_attn_image_to_token = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.mlp = nn.Sequential(operations.Linear(embedding_dim, mlp_dim, device=device, dtype=dtype), nn.ReLU(), operations.Linear(mlp_dim, embedding_dim, device=device, dtype=dtype))
|
||||
self.norm1 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm2 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm3 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
self.norm4 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, queries, keys, query_pe, key_pe):
|
||||
if self.skip_first_layer_pe:
|
||||
queries = self.norm1(self.self_attn(queries, queries, queries))
|
||||
else:
|
||||
q = queries + query_pe
|
||||
queries = self.norm1(queries + self.self_attn(q, q, queries))
|
||||
q, k = queries + query_pe, keys + key_pe
|
||||
queries = self.norm2(queries + self.cross_attn_token_to_image(q, k, keys))
|
||||
queries = self.norm3(queries + self.mlp(queries))
|
||||
q, k = queries + query_pe, keys + key_pe
|
||||
keys = self.norm4(keys + self.cross_attn_image_to_token(k, q, queries))
|
||||
return queries, keys
|
||||
|
||||
|
||||
class TwoWayTransformer(nn.Module):
|
||||
def __init__(self, depth=2, embedding_dim=256, num_heads=8, mlp_dim=2048, attention_downsample_rate=2, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.layers = nn.ModuleList([
|
||||
TwoWayAttentionBlock(embedding_dim, num_heads, mlp_dim, attention_downsample_rate,
|
||||
skip_first_layer_pe=(i == 0), device=device, dtype=dtype, operations=operations)
|
||||
for i in range(depth)
|
||||
])
|
||||
self.final_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
|
||||
self.norm_final = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, image_embedding, image_pe, point_embedding):
|
||||
queries, keys = point_embedding, image_embedding
|
||||
for layer in self.layers:
|
||||
queries, keys = layer(queries, keys, point_embedding, image_pe)
|
||||
q, k = queries + point_embedding, keys + image_pe
|
||||
queries = self.norm_final(queries + self.final_attn_token_to_image(q, k, keys))
|
||||
return queries, keys
|
||||
|
||||
|
||||
class PositionEmbeddingRandom(nn.Module):
|
||||
"""Fourier feature positional encoding with random gaussian projection."""
|
||||
def __init__(self, num_pos_feats=64, scale=None):
|
||||
super().__init__()
|
||||
self.register_buffer("positional_encoding_gaussian_matrix", (scale or 1.0) * torch.randn(2, num_pos_feats))
|
||||
|
||||
def _encode(self, normalized_coords):
|
||||
"""Map normalized [0,1] coordinates to fourier features via random projection. Computes in fp32."""
|
||||
orig_dtype = normalized_coords.dtype
|
||||
proj_matrix = self.positional_encoding_gaussian_matrix.to(device=normalized_coords.device, dtype=torch.float32)
|
||||
projected = 2 * math.pi * (2 * normalized_coords.float() - 1) @ proj_matrix
|
||||
return torch.cat([projected.sin(), projected.cos()], dim=-1).to(orig_dtype)
|
||||
|
||||
def forward(self, size, device=None):
|
||||
h, w = size
|
||||
dev = device if device is not None else self.positional_encoding_gaussian_matrix.device
|
||||
ones = torch.ones((h, w), device=dev, dtype=torch.float32)
|
||||
norm_xy = torch.stack([(ones.cumsum(1) - 0.5) / w, (ones.cumsum(0) - 0.5) / h], dim=-1)
|
||||
return self._encode(norm_xy).permute(2, 0, 1).unsqueeze(0)
|
||||
|
||||
def forward_with_coords(self, pixel_coords, image_size):
|
||||
norm = pixel_coords.clone()
|
||||
norm[:, :, 0] /= image_size[1]
|
||||
norm[:, :, 1] /= image_size[0]
|
||||
return self._encode(norm)
|
||||
|
||||
|
||||
# ViTDet backbone + FPN neck
|
||||
|
||||
def window_partition(x: torch.Tensor, window_size: int):
|
||||
B, H, W, C = x.shape
|
||||
pad_h = (window_size - H % window_size) % window_size
|
||||
pad_w = (window_size - W % window_size) % window_size
|
||||
if pad_h > 0 or pad_w > 0:
|
||||
x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
|
||||
Hp, Wp = H + pad_h, W + pad_w
|
||||
x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
|
||||
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
|
||||
return windows, (Hp, Wp)
|
||||
|
||||
|
||||
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw, hw):
|
||||
Hp, Wp = pad_hw
|
||||
H, W = hw
|
||||
B = windows.shape[0] // (Hp * Wp // window_size // window_size)
|
||||
x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
|
||||
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
|
||||
if Hp > H or Wp > W:
|
||||
x = x[:, :H, :W, :].contiguous()
|
||||
return x
|
||||
|
||||
|
||||
def rope_2d(end_x: int, end_y: int, dim: int, theta: float = 10000.0, scale_pos: float = 1.0):
|
||||
"""Generate 2D axial RoPE using flux EmbedND. Returns [1, 1, HW, dim//2, 2, 2]."""
|
||||
t = torch.arange(end_x * end_y, dtype=torch.float32)
|
||||
ids = torch.stack([(t % end_x) * scale_pos,
|
||||
torch.div(t, end_x, rounding_mode="floor") * scale_pos], dim=-1)
|
||||
return EmbedND(dim=dim, theta=theta, axes_dim=[dim // 2, dim // 2])(ids.unsqueeze(0))
|
||||
|
||||
|
||||
class _ViTMLP(nn.Module):
|
||||
def __init__(self, dim, mlp_ratio=4.0, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
hidden = int(dim * mlp_ratio)
|
||||
self.fc1 = operations.Linear(dim, hidden, device=device, dtype=dtype)
|
||||
self.act = nn.GELU()
|
||||
self.fc2 = operations.Linear(hidden, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.fc2(self.act(self.fc1(x)))
|
||||
|
||||
|
||||
class Attention(nn.Module):
|
||||
"""ViTDet multi-head attention with fused QKV projection."""
|
||||
|
||||
def __init__(self, dim, num_heads=8, qkv_bias=True, use_rope=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.use_rope = use_rope
|
||||
self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
|
||||
self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x, freqs_cis=None):
|
||||
B, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
|
||||
q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0)
|
||||
if self.use_rope and freqs_cis is not None:
|
||||
q, k = apply_rope(q, k, freqs_cis)
|
||||
return self.proj(optimized_attention(q, k, v, self.num_heads, skip_reshape=True, low_precision_attention=False))
|
||||
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, window_size=0, use_rope=False, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.attn = Attention(dim, num_heads, qkv_bias, use_rope, device=device, dtype=dtype, operations=operations)
|
||||
self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
|
||||
self.mlp = _ViTMLP(dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
def forward(self, x, freqs_cis=None):
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
if self.window_size > 0:
|
||||
H, W = x.shape[1], x.shape[2]
|
||||
x, pad_hw = window_partition(x, self.window_size)
|
||||
x = x.view(x.shape[0], self.window_size * self.window_size, -1)
|
||||
x = self.attn(x, freqs_cis=freqs_cis)
|
||||
x = x.view(-1, self.window_size, self.window_size, x.shape[-1])
|
||||
x = window_unpartition(x, self.window_size, pad_hw, (H, W))
|
||||
else:
|
||||
B, H, W, C = x.shape
|
||||
x = x.view(B, H * W, C)
|
||||
x = self.attn(x, freqs_cis=freqs_cis)
|
||||
x = x.view(B, H, W, C)
|
||||
x = shortcut + x
|
||||
x = x + self.mlp(self.norm2(x))
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Module):
|
||||
def __init__(self, patch_size=14, in_chans=3, embed_dim=1024, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
self.proj = operations.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=False, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
return self.proj(x)
|
||||
|
||||
|
||||
class ViTDet(nn.Module):
|
||||
def __init__(self, img_size=1008, patch_size=14, embed_dim=1024, depth=32, num_heads=16, mlp_ratio=4.625, qkv_bias=True, window_size=24,
|
||||
global_att_blocks=(7, 15, 23, 31), use_rope=True, pretrain_img_size=336, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.global_att_blocks = set(global_att_blocks)
|
||||
|
||||
self.patch_embed = PatchEmbed(patch_size, 3, embed_dim, device=device, dtype=dtype, operations=operations)
|
||||
|
||||
num_patches = (pretrain_img_size // patch_size) ** 2 + 1 # +1 for cls token
|
||||
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim, device=device, dtype=dtype))
|
||||
|
||||
self.ln_pre = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
|
||||
|
||||
grid_size = img_size // patch_size
|
||||
pretrain_grid = pretrain_img_size // patch_size
|
||||
|
||||
self.blocks = nn.ModuleList()
|
||||
for i in range(depth):
|
||||
is_global = i in self.global_att_blocks
|
||||
self.blocks.append(Block(
|
||||
embed_dim, num_heads, mlp_ratio, qkv_bias,
|
||||
window_size=0 if is_global else window_size,
|
||||
use_rope=use_rope,
|
||||
device=device, dtype=dtype, operations=operations,
|
||||
))
|
||||
|
||||
if use_rope:
|
||||
rope_scale = pretrain_grid / grid_size
|
||||
self.register_buffer("freqs_cis", rope_2d(grid_size, grid_size, embed_dim // num_heads, scale_pos=rope_scale), persistent=False)
|
||||
self.register_buffer("freqs_cis_window", rope_2d(window_size, window_size, embed_dim // num_heads), persistent=False)
|
||||
else:
|
||||
self.freqs_cis = None
|
||||
self.freqs_cis_window = None
|
||||
|
||||
def _get_pos_embed(self, num_tokens):
|
||||
pos = self.pos_embed
|
||||
if pos.shape[1] == num_tokens:
|
||||
return pos
|
||||
cls_pos = pos[:, :1]
|
||||
spatial_pos = pos[:, 1:]
|
||||
old_size = int(math.sqrt(spatial_pos.shape[1]))
|
||||
new_size = int(math.sqrt(num_tokens - 1)) if num_tokens > 1 else old_size
|
||||
spatial_2d = spatial_pos.reshape(1, old_size, old_size, -1).permute(0, 3, 1, 2)
|
||||
tiles_h = new_size // old_size + 1
|
||||
tiles_w = new_size // old_size + 1
|
||||
tiled = spatial_2d.tile([1, 1, tiles_h, tiles_w])[:, :, :new_size, :new_size]
|
||||
tiled = tiled.permute(0, 2, 3, 1).reshape(1, new_size * new_size, -1)
|
||||
return torch.cat([cls_pos, tiled], dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x)
|
||||
B, C, Hp, Wp = x.shape
|
||||
x = x.permute(0, 2, 3, 1).reshape(B, Hp * Wp, C)
|
||||
|
||||
pos = cast_to_input(self._get_pos_embed(Hp * Wp + 1), x)
|
||||
x = x + pos[:, 1:Hp * Wp + 1]
|
||||
|
||||
x = x.view(B, Hp, Wp, C)
|
||||
x = self.ln_pre(x)
|
||||
|
||||
freqs_cis_global = self.freqs_cis
|
||||
freqs_cis_win = self.freqs_cis_window
|
||||
if freqs_cis_global is not None:
|
||||
freqs_cis_global = cast_to_input(freqs_cis_global, x)
|
||||
if freqs_cis_win is not None:
|
||||
freqs_cis_win = cast_to_input(freqs_cis_win, x)
|
||||
|
||||
for block in self.blocks:
|
||||
fc = freqs_cis_win if block.window_size > 0 else freqs_cis_global
|
||||
x = block(x, freqs_cis=fc)
|
||||
|
||||
return x.permute(0, 3, 1, 2)
|
||||
|
||||
|
||||
class FPNScaleConv(nn.Module):
|
||||
def __init__(self, in_dim, out_dim, scale, device=None, dtype=None, operations=None):
|
||||
super().__init__()
|
||||
if scale == 4.0:
|
||||
self.dconv_2x2_0 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
self.dconv_2x2_1 = operations.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
proj_in = in_dim // 4
|
||||
elif scale == 2.0:
|
||||
self.dconv_2x2 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
|
||||
proj_in = in_dim // 2
|
||||
elif scale == 1.0:
|
||||
proj_in = in_dim
|
||||
elif scale == 0.5:
|
||||
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
||||
proj_in = in_dim
|
||||
self.scale = scale
|
||||
self.conv_1x1 = operations.Conv2d(proj_in, out_dim, kernel_size=1, device=device, dtype=dtype)
|
||||
self.conv_3x3 = operations.Conv2d(out_dim, out_dim, kernel_size=3, padding=1, device=device, dtype=dtype)
|
||||
|
||||
def forward(self, x):
|
||||
if self.scale == 4.0:
|
||||
x = F.gelu(self.dconv_2x2_0(x))
|
||||
x = self.dconv_2x2_1(x)
|
||||
elif self.scale == 2.0:
|
||||
x = self.dconv_2x2(x)
|
||||
elif self.scale == 0.5:
|
||||
x = self.pool(x)
|
||||
x = self.conv_1x1(x)
|
||||
x = self.conv_3x3(x)
|
||||
return x
|
||||
|
||||
|
||||
class PositionEmbeddingSine(nn.Module):
|
||||
"""2D sinusoidal position encoding (DETR-style) with result caching."""
|
||||
def __init__(self, num_pos_feats=256, temperature=10000.0, normalize=True, scale=None):
|
||||
super().__init__()
|
||||
assert num_pos_feats % 2 == 0
|
||||
self.half_dim = num_pos_feats // 2
|
||||
self.temperature = temperature
|
||||
self.normalize = normalize
|
||||
self.scale = scale if scale is not None else 2 * math.pi
|
||||
self._cache = {}
|
||||
|
||||
def _sincos(self, vals):
|
||||
"""Encode 1D values to interleaved sin/cos features."""
|
||||
freqs = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=vals.device) // 2) / self.half_dim)
|
||||
raw = vals[..., None] * self.scale / freqs
|
||||
return torch.stack((raw[..., 0::2].sin(), raw[..., 1::2].cos()), dim=-1).flatten(-2)
|
||||
|
||||
def _encode_xy(self, x, y):
|
||||
"""Encode normalized x, y coordinates to sinusoidal features. Returns (pos_x, pos_y) each [N, half_dim]."""
|
||||
dim_t = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=x.device) // 2) / self.half_dim)
|
||||
pos_x = x[:, None] * self.scale / dim_t
|
||||
pos_y = y[:, None] * self.scale / dim_t
|
||||
pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
|
||||
pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
|
||||
return pos_x, pos_y
|
||||
|
||||
def encode_boxes(self, cx, cy, w, h):
|
||||
"""Encode box center + size to [N, d_model+2] features."""
|
||||
pos_x, pos_y = self._encode_xy(cx, cy)
|
||||
return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
|
||||
|
||||
def forward(self, x):
|
||||
B, C, H, W = x.shape
|
||||
key = (H, W, x.device)
|
||||
if key not in self._cache:
|
||||
gy = torch.arange(H, dtype=torch.float32, device=x.device)
|
||||
gx = torch.arange(W, dtype=torch.float32, device=x.device)
|
||||
if self.normalize:
|
||||
gy, gx = gy / (H - 1 + 1e-6), gx / (W - 1 + 1e-6)
|
||||
yy, xx = torch.meshgrid(gy, gx, indexing="ij")
|
||||
self._cache[key] = torch.cat((self._sincos(yy), self._sincos(xx)), dim=-1).permute(2, 0, 1).unsqueeze(0)
|
||||
return self._cache[key].expand(B, -1, -1, -1)
|
||||
|
||||
|
||||
class SAM3VisionBackbone(nn.Module):
|
||||
def __init__(self, embed_dim=1024, d_model=256, multiplex=False, device=None, dtype=None, operations=None, **kwargs):
|
||||
super().__init__()
|
||||
self.trunk = ViTDet(embed_dim=embed_dim, device=device, dtype=dtype, operations=operations, **kwargs)
|
||||
self.position_encoding = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
|
||||
self.multiplex = multiplex
|
||||
|
||||
fpn_args = dict(device=device, dtype=dtype, operations=operations)
|
||||
if multiplex:
|
||||
scales = [4.0, 2.0, 1.0]
|
||||
self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.propagation_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.interactive_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
else:
|
||||
scales = [4.0, 2.0, 1.0, 0.5]
|
||||
self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
self.sam2_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
|
||||
|
||||
def forward(self, images, need_tracker=False, tracker_mode=None, cached_trunk=None, tracker_only=False):
|
||||
backbone_out = cached_trunk if cached_trunk is not None else self.trunk(images)
|
||||
|
||||
if tracker_only:
|
||||
# Skip detector FPN when only tracker features are needed (video tracking)
|
||||
if self.multiplex:
|
||||
tracker_convs = self.propagation_convs if tracker_mode == "propagation" else self.interactive_convs
|
||||
else:
|
||||
tracker_convs = self.sam2_convs
|
||||
tracker_features = [conv(backbone_out) for conv in tracker_convs]
|
||||
tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
|
||||
return None, None, tracker_features, tracker_positions
|
||||
|
||||
features = [conv(backbone_out) for conv in self.convs]
|
||||
positions = [cast_to_input(self.position_encoding(f), f) for f in features]
|
||||
|
||||
if self.multiplex:
|
||||
if tracker_mode == "propagation":
|
||||
tracker_convs = self.propagation_convs
|
||||
elif tracker_mode == "interactive":
|
||||
tracker_convs = self.interactive_convs
|
||||
else:
|
||||
return features, positions, None, None
|
||||
elif need_tracker:
|
||||
tracker_convs = self.sam2_convs
|
||||
else:
|
||||
return features, positions, None, None
|
||||
|
||||
tracker_features = [conv(backbone_out) for conv in tracker_convs]
|
||||
tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
|
||||
return features, positions, tracker_features, tracker_positions
|
||||
1785
comfy/ldm/sam3/tracker.py
Normal file
1785
comfy/ldm/sam3/tracker.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -342,6 +342,12 @@ def model_lora_keys_unet(model, key_map={}):
|
||||
key_map["base_model.model.{}".format(key_lora)] = k # Official base model loras
|
||||
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k # LyCORIS/LoKR format
|
||||
|
||||
if isinstance(model, comfy.model_base.ErnieImage):
|
||||
for k in sdk:
|
||||
if k.startswith("diffusion_model.") and k.endswith(".weight"):
|
||||
key_lora = k[len("diffusion_model."):-len(".weight")]
|
||||
key_map["transformer.{}".format(key_lora)] = k
|
||||
|
||||
return key_map
|
||||
|
||||
|
||||
|
||||
@ -52,8 +52,10 @@ import comfy.ldm.qwen_image.model
|
||||
import comfy.ldm.kandinsky5.model
|
||||
import comfy.ldm.anima.model
|
||||
import comfy.ldm.ace.ace_step15
|
||||
import comfy.ldm.cogvideo.model
|
||||
import comfy.ldm.rt_detr.rtdetr_v4
|
||||
import comfy.ldm.ernie.model
|
||||
import comfy.ldm.sam3.detector
|
||||
|
||||
import comfy.model_management
|
||||
import comfy.patcher_extension
|
||||
@ -80,6 +82,7 @@ class ModelType(Enum):
|
||||
IMG_TO_IMG = 9
|
||||
FLOW_COSMOS = 10
|
||||
IMG_TO_IMG_FLOW = 11
|
||||
V_PREDICTION_DDPM = 12
|
||||
|
||||
|
||||
def model_sampling(model_config, model_type):
|
||||
@ -114,6 +117,8 @@ def model_sampling(model_config, model_type):
|
||||
s = comfy.model_sampling.ModelSamplingCosmosRFlow
|
||||
elif model_type == ModelType.IMG_TO_IMG_FLOW:
|
||||
c = comfy.model_sampling.IMG_TO_IMG_FLOW
|
||||
elif model_type == ModelType.V_PREDICTION_DDPM:
|
||||
c = comfy.model_sampling.V_PREDICTION_DDPM
|
||||
|
||||
class ModelSampling(s, c):
|
||||
pass
|
||||
@ -578,8 +583,8 @@ class Stable_Zero123(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
|
||||
super().__init__(model_config, model_type, device=device)
|
||||
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
|
||||
self.cc_projection.weight.copy_(cc_projection_weight)
|
||||
self.cc_projection.bias.copy_(cc_projection_bias)
|
||||
self.cc_projection.weight = torch.nn.Parameter(cc_projection_weight.clone())
|
||||
self.cc_projection.bias = torch.nn.Parameter(cc_projection_bias.clone())
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = {}
|
||||
@ -1974,3 +1979,63 @@ class ErnieImage(BaseModel):
|
||||
if cross_attn is not None:
|
||||
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
|
||||
return out
|
||||
|
||||
class SAM3(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.sam3.detector.SAM3Model)
|
||||
|
||||
class CogVideoX(BaseModel):
|
||||
def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
|
||||
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
|
||||
self.image_to_video = image_to_video
|
||||
|
||||
def concat_cond(self, **kwargs):
|
||||
noise = kwargs.get("noise", None)
|
||||
# Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
|
||||
extra_channels = self.diffusion_model.in_channels - noise.shape[1]
|
||||
if extra_channels == 0:
|
||||
return None
|
||||
|
||||
image = kwargs.get("concat_latent_image", None)
|
||||
device = kwargs["device"]
|
||||
|
||||
if image is None:
|
||||
shape = list(noise.shape)
|
||||
shape[1] = extra_channels
|
||||
return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
|
||||
|
||||
latent_dim = self.latent_format.latent_channels
|
||||
image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
|
||||
|
||||
if noise.ndim == 5 and image.ndim == 5:
|
||||
if image.shape[-3] < noise.shape[-3]:
|
||||
image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
|
||||
elif image.shape[-3] > noise.shape[-3]:
|
||||
image = image[:, :, :noise.shape[-3]]
|
||||
|
||||
for i in range(0, image.shape[1], latent_dim):
|
||||
image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
|
||||
image = utils.resize_to_batch_size(image, noise.shape[0])
|
||||
|
||||
if image.shape[1] > extra_channels:
|
||||
image = image[:, :extra_channels]
|
||||
elif image.shape[1] < extra_channels:
|
||||
repeats = extra_channels // image.shape[1]
|
||||
remainder = extra_channels % image.shape[1]
|
||||
parts = [image] * repeats
|
||||
if remainder > 0:
|
||||
parts.append(image[:, :remainder])
|
||||
image = torch.cat(parts, dim=1)
|
||||
|
||||
return image
|
||||
|
||||
def extra_conds(self, **kwargs):
|
||||
out = super().extra_conds(**kwargs)
|
||||
# OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
|
||||
if self.diffusion_model.ofs_proj_dim is not None:
|
||||
ofs = kwargs.get("ofs", None)
|
||||
if ofs is None:
|
||||
noise = kwargs.get("noise", None)
|
||||
ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
|
||||
out['ofs'] = comfy.conds.CONDRegular(ofs)
|
||||
return out
|
||||
|
||||
@ -490,6 +490,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys: # CogVideoX
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "cogvideox"
|
||||
|
||||
# Extract config from weight shapes
|
||||
norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
|
||||
time_embed_dim = norm1_weight.shape[1]
|
||||
dim = norm1_weight.shape[0] // 6
|
||||
|
||||
dit_config["num_attention_heads"] = dim // 64
|
||||
dit_config["attention_head_dim"] = 64
|
||||
dit_config["time_embed_dim"] = time_embed_dim
|
||||
dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
|
||||
|
||||
# Detect in_channels from patch_embed
|
||||
patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
|
||||
if patch_proj_key in state_dict_keys:
|
||||
w = state_dict[patch_proj_key]
|
||||
if w.ndim == 4:
|
||||
# Conv2d: [out, in, kh, kw] — CogVideoX 1.0
|
||||
dit_config["in_channels"] = w.shape[1]
|
||||
dit_config["patch_size"] = w.shape[2]
|
||||
elif w.ndim == 2:
|
||||
# Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
|
||||
dit_config["patch_size"] = 2
|
||||
dit_config["patch_size_t"] = 2
|
||||
dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2) # 256 // 8 = 32
|
||||
|
||||
text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
|
||||
if text_proj_key in state_dict_keys:
|
||||
dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
|
||||
|
||||
# Detect OFS embedding
|
||||
ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
|
||||
if ofs_key in state_dict_keys:
|
||||
dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
|
||||
|
||||
# Detect positional embedding type
|
||||
pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
|
||||
if pos_key in state_dict_keys:
|
||||
dit_config["use_learned_positional_embeddings"] = True
|
||||
dit_config["use_rotary_positional_embeddings"] = False
|
||||
else:
|
||||
dit_config["use_learned_positional_embeddings"] = False
|
||||
dit_config["use_rotary_positional_embeddings"] = True
|
||||
|
||||
return dit_config
|
||||
|
||||
if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "wan2.1"
|
||||
@ -718,6 +766,14 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
|
||||
dit_config["image_model"] = "ernie"
|
||||
return dit_config
|
||||
|
||||
if 'detector.backbone.vision_backbone.trunk.blocks.0.attn.qkv.weight' in state_dict_keys: # SAM3 / SAM3.1
|
||||
if 'detector.transformer.decoder.query_embed.weight' in state_dict_keys:
|
||||
dit_config = {}
|
||||
dit_config["image_model"] = "SAM3"
|
||||
if 'detector.backbone.vision_backbone.propagation_convs.0.conv_1x1.weight' in state_dict_keys:
|
||||
dit_config["image_model"] = "SAM31"
|
||||
return dit_config
|
||||
|
||||
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
|
||||
return None
|
||||
|
||||
@ -873,6 +929,10 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
|
||||
return model_config
|
||||
|
||||
def unet_prefix_from_state_dict(state_dict):
|
||||
# SAM3: detector.* and tracker.* at top level, no common prefix
|
||||
if any(k.startswith("detector.") for k in state_dict) and any(k.startswith("tracker.") for k in state_dict):
|
||||
return ""
|
||||
|
||||
candidates = ["model.diffusion_model.", #ldm/sgm models
|
||||
"model.model.", #audio models
|
||||
"net.", #cosmos
|
||||
|
||||
@ -663,6 +663,7 @@ def minimum_inference_memory():
|
||||
|
||||
def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins_required=0, ram_required=0):
|
||||
cleanup_models_gc()
|
||||
comfy.memory_management.extra_ram_release(max(pins_required, ram_required))
|
||||
unloaded_model = []
|
||||
can_unload = []
|
||||
unloaded_models = []
|
||||
@ -1801,7 +1802,7 @@ def debug_memory_summary():
|
||||
return torch.cuda.memory.memory_summary()
|
||||
return ""
|
||||
|
||||
class InterruptProcessingException(Exception):
|
||||
class InterruptProcessingException(BaseException):
|
||||
pass
|
||||
|
||||
interrupt_processing_mutex = threading.RLock()
|
||||
|
||||
@ -31,6 +31,7 @@ import comfy.float
|
||||
import comfy.hooks
|
||||
import comfy.lora
|
||||
import comfy.model_management
|
||||
import comfy.ops
|
||||
import comfy.patcher_extension
|
||||
import comfy.utils
|
||||
from comfy.comfy_types import UnetWrapperFunction
|
||||
@ -685,9 +686,9 @@ class ModelPatcher:
|
||||
sd.pop(k)
|
||||
return sd
|
||||
|
||||
def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False):
|
||||
def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False, force_cast=False):
|
||||
weight, set_func, convert_func = get_key_weight(self.model, key)
|
||||
if key not in self.patches:
|
||||
if key not in self.patches and not force_cast:
|
||||
return weight
|
||||
|
||||
inplace_update = self.weight_inplace_update or inplace_update
|
||||
@ -695,7 +696,7 @@ class ModelPatcher:
|
||||
if key not in self.backup and not return_weight:
|
||||
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
|
||||
|
||||
temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
|
||||
temp_dtype = comfy.model_management.lora_compute_dtype(device_to) if key in self.patches else None
|
||||
if device_to is not None:
|
||||
temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
|
||||
else:
|
||||
@ -703,9 +704,10 @@ class ModelPatcher:
|
||||
if convert_func is not None:
|
||||
temp_weight = convert_func(temp_weight, inplace=True)
|
||||
|
||||
out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
|
||||
out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key) if key in self.patches else temp_weight
|
||||
if set_func is None:
|
||||
out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
|
||||
if key in self.patches:
|
||||
out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
|
||||
if return_weight:
|
||||
return out_weight
|
||||
elif inplace_update:
|
||||
@ -855,7 +857,9 @@ class ModelPatcher:
|
||||
if m.comfy_patched_weights == True:
|
||||
continue
|
||||
|
||||
for param in params:
|
||||
for param, param_value in params.items():
|
||||
if hasattr(m, "comfy_cast_weights") and getattr(param_value, "is_meta", False):
|
||||
comfy.ops.disable_weight_init._zero_init_parameter(m, param)
|
||||
key = key_param_name_to_key(n, param)
|
||||
self.unpin_weight(key)
|
||||
self.patch_weight_to_device(key, device_to=device_to)
|
||||
@ -1584,7 +1588,7 @@ class ModelPatcherDynamic(ModelPatcher):
|
||||
key = key_param_name_to_key(n, param_key)
|
||||
if key in self.backup:
|
||||
comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
|
||||
self.patch_weight_to_device(key, device_to=device_to)
|
||||
self.patch_weight_to_device(key, device_to=device_to, force_cast=True)
|
||||
weight, _, _ = get_key_weight(self.model, key)
|
||||
if weight is not None:
|
||||
self.model.model_loaded_weight_memory += weight.numel() * weight.element_size()
|
||||
@ -1609,6 +1613,10 @@ class ModelPatcherDynamic(ModelPatcher):
|
||||
m._v = vbar.alloc(v_weight_size)
|
||||
allocated_size += v_weight_size
|
||||
|
||||
for param in params:
|
||||
if param not in ("weight", "bias"):
|
||||
force_load_param(self, param, device_to)
|
||||
|
||||
else:
|
||||
for param in params:
|
||||
key = key_param_name_to_key(n, param)
|
||||
|
||||
@ -54,6 +54,30 @@ class V_PREDICTION(EPS):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
|
||||
|
||||
class V_PREDICTION_DDPM:
|
||||
"""CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
|
||||
x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
|
||||
= x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
|
||||
"""
|
||||
def calculate_input(self, sigma, noise):
|
||||
return noise
|
||||
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
|
||||
|
||||
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
|
||||
sigma = reshape_sigma(sigma, noise.ndim)
|
||||
if max_denoise:
|
||||
noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
|
||||
else:
|
||||
noise = noise * sigma
|
||||
noise += latent_image
|
||||
return noise
|
||||
|
||||
def inverse_noise_scaling(self, sigma, latent):
|
||||
return latent
|
||||
|
||||
class EDM(V_PREDICTION):
|
||||
def calculate_denoised(self, sigma, model_output, model_input):
|
||||
sigma = reshape_sigma(sigma, model_output.ndim)
|
||||
|
||||
16
comfy/ops.py
16
comfy/ops.py
@ -79,14 +79,21 @@ def cast_to_input(weight, input, non_blocking=False, copy=True):
|
||||
return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
|
||||
|
||||
|
||||
def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
|
||||
def materialize_meta_param(s, param_keys):
|
||||
for param_key in param_keys:
|
||||
param = getattr(s, param_key, None)
|
||||
if param is not None and getattr(param, "is_meta", False):
|
||||
setattr(s, param_key, torch.nn.Parameter(torch.zeros(param.shape, dtype=param.dtype), requires_grad=param.requires_grad))
|
||||
|
||||
|
||||
def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
|
||||
#vbar doesn't support CPU weights, but some custom nodes have weird paths
|
||||
#that might switch the layer to the CPU and expect it to work. We have to take
|
||||
#a clone conservatively as we are mmapped and some SFT files are packed misaligned
|
||||
#If you are a custom node author reading this, please move your layer to the GPU
|
||||
#or declare your ModelPatcher as CPU in the first place.
|
||||
if comfy.model_management.is_device_cpu(device):
|
||||
materialize_meta_param(s, ["weight", "bias"])
|
||||
weight = s.weight.to(dtype=dtype, copy=True)
|
||||
if isinstance(weight, QuantizedTensor):
|
||||
weight = weight.dequantize()
|
||||
@ -108,6 +115,7 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
|
||||
xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
|
||||
|
||||
if not resident:
|
||||
materialize_meta_param(s, ["weight", "bias"])
|
||||
cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
|
||||
cast_dest = None
|
||||
|
||||
@ -306,6 +314,12 @@ class CastWeightBiasOp:
|
||||
bias_function = []
|
||||
|
||||
class disable_weight_init:
|
||||
@staticmethod
|
||||
def _zero_init_parameter(module, name):
|
||||
param = getattr(module, name)
|
||||
device = None if getattr(param, "is_meta", False) else param.device
|
||||
setattr(module, name, torch.nn.Parameter(torch.zeros(param.shape, device=device, dtype=param.dtype), requires_grad=False))
|
||||
|
||||
@staticmethod
|
||||
def _lazy_load_from_state_dict(module, state_dict, prefix, local_metadata,
|
||||
missing_keys, unexpected_keys, weight_shape,
|
||||
|
||||
@ -2,7 +2,6 @@ import comfy.model_management
|
||||
import comfy.memory_management
|
||||
import comfy_aimdo.host_buffer
|
||||
import comfy_aimdo.torch
|
||||
import psutil
|
||||
|
||||
from comfy.cli_args import args
|
||||
|
||||
@ -12,11 +11,6 @@ def get_pin(module):
|
||||
def pin_memory(module):
|
||||
if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
|
||||
return
|
||||
#FIXME: This is a RAM cache trigger event
|
||||
ram_headroom = comfy.memory_management.RAM_CACHE_HEADROOM
|
||||
#we split the difference and assume half the RAM cache headroom is for us
|
||||
if ram_headroom > 0 and psutil.virtual_memory().available < (ram_headroom * 0.5):
|
||||
comfy.memory_management.extra_ram_release(ram_headroom)
|
||||
|
||||
size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
|
||||
|
||||
|
||||
17
comfy/sd.py
17
comfy/sd.py
@ -18,6 +18,7 @@ import comfy.ldm.wan.vae
|
||||
import comfy.ldm.wan.vae2_2
|
||||
import comfy.ldm.hunyuan3d.vae
|
||||
import comfy.ldm.ace.vae.music_dcae_pipeline
|
||||
import comfy.ldm.cogvideo.vae
|
||||
import comfy.ldm.hunyuan_video.vae
|
||||
import comfy.ldm.mmaudio.vae.autoencoder
|
||||
import comfy.pixel_space_convert
|
||||
@ -478,7 +479,10 @@ class VAE:
|
||||
encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': encoder_config},
|
||||
decoder_config={'target': "comfy.ldm.modules.temporal_ae.VideoDecoder", 'params': decoder_config})
|
||||
elif "taesd_decoder.1.weight" in sd:
|
||||
self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
|
||||
if isinstance(metadata, dict) and "tae_latent_channels" in metadata:
|
||||
self.latent_channels = metadata["tae_latent_channels"]
|
||||
else:
|
||||
self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
|
||||
self.first_stage_model = comfy.taesd.taesd.TAESD(latent_channels=self.latent_channels)
|
||||
elif "vquantizer.codebook.weight" in sd: #VQGan: stage a of stable cascade
|
||||
self.first_stage_model = StageA()
|
||||
@ -652,6 +656,17 @@ class VAE:
|
||||
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
|
||||
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
|
||||
elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd: # CogVideoX VAE
|
||||
self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
|
||||
self.upscale_index_formula = (4, 8, 8)
|
||||
self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
|
||||
self.downscale_index_formula = (4, 8, 8)
|
||||
self.latent_dim = 3
|
||||
self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
|
||||
self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
|
||||
self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
|
||||
self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
|
||||
self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
elif "decoder.conv_in.conv.weight" in sd:
|
||||
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
|
||||
ddconfig["conv3d"] = True
|
||||
|
||||
@ -27,6 +27,7 @@ import comfy.text_encoders.anima
|
||||
import comfy.text_encoders.ace15
|
||||
import comfy.text_encoders.longcat_image
|
||||
import comfy.text_encoders.ernie
|
||||
import comfy.text_encoders.cogvideo
|
||||
|
||||
from . import supported_models_base
|
||||
from . import latent_formats
|
||||
@ -1781,6 +1782,183 @@ class ErnieImage(supported_models_base.BASE):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
|
||||
|
||||
|
||||
models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage]
|
||||
class SAM3(supported_models_base.BASE):
|
||||
unet_config = {"image_model": "SAM3"}
|
||||
supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
|
||||
text_encoder_key_prefix = ["detector.backbone.language_backbone."]
|
||||
unet_extra_prefix = ""
|
||||
|
||||
models += [SVD_img2vid]
|
||||
def process_clip_state_dict(self, state_dict):
|
||||
clip_keys = getattr(self, "_clip_stash", {})
|
||||
clip_keys = utils.state_dict_prefix_replace(clip_keys, {"detector.backbone.language_backbone.": "", "backbone.language_backbone.": ""}, filter_keys=True)
|
||||
clip_keys = utils.clip_text_transformers_convert(clip_keys, "encoder.", "sam3_clip.transformer.")
|
||||
return {k: v for k, v in clip_keys.items() if not k.startswith("resizer.")}
|
||||
|
||||
def process_unet_state_dict(self, state_dict):
|
||||
self._clip_stash = {k: state_dict.pop(k) for k in list(state_dict.keys()) if "language_backbone" in k and "resizer" not in k}
|
||||
# SAM3.1: remap tracker.model.* -> tracker.*
|
||||
for k in list(state_dict.keys()):
|
||||
if k.startswith("tracker.model."):
|
||||
state_dict["tracker." + k[len("tracker.model."):]] = state_dict.pop(k)
|
||||
# SAM3.1: remove per-block freqs_cis buffers (computed dynamically)
|
||||
for k in [k for k in list(state_dict.keys()) if ".attn.freqs_cis" in k]:
|
||||
state_dict.pop(k)
|
||||
# Split fused QKV projections
|
||||
for k in [k for k in list(state_dict.keys()) if k.endswith((".in_proj_weight", ".in_proj_bias"))]:
|
||||
t = state_dict.pop(k)
|
||||
base, suffix = k.rsplit(".in_proj_", 1)
|
||||
s = ".weight" if suffix == "weight" else ".bias"
|
||||
d = t.shape[0] // 3
|
||||
state_dict[base + ".q_proj" + s] = t[:d]
|
||||
state_dict[base + ".k_proj" + s] = t[d:2*d]
|
||||
state_dict[base + ".v_proj" + s] = t[2*d:]
|
||||
# Remap tracker SAM decoder transformer key names to match sam.py TwoWayTransformer
|
||||
for k in list(state_dict.keys()):
|
||||
if "sam_mask_decoder.transformer." not in k:
|
||||
continue
|
||||
new_k = k.replace(".mlp.lin1.", ".mlp.0.").replace(".mlp.lin2.", ".mlp.2.").replace(".norm_final_attn.", ".norm_final.")
|
||||
if new_k != k:
|
||||
state_dict[new_k] = state_dict.pop(k)
|
||||
return state_dict
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
return model_base.SAM3(self, device=device)
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
import comfy.text_encoders.sam3_clip
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.sam3_clip.SAM3TokenizerWrapper, comfy.text_encoders.sam3_clip.SAM3ClipModelWrapper)
|
||||
|
||||
|
||||
class SAM31(SAM3):
|
||||
unet_config = {"image_model": "SAM31"}
|
||||
|
||||
|
||||
class CogVideoX_T2V(supported_models_base.BASE):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
}
|
||||
|
||||
sampling_settings = {
|
||||
"linear_start": 0.00085,
|
||||
"linear_end": 0.012,
|
||||
"beta_schedule": "linear",
|
||||
"zsnr": True,
|
||||
}
|
||||
|
||||
unet_extra_config = {}
|
||||
latent_format = latent_formats.CogVideoX
|
||||
|
||||
supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
|
||||
|
||||
vae_key_prefix = ["vae."]
|
||||
text_encoder_key_prefix = ["text_encoders."]
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
# CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, device=device)
|
||||
return out
|
||||
|
||||
def clip_target(self, state_dict={}):
|
||||
return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
|
||||
|
||||
class CogVideoX_I2V(CogVideoX_T2V):
|
||||
unet_config = {
|
||||
"image_model": "cogvideox",
|
||||
"in_channels": 32,
|
||||
}
|
||||
|
||||
def get_model(self, state_dict, prefix="", device=None):
|
||||
if self.unet_config.get("patch_size_t") is not None:
|
||||
self.unet_config.setdefault("sample_height", 96)
|
||||
self.unet_config.setdefault("sample_width", 170)
|
||||
self.unet_config.setdefault("sample_frames", 81)
|
||||
out = model_base.CogVideoX(self, image_to_video=True, device=device)
|
||||
return out
|
||||
|
||||
|
||||
models = [
|
||||
LotusD,
|
||||
Stable_Zero123,
|
||||
SD15_instructpix2pix,
|
||||
SD15,
|
||||
SD20,
|
||||
SD21UnclipL,
|
||||
SD21UnclipH,
|
||||
SDXL_instructpix2pix,
|
||||
SDXLRefiner,
|
||||
SDXL,
|
||||
SSD1B,
|
||||
KOALA_700M,
|
||||
KOALA_1B,
|
||||
Segmind_Vega,
|
||||
SD_X4Upscaler,
|
||||
Stable_Cascade_C,
|
||||
Stable_Cascade_B,
|
||||
SV3D_u,
|
||||
SV3D_p,
|
||||
SD3,
|
||||
StableAudio,
|
||||
AuraFlow,
|
||||
PixArtAlpha,
|
||||
PixArtSigma,
|
||||
HunyuanDiT,
|
||||
HunyuanDiT1,
|
||||
FluxInpaint,
|
||||
Flux,
|
||||
LongCatImage,
|
||||
FluxSchnell,
|
||||
GenmoMochi,
|
||||
LTXV,
|
||||
LTXAV,
|
||||
HunyuanVideo15_SR_Distilled,
|
||||
HunyuanVideo15,
|
||||
HunyuanImage21Refiner,
|
||||
HunyuanImage21,
|
||||
HunyuanVideoSkyreelsI2V,
|
||||
HunyuanVideoI2V,
|
||||
HunyuanVideo,
|
||||
CosmosT2V,
|
||||
CosmosI2V,
|
||||
CosmosT2IPredict2,
|
||||
CosmosI2VPredict2,
|
||||
ZImagePixelSpace,
|
||||
ZImage,
|
||||
Lumina2,
|
||||
WAN22_T2V,
|
||||
WAN21_T2V,
|
||||
WAN21_I2V,
|
||||
WAN21_FunControl2V,
|
||||
WAN21_Vace,
|
||||
WAN21_Camera,
|
||||
WAN22_Camera,
|
||||
WAN22_S2V,
|
||||
WAN21_HuMo,
|
||||
WAN22_Animate,
|
||||
WAN21_FlowRVS,
|
||||
WAN21_SCAIL,
|
||||
Hunyuan3Dv2mini,
|
||||
Hunyuan3Dv2,
|
||||
Hunyuan3Dv2_1,
|
||||
HiDream,
|
||||
Chroma,
|
||||
ChromaRadiance,
|
||||
ACEStep,
|
||||
ACEStep15,
|
||||
Omnigen2,
|
||||
QwenImage,
|
||||
Flux2,
|
||||
Kandinsky5Image,
|
||||
Kandinsky5,
|
||||
Anima,
|
||||
RT_DETR_v4,
|
||||
ErnieImage,
|
||||
SAM3,
|
||||
SAM31,
|
||||
CogVideoX_I2V,
|
||||
CogVideoX_T2V,
|
||||
SVD_img2vid,
|
||||
]
|
||||
|
||||
@ -7,6 +7,7 @@ from tqdm.auto import tqdm
|
||||
from collections import namedtuple, deque
|
||||
|
||||
import comfy.ops
|
||||
import comfy.model_management
|
||||
operations=comfy.ops.disable_weight_init
|
||||
|
||||
DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
|
||||
@ -47,11 +48,14 @@ class TGrow(nn.Module):
|
||||
x = self.conv(x)
|
||||
return x.reshape(-1, C, H, W)
|
||||
|
||||
def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||
def apply_model_with_memblocks(model, x, parallel, show_progress_bar, output_device=None,
|
||||
patch_size=1, decode=False):
|
||||
|
||||
B, T, C, H, W = x.shape
|
||||
if parallel:
|
||||
x = x.reshape(B*T, C, H, W)
|
||||
if not decode and patch_size > 1:
|
||||
x = F.pixel_unshuffle(x, patch_size)
|
||||
# parallel over input timesteps, iterate over blocks
|
||||
for b in tqdm(model, disable=not show_progress_bar):
|
||||
if isinstance(b, MemBlock):
|
||||
@ -62,20 +66,27 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
|
||||
x = b(x, mem)
|
||||
else:
|
||||
x = b(x)
|
||||
BT, C, H, W = x.shape
|
||||
T = BT // B
|
||||
x = x.view(B, T, C, H, W)
|
||||
if decode and patch_size > 1:
|
||||
x = F.pixel_shuffle(x, patch_size)
|
||||
x = x.view(B, x.shape[0] // B, *x.shape[1:])
|
||||
x = x.to(output_device)
|
||||
else:
|
||||
out = []
|
||||
work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
|
||||
# Chunk along the time dim directly (chunks are [B,1,C,H,W] views, squeeze to [B,C,H,W] views).
|
||||
# Avoids forcing a contiguous copy when x is non-contiguous (e.g. after movedim in encode/decode).
|
||||
work_queue = deque([TWorkItem(xt.squeeze(1), 0) for xt in x.chunk(T, dim=1)])
|
||||
progress_bar = tqdm(range(T), disable=not show_progress_bar)
|
||||
mem = [None] * len(model)
|
||||
while work_queue:
|
||||
xt, i = work_queue.popleft()
|
||||
if i == 0:
|
||||
progress_bar.update(1)
|
||||
if not decode and patch_size > 1:
|
||||
xt = F.pixel_unshuffle(xt, patch_size)
|
||||
if i == len(model):
|
||||
out.append(xt)
|
||||
if decode and patch_size > 1:
|
||||
xt = F.pixel_shuffle(xt, patch_size)
|
||||
out.append(xt.to(output_device))
|
||||
del xt
|
||||
else:
|
||||
b = model[i]
|
||||
@ -165,24 +176,20 @@ class TAEHV(nn.Module):
|
||||
|
||||
def encode(self, x, **kwargs):
|
||||
x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
if self.patch_size > 1:
|
||||
B, T, C, H, W = x.shape
|
||||
x = x.reshape(B * T, C, H, W)
|
||||
x = F.pixel_unshuffle(x, self.patch_size)
|
||||
x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size)
|
||||
if x.shape[1] % self.t_downscale != 0:
|
||||
# pad at end to multiple of t_downscale
|
||||
n_pad = self.t_downscale - x.shape[1] % self.t_downscale
|
||||
padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
|
||||
x = torch.cat([x, padding], 1)
|
||||
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
|
||||
x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar,
|
||||
patch_size=self.patch_size).movedim(2, 1)
|
||||
return self.process_out(x)
|
||||
|
||||
def decode(self, x, **kwargs):
|
||||
x = x.unsqueeze(0) if x.ndim == 4 else x # [T, C, H, W] -> [1, T, C, H, W]
|
||||
x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x # [B, T, C, H, W] or [B, C, T, H, W]
|
||||
x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
|
||||
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
|
||||
if self.patch_size > 1:
|
||||
x = F.pixel_shuffle(x, self.patch_size)
|
||||
x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar,
|
||||
output_device=comfy.model_management.intermediate_device(),
|
||||
patch_size=self.patch_size, decode=True)
|
||||
return x[:, self.frames_to_trim:].movedim(2, 1)
|
||||
|
||||
@ -17,32 +17,79 @@ class Clamp(nn.Module):
|
||||
return torch.tanh(x / 3) * 3
|
||||
|
||||
class Block(nn.Module):
|
||||
def __init__(self, n_in, n_out):
|
||||
def __init__(self, n_in: int, n_out: int, use_midblock_gn: bool = False):
|
||||
super().__init__()
|
||||
self.conv = nn.Sequential(conv(n_in, n_out), nn.ReLU(), conv(n_out, n_out), nn.ReLU(), conv(n_out, n_out))
|
||||
self.skip = comfy.ops.disable_weight_init.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
|
||||
self.fuse = nn.ReLU()
|
||||
def forward(self, x):
|
||||
if not use_midblock_gn:
|
||||
self.pool = None
|
||||
return
|
||||
n_gn = n_in * 4
|
||||
self.pool = nn.Sequential(
|
||||
comfy.ops.disable_weight_init.Conv2d(n_in, n_gn, 1, bias=False),
|
||||
comfy.ops.disable_weight_init.GroupNorm(4, n_gn),
|
||||
nn.ReLU(inplace=True),
|
||||
comfy.ops.disable_weight_init.Conv2d(n_gn, n_in, 1, bias=False),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
if self.pool is not None:
|
||||
x = x + self.pool(x)
|
||||
return self.fuse(self.conv(x) + self.skip(x))
|
||||
|
||||
def Encoder(latent_channels=4):
|
||||
return nn.Sequential(
|
||||
conv(3, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, latent_channels),
|
||||
)
|
||||
class Encoder(nn.Sequential):
|
||||
def __init__(self, latent_channels: int = 4, use_gn: bool = False):
|
||||
super().__init__(
|
||||
conv(3, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
|
||||
conv(64, 64, stride=2, bias=False), Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn),
|
||||
conv(64, latent_channels),
|
||||
)
|
||||
|
||||
class Decoder(nn.Sequential):
|
||||
def __init__(self, latent_channels: int = 4, use_gn: bool = False):
|
||||
super().__init__(
|
||||
Clamp(), conv(latent_channels, 64), nn.ReLU(),
|
||||
Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), conv(64, 3),
|
||||
)
|
||||
|
||||
class DecoderFlux2(Decoder):
|
||||
def __init__(self, latent_channels: int = 128, use_gn: bool = True):
|
||||
if latent_channels != 128 or not use_gn:
|
||||
raise ValueError("Unexpected parameters for Flux2 TAE module")
|
||||
super().__init__(latent_channels=32, use_gn=True)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
B, C, H, W = x.shape
|
||||
x = (
|
||||
x
|
||||
.reshape(B, 32, 2, 2, H, W)
|
||||
.permute(0, 1, 4, 2, 5, 3)
|
||||
.reshape(B, 32, H * 2, W * 2)
|
||||
)
|
||||
return super().forward(x)
|
||||
|
||||
class EncoderFlux2(Encoder):
|
||||
def __init__(self, latent_channels: int = 128, use_gn: bool = True):
|
||||
if latent_channels != 128 or not use_gn:
|
||||
raise ValueError("Unexpected parameters for Flux2 TAE module")
|
||||
super().__init__(latent_channels=32, use_gn=True)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
result = super().forward(x)
|
||||
B, C, H, W = result.shape
|
||||
return (
|
||||
result
|
||||
.reshape(B, C, H // 2, 2, W // 2, 2)
|
||||
.permute(0, 1, 3, 5, 2, 4)
|
||||
.reshape(B, 128, H // 2, W // 2)
|
||||
)
|
||||
|
||||
def Decoder(latent_channels=4):
|
||||
return nn.Sequential(
|
||||
Clamp(), conv(latent_channels, 64), nn.ReLU(),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
|
||||
Block(64, 64), conv(64, 3),
|
||||
)
|
||||
|
||||
class TAESD(nn.Module):
|
||||
latent_magnitude = 3
|
||||
@ -51,8 +98,15 @@ class TAESD(nn.Module):
|
||||
def __init__(self, encoder_path=None, decoder_path=None, latent_channels=4):
|
||||
"""Initialize pretrained TAESD on the given device from the given checkpoints."""
|
||||
super().__init__()
|
||||
self.taesd_encoder = Encoder(latent_channels=latent_channels)
|
||||
self.taesd_decoder = Decoder(latent_channels=latent_channels)
|
||||
if latent_channels == 128:
|
||||
encoder_class = EncoderFlux2
|
||||
decoder_class = DecoderFlux2
|
||||
else:
|
||||
encoder_class = Encoder
|
||||
decoder_class = Decoder
|
||||
self.taesd_encoder = encoder_class(latent_channels=latent_channels)
|
||||
self.taesd_decoder = decoder_class(latent_channels=latent_channels)
|
||||
|
||||
self.vae_scale = torch.nn.Parameter(torch.tensor(1.0))
|
||||
self.vae_shift = torch.nn.Parameter(torch.tensor(0.0))
|
||||
if encoder_path is not None:
|
||||
@ -61,19 +115,19 @@ class TAESD(nn.Module):
|
||||
self.taesd_decoder.load_state_dict(comfy.utils.load_torch_file(decoder_path, safe_load=True))
|
||||
|
||||
@staticmethod
|
||||
def scale_latents(x):
|
||||
def scale_latents(x: torch.Tensor) -> torch.Tensor:
|
||||
"""raw latents -> [0, 1]"""
|
||||
return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1)
|
||||
|
||||
@staticmethod
|
||||
def unscale_latents(x):
|
||||
def unscale_latents(x: torch.Tensor) -> torch.Tensor:
|
||||
"""[0, 1] -> raw latents"""
|
||||
return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude)
|
||||
|
||||
def decode(self, x):
|
||||
def decode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x_sample = self.taesd_decoder((x - self.vae_shift) * self.vae_scale)
|
||||
x_sample = x_sample.sub(0.5).mul(2)
|
||||
return x_sample
|
||||
|
||||
def encode(self, x):
|
||||
def encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return (self.taesd_encoder(x * 0.5 + 0.5) / self.vae_scale) + self.vae_shift
|
||||
|
||||
6
comfy/text_encoders/cogvideo.py
Normal file
6
comfy/text_encoders/cogvideo.py
Normal file
@ -0,0 +1,6 @@
|
||||
import comfy.text_encoders.sd3_clip
|
||||
|
||||
|
||||
class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
|
||||
97
comfy/text_encoders/sam3_clip.py
Normal file
97
comfy/text_encoders/sam3_clip.py
Normal file
@ -0,0 +1,97 @@
|
||||
import re
|
||||
from comfy import sd1_clip
|
||||
|
||||
SAM3_CLIP_CONFIG = {
|
||||
"architectures": ["CLIPTextModel"],
|
||||
"hidden_act": "quick_gelu",
|
||||
"hidden_size": 1024,
|
||||
"intermediate_size": 4096,
|
||||
"num_attention_heads": 16,
|
||||
"num_hidden_layers": 24,
|
||||
"max_position_embeddings": 32,
|
||||
"projection_dim": 512,
|
||||
"vocab_size": 49408,
|
||||
"layer_norm_eps": 1e-5,
|
||||
"eos_token_id": 49407,
|
||||
}
|
||||
|
||||
|
||||
class SAM3ClipModel(sd1_clip.SDClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}):
|
||||
super().__init__(device=device, dtype=dtype, max_length=32, layer="last", textmodel_json_config=SAM3_CLIP_CONFIG, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=False, return_attention_masks=True, enable_attention_masks=True, model_options=model_options)
|
||||
|
||||
|
||||
class SAM3Tokenizer(sd1_clip.SDTokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(max_length=32, pad_with_end=False, pad_token=0, embedding_directory=embedding_directory, embedding_size=1024, embedding_key="sam3_clip", tokenizer_data=tokenizer_data)
|
||||
self.disable_weights = True
|
||||
|
||||
|
||||
def _parse_prompts(text):
|
||||
"""Split comma-separated prompts with optional :N max detections per category"""
|
||||
text = text.replace("(", "").replace(")", "")
|
||||
parts = [p.strip() for p in text.split(",") if p.strip()]
|
||||
result = []
|
||||
for part in parts:
|
||||
m = re.match(r'^(.+?)\s*:\s*([\d.]+)\s*$', part)
|
||||
if m:
|
||||
text_part = m.group(1).strip()
|
||||
val = m.group(2)
|
||||
max_det = max(1, round(float(val)))
|
||||
result.append((text_part, max_det))
|
||||
else:
|
||||
result.append((part, 1))
|
||||
return result
|
||||
|
||||
|
||||
class SAM3TokenizerWrapper(sd1_clip.SD1Tokenizer):
|
||||
def __init__(self, embedding_directory=None, tokenizer_data={}):
|
||||
super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="l", tokenizer=SAM3Tokenizer, name="sam3_clip")
|
||||
|
||||
def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
|
||||
parsed = _parse_prompts(text)
|
||||
if len(parsed) <= 1 and (not parsed or parsed[0][1] == 1):
|
||||
return super().tokenize_with_weights(text, return_word_ids, **kwargs)
|
||||
# Tokenize each prompt part separately, store per-part batches and metadata
|
||||
inner = getattr(self, self.clip)
|
||||
per_prompt = []
|
||||
for prompt_text, max_det in parsed:
|
||||
batches = inner.tokenize_with_weights(prompt_text, return_word_ids, **kwargs)
|
||||
per_prompt.append((batches, max_det))
|
||||
# Main output uses first prompt's tokens (for compatibility)
|
||||
out = {self.clip_name: per_prompt[0][0], "sam3_per_prompt": per_prompt}
|
||||
return out
|
||||
|
||||
|
||||
class SAM3ClipModelWrapper(sd1_clip.SD1ClipModel):
|
||||
def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
|
||||
super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="l", clip_model=SAM3ClipModel, name="sam3_clip")
|
||||
|
||||
def encode_token_weights(self, token_weight_pairs):
|
||||
per_prompt = token_weight_pairs.pop("sam3_per_prompt", None)
|
||||
if per_prompt is None:
|
||||
return super().encode_token_weights(token_weight_pairs)
|
||||
|
||||
# Encode each prompt separately, pack into extra dict
|
||||
inner = getattr(self, self.clip)
|
||||
multi_cond = []
|
||||
first_pooled = None
|
||||
for batches, max_det in per_prompt:
|
||||
out = inner.encode_token_weights(batches)
|
||||
cond, pooled = out[0], out[1]
|
||||
extra = out[2] if len(out) > 2 else {}
|
||||
if first_pooled is None:
|
||||
first_pooled = pooled
|
||||
multi_cond.append({
|
||||
"cond": cond,
|
||||
"attention_mask": extra.get("attention_mask"),
|
||||
"max_detections": max_det,
|
||||
})
|
||||
|
||||
# Return first prompt as main (for non-SAM3 consumers), all prompts in metadata
|
||||
main = multi_cond[0]
|
||||
main_extra = {}
|
||||
if main["attention_mask"] is not None:
|
||||
main_extra["attention_mask"] = main["attention_mask"]
|
||||
main_extra["sam3_multi_cond"] = multi_cond
|
||||
return (main["cond"], first_pooled, main_extra)
|
||||
@ -9,6 +9,7 @@ from comfy_api.latest._input import (
|
||||
CurveInput,
|
||||
MonotoneCubicCurve,
|
||||
LinearCurve,
|
||||
RangeInput,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@ -21,4 +22,5 @@ __all__ = [
|
||||
"CurveInput",
|
||||
"MonotoneCubicCurve",
|
||||
"LinearCurve",
|
||||
"RangeInput",
|
||||
]
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
|
||||
from .curve_types import CurvePoint, CurveInput, MonotoneCubicCurve, LinearCurve
|
||||
from .range_types import RangeInput
|
||||
from .video_types import VideoInput
|
||||
|
||||
__all__ = [
|
||||
@ -12,4 +13,5 @@ __all__ = [
|
||||
"CurveInput",
|
||||
"MonotoneCubicCurve",
|
||||
"LinearCurve",
|
||||
"RangeInput",
|
||||
]
|
||||
|
||||
70
comfy_api/latest/_input/range_types.py
Normal file
70
comfy_api/latest/_input/range_types.py
Normal file
@ -0,0 +1,70 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RangeInput:
|
||||
"""Represents a levels/range adjustment: input range [min, max] with
|
||||
optional midpoint (gamma control).
|
||||
|
||||
Generates a 1D LUT identical to GIMP's levels mapping:
|
||||
1. Normalize input to [0, 1] using [min, max]
|
||||
2. Apply gamma correction: pow(value, 1/gamma)
|
||||
3. Clamp to [0, 1]
|
||||
|
||||
The midpoint field is a position in [0, 1] representing where the
|
||||
midtone falls within [min, max]. It maps to gamma via:
|
||||
gamma = -log2(midpoint)
|
||||
So midpoint=0.5 → gamma=1.0 (linear).
|
||||
"""
|
||||
|
||||
def __init__(self, min_val: float, max_val: float, midpoint: float | None = None):
|
||||
self.min_val = min_val
|
||||
self.max_val = max_val
|
||||
self.midpoint = midpoint
|
||||
|
||||
@staticmethod
|
||||
def from_raw(data) -> RangeInput:
|
||||
if isinstance(data, RangeInput):
|
||||
return data
|
||||
if isinstance(data, dict):
|
||||
return RangeInput(
|
||||
min_val=float(data.get("min", 0.0)),
|
||||
max_val=float(data.get("max", 1.0)),
|
||||
midpoint=float(data["midpoint"]) if data.get("midpoint") is not None else None,
|
||||
)
|
||||
raise TypeError(f"Cannot convert {type(data)} to RangeInput")
|
||||
|
||||
def to_lut(self, size: int = 256) -> np.ndarray:
|
||||
"""Generate a float64 lookup table mapping [0, 1] input through this
|
||||
levels adjustment.
|
||||
|
||||
The LUT maps normalized input values (0..1) to output values (0..1),
|
||||
matching the GIMP levels formula.
|
||||
"""
|
||||
xs = np.linspace(0.0, 1.0, size, dtype=np.float64)
|
||||
|
||||
in_range = self.max_val - self.min_val
|
||||
if abs(in_range) < 1e-10:
|
||||
return np.where(xs >= self.min_val, 1.0, 0.0).astype(np.float64)
|
||||
|
||||
# Normalize: map [min, max] → [0, 1]
|
||||
result = (xs - self.min_val) / in_range
|
||||
result = np.clip(result, 0.0, 1.0)
|
||||
|
||||
# Gamma correction from midpoint
|
||||
if self.midpoint is not None and self.midpoint > 0 and self.midpoint != 0.5:
|
||||
gamma = max(-math.log2(self.midpoint), 0.001)
|
||||
inv_gamma = 1.0 / gamma
|
||||
mask = result > 0
|
||||
result[mask] = np.power(result[mask], inv_gamma)
|
||||
|
||||
return result
|
||||
|
||||
def __repr__(self) -> str:
|
||||
mid = f", midpoint={self.midpoint}" if self.midpoint is not None else ""
|
||||
return f"RangeInput(min={self.min_val}, max={self.max_val}{mid})"
|
||||
@ -12,6 +12,7 @@ import numpy as np
|
||||
import math
|
||||
import torch
|
||||
from .._util import VideoContainer, VideoCodec, VideoComponents
|
||||
import logging
|
||||
|
||||
|
||||
def container_to_output_format(container_format: str | None) -> str | None:
|
||||
@ -238,64 +239,125 @@ class VideoFromFile(VideoInput):
|
||||
start_time = max(self._get_raw_duration() + self.__start_time, 0)
|
||||
else:
|
||||
start_time = self.__start_time
|
||||
|
||||
# Get video frames
|
||||
frames = []
|
||||
audio_frames = []
|
||||
alphas = None
|
||||
start_pts = int(start_time / video_stream.time_base)
|
||||
end_pts = int((start_time + self.__duration) / video_stream.time_base)
|
||||
container.seek(start_pts, stream=video_stream)
|
||||
for frame in container.decode(video_stream):
|
||||
if frame.pts < start_pts:
|
||||
continue
|
||||
if self.__duration and frame.pts >= end_pts:
|
||||
break
|
||||
img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3)
|
||||
img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3)
|
||||
frames.append(img)
|
||||
|
||||
images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
|
||||
if start_pts != 0:
|
||||
container.seek(start_pts, stream=video_stream)
|
||||
|
||||
image_format = 'gbrpf32le'
|
||||
process_image_format = lambda a: a
|
||||
audio = None
|
||||
|
||||
streams = [video_stream]
|
||||
has_first_audio_frame = False
|
||||
checked_alpha = False
|
||||
|
||||
# Default to False so we decode until EOF if duration is 0
|
||||
video_done = False
|
||||
audio_done = True
|
||||
|
||||
if len(container.streams.audio):
|
||||
audio_stream = container.streams.audio[-1]
|
||||
streams += [audio_stream]
|
||||
resampler = av.audio.resampler.AudioResampler(format='fltp')
|
||||
audio_done = False
|
||||
|
||||
for packet in container.demux(*streams):
|
||||
if video_done and audio_done:
|
||||
break
|
||||
|
||||
if packet.stream.type == "video":
|
||||
if video_done:
|
||||
continue
|
||||
try:
|
||||
for frame in packet.decode():
|
||||
if frame.pts < start_pts:
|
||||
continue
|
||||
if self.__duration and frame.pts >= end_pts:
|
||||
video_done = True
|
||||
break
|
||||
|
||||
if not checked_alpha:
|
||||
alpha_channel = False
|
||||
for comp in frame.format.components:
|
||||
if comp.is_alpha or frame.format.name == "pal8":
|
||||
alphas = []
|
||||
alpha_channel = True
|
||||
break
|
||||
if frame.format.name in ("yuvj420p", "yuvj422p", "yuvj444p", "rgb24", "rgba", "pal8"):
|
||||
process_image_format = lambda a: a.float() / 255.0
|
||||
if alpha_channel:
|
||||
image_format = 'rgba'
|
||||
else:
|
||||
image_format = 'rgb24'
|
||||
else:
|
||||
process_image_format = lambda a: a
|
||||
if alpha_channel:
|
||||
image_format = 'gbrapf32le'
|
||||
else:
|
||||
image_format = 'gbrpf32le'
|
||||
|
||||
checked_alpha = True
|
||||
|
||||
img = frame.to_ndarray(format=image_format) # shape: (H, W, 4)
|
||||
if frame.rotation != 0:
|
||||
k = int(round(frame.rotation // 90))
|
||||
img = np.rot90(img, k=k, axes=(0, 1)).copy()
|
||||
if alphas is None:
|
||||
frames.append(torch.from_numpy(img))
|
||||
else:
|
||||
frames.append(torch.from_numpy(img[..., :-1]))
|
||||
alphas.append(torch.from_numpy(img[..., -1:]))
|
||||
except av.error.InvalidDataError:
|
||||
logging.info("pyav decode error")
|
||||
|
||||
elif packet.stream.type == "audio":
|
||||
if audio_done:
|
||||
continue
|
||||
|
||||
aframes = itertools.chain.from_iterable(
|
||||
map(resampler.resample, packet.decode())
|
||||
)
|
||||
for frame in aframes:
|
||||
if self.__duration and frame.time > start_time + self.__duration:
|
||||
audio_done = True
|
||||
break
|
||||
|
||||
if not has_first_audio_frame:
|
||||
offset_seconds = start_time - frame.pts * audio_stream.time_base
|
||||
to_skip = max(0, int(offset_seconds * audio_stream.sample_rate))
|
||||
if to_skip < frame.samples:
|
||||
has_first_audio_frame = True
|
||||
audio_frames.append(frame.to_ndarray()[..., to_skip:])
|
||||
else:
|
||||
audio_frames.append(frame.to_ndarray())
|
||||
|
||||
images = process_image_format(torch.stack(frames)) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
|
||||
if alphas is not None:
|
||||
alphas = process_image_format(torch.stack(alphas)) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)
|
||||
|
||||
# Get frame rate
|
||||
frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
|
||||
|
||||
# Get audio if available
|
||||
audio = None
|
||||
container.seek(start_pts, stream=video_stream)
|
||||
# Use last stream for consistency
|
||||
if len(container.streams.audio):
|
||||
audio_stream = container.streams.audio[-1]
|
||||
audio_frames = []
|
||||
resample = av.audio.resampler.AudioResampler(format='fltp').resample
|
||||
frames = itertools.chain.from_iterable(
|
||||
map(resample, container.decode(audio_stream))
|
||||
)
|
||||
if len(audio_frames) > 0:
|
||||
audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
|
||||
if self.__duration:
|
||||
audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
|
||||
|
||||
has_first_frame = False
|
||||
for frame in frames:
|
||||
offset_seconds = start_time - frame.pts * audio_stream.time_base
|
||||
to_skip = max(0, int(offset_seconds * audio_stream.sample_rate))
|
||||
if to_skip < frame.samples:
|
||||
has_first_frame = True
|
||||
break
|
||||
if has_first_frame:
|
||||
audio_frames.append(frame.to_ndarray()[..., to_skip:])
|
||||
|
||||
for frame in frames:
|
||||
if self.__duration and frame.time > start_time + self.__duration:
|
||||
break
|
||||
audio_frames.append(frame.to_ndarray()) # shape: (channels, samples)
|
||||
if len(audio_frames) > 0:
|
||||
audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
|
||||
if self.__duration:
|
||||
audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
|
||||
|
||||
audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
|
||||
audio = AudioInput({
|
||||
"waveform": audio_tensor,
|
||||
"sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
|
||||
})
|
||||
audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
|
||||
audio = AudioInput({
|
||||
"waveform": audio_tensor,
|
||||
"sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
|
||||
})
|
||||
|
||||
metadata = container.metadata
|
||||
return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
|
||||
return VideoComponents(images=images, alpha=alphas, audio=audio, frame_rate=frame_rate, metadata=metadata)
|
||||
|
||||
def get_components(self) -> VideoComponents:
|
||||
if isinstance(self.__file, io.BytesIO):
|
||||
|
||||
@ -1266,6 +1266,43 @@ class Histogram(ComfyTypeIO):
|
||||
Type = list[int]
|
||||
|
||||
|
||||
@comfytype(io_type="RANGE")
|
||||
class Range(ComfyTypeIO):
|
||||
from comfy_api.input import RangeInput
|
||||
if TYPE_CHECKING:
|
||||
Type = RangeInput
|
||||
|
||||
class Input(WidgetInput):
|
||||
def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
|
||||
socketless: bool=True, default: dict=None,
|
||||
display: str=None,
|
||||
gradient_stops: list=None,
|
||||
show_midpoint: bool=None,
|
||||
midpoint_scale: str=None,
|
||||
value_min: float=None,
|
||||
value_max: float=None,
|
||||
advanced: bool=None):
|
||||
super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
|
||||
if default is None:
|
||||
self.default = {"min": 0.0, "max": 1.0}
|
||||
self.display = display
|
||||
self.gradient_stops = gradient_stops
|
||||
self.show_midpoint = show_midpoint
|
||||
self.midpoint_scale = midpoint_scale
|
||||
self.value_min = value_min
|
||||
self.value_max = value_max
|
||||
|
||||
def as_dict(self):
|
||||
return super().as_dict() | prune_dict({
|
||||
"display": self.display,
|
||||
"gradient_stops": self.gradient_stops,
|
||||
"show_midpoint": self.show_midpoint,
|
||||
"midpoint_scale": self.midpoint_scale,
|
||||
"value_min": self.value_min,
|
||||
"value_max": self.value_max,
|
||||
})
|
||||
|
||||
|
||||
DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {}
|
||||
def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]):
|
||||
DYNAMIC_INPUT_LOOKUP[io_type] = func
|
||||
@ -2276,5 +2313,6 @@ __all__ = [
|
||||
"BoundingBox",
|
||||
"Curve",
|
||||
"Histogram",
|
||||
"Range",
|
||||
"NodeReplace",
|
||||
]
|
||||
|
||||
@ -3,7 +3,7 @@ from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from fractions import Fraction
|
||||
from typing import Optional
|
||||
from .._input import ImageInput, AudioInput
|
||||
from .._input import ImageInput, AudioInput, MaskInput
|
||||
|
||||
class VideoCodec(str, Enum):
|
||||
AUTO = "auto"
|
||||
@ -48,5 +48,4 @@ class VideoComponents:
|
||||
frame_rate: Fraction
|
||||
audio: Optional[AudioInput] = None
|
||||
metadata: Optional[dict] = None
|
||||
|
||||
|
||||
alpha: Optional[MaskInput] = None
|
||||
|
||||
@ -122,6 +122,46 @@ class TaskStatusResponse(BaseModel):
|
||||
usage: TaskStatusUsage | None = Field(None)
|
||||
|
||||
|
||||
class GetAssetResponse(BaseModel):
|
||||
id: str = Field(...)
|
||||
name: str | None = Field(None)
|
||||
url: str | None = Field(None)
|
||||
asset_type: str = Field(...)
|
||||
group_id: str = Field(...)
|
||||
status: str = Field(...)
|
||||
error: TaskStatusError | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateVisualValidateSessionResponse(BaseModel):
|
||||
session_id: str = Field(...)
|
||||
h5_link: str = Field(...)
|
||||
|
||||
|
||||
class SeedanceGetVisualValidateSessionResponse(BaseModel):
|
||||
session_id: str = Field(...)
|
||||
status: str = Field(...)
|
||||
group_id: str | None = Field(None)
|
||||
error_code: str | None = Field(None)
|
||||
error_message: str | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateAssetRequest(BaseModel):
|
||||
group_id: str = Field(...)
|
||||
url: str = Field(...)
|
||||
asset_type: str = Field(...)
|
||||
name: str | None = Field(None, max_length=64)
|
||||
project_name: str | None = Field(None)
|
||||
|
||||
|
||||
class SeedanceCreateAssetResponse(BaseModel):
|
||||
asset_id: str = Field(...)
|
||||
|
||||
|
||||
class SeedanceVirtualLibraryCreateAssetRequest(BaseModel):
|
||||
url: str = Field(..., description="Publicly accessible URL of the image asset to upload.")
|
||||
hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.")
|
||||
|
||||
|
||||
# Dollars per 1K tokens, keyed by (model_id, has_video_input).
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS = {
|
||||
("dreamina-seedance-2-0-260128", False): 0.007,
|
||||
|
||||
@ -118,7 +118,7 @@ class Wan27ReferenceVideoInputField(BaseModel):
|
||||
class Wan27ReferenceVideoParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int = Field(5, ge=2, le=10)
|
||||
duration: int = Field(5, ge=2, le=15)
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
@ -157,7 +157,7 @@ class Wan27VideoEditInputField(BaseModel):
|
||||
class Wan27VideoEditParametersField(BaseModel):
|
||||
resolution: str = Field(...)
|
||||
ratio: str | None = Field(None)
|
||||
duration: int = Field(0)
|
||||
duration: int | None = Field(0)
|
||||
audio_setting: str = Field("auto")
|
||||
watermark: bool = Field(False)
|
||||
seed: int = Field(..., ge=0, le=2147483647)
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import re
|
||||
|
||||
import torch
|
||||
from typing_extensions import override
|
||||
@ -11,9 +13,15 @@ from comfy_api_nodes.apis.bytedance import (
|
||||
SEEDANCE2_PRICE_PER_1K_TOKENS,
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
|
||||
VIDEO_TASKS_EXECUTION_TIME,
|
||||
GetAssetResponse,
|
||||
Image2VideoTaskCreationRequest,
|
||||
ImageTaskCreationResponse,
|
||||
Seedance2TaskCreationRequest,
|
||||
SeedanceCreateAssetRequest,
|
||||
SeedanceCreateAssetResponse,
|
||||
SeedanceCreateVisualValidateSessionResponse,
|
||||
SeedanceGetVisualValidateSessionResponse,
|
||||
SeedanceVirtualLibraryCreateAssetRequest,
|
||||
Seedream4Options,
|
||||
Seedream4TaskCreationRequest,
|
||||
TaskAudioContent,
|
||||
@ -44,10 +52,16 @@ from comfy_api_nodes.util import (
|
||||
validate_image_aspect_ratio,
|
||||
validate_image_dimensions,
|
||||
validate_string,
|
||||
validate_video_dimensions,
|
||||
validate_video_duration,
|
||||
)
|
||||
from server import PromptServer
|
||||
|
||||
BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations"
|
||||
|
||||
_VERIFICATION_POLL_TIMEOUT_SEC = 120
|
||||
_VERIFICATION_POLL_INTERVAL_SEC = 3
|
||||
|
||||
SEEDREAM_MODELS = {
|
||||
"seedream 5.0 lite": "seedream-5-0-260128",
|
||||
"seedream-4-5-251128": "seedream-4-5-251128",
|
||||
@ -96,6 +110,193 @@ def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: st
|
||||
)
|
||||
|
||||
|
||||
async def _resolve_reference_assets(
|
||||
cls: type[IO.ComfyNode],
|
||||
asset_ids: list[str],
|
||||
) -> tuple[dict[str, str], dict[str, str], dict[str, str]]:
|
||||
"""Look up each asset, validate Active status, group by asset_type.
|
||||
|
||||
Returns (image_assets, video_assets, audio_assets), each mapping asset_id -> "asset://<asset_id>".
|
||||
"""
|
||||
image_assets: dict[str, str] = {}
|
||||
video_assets: dict[str, str] = {}
|
||||
audio_assets: dict[str, str] = {}
|
||||
for i, raw_id in enumerate(asset_ids, 1):
|
||||
asset_id = (raw_id or "").strip()
|
||||
if not asset_id:
|
||||
continue
|
||||
result = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
|
||||
response_model=GetAssetResponse,
|
||||
)
|
||||
if result.status != "Active":
|
||||
extra = f" {result.error.code}: {result.error.message}" if result.error else ""
|
||||
raise ValueError(f"Reference asset {i} (Id={asset_id}) is not Active (Status={result.status}).{extra}")
|
||||
asset_uri = f"asset://{asset_id}"
|
||||
if result.asset_type == "Image":
|
||||
image_assets[asset_id] = asset_uri
|
||||
elif result.asset_type == "Video":
|
||||
video_assets[asset_id] = asset_uri
|
||||
elif result.asset_type == "Audio":
|
||||
audio_assets[asset_id] = asset_uri
|
||||
return image_assets, video_assets, audio_assets
|
||||
|
||||
|
||||
_ASSET_REF_RE = re.compile(r"\basset ?(\d{1,2})\b", re.IGNORECASE)
|
||||
|
||||
|
||||
def _build_asset_labels(
|
||||
reference_assets: dict[str, str],
|
||||
image_asset_uris: dict[str, str],
|
||||
video_asset_uris: dict[str, str],
|
||||
audio_asset_uris: dict[str, str],
|
||||
n_reference_images: int,
|
||||
n_reference_videos: int,
|
||||
n_reference_audios: int,
|
||||
) -> dict[int, str]:
|
||||
"""Map asset slot number (from 'asset_N' keys) to its positional label.
|
||||
|
||||
Asset entries are appended to `content` after the reference_images/videos/audios,
|
||||
so their 1-indexed labels continue from the count of existing same-type refs:
|
||||
one reference_images entry + one Image-type asset -> asset labelled "Image 2".
|
||||
"""
|
||||
image_n = n_reference_images
|
||||
video_n = n_reference_videos
|
||||
audio_n = n_reference_audios
|
||||
labels: dict[int, str] = {}
|
||||
for slot_key, raw_id in reference_assets.items():
|
||||
asset_id = (raw_id or "").strip()
|
||||
if not asset_id:
|
||||
continue
|
||||
try:
|
||||
slot_num = int(slot_key.rsplit("_", 1)[-1])
|
||||
except ValueError:
|
||||
continue
|
||||
if asset_id in image_asset_uris:
|
||||
image_n += 1
|
||||
labels[slot_num] = f"Image {image_n}"
|
||||
elif asset_id in video_asset_uris:
|
||||
video_n += 1
|
||||
labels[slot_num] = f"Video {video_n}"
|
||||
elif asset_id in audio_asset_uris:
|
||||
audio_n += 1
|
||||
labels[slot_num] = f"Audio {audio_n}"
|
||||
return labels
|
||||
|
||||
|
||||
def _rewrite_asset_refs(prompt: str, labels: dict[int, str]) -> str:
|
||||
"""Case-insensitively replace 'assetNN' (1-2 digit) tokens with their labels."""
|
||||
if not labels:
|
||||
return prompt
|
||||
|
||||
def _sub(m: "re.Match[str]") -> str:
|
||||
return labels.get(int(m.group(1)), m.group(0))
|
||||
|
||||
return _ASSET_REF_RE.sub(_sub, prompt)
|
||||
|
||||
|
||||
async def _obtain_group_id_via_h5_auth(cls: type[IO.ComfyNode]) -> str:
|
||||
session = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/visual-validate/sessions", method="POST"),
|
||||
response_model=SeedanceCreateVisualValidateSessionResponse,
|
||||
)
|
||||
logger.warning("Seedance authentication required. Open link: %s", session.h5_link)
|
||||
|
||||
h5_text = f"Open this link in your browser and complete face verification:\n\n{session.h5_link}"
|
||||
|
||||
result = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/visual-validate/sessions/{session.session_id}"),
|
||||
response_model=SeedanceGetVisualValidateSessionResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
completed_statuses=["completed"],
|
||||
failed_statuses=["failed"],
|
||||
poll_interval=_VERIFICATION_POLL_INTERVAL_SEC,
|
||||
max_poll_attempts=(_VERIFICATION_POLL_TIMEOUT_SEC // _VERIFICATION_POLL_INTERVAL_SEC) - 1,
|
||||
estimated_duration=_VERIFICATION_POLL_TIMEOUT_SEC - 1,
|
||||
extra_text=h5_text,
|
||||
)
|
||||
|
||||
if not result.group_id:
|
||||
raise RuntimeError(f"Seedance session {session.session_id} completed without a group_id")
|
||||
|
||||
logger.warning("Seedance authentication complete. New GroupId: %s", result.group_id)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Authentication complete. New GroupId: {result.group_id}", cls.hidden.unique_id
|
||||
)
|
||||
return result.group_id
|
||||
|
||||
|
||||
async def _resolve_group_id(cls: type[IO.ComfyNode], group_id: str) -> str:
|
||||
if group_id and group_id.strip():
|
||||
return group_id.strip()
|
||||
return await _obtain_group_id_via_h5_auth(cls)
|
||||
|
||||
|
||||
async def _create_seedance_asset(
|
||||
cls: type[IO.ComfyNode],
|
||||
*,
|
||||
group_id: str,
|
||||
url: str,
|
||||
name: str,
|
||||
asset_type: str,
|
||||
) -> str:
|
||||
req = SeedanceCreateAssetRequest(
|
||||
group_id=group_id,
|
||||
url=url,
|
||||
asset_type=asset_type,
|
||||
name=name or None,
|
||||
)
|
||||
result = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/assets", method="POST"),
|
||||
response_model=SeedanceCreateAssetResponse,
|
||||
data=req,
|
||||
)
|
||||
return result.asset_id
|
||||
|
||||
|
||||
async def _wait_for_asset_active(cls: type[IO.ComfyNode], asset_id: str, group_id: str) -> GetAssetResponse:
|
||||
"""Poll the newly created asset until its status becomes Active."""
|
||||
return await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
|
||||
response_model=GetAssetResponse,
|
||||
status_extractor=lambda r: r.status,
|
||||
completed_statuses=["Active"],
|
||||
failed_statuses=["Failed"],
|
||||
poll_interval=5,
|
||||
max_poll_attempts=1200,
|
||||
extra_text=f"Waiting for asset pre-processing...\n\nasset_id: {asset_id}\n\ngroup_id: {group_id}",
|
||||
)
|
||||
|
||||
|
||||
async def _seedance_virtual_library_upload_image_asset(
|
||||
cls: type[IO.ComfyNode],
|
||||
image: torch.Tensor,
|
||||
*,
|
||||
wait_label: str = "Uploading image",
|
||||
) -> str:
|
||||
"""Upload an image into the caller's per-customer Seedance virtual library."""
|
||||
public_url = await upload_image_to_comfyapi(cls, image, wait_label=wait_label)
|
||||
normalized = image.detach().cpu().contiguous().to(torch.float32)
|
||||
digest = hashlib.sha256()
|
||||
digest.update(str(tuple(normalized.shape)).encode("utf-8"))
|
||||
digest.update(b"\0")
|
||||
digest.update(normalized.numpy().tobytes())
|
||||
image_hash = digest.hexdigest()
|
||||
create_resp = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"),
|
||||
response_model=SeedanceCreateAssetResponse,
|
||||
data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=image_hash),
|
||||
)
|
||||
await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library")
|
||||
return f"asset://{create_resp.asset_id}"
|
||||
|
||||
|
||||
def _seedance2_price_extractor(model_id: str, has_video_input: bool):
|
||||
"""Returns a price_extractor closure for Seedance 2.0 poll_op."""
|
||||
rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
|
||||
@ -1228,12 +1429,27 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
IO.Image.Input(
|
||||
"first_frame",
|
||||
tooltip="First frame image for the video.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Image.Input(
|
||||
"last_frame",
|
||||
tooltip="Last frame image for the video.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"first_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the first frame. "
|
||||
"Mutually exclusive with the first_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.String.Input(
|
||||
"last_frame_asset_id",
|
||||
default="",
|
||||
tooltip="Seedance asset_id to use as the last frame. "
|
||||
"Mutually exclusive with the last_frame image input.",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
@ -1286,28 +1502,62 @@ class ByteDance2FirstLastFrameNode(IO.ComfyNode):
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
first_frame: Input.Image,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
first_frame: Input.Image | None = None,
|
||||
last_frame: Input.Image | None = None,
|
||||
first_frame_asset_id: str = "",
|
||||
last_frame_asset_id: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(model["prompt"], strip_whitespace=True, min_length=1)
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
|
||||
first_frame_asset_id = first_frame_asset_id.strip()
|
||||
last_frame_asset_id = last_frame_asset_id.strip()
|
||||
|
||||
if first_frame is not None and first_frame_asset_id:
|
||||
raise ValueError("Provide only one of first_frame or first_frame_asset_id, not both.")
|
||||
if first_frame is None and not first_frame_asset_id:
|
||||
raise ValueError("Either first_frame or first_frame_asset_id is required.")
|
||||
if last_frame is not None and last_frame_asset_id:
|
||||
raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.")
|
||||
|
||||
asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a]
|
||||
image_assets: dict[str, str] = {}
|
||||
if asset_ids_to_resolve:
|
||||
image_assets, _, _ = await _resolve_reference_assets(cls, asset_ids_to_resolve)
|
||||
for aid in asset_ids_to_resolve:
|
||||
if aid not in image_assets:
|
||||
raise ValueError(f"Asset {aid} is not an Image asset.")
|
||||
|
||||
if first_frame_asset_id:
|
||||
first_frame_url = image_assets[first_frame_asset_id]
|
||||
else:
|
||||
first_frame_url = await _seedance_virtual_library_upload_image_asset(
|
||||
cls, first_frame, wait_label="Uploading first frame."
|
||||
)
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame.")
|
||||
),
|
||||
image_url=TaskImageContentUrl(url=first_frame_url),
|
||||
role="first_frame",
|
||||
),
|
||||
]
|
||||
if last_frame is not None:
|
||||
if last_frame_asset_id:
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(url=image_assets[last_frame_asset_id]),
|
||||
role="last_frame",
|
||||
),
|
||||
)
|
||||
elif last_frame is not None:
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(cls, last_frame, wait_label="Uploading last frame.")
|
||||
url=await _seedance_virtual_library_upload_image_asset(
|
||||
cls, last_frame, wait_label="Uploading last frame."
|
||||
)
|
||||
),
|
||||
role="last_frame",
|
||||
),
|
||||
@ -1385,6 +1635,24 @@ def _seedance2_reference_inputs(resolutions: list[str]):
|
||||
tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
|
||||
"for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_assets",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.String.Input("reference_asset"),
|
||||
names=[
|
||||
"asset_1",
|
||||
"asset_2",
|
||||
"asset_3",
|
||||
"asset_4",
|
||||
"asset_5",
|
||||
"asset_6",
|
||||
"asset_7",
|
||||
"asset_8",
|
||||
"asset_9",
|
||||
],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@ -1486,24 +1754,42 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
reference_images = model.get("reference_images", {})
|
||||
reference_videos = model.get("reference_videos", {})
|
||||
reference_audios = model.get("reference_audios", {})
|
||||
reference_assets = model.get("reference_assets", {})
|
||||
|
||||
if not reference_images and not reference_videos:
|
||||
raise ValueError("At least one reference image or video is required.")
|
||||
reference_image_assets, reference_video_assets, reference_audio_assets = await _resolve_reference_assets(
|
||||
cls, list(reference_assets.values())
|
||||
)
|
||||
|
||||
if not reference_images and not reference_videos and not reference_image_assets and not reference_video_assets:
|
||||
raise ValueError("At least one reference image or video or asset is required.")
|
||||
|
||||
total_images = len(reference_images) + len(reference_image_assets)
|
||||
if total_images > 9:
|
||||
raise ValueError(
|
||||
f"Too many reference images: {total_images} "
|
||||
f"(images={len(reference_images)}, image assets={len(reference_image_assets)}). Maximum is 9."
|
||||
)
|
||||
total_videos = len(reference_videos) + len(reference_video_assets)
|
||||
if total_videos > 3:
|
||||
raise ValueError(
|
||||
f"Too many reference videos: {total_videos} "
|
||||
f"(videos={len(reference_videos)}, video assets={len(reference_video_assets)}). Maximum is 3."
|
||||
)
|
||||
total_audios = len(reference_audios) + len(reference_audio_assets)
|
||||
if total_audios > 3:
|
||||
raise ValueError(
|
||||
f"Too many reference audios: {total_audios} "
|
||||
f"(audios={len(reference_audios)}, audio assets={len(reference_audio_assets)}). Maximum is 3."
|
||||
)
|
||||
|
||||
model_id = SEEDANCE_MODELS[model["model"]]
|
||||
has_video_input = len(reference_videos) > 0
|
||||
has_video_input = total_videos > 0
|
||||
|
||||
if model.get("auto_downscale") and reference_videos:
|
||||
max_px = (
|
||||
SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {})
|
||||
.get(model["resolution"], {})
|
||||
.get("max")
|
||||
)
|
||||
max_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("max")
|
||||
if max_px:
|
||||
for key in reference_videos:
|
||||
reference_videos[key] = resize_video_to_pixel_budget(
|
||||
reference_videos[key], max_px
|
||||
)
|
||||
reference_videos[key] = resize_video_to_pixel_budget(reference_videos[key], max_px)
|
||||
|
||||
total_video_duration = 0.0
|
||||
for i, key in enumerate(reference_videos, 1):
|
||||
@ -1531,16 +1817,27 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
if total_audio_duration > 15.1:
|
||||
raise ValueError(f"Total reference audio duration is {total_audio_duration:.1f}s. Maximum is 15.1 seconds.")
|
||||
|
||||
asset_labels = _build_asset_labels(
|
||||
reference_assets,
|
||||
reference_image_assets,
|
||||
reference_video_assets,
|
||||
reference_audio_assets,
|
||||
len(reference_images),
|
||||
len(reference_videos),
|
||||
len(reference_audios),
|
||||
)
|
||||
prompt_text = _rewrite_asset_refs(model["prompt"], asset_labels)
|
||||
|
||||
content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = [
|
||||
TaskTextContent(text=model["prompt"]),
|
||||
TaskTextContent(text=prompt_text),
|
||||
]
|
||||
for i, key in enumerate(reference_images, 1):
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(
|
||||
url=await upload_image_to_comfyapi(
|
||||
url=await _seedance_virtual_library_upload_image_asset(
|
||||
cls,
|
||||
image=reference_images[key],
|
||||
reference_images[key],
|
||||
wait_label=f"Uploading image {i}",
|
||||
),
|
||||
),
|
||||
@ -1573,6 +1870,21 @@ class ByteDance2ReferenceNode(IO.ComfyNode):
|
||||
),
|
||||
),
|
||||
)
|
||||
for url in reference_image_assets.values():
|
||||
content.append(
|
||||
TaskImageContent(
|
||||
image_url=TaskImageContentUrl(url=url),
|
||||
role="reference_image",
|
||||
),
|
||||
)
|
||||
for url in reference_video_assets.values():
|
||||
content.append(
|
||||
TaskVideoContent(video_url=TaskVideoContentUrl(url=url)),
|
||||
)
|
||||
for url in reference_audio_assets.values():
|
||||
content.append(
|
||||
TaskAudioContent(audio_url=TaskAudioContentUrl(url=url)),
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
|
||||
@ -1627,6 +1939,156 @@ async def process_video_task(
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
|
||||
|
||||
|
||||
class ByteDanceCreateImageAsset(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceCreateImageAsset",
|
||||
display_name="ByteDance Create Image Asset",
|
||||
category="api node/image/ByteDance",
|
||||
description=(
|
||||
"Create a Seedance 2.0 personal image asset. Uploads the input image and "
|
||||
"registers it in the given asset group. If group_id is empty, runs a real-person "
|
||||
"H5 authentication flow to create a new group before adding the asset."
|
||||
),
|
||||
inputs=[
|
||||
IO.Image.Input("image", tooltip="Image to register as a personal asset."),
|
||||
IO.String.Input(
|
||||
"group_id",
|
||||
default="",
|
||||
tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
|
||||
"same person. Leave empty to run real-person authentication in the browser and create a new group.",
|
||||
),
|
||||
# IO.String.Input(
|
||||
# "name",
|
||||
# default="",
|
||||
# tooltip="Asset name (up to 64 characters).",
|
||||
# ),
|
||||
],
|
||||
outputs=[
|
||||
IO.String.Output(display_name="asset_id"),
|
||||
IO.String.Output(display_name="group_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
# is_api_node=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
image: Input.Image,
|
||||
group_id: str = "",
|
||||
# name: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
# if len(name) > 64:
|
||||
# raise ValueError("Name of asset can not be greater then 64 symbols")
|
||||
validate_image_dimensions(image, min_width=300, max_width=6000, min_height=300, max_height=6000)
|
||||
validate_image_aspect_ratio(image, min_ratio=(0.4, 1), max_ratio=(2.5, 1))
|
||||
resolved_group = await _resolve_group_id(cls, group_id)
|
||||
asset_id = await _create_seedance_asset(
|
||||
cls,
|
||||
group_id=resolved_group,
|
||||
url=await upload_image_to_comfyapi(cls, image),
|
||||
name="",
|
||||
asset_type="Image",
|
||||
)
|
||||
await _wait_for_asset_active(cls, asset_id, resolved_group)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
|
||||
f"group_id: {resolved_group}",
|
||||
cls.hidden.unique_id,
|
||||
)
|
||||
return IO.NodeOutput(asset_id, resolved_group)
|
||||
|
||||
|
||||
class ByteDanceCreateVideoAsset(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls) -> IO.Schema:
|
||||
return IO.Schema(
|
||||
node_id="ByteDanceCreateVideoAsset",
|
||||
display_name="ByteDance Create Video Asset",
|
||||
category="api node/video/ByteDance",
|
||||
description=(
|
||||
"Create a Seedance 2.0 personal video asset. Uploads the input video and "
|
||||
"registers it in the given asset group. If group_id is empty, runs a real-person "
|
||||
"H5 authentication flow to create a new group before adding the asset."
|
||||
),
|
||||
inputs=[
|
||||
IO.Video.Input("video", tooltip="Video to register as a personal asset."),
|
||||
IO.String.Input(
|
||||
"group_id",
|
||||
default="",
|
||||
tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
|
||||
"same person. Leave empty to run real-person authentication in the browser and create a new group.",
|
||||
),
|
||||
# IO.String.Input(
|
||||
# "name",
|
||||
# default="",
|
||||
# tooltip="Asset name (up to 64 characters).",
|
||||
# ),
|
||||
],
|
||||
outputs=[
|
||||
IO.String.Output(display_name="asset_id"),
|
||||
IO.String.Output(display_name="group_id"),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
# is_api_node=True,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
video: Input.Video,
|
||||
group_id: str = "",
|
||||
# name: str = "",
|
||||
) -> IO.NodeOutput:
|
||||
# if len(name) > 64:
|
||||
# raise ValueError("Name of asset can not be greater then 64 symbols")
|
||||
validate_video_duration(video, min_duration=2, max_duration=15)
|
||||
validate_video_dimensions(video, min_width=300, max_width=6000, min_height=300, max_height=6000)
|
||||
|
||||
w, h = video.get_dimensions()
|
||||
if h > 0:
|
||||
ratio = w / h
|
||||
if not (0.4 <= ratio <= 2.5):
|
||||
raise ValueError(f"Asset video aspect ratio (W/H) must be in [0.4, 2.5], got {ratio:.3f} ({w}x{h}).")
|
||||
pixels = w * h
|
||||
if not (409_600 <= pixels <= 927_408):
|
||||
raise ValueError(
|
||||
f"Asset video total pixels (W×H) must be in [409600, 927408], " f"got {pixels:,} ({w}x{h})."
|
||||
)
|
||||
|
||||
fps = float(video.get_frame_rate())
|
||||
if not (24 <= fps <= 60):
|
||||
raise ValueError(f"Asset video FPS must be in [24, 60], got {fps:.2f}.")
|
||||
|
||||
resolved_group = await _resolve_group_id(cls, group_id)
|
||||
asset_id = await _create_seedance_asset(
|
||||
cls,
|
||||
group_id=resolved_group,
|
||||
url=await upload_video_to_comfyapi(cls, video),
|
||||
name="",
|
||||
asset_type="Video",
|
||||
)
|
||||
await _wait_for_asset_active(cls, asset_id, resolved_group)
|
||||
PromptServer.instance.send_progress_text(
|
||||
f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
|
||||
f"group_id: {resolved_group}",
|
||||
cls.hidden.unique_id,
|
||||
)
|
||||
return IO.NodeOutput(asset_id, resolved_group)
|
||||
|
||||
|
||||
class ByteDanceExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -1640,6 +2102,8 @@ class ByteDanceExtension(ComfyExtension):
|
||||
ByteDance2TextToVideoNode,
|
||||
ByteDance2FirstLastFrameNode,
|
||||
ByteDance2ReferenceNode,
|
||||
ByteDanceCreateImageAsset,
|
||||
ByteDanceCreateVideoAsset,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -276,6 +276,7 @@ async def finish_omni_video_task(cls: type[IO.ComfyNode], response: TaskStatusRe
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/kling/v1/videos/omni-video/{response.data.task_id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
@ -862,7 +863,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
),
|
||||
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
|
||||
IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -904,12 +905,13 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -934,6 +936,8 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 only supports durations of 5 or 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -963,6 +967,12 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
f"must equal the global duration ({duration}s)."
|
||||
)
|
||||
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -972,7 +982,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
|
||||
prompt=prompt,
|
||||
aspect_ratio=aspect_ratio,
|
||||
duration=str(duration),
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
mode=mode,
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
shot_type="customize" if multi_shot else None,
|
||||
@ -1014,7 +1024,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
optional=True,
|
||||
tooltip="Up to 6 additional reference images.",
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -1061,12 +1071,13 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -1093,6 +1104,8 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -1161,6 +1174,12 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
|
||||
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"):
|
||||
image_list.append(OmniParamImage(image_url=i))
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -1170,7 +1189,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
|
||||
prompt=prompt,
|
||||
duration=str(duration),
|
||||
image_list=image_list,
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
mode=mode,
|
||||
sound="on" if generate_audio else "off",
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
@ -1204,7 +1223,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
"reference_images",
|
||||
tooltip="Up to 7 reference images.",
|
||||
),
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
|
||||
IO.DynamicCombo.Input(
|
||||
"storyboards",
|
||||
options=[
|
||||
@ -1251,12 +1270,13 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
|
||||
expr="""
|
||||
(
|
||||
$mode := (widgets.resolution = "720p") ? "std" : "pro";
|
||||
$res := widgets.resolution;
|
||||
$mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
|
||||
$isV3 := $contains(widgets.model_name, "v3");
|
||||
$audio := $isV3 and widgets.generate_audio;
|
||||
$rates := $audio
|
||||
? {"std": 0.112, "pro": 0.14}
|
||||
: {"std": 0.084, "pro": 0.112};
|
||||
? {"std": 0.112, "pro": 0.14, "4k": 0.42}
|
||||
: {"std": 0.084, "pro": 0.112, "4k": 0.42};
|
||||
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
|
||||
)
|
||||
""",
|
||||
@ -1282,6 +1302,8 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
|
||||
if generate_audio:
|
||||
raise ValueError("kling-video-o1 does not support audio generation.")
|
||||
if resolution == "4k":
|
||||
raise ValueError("kling-video-o1 does not support 4k resolution.")
|
||||
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
|
||||
if stories_enabled and model_name == "kling-video-o1":
|
||||
raise ValueError("kling-video-o1 does not support storyboards.")
|
||||
@ -1320,6 +1342,12 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
image_list: list[OmniParamImage] = []
|
||||
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
|
||||
image_list.append(OmniParamImage(image_url=i))
|
||||
if resolution == "4k":
|
||||
mode = "4k"
|
||||
elif resolution == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
|
||||
@ -1330,7 +1358,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
|
||||
aspect_ratio=aspect_ratio,
|
||||
duration=str(duration),
|
||||
image_list=image_list,
|
||||
mode="pro" if resolution == "1080p" else "std",
|
||||
mode=mode,
|
||||
sound="on" if generate_audio else "off",
|
||||
multi_shot=multi_shot,
|
||||
multi_prompt=multi_prompt_list,
|
||||
@ -2860,7 +2888,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-v3",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"]),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
|
||||
IO.Combo.Input(
|
||||
"aspect_ratio",
|
||||
options=["16:9", "9:16", "1:1"],
|
||||
@ -2913,7 +2941,11 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
@ -2943,7 +2975,12 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
start_frame: Input.Image | None = None,
|
||||
) -> IO.NodeOutput:
|
||||
_ = seed
|
||||
mode = "pro" if model["resolution"] == "1080p" else "std"
|
||||
if model["resolution"] == "4k":
|
||||
mode = "4k"
|
||||
elif model["resolution"] == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
custom_multi_shot = False
|
||||
if multi_shot["multi_shot"] == "disabled":
|
||||
shot_type = None
|
||||
@ -3025,6 +3062,7 @@ class KlingVideoNode(IO.ComfyNode):
|
||||
cls,
|
||||
ApiEndpoint(path=poll_path),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
@ -3057,7 +3095,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
IO.DynamicCombo.Option(
|
||||
"kling-v3",
|
||||
[
|
||||
IO.Combo.Input("resolution", options=["1080p", "720p"]),
|
||||
IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
|
||||
],
|
||||
),
|
||||
],
|
||||
@ -3089,7 +3127,11 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
),
|
||||
expr="""
|
||||
(
|
||||
$rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
|
||||
$rates := {
|
||||
"4k": {"off": 0.42, "on": 0.42},
|
||||
"1080p": {"off": 0.112, "on": 0.168},
|
||||
"720p": {"off": 0.084, "on": 0.126}
|
||||
};
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$audio := widgets.generate_audio ? "on" : "off";
|
||||
$rate := $lookup($lookup($rates, $res), $audio);
|
||||
@ -3118,6 +3160,12 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
|
||||
image_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame")
|
||||
image_tail_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame")
|
||||
if model["resolution"] == "4k":
|
||||
mode = "4k"
|
||||
elif model["resolution"] == "1080p":
|
||||
mode = "pro"
|
||||
else:
|
||||
mode = "std"
|
||||
response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
|
||||
@ -3127,7 +3175,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
image=image_url,
|
||||
image_tail=image_tail_url,
|
||||
prompt=prompt,
|
||||
mode="pro" if model["resolution"] == "1080p" else "std",
|
||||
mode=mode,
|
||||
duration=str(duration),
|
||||
sound="on" if generate_audio else "off",
|
||||
),
|
||||
@ -3140,6 +3188,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/kling/v1/videos/image2video/{response.data.task_id}"),
|
||||
response_model=TaskStatusResponse,
|
||||
max_poll_attempts=280,
|
||||
status_extractor=lambda r: (r.data.task_status if r.data else None),
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(final_response.data.task_result.videos[0].url))
|
||||
|
||||
@ -357,6 +357,10 @@ def calculate_tokens_price_image_1_5(response: OpenAIImageGenerationResponse) ->
|
||||
return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 32.0)) / 1_000_000.0
|
||||
|
||||
|
||||
def calculate_tokens_price_image_2_0(response: OpenAIImageGenerationResponse) -> float | None:
|
||||
return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 30.0)) / 1_000_000.0
|
||||
|
||||
|
||||
class OpenAIGPTImage1(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
@ -401,8 +405,19 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
IO.Combo.Input(
|
||||
"size",
|
||||
default="auto",
|
||||
options=["auto", "1024x1024", "1024x1536", "1536x1024"],
|
||||
tooltip="Image size",
|
||||
options=[
|
||||
"auto",
|
||||
"1024x1024",
|
||||
"1024x1536",
|
||||
"1536x1024",
|
||||
"2048x2048",
|
||||
"2048x1152",
|
||||
"1152x2048",
|
||||
"3840x2160",
|
||||
"2160x3840",
|
||||
"Custom",
|
||||
],
|
||||
tooltip="Image size. Select 'Custom' to use the custom width and height (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
@ -427,10 +442,28 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"model",
|
||||
options=["gpt-image-1", "gpt-image-1.5", 'gpt-image-2'],
|
||||
options=["gpt-image-1", "gpt-image-1.5", "gpt-image-2"],
|
||||
default="gpt-image-2",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_width",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
IO.Int.Input(
|
||||
"custom_height",
|
||||
default=1024,
|
||||
min=1024,
|
||||
max=3840,
|
||||
step=16,
|
||||
tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
|
||||
optional=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Image.Output(),
|
||||
@ -442,23 +475,36 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["quality", "n"]),
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["quality", "n", "model"]),
|
||||
expr="""
|
||||
(
|
||||
$ranges := {
|
||||
"low": [0.011, 0.02],
|
||||
"medium": [0.046, 0.07],
|
||||
"high": [0.167, 0.3]
|
||||
"gpt-image-1": {
|
||||
"low": [0.011, 0.02],
|
||||
"medium": [0.042, 0.07],
|
||||
"high": [0.167, 0.25]
|
||||
},
|
||||
"gpt-image-1.5": {
|
||||
"low": [0.009, 0.02],
|
||||
"medium": [0.034, 0.062],
|
||||
"high": [0.133, 0.22]
|
||||
},
|
||||
"gpt-image-2": {
|
||||
"low": [0.0048, 0.019],
|
||||
"medium": [0.041, 0.168],
|
||||
"high": [0.165, 0.67]
|
||||
}
|
||||
};
|
||||
$range := $lookup($ranges, widgets.quality);
|
||||
$n := widgets.n;
|
||||
$range := $lookup($lookup($ranges, widgets.model), widgets.quality);
|
||||
$nRaw := widgets.n;
|
||||
$n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
|
||||
($n = 1)
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1]}
|
||||
? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
|
||||
: {
|
||||
"type":"range_usd",
|
||||
"min_usd": $range[0],
|
||||
"max_usd": $range[1],
|
||||
"format": { "suffix": " x " & $string($n) & "/Run" }
|
||||
"min_usd": $range[0] * $n,
|
||||
"max_usd": $range[1] * $n,
|
||||
"format": { "suffix": "/Run", "approximate": true }
|
||||
}
|
||||
)
|
||||
""",
|
||||
@ -476,6 +522,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
mask: Input.Image | None = None,
|
||||
n: int = 1,
|
||||
size: str = "1024x1024",
|
||||
custom_width: int = 1024,
|
||||
custom_height: int = 1024,
|
||||
model: str = "gpt-image-1",
|
||||
) -> IO.NodeOutput:
|
||||
validate_string(prompt, strip_whitespace=False)
|
||||
@ -483,12 +531,36 @@ class OpenAIGPTImage1(IO.ComfyNode):
|
||||
if mask is not None and image is None:
|
||||
raise ValueError("Cannot use a mask without an input image")
|
||||
|
||||
if size == "Custom":
|
||||
if model != "gpt-image-2":
|
||||
raise ValueError("Custom resolution is only supported by GPT Image 2 model")
|
||||
if custom_width % 16 != 0 or custom_height % 16 != 0:
|
||||
raise ValueError(f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}")
|
||||
if max(custom_width, custom_height) > 3840:
|
||||
raise ValueError(f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}")
|
||||
ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
|
||||
if ratio > 3:
|
||||
raise ValueError(
|
||||
f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
|
||||
)
|
||||
total_pixels = custom_width * custom_height
|
||||
if not 655_360 <= total_pixels <= 8_294_400:
|
||||
raise ValueError(
|
||||
f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
|
||||
)
|
||||
size = f"{custom_width}x{custom_height}"
|
||||
elif model in ("gpt-image-1", "gpt-image-1.5"):
|
||||
if size not in ("auto", "1024x1024", "1024x1536", "1536x1024"):
|
||||
raise ValueError(f"Resolution {size} is only supported by GPT Image 2 model")
|
||||
|
||||
if model == "gpt-image-1":
|
||||
price_extractor = calculate_tokens_price_image_1
|
||||
elif model == "gpt-image-1.5":
|
||||
price_extractor = calculate_tokens_price_image_1_5
|
||||
elif model == "gpt-image-2":
|
||||
price_extractor = calculate_tokens_price_image_1_5
|
||||
price_extractor = calculate_tokens_price_image_2_0
|
||||
if background == "transparent":
|
||||
raise ValueError("Transparent background is not supported for GPT Image 2 model")
|
||||
else:
|
||||
raise ValueError(f"Unknown model: {model}")
|
||||
|
||||
|
||||
@ -33,9 +33,13 @@ class OpenAIVideoSora2(IO.ComfyNode):
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="OpenAIVideoSora2",
|
||||
display_name="OpenAI Sora - Video",
|
||||
display_name="OpenAI Sora - Video (Deprecated)",
|
||||
category="api node/video/Sora",
|
||||
description="OpenAI video and audio generation.",
|
||||
description=(
|
||||
"OpenAI video and audio generation.\n\n"
|
||||
"DEPRECATION NOTICE: OpenAI will stop serving the Sora v2 API in September 2026. "
|
||||
"This node will be removed from ComfyUI at that time."
|
||||
),
|
||||
inputs=[
|
||||
IO.Combo.Input(
|
||||
"model",
|
||||
|
||||
@ -1646,6 +1646,557 @@ class Wan2ReferenceVideoApi(IO.ComfyNode):
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
|
||||
|
||||
|
||||
class HappyHorseTextToVideoApi(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="HappyHorseTextToVideoApi",
|
||||
display_name="HappyHorse Text to Video",
|
||||
category="api node/video/Wan",
|
||||
description="Generates a video based on a text prompt using the HappyHorse model.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"happyhorse-1.0-t2v",
|
||||
[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Prompt describing the elements and visual features. "
|
||||
"Supports English and Chinese.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["720P", "1080P"],
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"ratio",
|
||||
options=["16:9", "9:16", "1:1", "4:3", "3:4"],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=5,
|
||||
min=3,
|
||||
max=15,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
|
||||
expr="""
|
||||
(
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$ppsTable := { "720p": 0.14, "1080p": 0.24 };
|
||||
$pps := $lookup($ppsTable, $res);
|
||||
{ "type": "usd", "usd": $pps * $dur }
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
):
|
||||
validate_string(model["prompt"], strip_whitespace=False, min_length=1)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(
|
||||
path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
|
||||
method="POST",
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
data=Wan27Text2VideoTaskCreationRequest(
|
||||
model=model["model"],
|
||||
input=Text2VideoInputField(
|
||||
prompt=model["prompt"],
|
||||
negative_prompt=None,
|
||||
),
|
||||
parameters=Wan27Text2VideoParametersField(
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=model["duration"],
|
||||
seed=seed,
|
||||
watermark=watermark,
|
||||
),
|
||||
),
|
||||
)
|
||||
if not initial_response.output:
|
||||
raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
|
||||
response_model=VideoTaskStatusResponse,
|
||||
status_extractor=lambda x: x.output.task_status,
|
||||
poll_interval=7,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
|
||||
|
||||
|
||||
class HappyHorseImageToVideoApi(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="HappyHorseImageToVideoApi",
|
||||
display_name="HappyHorse Image to Video",
|
||||
category="api node/video/Wan",
|
||||
description="Generate a video from a first-frame image using the HappyHorse model.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"happyhorse-1.0-i2v",
|
||||
[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Prompt describing the elements and visual features. "
|
||||
"Supports English and Chinese.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["720P", "1080P"],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=5,
|
||||
min=3,
|
||||
max=15,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Image.Input(
|
||||
"first_frame",
|
||||
tooltip="First frame image. The output aspect ratio is derived from this image.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
|
||||
expr="""
|
||||
(
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$ppsTable := { "720p": 0.14, "1080p": 0.24 };
|
||||
$pps := $lookup($ppsTable, $res);
|
||||
{ "type": "usd", "usd": $pps * $dur }
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
first_frame: Input.Image,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
):
|
||||
media = [
|
||||
Wan27MediaItem(
|
||||
type="first_frame",
|
||||
url=await upload_image_to_comfyapi(cls, image=first_frame),
|
||||
)
|
||||
]
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(
|
||||
path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
|
||||
method="POST",
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
data=Wan27ImageToVideoTaskCreationRequest(
|
||||
model=model["model"],
|
||||
input=Wan27ImageToVideoInputField(
|
||||
prompt=model["prompt"] or None,
|
||||
negative_prompt=None,
|
||||
media=media,
|
||||
),
|
||||
parameters=Wan27ImageToVideoParametersField(
|
||||
resolution=model["resolution"],
|
||||
duration=model["duration"],
|
||||
seed=seed,
|
||||
watermark=watermark,
|
||||
),
|
||||
),
|
||||
)
|
||||
if not initial_response.output:
|
||||
raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
|
||||
response_model=VideoTaskStatusResponse,
|
||||
status_extractor=lambda x: x.output.task_status,
|
||||
poll_interval=7,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
|
||||
|
||||
|
||||
class HappyHorseVideoEditApi(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="HappyHorseVideoEditApi",
|
||||
display_name="HappyHorse Video Edit",
|
||||
category="api node/video/Wan",
|
||||
description="Edit a video using text instructions or reference images with the HappyHorse model. "
|
||||
"Output duration is 3-15s and matches the input video; inputs longer than 15s are truncated.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"happyhorse-1.0-video-edit",
|
||||
[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Editing instructions or style transfer requirements.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["720P", "1080P"],
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"ratio",
|
||||
options=["16:9", "9:16", "1:1", "4:3", "3:4"],
|
||||
tooltip="Aspect ratio. If not changed, approximates the input video ratio.",
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("reference_image"),
|
||||
names=[
|
||||
"image1",
|
||||
"image2",
|
||||
"image3",
|
||||
"image4",
|
||||
"image5",
|
||||
],
|
||||
min=0,
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Video.Input(
|
||||
"video",
|
||||
tooltip="The video to edit.",
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution"]),
|
||||
expr="""
|
||||
(
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$ppsTable := { "720p": 0.14, "1080p": 0.24 };
|
||||
$pps := $lookup($ppsTable, $res);
|
||||
{ "type": "usd", "usd": $pps, "format": { "suffix": "/second" } }
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
video: Input.Video,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
):
|
||||
validate_string(model["prompt"], strip_whitespace=False, min_length=1)
|
||||
validate_video_duration(video, min_duration=3, max_duration=60)
|
||||
media = [Wan27MediaItem(type="video", url=await upload_video_to_comfyapi(cls, video))]
|
||||
reference_images = model.get("reference_images", {})
|
||||
for key in reference_images:
|
||||
media.append(
|
||||
Wan27MediaItem(
|
||||
type="reference_image", url=await upload_image_to_comfyapi(cls, image=reference_images[key])
|
||||
)
|
||||
)
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(
|
||||
path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
|
||||
method="POST",
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
data=Wan27VideoEditTaskCreationRequest(
|
||||
model=model["model"],
|
||||
input=Wan27VideoEditInputField(prompt=model["prompt"], media=media),
|
||||
parameters=Wan27VideoEditParametersField(
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=None,
|
||||
watermark=watermark,
|
||||
seed=seed,
|
||||
),
|
||||
),
|
||||
)
|
||||
if not initial_response.output:
|
||||
raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
|
||||
response_model=VideoTaskStatusResponse,
|
||||
status_extractor=lambda x: x.output.task_status,
|
||||
poll_interval=7,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
|
||||
|
||||
|
||||
class HappyHorseReferenceVideoApi(IO.ComfyNode):
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return IO.Schema(
|
||||
node_id="HappyHorseReferenceVideoApi",
|
||||
display_name="HappyHorse Reference to Video",
|
||||
category="api node/video/Wan",
|
||||
description="Generate a video featuring a person or object from reference materials with the HappyHorse "
|
||||
"model. Supports single-character performances and multi-character interactions.",
|
||||
inputs=[
|
||||
IO.DynamicCombo.Input(
|
||||
"model",
|
||||
options=[
|
||||
IO.DynamicCombo.Option(
|
||||
"happyhorse-1.0-r2v",
|
||||
[
|
||||
IO.String.Input(
|
||||
"prompt",
|
||||
multiline=True,
|
||||
default="",
|
||||
tooltip="Prompt describing the video. Use identifiers such as 'character1' and "
|
||||
"'character2' to refer to the reference characters.",
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"resolution",
|
||||
options=["720P", "1080P"],
|
||||
),
|
||||
IO.Combo.Input(
|
||||
"ratio",
|
||||
options=["16:9", "9:16", "1:1", "4:3", "3:4"],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"duration",
|
||||
default=5,
|
||||
min=3,
|
||||
max=15,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
),
|
||||
IO.Autogrow.Input(
|
||||
"reference_images",
|
||||
template=IO.Autogrow.TemplateNames(
|
||||
IO.Image.Input("reference_image"),
|
||||
names=[
|
||||
"image1",
|
||||
"image2",
|
||||
"image3",
|
||||
"image4",
|
||||
"image5",
|
||||
"image6",
|
||||
"image7",
|
||||
"image8",
|
||||
"image9",
|
||||
],
|
||||
min=1,
|
||||
),
|
||||
),
|
||||
],
|
||||
),
|
||||
],
|
||||
),
|
||||
IO.Int.Input(
|
||||
"seed",
|
||||
default=0,
|
||||
min=0,
|
||||
max=2147483647,
|
||||
step=1,
|
||||
display_mode=IO.NumberDisplay.number,
|
||||
control_after_generate=True,
|
||||
tooltip="Seed to use for generation.",
|
||||
),
|
||||
IO.Boolean.Input(
|
||||
"watermark",
|
||||
default=False,
|
||||
tooltip="Whether to add an AI-generated watermark to the result.",
|
||||
advanced=True,
|
||||
),
|
||||
],
|
||||
outputs=[
|
||||
IO.Video.Output(),
|
||||
],
|
||||
hidden=[
|
||||
IO.Hidden.auth_token_comfy_org,
|
||||
IO.Hidden.api_key_comfy_org,
|
||||
IO.Hidden.unique_id,
|
||||
],
|
||||
is_api_node=True,
|
||||
price_badge=IO.PriceBadge(
|
||||
depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
|
||||
expr="""
|
||||
(
|
||||
$res := $lookup(widgets, "model.resolution");
|
||||
$dur := $lookup(widgets, "model.duration");
|
||||
$ppsTable := { "720p": 0.14, "1080p": 0.24 };
|
||||
$pps := $lookup($ppsTable, $res);
|
||||
{ "type": "usd", "usd": $pps * $dur }
|
||||
)
|
||||
""",
|
||||
),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
async def execute(
|
||||
cls,
|
||||
model: dict,
|
||||
seed: int,
|
||||
watermark: bool,
|
||||
):
|
||||
validate_string(model["prompt"], strip_whitespace=False, min_length=1)
|
||||
media = []
|
||||
reference_images = model.get("reference_images", {})
|
||||
for key in reference_images:
|
||||
media.append(
|
||||
Wan27MediaItem(
|
||||
type="reference_image",
|
||||
url=await upload_image_to_comfyapi(cls, image=reference_images[key]),
|
||||
)
|
||||
)
|
||||
if not media:
|
||||
raise ValueError("At least one reference reference image must be provided.")
|
||||
|
||||
initial_response = await sync_op(
|
||||
cls,
|
||||
ApiEndpoint(
|
||||
path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
|
||||
method="POST",
|
||||
),
|
||||
response_model=TaskCreationResponse,
|
||||
data=Wan27ReferenceVideoTaskCreationRequest(
|
||||
model=model["model"],
|
||||
input=Wan27ReferenceVideoInputField(
|
||||
prompt=model["prompt"],
|
||||
negative_prompt=None,
|
||||
media=media,
|
||||
),
|
||||
parameters=Wan27ReferenceVideoParametersField(
|
||||
resolution=model["resolution"],
|
||||
ratio=model["ratio"],
|
||||
duration=model["duration"],
|
||||
watermark=watermark,
|
||||
seed=seed,
|
||||
),
|
||||
),
|
||||
)
|
||||
if not initial_response.output:
|
||||
raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
|
||||
response = await poll_op(
|
||||
cls,
|
||||
ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
|
||||
response_model=VideoTaskStatusResponse,
|
||||
status_extractor=lambda x: x.output.task_status,
|
||||
poll_interval=7,
|
||||
)
|
||||
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
|
||||
|
||||
|
||||
class WanApiExtension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
|
||||
@ -1660,6 +2211,10 @@ class WanApiExtension(ComfyExtension):
|
||||
Wan2VideoContinuationApi,
|
||||
Wan2VideoEditApi,
|
||||
Wan2ReferenceVideoApi,
|
||||
HappyHorseTextToVideoApi,
|
||||
HappyHorseImageToVideoApi,
|
||||
HappyHorseVideoEditApi,
|
||||
HappyHorseReferenceVideoApi,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@ -156,6 +156,7 @@ async def poll_op(
|
||||
estimated_duration: int | None = None,
|
||||
cancel_endpoint: ApiEndpoint | None = None,
|
||||
cancel_timeout: float = 10.0,
|
||||
extra_text: str | None = None,
|
||||
) -> M:
|
||||
raw = await poll_op_raw(
|
||||
cls,
|
||||
@ -176,6 +177,7 @@ async def poll_op(
|
||||
estimated_duration=estimated_duration,
|
||||
cancel_endpoint=cancel_endpoint,
|
||||
cancel_timeout=cancel_timeout,
|
||||
extra_text=extra_text,
|
||||
)
|
||||
if not isinstance(raw, dict):
|
||||
raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).")
|
||||
@ -260,6 +262,7 @@ async def poll_op_raw(
|
||||
estimated_duration: int | None = None,
|
||||
cancel_endpoint: ApiEndpoint | None = None,
|
||||
cancel_timeout: float = 10.0,
|
||||
extra_text: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Polls an endpoint until the task reaches a terminal state. Displays time while queued/processing,
|
||||
@ -299,6 +302,7 @@ async def poll_op_raw(
|
||||
price=state.price,
|
||||
is_queued=state.is_queued,
|
||||
processing_elapsed_seconds=int(proc_elapsed),
|
||||
extra_text=extra_text,
|
||||
)
|
||||
await asyncio.sleep(1.0)
|
||||
except Exception as exc:
|
||||
@ -389,6 +393,7 @@ async def poll_op_raw(
|
||||
price=state.price,
|
||||
is_queued=False,
|
||||
processing_elapsed_seconds=int(state.base_processing_elapsed),
|
||||
extra_text=extra_text,
|
||||
)
|
||||
return resp_json
|
||||
|
||||
@ -462,6 +467,7 @@ def _display_time_progress(
|
||||
price: float | None = None,
|
||||
is_queued: bool | None = None,
|
||||
processing_elapsed_seconds: int | None = None,
|
||||
extra_text: str | None = None,
|
||||
) -> None:
|
||||
if estimated_total is not None and estimated_total > 0 and is_queued is False:
|
||||
pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds
|
||||
@ -469,7 +475,8 @@ def _display_time_progress(
|
||||
time_line = f"Time elapsed: {int(elapsed_seconds)}s (~{remaining}s remaining)"
|
||||
else:
|
||||
time_line = f"Time elapsed: {int(elapsed_seconds)}s"
|
||||
_display_text(node_cls, time_line, status=status, price=price)
|
||||
text = f"{time_line}\n\n{extra_text}" if extra_text else time_line
|
||||
_display_text(node_cls, text, status=status, price=price)
|
||||
|
||||
|
||||
async def _diagnose_connectivity() -> dict[str, bool]:
|
||||
|
||||
@ -5,6 +5,7 @@ import psutil
|
||||
import time
|
||||
import torch
|
||||
from typing import Sequence, Mapping, Dict
|
||||
from comfy.model_patcher import ModelPatcher
|
||||
from comfy_execution.graph import DynamicPrompt
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
@ -523,13 +524,15 @@ class RAMPressureCache(LRUCache):
|
||||
self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time()
|
||||
super().set_local(node_id, value)
|
||||
|
||||
def ram_release(self, target):
|
||||
def ram_release(self, target, free_active=False):
|
||||
if psutil.virtual_memory().available >= target:
|
||||
return
|
||||
|
||||
clean_list = []
|
||||
|
||||
for key, cache_entry in self.cache.items():
|
||||
if not free_active and self.used_generation[key] == self.generation:
|
||||
continue
|
||||
oom_score = RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key])
|
||||
|
||||
ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE
|
||||
@ -542,6 +545,9 @@ class RAMPressureCache(LRUCache):
|
||||
scan_list_for_ram_usage(output)
|
||||
elif isinstance(output, torch.Tensor) and output.device.type == 'cpu':
|
||||
ram_usage += output.numel() * output.element_size()
|
||||
elif isinstance(output, ModelPatcher) and self.used_generation[key] != self.generation:
|
||||
#old ModelPatchers are the first to go
|
||||
ram_usage = 1e30
|
||||
scan_list_for_ram_usage(cache_entry.outputs)
|
||||
|
||||
oom_score *= ram_usage
|
||||
|
||||
@ -637,7 +637,7 @@ class SaveGLB(IO.ComfyNode):
|
||||
],
|
||||
tooltip="Mesh or 3D file to save",
|
||||
),
|
||||
IO.String.Input("filename_prefix", default="mesh/ComfyUI"),
|
||||
IO.String.Input("filename_prefix", default="3d/ComfyUI"),
|
||||
],
|
||||
hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo]
|
||||
)
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
import nodes
|
||||
import node_helpers
|
||||
import torch
|
||||
import torchaudio
|
||||
import comfy.model_management
|
||||
import comfy.model_sampling
|
||||
import comfy.samplers
|
||||
@ -711,7 +712,14 @@ class LTXVReferenceAudio(io.ComfyNode):
|
||||
@classmethod
|
||||
def execute(cls, model, positive, negative, reference_audio, audio_vae, identity_guidance_scale, start_percent, end_percent) -> io.NodeOutput:
|
||||
# Encode reference audio to latents and patchify
|
||||
audio_latents = audio_vae.encode(reference_audio)
|
||||
sample_rate = reference_audio["sample_rate"]
|
||||
vae_sample_rate = getattr(audio_vae, "audio_sample_rate", 44100)
|
||||
if vae_sample_rate != sample_rate:
|
||||
waveform = torchaudio.functional.resample(reference_audio["waveform"], sample_rate, vae_sample_rate)
|
||||
else:
|
||||
waveform = reference_audio["waveform"]
|
||||
|
||||
audio_latents = audio_vae.encode(waveform.movedim(1, -1))
|
||||
b, c, t, f = audio_latents.shape
|
||||
ref_tokens = audio_latents.permute(0, 2, 1, 3).reshape(b, t, c * f)
|
||||
ref_audio = {"tokens": ref_tokens}
|
||||
|
||||
@ -2,6 +2,7 @@ import numpy as np
|
||||
import scipy.ndimage
|
||||
import torch
|
||||
import comfy.utils
|
||||
import comfy.model_management
|
||||
import node_helpers
|
||||
from typing_extensions import override
|
||||
from comfy_api.latest import ComfyExtension, IO, UI
|
||||
@ -188,7 +189,7 @@ class SolidMask(IO.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def execute(cls, value, width, height) -> IO.NodeOutput:
|
||||
out = torch.full((1, height, width), value, dtype=torch.float32, device="cpu")
|
||||
out = torch.full((1, height, width), value, dtype=torch.float32, device=comfy.model_management.intermediate_device())
|
||||
return IO.NodeOutput(out)
|
||||
|
||||
solid = execute # TODO: remove
|
||||
@ -262,6 +263,7 @@ class MaskComposite(IO.ComfyNode):
|
||||
def execute(cls, destination, source, x, y, operation) -> IO.NodeOutput:
|
||||
output = destination.reshape((-1, destination.shape[-2], destination.shape[-1])).clone()
|
||||
source = source.reshape((-1, source.shape[-2], source.shape[-1]))
|
||||
source = source.to(output.device)
|
||||
|
||||
left, top = (x, y,)
|
||||
right, bottom = (min(left + source.shape[-1], destination.shape[-1]), min(top + source.shape[-2], destination.shape[-2]))
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import json
|
||||
from comfy.comfy_types.node_typing import IO
|
||||
import torch
|
||||
|
||||
# Preview Any - original implement from
|
||||
# https://github.com/rgthree/rgthree-comfy/blob/main/py/display_any.py
|
||||
@ -19,6 +20,7 @@ class PreviewAny():
|
||||
SEARCH_ALIASES = ["show output", "inspect", "debug", "print value", "show text"]
|
||||
|
||||
def main(self, source=None):
|
||||
torch.set_printoptions(edgeitems=6)
|
||||
value = 'None'
|
||||
if isinstance(source, str):
|
||||
value = source
|
||||
@ -33,6 +35,7 @@ class PreviewAny():
|
||||
except Exception:
|
||||
value = 'source exists, but could not be serialized.'
|
||||
|
||||
torch.set_printoptions()
|
||||
return {"ui": {"text": (value,)}, "result": (value,)}
|
||||
|
||||
NODE_CLASS_MAPPINGS = {
|
||||
|
||||
529
comfy_extras/nodes_sam3.py
Normal file
529
comfy_extras/nodes_sam3.py
Normal file
@ -0,0 +1,529 @@
|
||||
"""
|
||||
SAM3 (Segment Anything 3) nodes for detection, segmentation, and video tracking.
|
||||
"""
|
||||
|
||||
from typing_extensions import override
|
||||
|
||||
import json
|
||||
import os
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import comfy.model_management
|
||||
import comfy.utils
|
||||
import folder_paths
|
||||
from comfy_api.latest import ComfyExtension, io, ui
|
||||
import av
|
||||
from fractions import Fraction
|
||||
|
||||
|
||||
def _extract_text_prompts(conditioning, device, dtype):
|
||||
"""Extract list of (text_embeddings, text_mask) from conditioning."""
|
||||
cond_meta = conditioning[0][1]
|
||||
multi = cond_meta.get("sam3_multi_cond")
|
||||
prompts = []
|
||||
if multi is not None:
|
||||
for entry in multi:
|
||||
emb = entry["cond"].to(device=device, dtype=dtype)
|
||||
mask = entry["attention_mask"].to(device) if entry["attention_mask"] is not None else None
|
||||
if mask is None:
|
||||
mask = torch.ones(emb.shape[0], emb.shape[1], dtype=torch.int64, device=device)
|
||||
prompts.append((emb, mask, entry.get("max_detections", 1)))
|
||||
else:
|
||||
emb = conditioning[0][0].to(device=device, dtype=dtype)
|
||||
mask = cond_meta.get("attention_mask")
|
||||
if mask is not None:
|
||||
mask = mask.to(device)
|
||||
else:
|
||||
mask = torch.ones(emb.shape[0], emb.shape[1], dtype=torch.int64, device=device)
|
||||
prompts.append((emb, mask, 1))
|
||||
return prompts
|
||||
|
||||
|
||||
def _refine_mask(sam3_model, orig_image_hwc, coarse_mask, box_xyxy, H, W, device, dtype, iterations):
|
||||
"""Refine a coarse detector mask via SAM decoder, cropping to the detection box.
|
||||
|
||||
Returns: [1, H, W] binary mask
|
||||
"""
|
||||
def _coarse_fallback():
|
||||
return (F.interpolate(coarse_mask.unsqueeze(0).unsqueeze(0), size=(H, W),
|
||||
mode="bilinear", align_corners=False)[0] > 0).float()
|
||||
|
||||
if iterations <= 0:
|
||||
return _coarse_fallback()
|
||||
|
||||
pad_frac = 0.1
|
||||
x1, y1, x2, y2 = box_xyxy.tolist()
|
||||
bw, bh = x2 - x1, y2 - y1
|
||||
cx1 = max(0, int(x1 - bw * pad_frac))
|
||||
cy1 = max(0, int(y1 - bh * pad_frac))
|
||||
cx2 = min(W, int(x2 + bw * pad_frac))
|
||||
cy2 = min(H, int(y2 + bh * pad_frac))
|
||||
if cx2 <= cx1 or cy2 <= cy1:
|
||||
return _coarse_fallback()
|
||||
|
||||
crop = orig_image_hwc[cy1:cy2, cx1:cx2, :3]
|
||||
crop_1008 = comfy.utils.common_upscale(crop.unsqueeze(0).movedim(-1, 1), 1008, 1008, "bilinear", crop="disabled")
|
||||
crop_frame = crop_1008.to(device=device, dtype=dtype)
|
||||
crop_h, crop_w = cy2 - cy1, cx2 - cx1
|
||||
|
||||
# Crop coarse mask and refine via SAM on the cropped image
|
||||
mask_h, mask_w = coarse_mask.shape[-2:]
|
||||
mx1, my1 = int(cx1 / W * mask_w), int(cy1 / H * mask_h)
|
||||
mx2, my2 = int(cx2 / W * mask_w), int(cy2 / H * mask_h)
|
||||
if mx2 <= mx1 or my2 <= my1:
|
||||
return _coarse_fallback()
|
||||
mask_logit = coarse_mask[..., my1:my2, mx1:mx2].unsqueeze(0).unsqueeze(0)
|
||||
for _ in range(iterations):
|
||||
coarse_input = F.interpolate(mask_logit, size=(1008, 1008), mode="bilinear", align_corners=False)
|
||||
mask_logit = sam3_model.forward_segment(crop_frame, mask_inputs=coarse_input)
|
||||
|
||||
refined_crop = F.interpolate(mask_logit, size=(crop_h, crop_w), mode="bilinear", align_corners=False)
|
||||
full_mask = torch.zeros(1, 1, H, W, device=device, dtype=dtype)
|
||||
full_mask[:, :, cy1:cy2, cx1:cx2] = refined_crop
|
||||
coarse_full = F.interpolate(coarse_mask.unsqueeze(0).unsqueeze(0), size=(H, W), mode="bilinear", align_corners=False)
|
||||
return ((full_mask[0] > 0) | (coarse_full[0] > 0)).float()
|
||||
|
||||
|
||||
|
||||
class SAM3_Detect(io.ComfyNode):
|
||||
"""Open-vocabulary detection and segmentation using text, box, or point prompts."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SAM3_Detect",
|
||||
display_name="SAM3 Detect",
|
||||
category="detection/",
|
||||
search_aliases=["sam3", "segment anything", "open vocabulary", "text detection", "segment"],
|
||||
inputs=[
|
||||
io.Model.Input("model", display_name="model"),
|
||||
io.Image.Input("image", display_name="image"),
|
||||
io.Conditioning.Input("conditioning", display_name="conditioning", optional=True, tooltip="Text conditioning from CLIPTextEncode"),
|
||||
io.BoundingBox.Input("bboxes", display_name="bboxes", force_input=True, optional=True, tooltip="Bounding boxes to segment within"),
|
||||
io.String.Input("positive_coords", display_name="positive_coords", force_input=True, optional=True, tooltip="Positive point prompts as JSON [{\"x\": int, \"y\": int}, ...] (pixel coords)"),
|
||||
io.String.Input("negative_coords", display_name="negative_coords", force_input=True, optional=True, tooltip="Negative point prompts as JSON [{\"x\": int, \"y\": int}, ...] (pixel coords)"),
|
||||
io.Float.Input("threshold", display_name="threshold", default=0.5, min=0.0, max=1.0, step=0.01),
|
||||
io.Int.Input("refine_iterations", display_name="refine_iterations", default=2, min=0, max=5, tooltip="SAM decoder refinement passes (0=use raw detector masks)"),
|
||||
io.Boolean.Input("individual_masks", display_name="individual_masks", default=False, tooltip="Output per-object masks instead of union"),
|
||||
],
|
||||
outputs=[
|
||||
io.Mask.Output("masks"),
|
||||
io.BoundingBox.Output("bboxes"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, model, image, conditioning=None, bboxes=None, positive_coords=None, negative_coords=None, threshold=0.5, refine_iterations=2, individual_masks=False) -> io.NodeOutput:
|
||||
B, H, W, C = image.shape
|
||||
image_in = comfy.utils.common_upscale(image[..., :3].movedim(-1, 1), 1008, 1008, "bilinear", crop="disabled")
|
||||
|
||||
# Convert bboxes to normalized cxcywh format, per-frame list of [1, N, 4] tensors.
|
||||
# Supports: single dict (all frames), list[dict] (all frames), list[list[dict]] (per-frame).
|
||||
def _boxes_to_tensor(box_list):
|
||||
coords = []
|
||||
for d in box_list:
|
||||
cx = (d["x"] + d["width"] / 2) / W
|
||||
cy = (d["y"] + d["height"] / 2) / H
|
||||
coords.append([cx, cy, d["width"] / W, d["height"] / H])
|
||||
return torch.tensor([coords], dtype=torch.float32) # [1, N, 4]
|
||||
|
||||
per_frame_boxes = None
|
||||
if bboxes is not None:
|
||||
if isinstance(bboxes, dict):
|
||||
# Single box → same for all frames
|
||||
shared = _boxes_to_tensor([bboxes])
|
||||
per_frame_boxes = [shared] * B
|
||||
elif isinstance(bboxes, list) and len(bboxes) > 0 and isinstance(bboxes[0], list):
|
||||
# list[list[dict]] → per-frame boxes
|
||||
per_frame_boxes = [_boxes_to_tensor(frame_boxes) if frame_boxes else None for frame_boxes in bboxes]
|
||||
# Pad to B if fewer frames provided
|
||||
while len(per_frame_boxes) < B:
|
||||
per_frame_boxes.append(per_frame_boxes[-1] if per_frame_boxes else None)
|
||||
elif isinstance(bboxes, list) and len(bboxes) > 0:
|
||||
# list[dict] → same boxes for all frames
|
||||
shared = _boxes_to_tensor(bboxes)
|
||||
per_frame_boxes = [shared] * B
|
||||
|
||||
# Parse point prompts from JSON (KJNodes PointsEditor format: [{"x": int, "y": int}, ...])
|
||||
pos_pts = json.loads(positive_coords) if positive_coords else []
|
||||
neg_pts = json.loads(negative_coords) if negative_coords else []
|
||||
has_points = len(pos_pts) > 0 or len(neg_pts) > 0
|
||||
|
||||
comfy.model_management.load_model_gpu(model)
|
||||
device = comfy.model_management.get_torch_device()
|
||||
dtype = model.model.get_dtype()
|
||||
sam3_model = model.model.diffusion_model
|
||||
|
||||
# Build point inputs for tracker SAM decoder path
|
||||
point_inputs = None
|
||||
if has_points:
|
||||
all_coords = [[p["x"] / W * 1008, p["y"] / H * 1008] for p in pos_pts] + \
|
||||
[[p["x"] / W * 1008, p["y"] / H * 1008] for p in neg_pts]
|
||||
all_labels = [1] * len(pos_pts) + [0] * len(neg_pts)
|
||||
point_inputs = {
|
||||
"point_coords": torch.tensor([all_coords], dtype=dtype, device=device),
|
||||
"point_labels": torch.tensor([all_labels], dtype=torch.int32, device=device),
|
||||
}
|
||||
|
||||
cond_list = _extract_text_prompts(conditioning, device, dtype) if conditioning is not None and len(conditioning) > 0 else []
|
||||
has_text = len(cond_list) > 0
|
||||
|
||||
# Run per-image through detector (text/boxes) and/or tracker (points)
|
||||
all_bbox_dicts = []
|
||||
all_masks = []
|
||||
pbar = comfy.utils.ProgressBar(B)
|
||||
|
||||
for b in range(B):
|
||||
frame = image_in[b:b+1].to(device=device, dtype=dtype)
|
||||
b_boxes = None
|
||||
if per_frame_boxes is not None and per_frame_boxes[b] is not None:
|
||||
b_boxes = per_frame_boxes[b].to(device=device, dtype=dtype)
|
||||
|
||||
frame_bbox_dicts = []
|
||||
frame_masks = []
|
||||
|
||||
# Point prompts: tracker SAM decoder path with iterative refinement
|
||||
if point_inputs is not None:
|
||||
mask_logit = sam3_model.forward_segment(frame, point_inputs=point_inputs)
|
||||
for _ in range(max(0, refine_iterations - 1)):
|
||||
mask_logit = sam3_model.forward_segment(frame, mask_inputs=mask_logit)
|
||||
mask = F.interpolate(mask_logit, size=(H, W), mode="bilinear", align_corners=False)
|
||||
frame_masks.append((mask[0] > 0).float())
|
||||
|
||||
# Box prompts: SAM decoder path (segment inside each box)
|
||||
if b_boxes is not None and not has_text:
|
||||
for box_cxcywh in b_boxes[0]:
|
||||
cx, cy, bw, bh = box_cxcywh.tolist()
|
||||
# Convert cxcywh normalized → xyxy in 1008 space → [1, 2, 2] corners
|
||||
sam_box = torch.tensor([[[(cx - bw/2) * 1008, (cy - bh/2) * 1008],
|
||||
[(cx + bw/2) * 1008, (cy + bh/2) * 1008]]],
|
||||
device=device, dtype=dtype)
|
||||
mask_logit = sam3_model.forward_segment(frame, box_inputs=sam_box)
|
||||
for _ in range(max(0, refine_iterations - 1)):
|
||||
mask_logit = sam3_model.forward_segment(frame, mask_inputs=mask_logit)
|
||||
mask = F.interpolate(mask_logit, size=(H, W), mode="bilinear", align_corners=False)
|
||||
frame_masks.append((mask[0] > 0).float())
|
||||
|
||||
# Text prompts: run detector per text prompt (each detects one category)
|
||||
for text_embeddings, text_mask, max_det in cond_list:
|
||||
results = sam3_model(
|
||||
frame, text_embeddings=text_embeddings, text_mask=text_mask,
|
||||
boxes=b_boxes, threshold=threshold, orig_size=(H, W))
|
||||
|
||||
pred_boxes = results["boxes"][0]
|
||||
scores = results["scores"][0]
|
||||
masks = results["masks"][0]
|
||||
|
||||
probs = scores.sigmoid()
|
||||
keep = probs > threshold
|
||||
kept_boxes = pred_boxes[keep].cpu()
|
||||
kept_scores = probs[keep].cpu()
|
||||
kept_masks = masks[keep]
|
||||
|
||||
order = kept_scores.argsort(descending=True)[:max_det]
|
||||
kept_boxes = kept_boxes[order]
|
||||
kept_scores = kept_scores[order]
|
||||
kept_masks = kept_masks[order]
|
||||
|
||||
for box, score in zip(kept_boxes, kept_scores):
|
||||
frame_bbox_dicts.append({
|
||||
"x": float(box[0]), "y": float(box[1]),
|
||||
"width": float(box[2] - box[0]), "height": float(box[3] - box[1]),
|
||||
"score": float(score),
|
||||
})
|
||||
for m, box in zip(kept_masks, kept_boxes):
|
||||
frame_masks.append(_refine_mask(
|
||||
sam3_model, image[b], m, box, H, W, device, dtype, refine_iterations))
|
||||
|
||||
all_bbox_dicts.append(frame_bbox_dicts)
|
||||
if len(frame_masks) > 0:
|
||||
combined = torch.cat(frame_masks, dim=0) # [N_obj, H, W]
|
||||
if individual_masks:
|
||||
all_masks.append(combined)
|
||||
else:
|
||||
all_masks.append((combined > 0).any(dim=0).float())
|
||||
else:
|
||||
if individual_masks:
|
||||
all_masks.append(torch.zeros(0, H, W, device=comfy.model_management.intermediate_device()))
|
||||
else:
|
||||
all_masks.append(torch.zeros(H, W, device=comfy.model_management.intermediate_device()))
|
||||
pbar.update(1)
|
||||
|
||||
idev = comfy.model_management.intermediate_device()
|
||||
all_masks = [m.to(idev) for m in all_masks]
|
||||
mask_out = torch.cat(all_masks, dim=0) if individual_masks else torch.stack(all_masks)
|
||||
return io.NodeOutput(mask_out, all_bbox_dicts)
|
||||
|
||||
|
||||
SAM3TrackData = io.Custom("SAM3_TRACK_DATA")
|
||||
|
||||
class SAM3_VideoTrack(io.ComfyNode):
|
||||
"""Track objects across video frames using SAM3's memory-based tracker."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SAM3_VideoTrack",
|
||||
display_name="SAM3 Video Track",
|
||||
category="detection/",
|
||||
search_aliases=["sam3", "video", "track", "propagate"],
|
||||
inputs=[
|
||||
io.Image.Input("images", display_name="images", tooltip="Video frames as batched images"),
|
||||
io.Model.Input("model", display_name="model"),
|
||||
io.Mask.Input("initial_mask", display_name="initial_mask", optional=True, tooltip="Mask(s) for the first frame to track (one per object)"),
|
||||
io.Conditioning.Input("conditioning", display_name="conditioning", optional=True, tooltip="Text conditioning for detecting new objects during tracking"),
|
||||
io.Float.Input("detection_threshold", display_name="detection_threshold", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Score threshold for text-prompted detection"),
|
||||
io.Int.Input("max_objects", display_name="max_objects", default=0, min=0, tooltip="Max tracked objects (0=unlimited). Initial masks count toward this limit."),
|
||||
io.Int.Input("detect_interval", display_name="detect_interval", default=1, min=1, tooltip="Run detection every N frames (1=every frame). Higher values save compute."),
|
||||
],
|
||||
outputs=[
|
||||
SAM3TrackData.Output("track_data", display_name="track_data"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, images, model, initial_mask=None, conditioning=None, detection_threshold=0.5, max_objects=0, detect_interval=1) -> io.NodeOutput:
|
||||
N, H, W, C = images.shape
|
||||
|
||||
comfy.model_management.load_model_gpu(model)
|
||||
device = comfy.model_management.get_torch_device()
|
||||
dtype = model.model.get_dtype()
|
||||
sam3_model = model.model.diffusion_model
|
||||
|
||||
frames = images[..., :3].movedim(-1, 1)
|
||||
frames_in = comfy.utils.common_upscale(frames, 1008, 1008, "bilinear", crop="disabled").to(device=device, dtype=dtype)
|
||||
|
||||
init_masks = None
|
||||
if initial_mask is not None:
|
||||
init_masks = initial_mask.unsqueeze(1).to(device=device, dtype=dtype)
|
||||
|
||||
pbar = comfy.utils.ProgressBar(N)
|
||||
|
||||
text_prompts = None
|
||||
if conditioning is not None and len(conditioning) > 0:
|
||||
text_prompts = [(emb, mask) for emb, mask, _ in _extract_text_prompts(conditioning, device, dtype)]
|
||||
elif initial_mask is None:
|
||||
raise ValueError("Either initial_mask or conditioning must be provided")
|
||||
|
||||
result = sam3_model.forward_video(
|
||||
images=frames_in, initial_masks=init_masks, pbar=pbar, text_prompts=text_prompts,
|
||||
new_det_thresh=detection_threshold, max_objects=max_objects,
|
||||
detect_interval=detect_interval)
|
||||
result["orig_size"] = (H, W)
|
||||
return io.NodeOutput(result)
|
||||
|
||||
|
||||
class SAM3_TrackPreview(io.ComfyNode):
|
||||
"""Visualize tracked objects with distinct colors as a video preview. No tensor output — saves to temp video."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SAM3_TrackPreview",
|
||||
display_name="SAM3 Track Preview",
|
||||
category="detection/",
|
||||
inputs=[
|
||||
SAM3TrackData.Input("track_data", display_name="track_data"),
|
||||
io.Image.Input("images", display_name="images", optional=True),
|
||||
io.Float.Input("opacity", display_name="opacity", default=0.5, min=0.0, max=1.0, step=0.05),
|
||||
io.Float.Input("fps", display_name="fps", default=24.0, min=1.0, max=120.0, step=1.0),
|
||||
],
|
||||
is_output_node=True,
|
||||
)
|
||||
|
||||
COLORS = [
|
||||
(0.12, 0.47, 0.71), (1.0, 0.5, 0.05), (0.17, 0.63, 0.17), (0.84, 0.15, 0.16),
|
||||
(0.58, 0.4, 0.74), (0.55, 0.34, 0.29), (0.89, 0.47, 0.76), (0.5, 0.5, 0.5),
|
||||
(0.74, 0.74, 0.13), (0.09, 0.75, 0.81), (0.94, 0.76, 0.06), (0.42, 0.68, 0.84),
|
||||
]
|
||||
|
||||
# 5x3 bitmap font atlas for digits 0-9 [10, 5, 3]
|
||||
_glyph_cache = {} # (device, scale) -> (glyphs, outlines, gh, gw, oh, ow)
|
||||
|
||||
@staticmethod
|
||||
def _get_glyphs(device, scale=3):
|
||||
key = (device, scale)
|
||||
if key in SAM3_TrackPreview._glyph_cache:
|
||||
return SAM3_TrackPreview._glyph_cache[key]
|
||||
atlas = torch.tensor([
|
||||
[[1,1,1],[1,0,1],[1,0,1],[1,0,1],[1,1,1]],
|
||||
[[0,1,0],[1,1,0],[0,1,0],[0,1,0],[1,1,1]],
|
||||
[[1,1,1],[0,0,1],[1,1,1],[1,0,0],[1,1,1]],
|
||||
[[1,1,1],[0,0,1],[1,1,1],[0,0,1],[1,1,1]],
|
||||
[[1,0,1],[1,0,1],[1,1,1],[0,0,1],[0,0,1]],
|
||||
[[1,1,1],[1,0,0],[1,1,1],[0,0,1],[1,1,1]],
|
||||
[[1,1,1],[1,0,0],[1,1,1],[1,0,1],[1,1,1]],
|
||||
[[1,1,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]],
|
||||
[[1,1,1],[1,0,1],[1,1,1],[1,0,1],[1,1,1]],
|
||||
[[1,1,1],[1,0,1],[1,1,1],[0,0,1],[1,1,1]],
|
||||
], dtype=torch.bool)
|
||||
glyphs, outlines = [], []
|
||||
for d in range(10):
|
||||
g = atlas[d].repeat_interleave(scale, 0).repeat_interleave(scale, 1)
|
||||
padded = F.pad(g.float().unsqueeze(0).unsqueeze(0), (1,1,1,1))
|
||||
o = (F.max_pool2d(padded, 3, stride=1, padding=1)[0, 0] > 0)
|
||||
glyphs.append(g.to(device))
|
||||
outlines.append(o.to(device))
|
||||
gh, gw = glyphs[0].shape
|
||||
oh, ow = outlines[0].shape
|
||||
SAM3_TrackPreview._glyph_cache[key] = (glyphs, outlines, gh, gw, oh, ow)
|
||||
return SAM3_TrackPreview._glyph_cache[key]
|
||||
|
||||
@staticmethod
|
||||
def _draw_number_gpu(frame, number, cx, cy, color, scale=3):
|
||||
"""Draw a number on a GPU tensor [H, W, 3] float 0-1 at (cx, cy) with outline."""
|
||||
H, W = frame.shape[:2]
|
||||
device = frame.device
|
||||
glyphs, outlines, gh, gw, oh, ow = SAM3_TrackPreview._get_glyphs(device, scale)
|
||||
color_t = torch.tensor(color, device=device, dtype=frame.dtype)
|
||||
digs = [int(d) for d in str(number)]
|
||||
total_w = len(digs) * (gw + scale) - scale
|
||||
x0 = cx - total_w // 2
|
||||
y0 = cy - gh // 2
|
||||
for i, d in enumerate(digs):
|
||||
dx = x0 + i * (gw + scale)
|
||||
# Black outline
|
||||
oy0, ox0 = y0 - 1, dx - 1
|
||||
osy1, osx1 = max(0, -oy0), max(0, -ox0)
|
||||
osy2, osx2 = min(oh, H - oy0), min(ow, W - ox0)
|
||||
if osy2 > osy1 and osx2 > osx1:
|
||||
fy1, fx1 = oy0 + osy1, ox0 + osx1
|
||||
frame[fy1:fy1+(osy2-osy1), fx1:fx1+(osx2-osx1)][outlines[d][osy1:osy2, osx1:osx2]] = 0
|
||||
# Colored fill
|
||||
sy1, sx1 = max(0, -y0), max(0, -dx)
|
||||
sy2, sx2 = min(gh, H - y0), min(gw, W - dx)
|
||||
if sy2 > sy1 and sx2 > sx1:
|
||||
fy1, fx1 = y0 + sy1, dx + sx1
|
||||
frame[fy1:fy1+(sy2-sy1), fx1:fx1+(sx2-sx1)][glyphs[d][sy1:sy2, sx1:sx2]] = color_t
|
||||
|
||||
@classmethod
|
||||
def execute(cls, track_data, images=None, opacity=0.5, fps=24.0) -> io.NodeOutput:
|
||||
|
||||
from comfy.ldm.sam3.tracker import unpack_masks
|
||||
packed = track_data["packed_masks"]
|
||||
H, W = track_data["orig_size"]
|
||||
if images is not None:
|
||||
H, W = images.shape[1], images.shape[2]
|
||||
if packed is None:
|
||||
N, N_obj = track_data["n_frames"], 0
|
||||
else:
|
||||
N, N_obj = packed.shape[0], packed.shape[1]
|
||||
|
||||
import uuid
|
||||
gpu = comfy.model_management.get_torch_device()
|
||||
temp_dir = folder_paths.get_temp_directory()
|
||||
filename = f"sam3_track_preview_{uuid.uuid4().hex[:8]}.mp4"
|
||||
filepath = os.path.join(temp_dir, filename)
|
||||
with av.open(filepath, mode='w') as output:
|
||||
stream = output.add_stream('h264', rate=Fraction(round(fps * 1000), 1000))
|
||||
stream.width = W
|
||||
stream.height = H
|
||||
stream.pix_fmt = 'yuv420p'
|
||||
|
||||
frame_cpu = torch.empty(H, W, 3, dtype=torch.uint8)
|
||||
frame_np = frame_cpu.numpy()
|
||||
if N_obj > 0:
|
||||
colors_t = torch.tensor([cls.COLORS[i % len(cls.COLORS)] for i in range(N_obj)],
|
||||
device=gpu, dtype=torch.float32)
|
||||
grid_y = torch.arange(H, device=gpu).view(1, H, 1)
|
||||
grid_x = torch.arange(W, device=gpu).view(1, 1, W)
|
||||
for t in range(N):
|
||||
if images is not None and t < images.shape[0]:
|
||||
frame = images[t].clone()
|
||||
else:
|
||||
frame = torch.zeros(H, W, 3)
|
||||
|
||||
if N_obj > 0:
|
||||
frame_binary = unpack_masks(packed[t:t+1].to(gpu)) # [1, N_obj, H, W] bool
|
||||
frame_masks = F.interpolate(frame_binary.float(), size=(H, W), mode="nearest")[0]
|
||||
frame_gpu = frame.to(gpu)
|
||||
bool_masks = frame_masks > 0.5
|
||||
any_mask = bool_masks.any(dim=0)
|
||||
if any_mask.any():
|
||||
obj_idx_map = bool_masks.to(torch.uint8).argmax(dim=0)
|
||||
color_overlay = colors_t[obj_idx_map]
|
||||
mask_3d = any_mask.unsqueeze(-1)
|
||||
frame_gpu = torch.where(mask_3d, frame_gpu * (1 - opacity) + color_overlay * opacity, frame_gpu)
|
||||
area = bool_masks.sum(dim=(-1, -2)).clamp_(min=1)
|
||||
cy = (bool_masks * grid_y).sum(dim=(-1, -2)) // area
|
||||
cx = (bool_masks * grid_x).sum(dim=(-1, -2)) // area
|
||||
has = area > 1
|
||||
scores = track_data.get("scores", [])
|
||||
for obj_idx in range(N_obj):
|
||||
if has[obj_idx]:
|
||||
_cx, _cy = int(cx[obj_idx]), int(cy[obj_idx])
|
||||
color = cls.COLORS[obj_idx % len(cls.COLORS)]
|
||||
SAM3_TrackPreview._draw_number_gpu(frame_gpu, obj_idx, _cx, _cy, color)
|
||||
if obj_idx < len(scores) and scores[obj_idx] < 1.0:
|
||||
SAM3_TrackPreview._draw_number_gpu(frame_gpu, int(scores[obj_idx] * 100),
|
||||
_cx, _cy + 5 * 3 + 3, color, scale=2)
|
||||
frame_cpu.copy_(frame_gpu.clamp_(0, 1).mul_(255).byte())
|
||||
else:
|
||||
frame_cpu.copy_(frame.clamp_(0, 1).mul_(255).byte())
|
||||
|
||||
vframe = av.VideoFrame.from_ndarray(frame_np, format='rgb24')
|
||||
output.mux(stream.encode(vframe.reformat(format='yuv420p')))
|
||||
output.mux(stream.encode(None))
|
||||
return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(filename, "", io.FolderType.temp)]))
|
||||
|
||||
|
||||
class SAM3_TrackToMask(io.ComfyNode):
|
||||
"""Select tracked objects by index and output as mask."""
|
||||
|
||||
@classmethod
|
||||
def define_schema(cls):
|
||||
return io.Schema(
|
||||
node_id="SAM3_TrackToMask",
|
||||
display_name="SAM3 Track to Mask",
|
||||
category="detection/",
|
||||
inputs=[
|
||||
SAM3TrackData.Input("track_data", display_name="track_data"),
|
||||
io.String.Input("object_indices", display_name="object_indices", default="",
|
||||
tooltip="Comma-separated object indices to include (e.g. '0,2,3'). Empty = all objects."),
|
||||
],
|
||||
outputs=[
|
||||
io.Mask.Output("masks", display_name="masks"),
|
||||
],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def execute(cls, track_data, object_indices="") -> io.NodeOutput:
|
||||
from comfy.ldm.sam3.tracker import unpack_masks
|
||||
packed = track_data["packed_masks"]
|
||||
H, W = track_data["orig_size"]
|
||||
|
||||
if packed is None:
|
||||
N = track_data["n_frames"]
|
||||
return io.NodeOutput(torch.zeros(N, H, W, device=comfy.model_management.intermediate_device()))
|
||||
|
||||
N, N_obj = packed.shape[0], packed.shape[1]
|
||||
|
||||
if object_indices.strip():
|
||||
indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
|
||||
indices = [i for i in indices if 0 <= i < N_obj]
|
||||
else:
|
||||
indices = list(range(N_obj))
|
||||
|
||||
if not indices:
|
||||
return io.NodeOutput(torch.zeros(N, H, W, device=comfy.model_management.intermediate_device()))
|
||||
|
||||
selected = packed[:, indices]
|
||||
binary = unpack_masks(selected) # [N, len(indices), Hm, Wm] bool
|
||||
union = binary.any(dim=1, keepdim=True).float()
|
||||
mask_out = F.interpolate(union, size=(H, W), mode="bilinear", align_corners=False)[:, 0]
|
||||
return io.NodeOutput(mask_out)
|
||||
|
||||
|
||||
class SAM3Extension(ComfyExtension):
|
||||
@override
|
||||
async def get_node_list(self) -> list[type[io.ComfyNode]]:
|
||||
return [
|
||||
SAM3_Detect,
|
||||
SAM3_VideoTrack,
|
||||
SAM3_TrackPreview,
|
||||
SAM3_TrackToMask,
|
||||
]
|
||||
|
||||
|
||||
async def comfy_entrypoint() -> SAM3Extension:
|
||||
return SAM3Extension()
|
||||
@ -54,7 +54,7 @@ class EmptySD3LatentImage(io.ComfyNode):
|
||||
|
||||
@classmethod
|
||||
def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
|
||||
latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device())
|
||||
latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
|
||||
return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8})
|
||||
|
||||
generate = execute # TODO: remove
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
# This file is automatically generated by the build process when version is
|
||||
# updated in pyproject.toml.
|
||||
__version__ = "0.19.3"
|
||||
__version__ = "0.20.1"
|
||||
|
||||
40
execution.py
40
execution.py
@ -779,7 +779,7 @@ class PromptExecutor:
|
||||
|
||||
if self.cache_type == CacheType.RAM_PRESSURE:
|
||||
comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
|
||||
comfy.memory_management.extra_ram_release(ram_headroom)
|
||||
ram_release_callback(ram_headroom, free_active=True)
|
||||
else:
|
||||
# Only execute when the while-loop ends without break
|
||||
# Send cached UI for intermediate output nodes that weren't executed
|
||||
@ -811,11 +811,30 @@ class PromptExecutor:
|
||||
self._notify_prompt_lifecycle("end", prompt_id)
|
||||
|
||||
|
||||
async def validate_inputs(prompt_id, prompt, item, validated):
|
||||
async def validate_inputs(prompt_id, prompt, item, validated, visiting=None):
|
||||
if visiting is None:
|
||||
visiting = []
|
||||
|
||||
unique_id = item
|
||||
if unique_id in validated:
|
||||
return validated[unique_id]
|
||||
|
||||
if unique_id in visiting:
|
||||
cycle_path_nodes = visiting[visiting.index(unique_id):] + [unique_id]
|
||||
cycle_nodes = list(dict.fromkeys(cycle_path_nodes))
|
||||
cycle_path = " -> ".join(f"{node_id} ({prompt[node_id]['class_type']})" for node_id in cycle_path_nodes)
|
||||
for node_id in cycle_nodes:
|
||||
validated[node_id] = (False, [{
|
||||
"type": "dependency_cycle",
|
||||
"message": "Dependency cycle detected",
|
||||
"details": cycle_path,
|
||||
"extra_info": {
|
||||
"node_id": node_id,
|
||||
"cycle_nodes": cycle_nodes,
|
||||
}
|
||||
}], node_id)
|
||||
return validated[unique_id]
|
||||
|
||||
inputs = prompt[unique_id]['inputs']
|
||||
class_type = prompt[unique_id]['class_type']
|
||||
obj_class = nodes.NODE_CLASS_MAPPINGS[class_type]
|
||||
@ -899,7 +918,11 @@ async def validate_inputs(prompt_id, prompt, item, validated):
|
||||
errors.append(error)
|
||||
continue
|
||||
try:
|
||||
r = await validate_inputs(prompt_id, prompt, o_id, validated)
|
||||
visiting.append(unique_id)
|
||||
try:
|
||||
r = await validate_inputs(prompt_id, prompt, o_id, validated, visiting)
|
||||
finally:
|
||||
visiting.pop()
|
||||
if r[0] is False:
|
||||
# `r` will be set in `validated[o_id]` already
|
||||
valid = False
|
||||
@ -1048,10 +1071,13 @@ async def validate_inputs(prompt_id, prompt, item, validated):
|
||||
errors.append(error)
|
||||
continue
|
||||
|
||||
if len(errors) > 0 or valid is not True:
|
||||
ret = (False, errors, unique_id)
|
||||
else:
|
||||
ret = (True, [], unique_id)
|
||||
ret = validated.get(unique_id, (True, [], unique_id))
|
||||
# Recursive cycle detection may have already populated an error on us. Join it.
|
||||
ret = (
|
||||
ret[0] and valid is True and not errors,
|
||||
ret[1] + [error for error in errors if error not in ret[1]],
|
||||
unique_id,
|
||||
)
|
||||
|
||||
validated[unique_id] = ret
|
||||
return ret
|
||||
|
||||
@ -1 +1 @@
|
||||
comfyui_manager==4.1
|
||||
comfyui_manager==4.2.1
|
||||
|
||||
60
nodes.py
60
nodes.py
@ -32,7 +32,7 @@ import comfy.controlnet
|
||||
from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator
|
||||
from comfy_api.internal import register_versions, ComfyAPIWithVersion
|
||||
from comfy_api.version_list import supported_versions
|
||||
from comfy_api.latest import io, ComfyExtension
|
||||
from comfy_api.latest import io, ComfyExtension, InputImpl
|
||||
|
||||
import comfy.clip_vision
|
||||
|
||||
@ -728,50 +728,26 @@ class LoraLoaderModelOnly(LoraLoader):
|
||||
|
||||
class VAELoader:
|
||||
video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"]
|
||||
image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
|
||||
image_taes = ["taesd", "taesdxl", "taesd3", "taef1", "taef2"]
|
||||
|
||||
@staticmethod
|
||||
def vae_list(s):
|
||||
vaes = folder_paths.get_filename_list("vae")
|
||||
approx_vaes = folder_paths.get_filename_list("vae_approx")
|
||||
sdxl_taesd_enc = False
|
||||
sdxl_taesd_dec = False
|
||||
sd1_taesd_enc = False
|
||||
sd1_taesd_dec = False
|
||||
sd3_taesd_enc = False
|
||||
sd3_taesd_dec = False
|
||||
f1_taesd_enc = False
|
||||
f1_taesd_dec = False
|
||||
|
||||
have_img_encoder, have_img_decoder = set(), set()
|
||||
for v in approx_vaes:
|
||||
if v.startswith("taesd_decoder."):
|
||||
sd1_taesd_dec = True
|
||||
elif v.startswith("taesd_encoder."):
|
||||
sd1_taesd_enc = True
|
||||
elif v.startswith("taesdxl_decoder."):
|
||||
sdxl_taesd_dec = True
|
||||
elif v.startswith("taesdxl_encoder."):
|
||||
sdxl_taesd_enc = True
|
||||
elif v.startswith("taesd3_decoder."):
|
||||
sd3_taesd_dec = True
|
||||
elif v.startswith("taesd3_encoder."):
|
||||
sd3_taesd_enc = True
|
||||
elif v.startswith("taef1_encoder."):
|
||||
f1_taesd_dec = True
|
||||
elif v.startswith("taef1_decoder."):
|
||||
f1_taesd_enc = True
|
||||
else:
|
||||
parts = v.split("_", 1)
|
||||
if len(parts) != 2 or parts[0] not in s.image_taes:
|
||||
for tae in s.video_taes:
|
||||
if v.startswith(tae):
|
||||
vaes.append(v)
|
||||
|
||||
if sd1_taesd_dec and sd1_taesd_enc:
|
||||
vaes.append("taesd")
|
||||
if sdxl_taesd_dec and sdxl_taesd_enc:
|
||||
vaes.append("taesdxl")
|
||||
if sd3_taesd_dec and sd3_taesd_enc:
|
||||
vaes.append("taesd3")
|
||||
if f1_taesd_dec and f1_taesd_enc:
|
||||
vaes.append("taef1")
|
||||
break
|
||||
continue
|
||||
if parts[1].startswith("encoder."):
|
||||
have_img_encoder.add(parts[0])
|
||||
elif parts[1].startswith("decoder."):
|
||||
have_img_decoder.add(parts[0])
|
||||
vaes += [k for k in have_img_decoder if k in have_img_encoder]
|
||||
vaes.append("pixel_space")
|
||||
return vaes
|
||||
|
||||
@ -827,6 +803,11 @@ class VAELoader:
|
||||
else:
|
||||
vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
|
||||
sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True)
|
||||
if vae_name == "taef2":
|
||||
if metadata is None:
|
||||
metadata = {"tae_latent_channels": 128}
|
||||
else:
|
||||
metadata["tae_latent_channels"] = 128
|
||||
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
|
||||
vae.throw_exception_if_invalid()
|
||||
return (vae,)
|
||||
@ -1716,6 +1697,10 @@ class LoadImage:
|
||||
def load_image(self, image):
|
||||
image_path = folder_paths.get_annotated_filepath(image)
|
||||
|
||||
components = InputImpl.VideoFromFile(image_path).get_components()
|
||||
if components.images.shape[0] > 0:
|
||||
return (components.images, 1.0 - components.alpha[..., -1] if components.alpha is not None else torch.zeros((components.images.shape[0], 64, 64), dtype=torch.float32, device="cpu"))
|
||||
|
||||
img = node_helpers.pillow(Image.open, image_path)
|
||||
|
||||
output_images = []
|
||||
@ -2459,6 +2444,7 @@ async def init_builtin_extra_nodes():
|
||||
"nodes_curve.py",
|
||||
"nodes_rtdetr.py",
|
||||
"nodes_frame_interpolation.py",
|
||||
"nodes_sam3.py",
|
||||
]
|
||||
|
||||
import_failed = []
|
||||
|
||||
3231
openapi.yaml
Normal file
3231
openapi.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "ComfyUI"
|
||||
version = "0.19.3"
|
||||
version = "0.20.1"
|
||||
readme = "README.md"
|
||||
license = { file = "LICENSE" }
|
||||
requires-python = ">=3.10"
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
comfyui-frontend-package==1.42.14
|
||||
comfyui-workflow-templates==0.9.59
|
||||
comfyui-embedded-docs==0.4.3
|
||||
comfyui-frontend-package==1.42.15
|
||||
comfyui-workflow-templates==0.9.65
|
||||
comfyui-embedded-docs==0.4.4
|
||||
torch
|
||||
torchsde
|
||||
torchvision
|
||||
@ -19,11 +19,11 @@ scipy
|
||||
tqdm
|
||||
psutil
|
||||
alembic
|
||||
SQLAlchemy>=2.0
|
||||
SQLAlchemy>=2.0.0
|
||||
filelock
|
||||
av>=14.2.0
|
||||
comfy-kitchen>=0.2.8
|
||||
comfy-aimdo>=0.2.12
|
||||
comfy-aimdo==0.3.0
|
||||
requests
|
||||
simpleeval>=1.0.0
|
||||
blake3
|
||||
|
||||
@ -39,7 +39,7 @@ def get_required_packages_versions():
|
||||
if len(s) == 2:
|
||||
version_str = s[-1]
|
||||
if not is_valid_version(version_str):
|
||||
logging.error(f"Invalid version format in requirements.txt: {version_str}")
|
||||
logging.debug(f"Invalid version format for {s[0]} in requirements.txt: {version_str}")
|
||||
continue
|
||||
out[s[0]] = version_str
|
||||
return out.copy()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user