diff --git a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat b/.ci/windows_amd_base_files/run_amd_gpu_enable_dynamic_vram.bat
similarity index 66%
rename from .ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
rename to .ci/windows_amd_base_files/run_amd_gpu_enable_dynamic_vram.bat
index cece0aeb2..94ad31942 100755
--- a/.ci/windows_amd_base_files/run_amd_gpu_disable_smart_memory.bat
+++ b/.ci/windows_amd_base_files/run_amd_gpu_enable_dynamic_vram.bat
@@ -1,2 +1,2 @@
-.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --disable-smart-memory
+.\python_embeded\python.exe -s ComfyUI\main.py --windows-standalone-build --enable-dynamic-vram
pause
diff --git a/.github/workflows/backport_release.yaml b/.github/workflows/backport_release.yaml
new file mode 100644
index 000000000..ede6bde33
--- /dev/null
+++ b/.github/workflows/backport_release.yaml
@@ -0,0 +1,519 @@
+name: Backport Release
+
+on:
+ workflow_dispatch:
+ inputs:
+ commit:
+ description: 'Full 40-char SHA of the tip commit of the backport source branch (the PR head commit that passed tests). The branch is resolved from this SHA and must be unique.'
+ required: true
+ type: string
+
+permissions:
+ contents: read
+ pull-requests: read
+ checks: read
+
+jobs:
+ backport-release:
+ name: Create backport release
+ runs-on: ubuntu-latest
+ environment: backport release
+
+ steps:
+ - name: Generate GitHub App token
+ id: app-token
+ uses: actions/create-github-app-token@bcd2ba49218906704ab6c1aa796996da409d3eb1
+ with:
+ app-id: ${{ secrets.FEN_RELEASE_APP_ID }}
+ private-key: ${{ secrets.FEN_RELEASE_PRIVATE_KEY }}
+
+ - name: Checkout repository
+ uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd
+ with:
+ token: ${{ steps.app-token.outputs.token }}
+ fetch-depth: 0
+ fetch-tags: true
+
+ - name: Configure git
+ run: |
+ git config user.name "fen-release[bot]"
+ git config user.email "fen-release[bot]@users.noreply.github.com"
+
+ - name: Resolve source branch from commit SHA
+ id: resolve
+ env:
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+ run: |
+ set -euo pipefail
+
+ # Require a full 40-char lowercase-hex SHA. Short SHAs are ambiguous
+ # and we will be comparing this value against API responses (PR head
+ # SHA, ref tips) that always return the full form.
+ if [[ ! "${SOURCE_COMMIT}" =~ ^[0-9a-f]{40}$ ]]; then
+ echo "::error::Input commit '${SOURCE_COMMIT}' is not a full 40-char lowercase hex SHA."
+ exit 1
+ fi
+
+ # Fetch all remote branches so we can search for which one(s) point
+ # at this SHA. `actions/checkout` with fetch-depth: 0 fetches full
+ # history of the checked-out ref but does not necessarily populate
+ # every refs/remotes/origin/*, so do it explicitly.
+ git fetch --prune origin '+refs/heads/*:refs/remotes/origin/*'
+
+ # Verify the commit actually exists in this repo's object DB.
+ if ! git cat-file -e "${SOURCE_COMMIT}^{commit}" 2>/dev/null; then
+ echo "::error::Commit ${SOURCE_COMMIT} was not found in the repository."
+ exit 1
+ fi
+
+ # Find every remote branch whose tip == SOURCE_COMMIT. Exactly one
+ # branch must point at it. If zero, the commit isn't anyone's tip
+ # (likely stale, force-pushed past, or never the PR head). If more
+ # than one, the (branch -> SHA) mapping is ambiguous and we refuse
+ # to guess — the operator must give us a unique branch to release.
+ mapfile -t matching_branches < <(
+ git for-each-ref \
+ --format='%(refname:strip=3)' \
+ --points-at="${SOURCE_COMMIT}" \
+ refs/remotes/origin/ \
+ | grep -vx 'HEAD' || true
+ )
+
+ if [[ "${#matching_branches[@]}" -eq 0 ]]; then
+ echo "::error::No branch on origin has ${SOURCE_COMMIT} as its tip."
+ echo "::error::Either the branch was updated after you copied this SHA, or this commit was never the head of a branch."
+ exit 1
+ fi
+
+ if [[ "${#matching_branches[@]}" -gt 1 ]]; then
+ echo "::error::More than one branch on origin has ${SOURCE_COMMIT} as its tip; cannot pick one:"
+ for b in "${matching_branches[@]}"; do
+ echo "::error:: - ${b}"
+ done
+ echo "::error::Refusing to proceed with an ambiguous source branch."
+ exit 1
+ fi
+
+ source_branch="${matching_branches[0]}"
+
+ if [[ "${source_branch}" == "${DEFAULT_BRANCH}" ]]; then
+ echo "::error::Source branch must not be the default branch ('${DEFAULT_BRANCH}')."
+ exit 1
+ fi
+
+ echo "Resolved commit ${SOURCE_COMMIT} to branch '${source_branch}'."
+ echo "source_branch=${source_branch}" >> "$GITHUB_OUTPUT"
+
+ - name: Determine latest stable release
+ id: latest
+ env:
+ GH_TOKEN: ${{ steps.app-token.outputs.token }}
+ run: |
+ set -euo pipefail
+
+ # List all tags matching vMAJOR.MINOR.PATCH and pick the highest by numeric
+ # comparison of each component. We DO NOT use `sort -V` because it treats
+ # v0.19.99 as higher than v0.20.1.
+ latest_tag="$(
+ git tag --list 'v[0-9]*.[0-9]*.[0-9]*' \
+ | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' \
+ | awk -F'[v.]' '{ printf "%010d %010d %010d %s\n", $2, $3, $4, $0 }' \
+ | sort -k1,1n -k2,2n -k3,3n \
+ | tail -n1 \
+ | awk '{print $4}'
+ )"
+
+ if [[ -z "${latest_tag}" ]]; then
+ echo "::error::No stable release tags (vMAJOR.MINOR.PATCH) were found."
+ exit 1
+ fi
+
+ # Parse components
+ ver="${latest_tag#v}"
+ major="${ver%%.*}"
+ rest="${ver#*.}"
+ minor="${rest%%.*}"
+ patch="${rest#*.}"
+
+ new_patch=$((patch + 1))
+ new_version="v${major}.${minor}.${new_patch}"
+ release_branch="release/v${major}.${minor}"
+
+ latest_sha="$(git rev-list -n 1 "refs/tags/${latest_tag}")"
+
+ echo "latest_tag=${latest_tag}" >> "$GITHUB_OUTPUT"
+ echo "latest_sha=${latest_sha}" >> "$GITHUB_OUTPUT"
+ echo "major=${major}" >> "$GITHUB_OUTPUT"
+ echo "minor=${minor}" >> "$GITHUB_OUTPUT"
+ echo "patch=${patch}" >> "$GITHUB_OUTPUT"
+ echo "new_version=${new_version}" >> "$GITHUB_OUTPUT"
+ echo "new_version_no_v=${major}.${minor}.${new_patch}" >> "$GITHUB_OUTPUT"
+ echo "release_branch=${release_branch}" >> "$GITHUB_OUTPUT"
+
+ echo "Latest stable release: ${latest_tag} (${latest_sha})"
+ echo "New version will be: ${new_version}"
+ echo "Release branch: ${release_branch}"
+
+ - name: Validate source branch is cut directly from the latest stable release
+ env:
+ SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }}
+ LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
+ run: |
+ set -euo pipefail
+
+ # Use the user-provided SHA directly rather than re-resolving the branch
+ # tip — the resolve step already proved the branch tip equals SOURCE_COMMIT,
+ # and pinning to the SHA here makes the rest of the job TOCTOU-safe against
+ # someone pushing to the branch mid-run.
+ source_sha="${SOURCE_COMMIT}"
+
+ # Walking first-parent from the source tip must reach LATEST_TAG_SHA.
+ # We capture rev-list into a variable and grep against a here-string
+ # rather than piping `rev-list | grep -q`: under `set -o pipefail`,
+ # `grep -q` would exit on first match and SIGPIPE the still-streaming
+ # `rev-list`, propagating exit 141 as a spurious "not found".
+ first_parent_chain="$(git rev-list --first-parent "${source_sha}")"
+ if ! grep -Fxq "${LATEST_TAG_SHA}" <<< "${first_parent_chain}"; then
+ echo "::error::Source branch '${SOURCE_BRANCH}' is not cut from '${LATEST_TAG}'."
+ echo "::error::Its first-parent history does not include ${LATEST_TAG_SHA}."
+ exit 1
+ fi
+
+ # Additionally, every commit added on top of the tag (the set we are
+ # about to publish) must itself be a descendant of the tag along
+ # first-parent — i.e. no sibling commits from master sneak in via a
+ # non-first-parent path. Enforce by requiring that the symmetric
+ # difference is empty in one direction: commits in source that are
+ # NOT first-parent-reachable from source starting at the tag.
+ # We do this by intersecting:
+ # A = commits reachable from source but not from tag (full DAG)
+ # B = commits on the first-parent chain from source down to tag
+ # and requiring A == B.
+ all_added="$(git rev-list "${LATEST_TAG_SHA}..${source_sha}" | sort)"
+ first_parent_added="$(
+ git rev-list --first-parent "${LATEST_TAG_SHA}..${source_sha}" | sort
+ )"
+
+ if [[ "${all_added}" != "${first_parent_added}" ]]; then
+ echo "::error::Source branch '${SOURCE_BRANCH}' contains commits not on its first-parent chain from '${LATEST_TAG}'."
+ echo "::error::This usually means the branch was cut from master (not from the tag) or contains a merge from master."
+ echo "Commits reachable but not on first-parent chain:"
+ comm -23 <(printf '%s\n' "${all_added}") <(printf '%s\n' "${first_parent_added}") \
+ | while read -r sha; do
+ echo " $(git log -1 --format='%h %s' "${sha}")"
+ done
+ exit 1
+ fi
+
+ added_count="$(printf '%s\n' "${all_added}" | grep -c . || true)"
+ echo "Source branch is cut directly from ${LATEST_TAG} with ${added_count} commit(s) on top."
+
+ - name: Validate PR exists, is open, named correctly, has latest commit, and checks pass
+ env:
+ GH_TOKEN: ${{ steps.app-token.outputs.token }}
+ SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+ REPO: ${{ github.repository }}
+ run: |
+ set -euo pipefail
+
+ expected_title="ComfyUI backport release ${NEW_VERSION}"
+
+ # Find open PRs from this branch into master. The --state open filter
+ # is load-bearing: a closed/merged PR with passing checks must not be
+ # accepted as authorization for a new release.
+ pr_json="$(
+ gh pr list \
+ --repo "${REPO}" \
+ --state open \
+ --head "${SOURCE_BRANCH}" \
+ --base master \
+ --json number,title,headRefOid,state \
+ --limit 10
+ )"
+
+ pr_count="$(echo "${pr_json}" | jq 'length')"
+ if [[ "${pr_count}" -eq 0 ]]; then
+ echo "::error::No open PR found from '${SOURCE_BRANCH}' into 'master'. The PR must exist and be open."
+ exit 1
+ fi
+
+ # Pick the PR matching the expected title
+ pr_number="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
+ map(select(.title == $t)) | .[0].number // empty
+ ')"
+ pr_head_sha="$(echo "${pr_json}" | jq -r --arg t "${expected_title}" '
+ map(select(.title == $t)) | .[0].headRefOid // empty
+ ')"
+
+ if [[ -z "${pr_number}" ]]; then
+ echo "::error::No open PR from '${SOURCE_BRANCH}' into 'master' is titled '${expected_title}'."
+ echo "Found PRs:"
+ echo "${pr_json}" | jq -r '.[] | " #\(.number): \(.title)"'
+ exit 1
+ fi
+
+ # The PR's current head commit must equal the SHA the operator gave us.
+ # This is what closes the door on releasing stale code: if anyone has
+ # pushed to the branch since the operator validated tests passed, the
+ # PR head will have advanced past SOURCE_COMMIT and we abort. (The
+ # resolve step already proved the branch tip == SOURCE_COMMIT; this
+ # ties that same SHA to the PR that authorizes the release.)
+ if [[ "${pr_head_sha}" != "${SOURCE_COMMIT}" ]]; then
+ echo "::error::PR #${pr_number} head commit is ${pr_head_sha}, but the operator-provided commit is ${SOURCE_COMMIT}."
+ echo "::error::The PR has new commits since this release was authorized. Re-run with the new head SHA after verifying its checks."
+ exit 1
+ fi
+
+ echo "Found open PR #${pr_number} titled '${expected_title}' at head ${pr_head_sha} (matches operator-provided commit)."
+
+ # Verify all check runs on the head commit have completed successfully.
+ # A check is considered passing if conclusion is success, neutral, or skipped.
+ checks_json="$(
+ gh api \
+ --paginate \
+ "repos/${REPO}/commits/${pr_head_sha}/check-runs" \
+ --jq '.check_runs[] | {name: .name, status: .status, conclusion: .conclusion}'
+ )"
+
+ if [[ -z "${checks_json}" ]]; then
+ echo "::error::No check runs found on PR head commit ${pr_head_sha}."
+ exit 1
+ fi
+
+ echo "Check runs on ${pr_head_sha}:"
+ echo "${checks_json}" | jq -s '.'
+
+ failing="$(echo "${checks_json}" | jq -s '
+ map(select(
+ .status != "completed"
+ or (.conclusion as $c
+ | ["success","neutral","skipped"]
+ | index($c) | not)
+ ))
+ ')"
+
+ failing_count="$(echo "${failing}" | jq 'length')"
+ if [[ "${failing_count}" -gt 0 ]]; then
+ echo "::error::One or more checks have not passed on PR head commit ${pr_head_sha}:"
+ echo "${failing}" | jq -r '.[] | " - \(.name): status=\(.status) conclusion=\(.conclusion)"'
+ exit 1
+ fi
+
+ echo "All checks have passed on ${pr_head_sha}."
+
+ - name: Prepare release branch
+ id: prepare
+ env:
+ GH_TOKEN: ${{ steps.app-token.outputs.token }}
+ REPO: ${{ github.repository }}
+ RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+ LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
+ LATEST_TAG_SHA: ${{ steps.latest.outputs.latest_sha }}
+ PATCH: ${{ steps.latest.outputs.patch }}
+ run: |
+ set -euo pipefail
+
+ # Try to fetch the release branch. If patch == 0, it shouldn't exist yet
+ # and we'll create it from the latest stable tag. If patch > 0, it must
+ # already exist and its tip must equal the latest stable tag commit (i.e.
+ # the previous patch release).
+ if git ls-remote --exit-code --heads origin "${RELEASE_BRANCH}" >/dev/null 2>&1; then
+ echo "Release branch '${RELEASE_BRANCH}' already exists on origin."
+ git fetch origin "refs/heads/${RELEASE_BRANCH}:refs/remotes/origin/${RELEASE_BRANCH}"
+ git checkout -B "${RELEASE_BRANCH}" "refs/remotes/origin/${RELEASE_BRANCH}"
+
+ current_tip="$(git rev-parse HEAD)"
+ if [[ "${current_tip}" != "${LATEST_TAG_SHA}" ]]; then
+ echo "::error::Release branch '${RELEASE_BRANCH}' tip (${current_tip}) is not at the latest stable release '${LATEST_TAG}' (${LATEST_TAG_SHA})."
+ echo "::error::Refusing to release on top of a divergent branch."
+ exit 1
+ fi
+ echo "branch_existed=true" >> "$GITHUB_OUTPUT"
+ else
+ if [[ "${PATCH}" != "0" ]]; then
+ echo "::error::Release branch '${RELEASE_BRANCH}' does not exist on origin, but the latest stable release '${LATEST_TAG}' has patch=${PATCH} (>0). This is inconsistent."
+ exit 1
+ fi
+ echo "Release branch '${RELEASE_BRANCH}' does not exist. Creating from ${LATEST_TAG}."
+ git checkout -B "${RELEASE_BRANCH}" "refs/tags/${LATEST_TAG}"
+ echo "branch_existed=false" >> "$GITHUB_OUTPUT"
+ fi
+
+ - name: Fast-forward merge source branch into release branch
+ env:
+ SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+ run: |
+ set -euo pipefail
+
+ # --ff-only guarantees no merge commit is created. If a fast-forward is
+ # not possible (i.e. the release branch has commits the source branch
+ # doesn't), the merge will fail and we abort. Because we already validated
+ # that the source branch is rooted on the latest stable tag, and the
+ # release branch tip equals that same tag, this fast-forward should
+ # always succeed for a well-formed backport branch.
+ #
+ # We merge the operator-provided SHA, not the branch ref, so a push to
+ # the branch in the window between resolve and now cannot smuggle new
+ # commits into the release.
+ if ! git merge --ff-only "${SOURCE_COMMIT}"; then
+ echo "::error::Cannot fast-forward '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}'). A merge commit would be required. Aborting."
+ exit 1
+ fi
+
+ echo "Fast-forwarded '${RELEASE_BRANCH}' to ${SOURCE_COMMIT} (tip of '${SOURCE_BRANCH}')."
+
+ - name: Bump version files
+ env:
+ NEW_VERSION_NO_V: ${{ steps.latest.outputs.new_version_no_v }}
+ run: |
+ set -euo pipefail
+
+ if [[ ! -f comfyui_version.py ]]; then
+ echo "::error::comfyui_version.py not found in repo root."
+ exit 1
+ fi
+ if [[ ! -f pyproject.toml ]]; then
+ echo "::error::pyproject.toml not found in repo root."
+ exit 1
+ fi
+
+ # Replace the version string in comfyui_version.py.
+ # Expected format: __version__ = "X.Y.Z"
+ python3 - "$NEW_VERSION_NO_V" <<'PY'
+ import re, sys, pathlib
+ new = sys.argv[1]
+
+ p = pathlib.Path("comfyui_version.py")
+ src = p.read_text()
+ new_src, n = re.subn(
+ r'(__version__\s*=\s*[\'"])[^\'"]+([\'"])',
+ lambda m: f'{m.group(1)}{new}{m.group(2)}',
+ src,
+ count=1,
+ )
+ if n != 1:
+ sys.exit("Could not find __version__ assignment in comfyui_version.py")
+ p.write_text(new_src)
+
+ p = pathlib.Path("pyproject.toml")
+ src = p.read_text()
+ # Replace the first `version = "..."` inside [project] or [tool.poetry].
+ new_src, n = re.subn(
+ r'(?m)^(version\s*=\s*")[^"]+(")',
+ lambda m: f'{m.group(1)}{new}{m.group(2)}',
+ src,
+ count=1,
+ )
+ if n != 1:
+ sys.exit("Could not find version assignment in pyproject.toml")
+ p.write_text(new_src)
+ PY
+
+ echo "Updated version to ${NEW_VERSION_NO_V} in comfyui_version.py and pyproject.toml."
+ git --no-pager diff -- comfyui_version.py pyproject.toml
+
+ - name: Commit version bump and tag release
+ env:
+ NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+ run: |
+ set -euo pipefail
+
+ git add comfyui_version.py pyproject.toml
+ git commit -m "ComfyUI ${NEW_VERSION}"
+
+ if git rev-parse -q --verify "refs/tags/${NEW_VERSION}" >/dev/null; then
+ echo "::error::Tag ${NEW_VERSION} already exists locally."
+ exit 1
+ fi
+ git tag "${NEW_VERSION}"
+
+ - name: Verify tag does not already exist on origin
+ env:
+ NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+ run: |
+ set -euo pipefail
+ if git ls-remote --exit-code --tags origin "refs/tags/${NEW_VERSION}" >/dev/null 2>&1; then
+ echo "::error::Tag ${NEW_VERSION} already exists on origin. Aborting."
+ exit 1
+ fi
+
+ - name: Push release branch and tag
+ env:
+ RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+ NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+ run: |
+ set -euo pipefail
+
+ # Push the branch first, then the tag. Atomic-ish: if the branch push
+ # fails we never publish the tag.
+ git push origin "refs/heads/${RELEASE_BRANCH}:refs/heads/${RELEASE_BRANCH}"
+ git push origin "refs/tags/${NEW_VERSION}"
+
+ echo "Released ${NEW_VERSION} on ${RELEASE_BRANCH}."
+
+ - name: Delete remote source branch
+ env:
+ GH_TOKEN: ${{ steps.app-token.outputs.token }}
+ REPO: ${{ github.repository }}
+ SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+ DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
+ run: |
+ set -euo pipefail
+
+ # Belt-and-braces: the resolve step already refuses the default branch,
+ # but never delete the default or the release branch under any
+ # circumstances.
+ if [[ "${SOURCE_BRANCH}" == "${DEFAULT_BRANCH}" || "${SOURCE_BRANCH}" == "${RELEASE_BRANCH}" ]]; then
+ echo "::error::Refusing to delete '${SOURCE_BRANCH}' (matches default or release branch)."
+ exit 1
+ fi
+
+ # Delete the source branch on origin, but only if its tip is still the
+ # SHA we released from. If someone pushed new commits to it after we
+ # resolved it, leave it alone — those commits would be silently lost.
+ current_tip="$(git ls-remote origin "refs/heads/${SOURCE_BRANCH}" | awk '{print $1}')"
+ if [[ -z "${current_tip}" ]]; then
+ echo "Source branch '${SOURCE_BRANCH}' no longer exists on origin; nothing to delete."
+ exit 0
+ fi
+ if [[ "${current_tip}" != "${SOURCE_COMMIT}" ]]; then
+ echo "::warning::Source branch '${SOURCE_BRANCH}' tip (${current_tip}) no longer matches released commit (${SOURCE_COMMIT}). Leaving it in place."
+ exit 0
+ fi
+
+ git push origin --delete "refs/heads/${SOURCE_BRANCH}"
+ echo "Deleted remote branch '${SOURCE_BRANCH}'."
+
+ - name: Summary
+ if: always()
+ env:
+ NEW_VERSION: ${{ steps.latest.outputs.new_version }}
+ RELEASE_BRANCH: ${{ steps.latest.outputs.release_branch }}
+ LATEST_TAG: ${{ steps.latest.outputs.latest_tag }}
+ SOURCE_BRANCH: ${{ steps.resolve.outputs.source_branch }}
+ SOURCE_COMMIT: ${{ inputs.commit }}
+ run: |
+ # SOURCE_BRANCH is empty if the resolve step never produced an output
+ # (e.g. the workflow failed in or before that step). Show a placeholder
+ # in that case so the summary table still renders cleanly.
+ source_branch_display="${SOURCE_BRANCH:-(unresolved)}"
+ {
+ echo "## Backport release"
+ echo ""
+ echo "| Field | Value |"
+ echo "|---|---|"
+ echo "| Source commit | \`${SOURCE_COMMIT}\` |"
+ echo "| Source branch | \`${source_branch_display}\` |"
+ echo "| Previous stable | \`${LATEST_TAG}\` |"
+ echo "| New version | \`${NEW_VERSION}\` |"
+ echo "| Release branch | \`${RELEASE_BRANCH}\` |"
+ } >> "$GITHUB_STEP_SUMMARY"
diff --git a/.github/workflows/openapi-lint.yml b/.github/workflows/openapi-lint.yml
new file mode 100644
index 000000000..be949de2a
--- /dev/null
+++ b/.github/workflows/openapi-lint.yml
@@ -0,0 +1,31 @@
+name: OpenAPI Lint
+
+on:
+ pull_request:
+ paths:
+ - 'openapi.yaml'
+ - '.spectral.yaml'
+ - '.github/workflows/openapi-lint.yml'
+
+permissions:
+ contents: read
+
+jobs:
+ spectral:
+ name: Run Spectral
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Set up Node.js
+ uses: actions/setup-node@v4
+ with:
+ node-version: '20'
+
+ - name: Install Spectral
+ run: npm install -g @stoplight/spectral-cli@6
+
+ - name: Lint openapi.yaml
+ run: spectral lint openapi.yaml --ruleset .spectral.yaml --fail-severity=error
diff --git a/.github/workflows/stable-release.yml b/.github/workflows/stable-release.yml
index f501b7b31..bc64ed74d 100644
--- a/.github/workflows/stable-release.yml
+++ b/.github/workflows/stable-release.yml
@@ -145,6 +145,8 @@ jobs:
cp -r ComfyUI/.ci/windows_${{ inputs.rel_name }}_base_files/* ./
cp ../update_comfyui_and_python_dependencies.bat ./update/
+ echo 'local-portable' > ComfyUI/.comfy_environment
+
cd ..
"C:\Program Files\7-Zip\7z.exe" a -t7z -m0=lzma2 -mx=9 -mfb=128 -md=768m -ms=on -mf=BCJ2 ComfyUI_windows_portable.7z ComfyUI_windows_portable
diff --git a/.github/workflows/tag-dispatch-cloud.yml b/.github/workflows/tag-dispatch-cloud.yml
new file mode 100644
index 000000000..53a0e91d6
--- /dev/null
+++ b/.github/workflows/tag-dispatch-cloud.yml
@@ -0,0 +1,45 @@
+name: Tag Dispatch to Cloud
+
+on:
+ push:
+ tags:
+ - 'v*'
+
+jobs:
+ dispatch-cloud:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Send repository dispatch to cloud
+ env:
+ DISPATCH_TOKEN: ${{ secrets.CLOUD_REPO_DISPATCH_TOKEN }}
+ RELEASE_TAG: ${{ github.ref_name }}
+ run: |
+ set -euo pipefail
+
+ if [ -z "${DISPATCH_TOKEN:-}" ]; then
+ echo "::error::CLOUD_REPO_DISPATCH_TOKEN is required but not set."
+ exit 1
+ fi
+
+ RELEASE_URL="https://github.com/${{ github.repository }}/releases/tag/${RELEASE_TAG}"
+
+ PAYLOAD="$(jq -n \
+ --arg release_tag "$RELEASE_TAG" \
+ --arg release_url "$RELEASE_URL" \
+ '{
+ event_type: "comfyui_tag_pushed",
+ client_payload: {
+ release_tag: $release_tag,
+ release_url: $release_url
+ }
+ }')"
+
+ curl -fsSL \
+ -X POST \
+ -H "Accept: application/vnd.github+json" \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer ${DISPATCH_TOKEN}" \
+ https://api.github.com/repos/Comfy-Org/cloud/dispatches \
+ -d "$PAYLOAD"
+
+ echo "✅ Dispatched ComfyUI tag ${RELEASE_TAG} to Comfy-Org/cloud"
diff --git a/.gitignore b/.gitignore
index 2700ad5c2..fc426eda4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,6 @@ venv*/
*.log
web_custom_versions/
.DS_Store
-openapi.yaml
filtered-openapi.yaml
uv.lock
+.comfy_environment
diff --git a/.spectral.yaml b/.spectral.yaml
new file mode 100644
index 000000000..a4b137628
--- /dev/null
+++ b/.spectral.yaml
@@ -0,0 +1,100 @@
+extends:
+ - spectral:oas
+
+# Severity levels: error, warn, info, hint, off
+# Rules from the built-in "spectral:oas" ruleset are active by default.
+# Below we tune severity and add custom rules for our conventions.
+#
+# This ruleset mirrors Comfy-Org/cloud/.spectral.yaml so specs across the
+# organization are linted against a single consistent standard.
+
+rules:
+ # -----------------------------------------------------------------------
+ # Built-in rule severity overrides
+ # -----------------------------------------------------------------------
+ operation-operationId: error
+ operation-description: warn
+ operation-tag-defined: error
+ info-contact: off
+ info-description: warn
+ no-eval-in-markdown: error
+ no-$ref-siblings: error
+
+ # -----------------------------------------------------------------------
+ # Custom rules: naming conventions
+ # -----------------------------------------------------------------------
+
+ # Property names should be snake_case
+ property-name-snake-case:
+ description: Property names must be snake_case
+ severity: warn
+ given: "$.components.schemas.*.properties[*]~"
+ then:
+ function: pattern
+ functionOptions:
+ match: "^[a-z][a-z0-9]*(_[a-z0-9]+)*$"
+
+ # Operation IDs should be camelCase
+ operation-id-camel-case:
+ description: Operation IDs must be camelCase
+ severity: warn
+ given: "$.paths.*.*.operationId"
+ then:
+ function: pattern
+ functionOptions:
+ match: "^[a-z][a-zA-Z0-9]*$"
+
+ # -----------------------------------------------------------------------
+ # Custom rules: response conventions
+ # -----------------------------------------------------------------------
+
+ # Error responses (4xx, 5xx) should use a consistent shape
+ error-response-schema:
+ description: Error responses should reference a standard error schema
+ severity: hint
+ given: "$.paths.*.*.responses[?(@property >= '400' && @property < '600')].content['application/json'].schema"
+ then:
+ field: "$ref"
+ function: truthy
+
+ # All 2xx responses with JSON body should have a schema
+ response-schema-defined:
+ description: Success responses with JSON content should define a schema
+ severity: warn
+ given: "$.paths.*.*.responses[?(@property >= '200' && @property < '300')].content['application/json']"
+ then:
+ field: schema
+ function: truthy
+
+ # -----------------------------------------------------------------------
+ # Custom rules: best practices
+ # -----------------------------------------------------------------------
+
+ # Path parameters must have a description
+ path-param-description:
+ description: Path parameters should have a description
+ severity: warn
+ given:
+ - "$.paths.*.parameters[?(@.in == 'path')]"
+ - "$.paths.*.*.parameters[?(@.in == 'path')]"
+ then:
+ field: description
+ function: truthy
+
+ # Schemas should have a description
+ schema-description:
+ description: Component schemas should have a description
+ severity: hint
+ given: "$.components.schemas.*"
+ then:
+ field: description
+ function: truthy
+
+overrides:
+ # /ws uses HTTP 101 (Switching Protocols) — a legitimate response for a
+ # WebSocket upgrade, but not a 2xx, so operation-success-response fires
+ # as a false positive. OpenAPI 3.x has no native WebSocket support.
+ - files:
+ - "openapi.yaml#/paths/~1ws"
+ rules:
+ operation-success-response: off
diff --git a/CODEOWNERS b/CODEOWNERS
index 4d5448636..043c0ec75 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,2 +1,5 @@
-# Admins
-* @comfyanonymous @kosinkadink @guill
+* @comfyanonymous @kosinkadink @guill @alexisrolland @rattus128 @kijai
+
+/CODEOWNERS @comfyanonymous
+/.ci/ @comfyanonymous
+/.github/ @comfyanonymous
diff --git a/QUANTIZATION.md b/QUANTIZATION.md
index 1693e13f3..300822029 100644
--- a/QUANTIZATION.md
+++ b/QUANTIZATION.md
@@ -139,9 +139,9 @@ Example:
"_quantization_metadata": {
"format_version": "1.0",
"layers": {
- "model.layers.0.mlp.up_proj": "float8_e4m3fn",
- "model.layers.0.mlp.down_proj": "float8_e4m3fn",
- "model.layers.1.mlp.up_proj": "float8_e4m3fn"
+ "model.layers.0.mlp.up_proj": {"format": "float8_e4m3fn"},
+ "model.layers.0.mlp.down_proj": {"format": "float8_e4m3fn"},
+ "model.layers.1.mlp.up_proj": {"format": "float8_e4m3fn"}
}
}
}
@@ -165,4 +165,4 @@ Activation quantization (e.g., for FP8 Tensor Core operations) requires `input_s
3. **Compute scales**: Derive `input_scale` from collected statistics
4. **Store in checkpoint**: Save `input_scale` parameters alongside weights
-The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
\ No newline at end of file
+The calibration dataset should be representative of your target use case. For diffusion models, this typically means a diverse set of prompts and generation parameters.
diff --git a/README.md b/README.md
index 1eeb810de..dc2389266 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
# ComfyUI
-**The most powerful and modular visual AI engine and application.**
+**The most powerful and modular AI engine for content creation.**
[![Website][website-shield]][website-url]
@@ -20,7 +20,7 @@
[website-url]: https://www.comfy.org/
[discord-shield]: https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fdiscord.com%2Fapi%2Finvites%2Fcomfyorg%3Fwith_counts%3Dtrue&query=%24.approximate_member_count&logo=discord&logoColor=white&label=Discord&color=green&suffix=%20total
-[discord-url]: https://www.comfy.org/discord
+[discord-url]: https://discord.com/invite/comfyorg
[twitter-shield]: https://img.shields.io/twitter/follow/ComfyUI
[twitter-url]: https://x.com/ComfyUI
@@ -31,10 +31,16 @@
[github-downloads-latest-shield]: https://img.shields.io/github/downloads/comfyanonymous/ComfyUI/latest/total?style=flat&label=downloads%40latest
[github-downloads-link]: https://github.com/comfyanonymous/ComfyUI/releases
-
+
+
-ComfyUI lets you design and execute advanced stable diffusion pipelines using a graph/nodes/flowchart based interface. Available on Windows, Linux, and macOS.
+ComfyUI is the AI creation engine for visual professionals who demand control over every model, every parameter, and every output. Its powerful and modular node graph interface empowers creatives to generate images, videos, 3D models, audio, and more...
+- ComfyUI natively supports the latest open-source state of the art models.
+- API nodes provide access to the best closed source models such as Nano Banana, Seedance, Hunyuan3D, etc.
+- It is available on Windows, Linux, and macOS, locally with our [desktop application](https://www.comfy.org/download), our [portable install](#installing) or on our [cloud](https://www.comfy.org/cloud).
+- The most sophisticated workflows can be exposed through a simple UI thanks to App Mode.
+- It integrates seamlessly into production pipelines with our API endpoints.
## Get Started
@@ -77,6 +83,7 @@ See what ComfyUI can do with the [newer template workflows](https://comfy.org/wo
- [Hunyuan Image 2.1](https://comfyanonymous.github.io/ComfyUI_examples/hunyuan_image/)
- [Flux 2](https://comfyanonymous.github.io/ComfyUI_examples/flux2/)
- [Z Image](https://comfyanonymous.github.io/ComfyUI_examples/z_image/)
+ - Ernie Image
- Image Editing Models
- [Omnigen 2](https://comfyanonymous.github.io/ComfyUI_examples/omnigen/)
- [Flux Kontext](https://comfyanonymous.github.io/ComfyUI_examples/flux/#flux-kontext-image-editing-model)
@@ -126,7 +133,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
ComfyUI follows a weekly release cycle targeting Monday but this regularly changes because of model releases or large changes to the codebase. There are three interconnected repositories:
1. **[ComfyUI Core](https://github.com/comfyanonymous/ComfyUI)**
- - Releases a new stable version (e.g., v0.7.0) roughly every week.
+ - Releases a new major stable version (e.g., v0.7.0) roughly every 2 weeks.
- Starting from v0.4.0 patch versions will be used for fixes backported onto the current stable release.
- Minor versions will be used for releases off the master branch.
- Patch versions may still be used for releases on the master branch in cases where a backport would not make sense.
@@ -193,11 +200,15 @@ If you have trouble extracting it, right click the file -> properties -> unblock
The portable above currently comes with python 3.13 and pytorch cuda 13.0. Update your Nvidia drivers if it doesn't start.
-#### Alternative Downloads:
+#### All Official Portable Downloads:
-[Experimental portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
+[Portable for AMD GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_amd.7z)
-[Portable with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
+[Portable for Intel GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_intel.7z)
+
+[Portable for Nvidia GPUs](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia.7z) (supports 20 series and above).
+
+[Portable for Nvidia GPUs with pytorch cuda 12.6 and python 3.12](https://github.com/comfyanonymous/ComfyUI/releases/latest/download/ComfyUI_windows_portable_nvidia_cu126.7z) (Supports Nvidia 10 series and older GPUs).
#### How do I share models between another UI and ComfyUI?
@@ -418,9 +429,11 @@ Use `--tls-keyfile key.pem --tls-certfile cert.pem` to enable TLS/SSL, the app w
See also: [https://www.comfy.org/](https://www.comfy.org/)
+> _psst — we're hiring!_ Help build ComfyUI: [comfy.org/careers](https://www.comfy.org/careers)
+
## Frontend Development
-As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). This repository now hosts the compiled JS (from TS/Vue) under the `web/` directory.
+As of August 15, 2024, we have transitioned to a new frontend, which is now hosted in a separate repository: [ComfyUI Frontend](https://github.com/Comfy-Org/ComfyUI_frontend). The compiled JS files (from TS/Vue) are published to [pypi](https://pypi.org/project/comfyui-frontend-package) and installed as a dependency in ComfyUI.
### Reporting Issues and Requesting Features
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 000000000..299b0067b
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,44 @@
+# Security Policy
+
+## Scope
+
+ComfyUI is designed to run locally. By default, the server binds to `127.0.0.1`, meaning only the user's own machine can reach it. Our threat model assumes:
+
+- The user installed ComfyUI through a supported channel: the desktop application, the portable build, or a manual install following the README.
+- The user has not installed untrusted custom nodes. Custom nodes are arbitrary Python code and are trusted as much as any other software the user chooses to install.
+- Anyone with access to the ComfyUI URL is trusted (a direct consequence of the localhost-only default).
+- PyTorch and other dependencies are at the versions we ship or recommend in the README.
+
+A report is in scope only if it affects a user operating within this threat model.
+
+## What We Consider a Vulnerability
+
+We want to hear about issues where a **reasonable user** — someone who does not install random untrusted nodes and who reads UI prompts and warnings before clicking through them — can be harmed by ComfyUI itself.
+
+The clearest example: a workflow file that such a user might plausibly load and run, using only built-in nodes, that results in **untrusted code execution, arbitrary file read/write outside expected directories, or credential/data exfiltration**.
+
+When submitting a report, please include a clear description of *why this is a problem for a typical local ComfyUI user*. Reports without this context are difficult to act on.
+
+## What We Do Not Consider a Security Vulnerability
+
+Please report the following through our regular [GitHub issues](https://github.com/comfyanonymous/ComfyUI/issues) instead. Filing them as security reports will likely cause them to be deprioritized or closed.
+
+- **Issues requiring `--listen` or any non-default network exposure.** ComfyUI binds to localhost by default. If a remote attacker needs to reach the server for the attack to work, the user has chosen to expose it and is responsible for securing that deployment (firewall, reverse proxy, authentication, etc.). These are bugs, not vulnerabilities.
+- **`torch.load` and related deserialization issues in old PyTorch versions.** These are upstream PyTorch issues. Our distributions ship with — and our documentation recommends — recent PyTorch versions where these are addressed.
+- **Vulnerabilities that depend on outdated library versions** that we neither ship nor recommend (e.g., requiring PyTorch 2.6 or older).
+- **Issues that require a specific custom node to be installed.** Custom nodes are third-party code. Report these to the maintainer of that node.
+- **Crashes, hangs, or resource exhaustion from a loaded workflow.** Annoying, but not a security issue in our model. File a regular bug.
+- **Social-engineering scenarios** where the user is expected to ignore an explicit UI warning or prompt.
+
+## Reporting
+
+If you believe you have found an issue that falls within the scope above, please report it privately via GitHub's [Report a vulnerability](https://github.com/comfyanonymous/ComfyUI/security/advisories/new) feature rather than opening a public issue.
+
+Please include:
+
+1. A description of the vulnerability and the affected component.
+2. Reproduction steps, ideally with a minimal workflow file or proof-of-concept.
+3. The ComfyUI version, install method (desktop / portable / manual), and OS.
+4. An explanation of how this affects a typical local user as described in the threat model.
+
+We will acknowledge valid reports and coordinate a fix and disclosure timeline with you.
diff --git a/api_server/routes/internal/internal_routes.py b/api_server/routes/internal/internal_routes.py
index b224306da..1477afa01 100644
--- a/api_server/routes/internal/internal_routes.py
+++ b/api_server/routes/internal/internal_routes.py
@@ -67,7 +67,7 @@ class InternalRoutes:
(entry for entry in os.scandir(directory) if is_visible_file(entry)),
key=lambda entry: -entry.stat().st_mtime
)
- return web.json_response([entry.name for entry in sorted_files], status=200)
+ return web.json_response([f"{entry.name} [{directory_type}]" for entry in sorted_files], status=200)
def get_app(self):
diff --git a/app/assets/api/routes.py b/app/assets/api/routes.py
index 68126b6a5..6555974e9 100644
--- a/app/assets/api/routes.py
+++ b/app/assets/api/routes.py
@@ -160,10 +160,12 @@ def _build_asset_response(result: schemas.AssetDetailResult | schemas.UploadResu
preview_url = None
else:
preview_url = _build_preview_url_from_view(result.tags, result.ref.user_metadata)
+ asset_content_hash = result.asset.hash if result.asset else None
return schemas_out.Asset(
id=result.ref.id,
name=result.ref.name,
- asset_hash=result.asset.hash if result.asset else None,
+ hash=asset_content_hash,
+ asset_hash=asset_content_hash,
size=int(result.asset.size_bytes) if result.asset else None,
mime_type=result.asset.mime_type if result.asset else None,
tags=result.tags,
diff --git a/app/assets/api/schemas_out.py b/app/assets/api/schemas_out.py
index d99b1098d..0e748b907 100644
--- a/app/assets/api/schemas_out.py
+++ b/app/assets/api/schemas_out.py
@@ -10,6 +10,7 @@ class Asset(BaseModel):
id: str
name: str
+ hash: str | None = None
asset_hash: str | None = None
size: int | None = None
mime_type: str | None = None
diff --git a/app/assets/services/metadata_extract.py b/app/assets/services/metadata_extract.py
index a004929bc..bdfe60218 100644
--- a/app/assets/services/metadata_extract.py
+++ b/app/assets/services/metadata_extract.py
@@ -4,7 +4,6 @@ Tier 1: Filesystem metadata (zero parsing)
Tier 2: Safetensors header metadata (fast JSON read only)
"""
-from __future__ import annotations
import json
import logging
diff --git a/app/custom_node_manager.py b/app/custom_node_manager.py
index 281febca9..738af2abd 100644
--- a/app/custom_node_manager.py
+++ b/app/custom_node_manager.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import os
import folder_paths
import glob
diff --git a/app/frontend_management.py b/app/frontend_management.py
index f753ef0de..8e84e8dd9 100644
--- a/app/frontend_management.py
+++ b/app/frontend_management.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import argparse
import logging
import os
@@ -27,7 +26,7 @@ def frontend_install_warning_message():
return f"""
{get_missing_requirements_message()}
-This error is happening because the ComfyUI frontend is no longer shipped as part of the main repo but as a pip package instead.
+The ComfyUI frontend is shipped in a pip package so it needs to be updated separately from the ComfyUI code.
""".strip()
def parse_version(version: str) -> tuple[int, int, int]:
@@ -38,40 +37,63 @@ def is_valid_version(version: str) -> bool:
pattern = r"^(\d+)\.(\d+)\.(\d+)$"
return bool(re.match(pattern, version))
-def get_installed_frontend_version():
- """Get the currently installed frontend package version."""
- frontend_version_str = version("comfyui-frontend-package")
- return frontend_version_str
-
-
def get_required_frontend_version():
return get_required_packages_versions().get("comfyui-frontend-package", None)
-def check_frontend_version():
- """Check if the frontend version is up to date."""
+COMFY_PACKAGE_VERSIONS = []
+def get_comfy_package_versions():
+ """List installed/required versions for every comfy* package in requirements.txt."""
+ if COMFY_PACKAGE_VERSIONS:
+ return COMFY_PACKAGE_VERSIONS.copy()
+ out = COMFY_PACKAGE_VERSIONS
+ for name, required in (get_required_packages_versions() or {}).items():
+ if not name.startswith("comfy"):
+ continue
+ try:
+ installed = version(name)
+ except Exception:
+ installed = None
+ out.append({"name": name, "installed": installed, "required": required})
+ return out.copy()
- try:
- frontend_version_str = get_installed_frontend_version()
- frontend_version = parse_version(frontend_version_str)
- required_frontend_str = get_required_frontend_version()
- required_frontend = parse_version(required_frontend_str)
- if frontend_version < required_frontend:
- app.logger.log_startup_warning(
- f"""
+
+def check_comfy_packages_versions():
+ """Warn for every comfy* package whose installed version is below requirements.txt."""
+ from packaging.version import InvalidVersion, parse as parse_pep440
+ outdated_packages = []
+
+ for pkg in get_comfy_package_versions():
+ installed_str = pkg["installed"]
+ required_str = pkg["required"]
+ if not installed_str or not required_str:
+ continue
+ try:
+ outdated = parse_pep440(installed_str) < parse_pep440(required_str)
+ except InvalidVersion as e:
+ logging.error(f"Failed to check {pkg['name']} version: {e}")
+ continue
+ if outdated:
+ outdated_packages.append((pkg["name"], installed_str, required_str))
+ else:
+ logging.info("{} version: {}".format(pkg["name"], installed_str))
+
+ if outdated_packages:
+ package_warnings = "\n".join(
+ f"Installed {name} version {installed} is lower than the recommended version {required}."
+ for name, installed, required in outdated_packages
+ )
+ app.logger.log_startup_warning(
+ f"""
________________________________________________________________________
WARNING WARNING WARNING WARNING WARNING
-Installed frontend version {".".join(map(str, frontend_version))} is lower than the recommended version {".".join(map(str, required_frontend))}.
+{package_warnings}
-{frontend_install_warning_message()}
+{get_missing_requirements_message()}
________________________________________________________________________
""".strip()
- )
- else:
- logging.info("ComfyUI frontend version: {}".format(frontend_version_str))
- except Exception as e:
- logging.error(f"Failed to check frontend version: {e}")
+ )
REQUEST_TIMEOUT = 10 # seconds
@@ -201,6 +223,11 @@ class FrontendManager:
def get_required_templates_version(cls) -> str:
return get_required_packages_versions().get("comfyui-workflow-templates", None)
+ @classmethod
+ def get_comfy_package_versions(cls):
+ """List installed/required versions for every comfy* package in requirements.txt."""
+ return get_comfy_package_versions()
+
@classmethod
def default_frontend_path(cls) -> str:
try:
@@ -341,7 +368,7 @@ comfyui-workflow-templates is not installed.
main error source might be request timeout or invalid URL.
"""
if version_string == DEFAULT_VERSION_STRING:
- check_frontend_version()
+ check_comfy_packages_versions()
return cls.default_frontend_path()
repo_owner, repo_name, version = cls.parse_version_string(version_string)
@@ -403,7 +430,7 @@ comfyui-workflow-templates is not installed.
except Exception as e:
logging.error("Failed to initialize frontend: %s", e)
logging.info("Falling back to the default frontend.")
- check_frontend_version()
+ check_comfy_packages_versions()
return cls.default_frontend_path()
@classmethod
def template_asset_handler(cls):
diff --git a/app/logger.py b/app/logger.py
index 3d26d98fe..bde815822 100644
--- a/app/logger.py
+++ b/app/logger.py
@@ -5,6 +5,40 @@ import logging
import sys
import threading
+ANSI_NAMED_COLORS = {
+ 'black': '\033[30m',
+ 'red': '\033[31m',
+ 'green': '\033[32m',
+ 'yellow': '\033[33m',
+ 'blue': '\033[34m',
+ 'magenta': '\033[35m',
+ 'cyan': '\033[36m',
+ 'white': '\033[37m',
+}
+
+ANSI_LEVEL_COLORS = {
+ 'DEBUG': ANSI_NAMED_COLORS['cyan'],
+ 'INFO': ANSI_NAMED_COLORS['green'],
+ 'WARNING': ANSI_NAMED_COLORS['yellow'],
+ 'ERROR': ANSI_NAMED_COLORS['red'],
+ 'CRITICAL': ANSI_NAMED_COLORS['magenta'],
+}
+
+ANSI_RESET = '\033[0m'
+ANSI_BOLD = '\033[1m'
+
+
+class ColoredFormatter(logging.Formatter):
+ def format(self, record):
+ color = ANSI_LEVEL_COLORS.get(record.levelname, '')
+ bold = ANSI_BOLD if record.levelno >= logging.WARNING else ''
+ level_tag = f"{bold}{color}[{record.levelname}]{ANSI_RESET} "
+ message = super().format(record)
+ line_color = ANSI_NAMED_COLORS.get(getattr(record, 'color', ''), '')
+ if line_color:
+ return f"{level_tag}{line_color}{message}{ANSI_RESET}"
+ return level_tag + message
+
logs = None
stdout_interceptor = None
stderr_interceptor = None
@@ -68,8 +102,10 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
logger = logging.getLogger()
logger.setLevel(log_level)
+ formatter = ColoredFormatter("%(message)s")
+
stream_handler = logging.StreamHandler()
- stream_handler.setFormatter(logging.Formatter("%(message)s"))
+ stream_handler.setFormatter(formatter)
if use_stdout:
# Only errors and critical to stderr
@@ -77,7 +113,7 @@ def setup_logger(log_level: str = 'INFO', capacity: int = 300, use_stdout: bool
# Lesser to stdout
stdout_handler = logging.StreamHandler(sys.stdout)
- stdout_handler.setFormatter(logging.Formatter("%(message)s"))
+ stdout_handler.setFormatter(formatter)
stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
logger.addHandler(stdout_handler)
diff --git a/app/model_manager.py b/app/model_manager.py
index f124d1117..8f6e34b33 100644
--- a/app/model_manager.py
+++ b/app/model_manager.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import os
import base64
import json
diff --git a/app/node_replace_manager.py b/app/node_replace_manager.py
index d9aab5b22..72e8ac2b1 100644
--- a/app/node_replace_manager.py
+++ b/app/node_replace_manager.py
@@ -1,5 +1,7 @@
from __future__ import annotations
+import logging
+
from aiohttp import web
from typing import TYPE_CHECKING, TypedDict
@@ -31,8 +33,22 @@ class NodeReplaceManager:
self._replacements: dict[str, list[NodeReplace]] = {}
def register(self, node_replace: NodeReplace):
- """Register a node replacement mapping."""
- self._replacements.setdefault(node_replace.old_node_id, []).append(node_replace)
+ """Register a node replacement mapping.
+
+ Idempotent: if a replacement with the same (old_node_id, new_node_id)
+ is already registered, the duplicate is ignored. This prevents stale
+ entries from accumulating when custom nodes are reloaded in the same
+ process (e.g. via ComfyUI-Manager).
+ """
+ existing = self._replacements.setdefault(node_replace.old_node_id, [])
+ for entry in existing:
+ if entry.new_node_id == node_replace.new_node_id:
+ logging.debug(
+ "Node replacement %s -> %s already registered, ignoring duplicate.",
+ node_replace.old_node_id, node_replace.new_node_id,
+ )
+ return
+ existing.append(node_replace)
def get_replacement(self, old_node_id: str) -> list[NodeReplace] | None:
"""Get replacements for an old node ID."""
diff --git a/app/user_manager.py b/app/user_manager.py
index e18afb71b..7b11e381c 100644
--- a/app/user_manager.py
+++ b/app/user_manager.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import json
import os
import re
@@ -28,8 +27,8 @@ def get_file_info(path: str, relative_to: str) -> FileInfo:
return {
"path": os.path.relpath(path, relative_to).replace(os.sep, '/'),
"size": os.path.getsize(path),
- "modified": os.path.getmtime(path),
- "created": os.path.getctime(path)
+ "modified": int(os.path.getmtime(path) * 1000),
+ "created": int(os.path.getctime(path) * 1000),
}
diff --git a/blueprints/.glsl/Glow_30.frag b/blueprints/.glsl/Glow_30.frag
index 0ee152628..f3c85a212 100644
--- a/blueprints/.glsl/Glow_30.frag
+++ b/blueprints/.glsl/Glow_30.frag
@@ -2,7 +2,6 @@
precision mediump float;
uniform sampler2D u_image0;
-uniform vec2 u_resolution;
uniform int u_int0; // Blend mode
uniform int u_int1; // Color tint
uniform float u_float0; // Intensity
@@ -75,7 +74,7 @@ void main() {
float t0 = threshold - 0.15;
float t1 = threshold + 0.15;
- vec2 texelSize = 1.0 / u_resolution;
+ vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
float radius2 = radius * radius;
float sampleScale = clamp(radius * 0.75, 0.35, 1.0);
diff --git a/blueprints/.glsl/Image_Blur_1.frag b/blueprints/.glsl/Image_Blur_1.frag
index 83238111d..1819e1695 100644
--- a/blueprints/.glsl/Image_Blur_1.frag
+++ b/blueprints/.glsl/Image_Blur_1.frag
@@ -12,7 +12,6 @@ const int RADIAL_SAMPLES = 12;
const float RADIAL_STRENGTH = 0.0003;
uniform sampler2D u_image0;
-uniform vec2 u_resolution;
uniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)
uniform float u_float0; // Blur radius/amount
uniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)
@@ -25,7 +24,7 @@ float gaussian(float x, float sigma) {
}
void main() {
- vec2 texelSize = 1.0 / u_resolution;
+ vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));
float radius = max(u_float0, 0.0);
// Radial (angular) blur - single pass, doesn't use separable
diff --git a/blueprints/.glsl/Sharpen_23.frag b/blueprints/.glsl/Sharpen_23.frag
index c03f94b66..e7463a329 100644
--- a/blueprints/.glsl/Sharpen_23.frag
+++ b/blueprints/.glsl/Sharpen_23.frag
@@ -2,14 +2,13 @@
precision highp float;
uniform sampler2D u_image0;
-uniform vec2 u_resolution;
uniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0
in vec2 v_texCoord;
layout(location = 0) out vec4 fragColor0;
void main() {
- vec2 texel = 1.0 / u_resolution;
+ vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
// Sample center and neighbors
vec4 center = texture(u_image0, v_texCoord);
diff --git a/blueprints/.glsl/Unsharp_Mask_26.frag b/blueprints/.glsl/Unsharp_Mask_26.frag
index f5990cb4a..d968c9c03 100644
--- a/blueprints/.glsl/Unsharp_Mask_26.frag
+++ b/blueprints/.glsl/Unsharp_Mask_26.frag
@@ -2,7 +2,6 @@
precision highp float;
uniform sampler2D u_image0;
-uniform vec2 u_resolution;
uniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5
uniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels
uniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen
@@ -19,7 +18,7 @@ float getLuminance(vec3 color) {
}
void main() {
- vec2 texel = 1.0 / u_resolution;
+ vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));
float radius = max(u_float1, 0.5);
float amount = u_float0;
float threshold = u_float2;
diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium Base).json b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json
new file mode 100644
index 000000000..e561fe634
--- /dev/null
+++ b/blueprints/Audio Generation (Stable Audio 3 Medium Base).json
@@ -0,0 +1,2091 @@
+{
+ "revision": 0,
+ "last_node_id": 52,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 52,
+ "type": "8b66c757-fe2f-4184-91f3-479a19deb565",
+ "pos": [
+ 370,
+ 1120
+ ],
+ "size": [
+ 420,
+ 450
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "user_input",
+ "name": "user_input",
+ "type": "STRING",
+ "widget": {
+ "name": "user_input"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": null
+ },
+ {
+ "label": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "label": "use_reprompt",
+ "name": "use_reprompt",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_reprompt"
+ },
+ "link": null
+ },
+ {
+ "label": "reprompt_category",
+ "name": "category",
+ "type": "COMBO",
+ "widget": {
+ "name": "category"
+ },
+ "link": null
+ },
+ {
+ "label": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "label": "sa_clip",
+ "name": "sa_clip",
+ "type": "COMBO",
+ "widget": {
+ "name": "sa_clip"
+ },
+ "link": null
+ },
+ {
+ "label": "qwen_clip",
+ "name": "qwen_clip",
+ "type": "COMBO",
+ "widget": {
+ "name": "qwen_clip"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "31",
+ "value"
+ ],
+ [
+ "36",
+ "value"
+ ],
+ [
+ "3",
+ "seed"
+ ],
+ [
+ "35",
+ "value"
+ ],
+ [
+ "43",
+ "choice"
+ ],
+ [
+ "25",
+ "ckpt_name"
+ ],
+ [
+ "26",
+ "clip_name"
+ ],
+ [
+ "29",
+ "clip_name"
+ ]
+ ]
+ },
+ "widgets_values": [],
+ "title": "Audio Generation (Stable Audio 3 Medium Base)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "8b66c757-fe2f-4184-91f3-479a19deb565",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 56,
+ "lastLinkId": 84,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Audio Generation (Stable Audio 3 Medium Base)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -810,
+ 400,
+ 155.953125,
+ 208
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1750,
+ 1041,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f",
+ "name": "user_input",
+ "type": "STRING",
+ "linkIds": [
+ 68
+ ],
+ "label": "user_input",
+ "pos": [
+ -678.046875,
+ 424
+ ]
+ },
+ {
+ "id": "5ca95030-aff4-4544-b545-f0d814e0e49a",
+ "name": "duration",
+ "type": "FLOAT",
+ "linkIds": [
+ 82
+ ],
+ "label": "duration",
+ "pos": [
+ -678.046875,
+ 444
+ ]
+ },
+ {
+ "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 76
+ ],
+ "label": "seed",
+ "pos": [
+ -678.046875,
+ 464
+ ]
+ },
+ {
+ "id": "dc020099-39e6-4009-9937-408409d71736",
+ "name": "use_reprompt",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 83
+ ],
+ "label": "use_reprompt",
+ "pos": [
+ -678.046875,
+ 484
+ ]
+ },
+ {
+ "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169",
+ "name": "category",
+ "type": "COMBO",
+ "linkIds": [
+ 78
+ ],
+ "label": "reprompt_category",
+ "pos": [
+ -678.046875,
+ 504
+ ]
+ },
+ {
+ "id": "be19b747-6a47-4028-9c30-d52f54a712ea",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 79
+ ],
+ "label": "ckpt_name",
+ "pos": [
+ -678.046875,
+ 524
+ ]
+ },
+ {
+ "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642",
+ "name": "sa_clip",
+ "type": "COMBO",
+ "linkIds": [
+ 80
+ ],
+ "label": "sa_clip",
+ "pos": [
+ -678.046875,
+ 544
+ ]
+ },
+ {
+ "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac",
+ "name": "qwen_clip",
+ "type": "COMBO",
+ "linkIds": [
+ 81
+ ],
+ "label": "qwen_clip",
+ "pos": [
+ -678.046875,
+ 564
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "bbe988dd-5c03-44fd-a965-c712f9204988",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "linkIds": [
+ 27
+ ],
+ "localized_name": "AUDIO",
+ "pos": [
+ 1774,
+ 1065
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 620,
+ 420
+ ],
+ "size": [
+ 440,
+ 140
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 35
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 6
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 12,
+ "type": "VAEDecodeAudio",
+ "pos": [
+ 1450,
+ 110
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 13
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 39
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "slot_index": 0,
+ "links": [
+ 27
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecodeAudio",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 11,
+ "type": "EmptyLatentAudio",
+ "pos": [
+ 630,
+ 610
+ ],
+ "size": [
+ 430,
+ 140
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "seconds",
+ "name": "seconds",
+ "type": "FLOAT",
+ "widget": {
+ "name": "seconds"
+ },
+ "link": 50
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentAudio",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 60,
+ 1
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1100,
+ 100
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 30
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 4
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 6
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 12
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 76
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 50,
+ 7,
+ "lcm",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 29,
+ "type": "CLIPLoader",
+ "pos": [
+ 690,
+ 1580
+ ],
+ "size": [
+ 430,
+ 170
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "showAdvanced": false,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 81
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "models": [
+ {
+ "name": "qwen3.5_2b_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen3.5_2b_bf16.safetensors",
+ "stable_diffusion",
+ "default"
+ ]
+ },
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 610,
+ 130
+ ],
+ "size": [
+ 450,
+ 240
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 34
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 49
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 4
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 34,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 210,
+ 610
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 47
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 46
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 49
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 41,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1370,
+ 1360
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 56
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 57
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 42,
+ "type": "PreviewAny",
+ "pos": [
+ 1370,
+ 1310
+ ],
+ "size": [
+ 230,
+ 40
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 57
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewAny"
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 39,
+ "type": "StringReplace",
+ "pos": [
+ 1040,
+ 900
+ ],
+ "size": [
+ 270,
+ 280
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 52
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 59
+ ]
+ }
+ ],
+ "title": "Text Replace (USER INPUT)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "",
+ "USER_INPUT",
+ ""
+ ]
+ },
+ {
+ "id": 28,
+ "type": "TextGenerate",
+ "pos": [
+ 1200,
+ 1580
+ ],
+ "size": [
+ 430,
+ 420
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 40
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "video",
+ "name": "video",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": null
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 60
+ },
+ {
+ "localized_name": "max_length",
+ "name": "max_length",
+ "type": "INT",
+ "widget": {
+ "name": "max_length"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode",
+ "name": "sampling_mode",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "sampling_mode"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temperature",
+ "name": "sampling_mode.temperature",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.temperature"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_k",
+ "name": "sampling_mode.top_k",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.top_k"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_p",
+ "name": "sampling_mode.top_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.top_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "min_p",
+ "name": "sampling_mode.min_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.min_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "repetition_penalty",
+ "name": "sampling_mode.repetition_penalty",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.repetition_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "seed",
+ "name": "sampling_mode.seed",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.seed"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "presence_penalty",
+ "name": "sampling_mode.presence_penalty",
+ "shape": 7,
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.presence_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "thinking",
+ "name": "thinking",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "thinking"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "use_default_template",
+ "name": "use_default_template",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_default_template"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "generated_text",
+ "name": "generated_text",
+ "type": "STRING",
+ "links": [
+ 46,
+ 84
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextGenerate",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "",
+ 256,
+ "on",
+ 0.7,
+ 64,
+ 0.95,
+ 0.05,
+ 1.05,
+ 0,
+ 0,
+ false,
+ true
+ ]
+ },
+ {
+ "id": 31,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -390,
+ 160
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 68
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 47,
+ 53
+ ]
+ }
+ ],
+ "title": "User: short description (USER_INPUT in template)",
+ "properties": {
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 43,
+ "type": "CustomCombo",
+ "pos": [
+ 140,
+ 910
+ ],
+ "size": [
+ 550,
+ 320
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "choice",
+ "name": "choice",
+ "type": "COMBO",
+ "widget": {
+ "name": "choice"
+ },
+ "link": 78
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 65
+ ]
+ },
+ {
+ "localized_name": "INDEX",
+ "name": "INDEX",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "title": "Custom Combo (Category index)",
+ "properties": {
+ "Node name for S&R": "CustomCombo"
+ },
+ "widgets_values": [
+ "Music",
+ 0,
+ "Music",
+ "Instrument",
+ "SFX",
+ "One-shot",
+ ""
+ ]
+ },
+ {
+ "id": 49,
+ "type": "JsonExtractString",
+ "pos": [
+ 720,
+ 1200
+ ],
+ "size": [
+ 300,
+ 180
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "json_string",
+ "name": "json_string",
+ "type": "STRING",
+ "widget": {
+ "name": "json_string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "key",
+ "name": "key",
+ "type": "STRING",
+ "widget": {
+ "name": "key"
+ },
+ "link": 65
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "JsonExtractString"
+ },
+ "widgets_values": [
+ "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}",
+ "Music"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "StringReplace",
+ "pos": [
+ 1350,
+ 900
+ ],
+ "size": [
+ 260,
+ 280
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 59
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 58
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 60
+ ]
+ }
+ ],
+ "title": "Text Replace (AUDIO LENGTH)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "",
+ "AUDIO_LENGTH",
+ ""
+ ]
+ },
+ {
+ "id": 38,
+ "type": "StringReplace",
+ "pos": [
+ 720,
+ 900
+ ],
+ "size": [
+ 290,
+ 280
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 66
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 52
+ ]
+ }
+ ],
+ "title": "Text Replace (PROMPT TEMPLATE)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:",
+ "SYSTEM_PROMPTS",
+ ""
+ ]
+ },
+ {
+ "id": 35,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -390,
+ 570
+ ],
+ "size": [
+ 400,
+ 100
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 48
+ ]
+ }
+ ],
+ "title": "Boolean (Enable_Reprompt)",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 36,
+ "type": "PrimitiveFloat",
+ "pos": [
+ -390,
+ 410
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 82
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 50,
+ 56
+ ]
+ }
+ ],
+ "title": "Float (Duration)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 150
+ ]
+ },
+ {
+ "id": 25,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ 130
+ ],
+ "size": [
+ 440,
+ 190
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 79
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 30
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 39
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "stable_audio_3_medium_base.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium_base.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "stable_audio_3_medium_base.safetensors"
+ ]
+ },
+ {
+ "id": 26,
+ "type": "CLIPLoader",
+ "pos": [
+ 100,
+ 390
+ ],
+ "size": [
+ 440,
+ 170
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 80
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 34,
+ 35
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "t5gemma_b_b_ul2.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "t5gemma_b_b_ul2.safetensors",
+ "stable_audio",
+ "default"
+ ]
+ },
+ {
+ "id": 54,
+ "type": "PreviewAny",
+ "pos": [
+ 1720,
+ 1580
+ ],
+ "size": [
+ 420,
+ 550
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 4,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 84
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewAny"
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Loaders: checkpoint & CLIP",
+ "bounding": [
+ 80,
+ 50,
+ 485.721654232725,
+ 527.2848777754299
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "CLIP encode: conditioning",
+ "bounding": [
+ 600,
+ 60,
+ 470,
+ 510
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "User inputs: prompt & duration",
+ "bounding": [
+ -400,
+ 10,
+ 430,
+ 740
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Reprompt: full branch (template + LLM)",
+ "bounding": [
+ 60,
+ 780,
+ 1630,
+ 1360
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Reprompt: JSON extract & template fills",
+ "bounding": [
+ 120,
+ 820,
+ 1520,
+ 650
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Helpers: duration to string",
+ "bounding": [
+ 1340,
+ 1180,
+ 280,
+ 250
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Reprompt: Qwen TextGenerate",
+ "bounding": [
+ 680,
+ 1510,
+ 960,
+ 614.65625
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Audio generation: Stable Audio",
+ "bounding": [
+ 60,
+ 10,
+ 1627.3616782294932,
+ 737.0545987464304
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 35,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 13,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 12,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 39,
+ "origin_id": 25,
+ "origin_slot": 2,
+ "target_id": 12,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 50,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 30,
+ "origin_id": 25,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 4,
+ "origin_id": 6,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 6,
+ "origin_id": 7,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 12,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 34,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 49,
+ "origin_id": 34,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 47,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 46,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 48,
+ "origin_id": 35,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 56,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": 41,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 57,
+ "origin_id": 41,
+ "origin_slot": 1,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 52,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 53,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 39,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 40,
+ "origin_id": 29,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 60,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 65,
+ "origin_id": 43,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 59,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 58,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 66,
+ "origin_id": 49,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 27,
+ "origin_id": 12,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "AUDIO"
+ },
+ {
+ "id": 68,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 76,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 78,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 43,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 25,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 80,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 26,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 81,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 29,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 82,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 35,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 84,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 54,
+ "target_slot": 0,
+ "type": "STRING"
+ }
+ ],
+ "extra": {},
+ "category": "Audio/Music generation",
+ "description": "Generates music, instrument loops, sound effects, and one-shots from text using the Stable Audio 3 Medium base checkpoint, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Audio Generation (Stable Audio 3 Medium).json b/blueprints/Audio Generation (Stable Audio 3 Medium).json
new file mode 100644
index 000000000..30add5b05
--- /dev/null
+++ b/blueprints/Audio Generation (Stable Audio 3 Medium).json
@@ -0,0 +1,2091 @@
+{
+ "revision": 0,
+ "last_node_id": 52,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 52,
+ "type": "8b66c757-fe2f-4184-91f3-479a19deb565",
+ "pos": [
+ 370,
+ 1120
+ ],
+ "size": [
+ 420,
+ 450
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "user_input",
+ "name": "user_input",
+ "type": "STRING",
+ "widget": {
+ "name": "user_input"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": null
+ },
+ {
+ "label": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "label": "use_reprompt",
+ "name": "use_reprompt",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_reprompt"
+ },
+ "link": null
+ },
+ {
+ "label": "reprompt_category",
+ "name": "category",
+ "type": "COMBO",
+ "widget": {
+ "name": "category"
+ },
+ "link": null
+ },
+ {
+ "label": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "label": "sa_clip",
+ "name": "sa_clip",
+ "type": "COMBO",
+ "widget": {
+ "name": "sa_clip"
+ },
+ "link": null
+ },
+ {
+ "label": "qwen_clip",
+ "name": "qwen_clip",
+ "type": "COMBO",
+ "widget": {
+ "name": "qwen_clip"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "links": []
+ }
+ ],
+ "title": "Audio Generation (Stable Audio 3 Medium)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "31",
+ "value"
+ ],
+ [
+ "36",
+ "value"
+ ],
+ [
+ "3",
+ "seed"
+ ],
+ [
+ "35",
+ "value"
+ ],
+ [
+ "43",
+ "choice"
+ ],
+ [
+ "25",
+ "ckpt_name"
+ ],
+ [
+ "26",
+ "clip_name"
+ ],
+ [
+ "29",
+ "clip_name"
+ ]
+ ]
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "8b66c757-fe2f-4184-91f3-479a19deb565",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 56,
+ "lastLinkId": 84,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Audio Generation (Stable Audio 3 Medium)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -810,
+ 400,
+ 155.953125,
+ 208
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1750,
+ 1041,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "78ae2515-114b-494a-becc-43c7b6c2dc2f",
+ "name": "user_input",
+ "type": "STRING",
+ "linkIds": [
+ 68
+ ],
+ "label": "user_input",
+ "pos": [
+ -678.046875,
+ 424
+ ]
+ },
+ {
+ "id": "5ca95030-aff4-4544-b545-f0d814e0e49a",
+ "name": "duration",
+ "type": "FLOAT",
+ "linkIds": [
+ 82
+ ],
+ "label": "duration",
+ "pos": [
+ -678.046875,
+ 444
+ ]
+ },
+ {
+ "id": "718eb10f-da1a-4cea-a9c7-3040f98fe960",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 76
+ ],
+ "label": "seed",
+ "pos": [
+ -678.046875,
+ 464
+ ]
+ },
+ {
+ "id": "dc020099-39e6-4009-9937-408409d71736",
+ "name": "use_reprompt",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 83
+ ],
+ "label": "use_reprompt",
+ "pos": [
+ -678.046875,
+ 484
+ ]
+ },
+ {
+ "id": "edae394c-6324-44d6-8ac5-d8caa5ae2169",
+ "name": "category",
+ "type": "COMBO",
+ "linkIds": [
+ 78
+ ],
+ "label": "reprompt_category",
+ "pos": [
+ -678.046875,
+ 504
+ ]
+ },
+ {
+ "id": "be19b747-6a47-4028-9c30-d52f54a712ea",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 79
+ ],
+ "label": "ckpt_name",
+ "pos": [
+ -678.046875,
+ 524
+ ]
+ },
+ {
+ "id": "bc9241a2-bc20-4c5d-8cb1-f2958f598642",
+ "name": "sa_clip",
+ "type": "COMBO",
+ "linkIds": [
+ 80
+ ],
+ "label": "sa_clip",
+ "pos": [
+ -678.046875,
+ 544
+ ]
+ },
+ {
+ "id": "a33a2468-6d6d-4cb6-937c-3510bf16ebac",
+ "name": "qwen_clip",
+ "type": "COMBO",
+ "linkIds": [
+ 81
+ ],
+ "label": "qwen_clip",
+ "pos": [
+ -678.046875,
+ 564
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "bbe988dd-5c03-44fd-a965-c712f9204988",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "linkIds": [
+ 27
+ ],
+ "localized_name": "AUDIO",
+ "pos": [
+ 1774,
+ 1065
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 620,
+ 420
+ ],
+ "size": [
+ 440,
+ 140
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 35
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 6
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 12,
+ "type": "VAEDecodeAudio",
+ "pos": [
+ 1450,
+ 110
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 13
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 39
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "slot_index": 0,
+ "links": [
+ 27
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecodeAudio",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 11,
+ "type": "EmptyLatentAudio",
+ "pos": [
+ 630,
+ 610
+ ],
+ "size": [
+ 430,
+ 140
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "seconds",
+ "name": "seconds",
+ "type": "FLOAT",
+ "widget": {
+ "name": "seconds"
+ },
+ "link": 50
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentAudio",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 60,
+ 1
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1100,
+ 100
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 30
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 4
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 6
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 12
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 76
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 8,
+ 1,
+ "lcm",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 29,
+ "type": "CLIPLoader",
+ "pos": [
+ 690,
+ 1580
+ ],
+ "size": [
+ 430,
+ 170
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "showAdvanced": false,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 81
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "models": [
+ {
+ "name": "qwen3.5_2b_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen3.5/resolve/main/text_encoders/qwen3.5_2b_bf16.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen3.5_2b_bf16.safetensors",
+ "stable_diffusion",
+ "default"
+ ]
+ },
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 610,
+ 130
+ ],
+ "size": [
+ 450,
+ 240
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 34
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 49
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 4
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 34,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 210,
+ 610
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 47
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 46
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 49
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 41,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1370,
+ 1360
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 56
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 57
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 42,
+ "type": "PreviewAny",
+ "pos": [
+ 1370,
+ 1310
+ ],
+ "size": [
+ 230,
+ 40
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 57
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewAny"
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 39,
+ "type": "StringReplace",
+ "pos": [
+ 1040,
+ 900
+ ],
+ "size": [
+ 270,
+ 280
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 52
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 59
+ ]
+ }
+ ],
+ "title": "Text Replace (USER INPUT)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "",
+ "USER_INPUT",
+ ""
+ ]
+ },
+ {
+ "id": 28,
+ "type": "TextGenerate",
+ "pos": [
+ 1200,
+ 1580
+ ],
+ "size": [
+ 430,
+ 420
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 40
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "video",
+ "name": "video",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": null
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 60
+ },
+ {
+ "localized_name": "max_length",
+ "name": "max_length",
+ "type": "INT",
+ "widget": {
+ "name": "max_length"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode",
+ "name": "sampling_mode",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "sampling_mode"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temperature",
+ "name": "sampling_mode.temperature",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.temperature"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_k",
+ "name": "sampling_mode.top_k",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.top_k"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_p",
+ "name": "sampling_mode.top_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.top_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "min_p",
+ "name": "sampling_mode.min_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.min_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "repetition_penalty",
+ "name": "sampling_mode.repetition_penalty",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.repetition_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "seed",
+ "name": "sampling_mode.seed",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.seed"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "presence_penalty",
+ "name": "sampling_mode.presence_penalty",
+ "shape": 7,
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.presence_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "thinking",
+ "name": "thinking",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "thinking"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "use_default_template",
+ "name": "use_default_template",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_default_template"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "generated_text",
+ "name": "generated_text",
+ "type": "STRING",
+ "links": [
+ 46,
+ 84
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextGenerate",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "",
+ 256,
+ "on",
+ 0.7,
+ 64,
+ 0.95,
+ 0.05,
+ 1.05,
+ 0,
+ 0,
+ false,
+ true
+ ]
+ },
+ {
+ "id": 31,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -390,
+ 160
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 68
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 47,
+ 53
+ ]
+ }
+ ],
+ "title": "User: short description (USER_INPUT in template)",
+ "properties": {
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 43,
+ "type": "CustomCombo",
+ "pos": [
+ 140,
+ 910
+ ],
+ "size": [
+ 550,
+ 320
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "choice",
+ "name": "choice",
+ "type": "COMBO",
+ "widget": {
+ "name": "choice"
+ },
+ "link": 78
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 65
+ ]
+ },
+ {
+ "localized_name": "INDEX",
+ "name": "INDEX",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "title": "Custom Combo (Category index)",
+ "properties": {
+ "Node name for S&R": "CustomCombo"
+ },
+ "widgets_values": [
+ "Music",
+ 0,
+ "Music",
+ "Instrument",
+ "SFX",
+ "One-shot",
+ ""
+ ]
+ },
+ {
+ "id": 49,
+ "type": "JsonExtractString",
+ "pos": [
+ 720,
+ 1200
+ ],
+ "size": [
+ 300,
+ 180
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "json_string",
+ "name": "json_string",
+ "type": "STRING",
+ "widget": {
+ "name": "json_string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "key",
+ "name": "key",
+ "type": "STRING",
+ "widget": {
+ "name": "key"
+ },
+ "link": 65
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "JsonExtractString"
+ },
+ "widgets_values": [
+ "{\n \"Music\": \"You are an expert musician and musicologist and prompt engineer. Transform the user's input into a detailed, vivid music prompt for a full instrumental track.\\n\\n1. Start with the genre or style and optional adjectives (e.g., upbeat, dreamy, aggressive).\\n2. List the main instruments that define the track.\\n3. Add supporting elements or layers such as pads, harmonics, effects, or field recordings.\\n4. Include rhythm or percussion elements like drums, hi-hats, congas, brushes, or polyrhythms.\\n5. Integrate mood and energy naturally in the sentence (e.g., \\\"creating suspenseful tension\\\" or \\\"bright and uplifting\\\").\\n6. Specify the BPM.\\n7. Specify the track length as an integer in seconds. Use ranges: energetic/dance 120-180s, pop/rock 180-210s, cinematic/ambient 240-300s.\\n8. Combine all elements into one natural, fluid sentence. Avoid semicolons.\\n\\nTemplate:\\nGenre/Style with main instruments, supporting instruments/layers, and rhythm/percussion creating mood/energy. BPM: X. Length: Y seconds\\n\\nExamples:\\n- Jazz ballad with smooth saxophone lead, piano chords, upright bass, brushed drums, and soft strings that swing gently for a warm and cozy evening. BPM: 85. Length: 180 seconds\\n- EDM festival track with pulsing synth leads, plucked arpeggios, layered pads, side-chained bass, punchy kick and snare, and hi-hat rolls creating bright, energetic, and uplifting dance energy. BPM: 128. Length: 150 seconds\\n- Lo-fi hip-hop chill track with mellow electric piano, soft vinyl crackle, subtle synth pads, low-pass filtered drums, percussion loops, and soft plucked bass for a relaxed, dreamy vibe. BPM: 75. Length: 150 seconds\\n- Heavy metal anthem with distorted electric guitars, bass guitar, double bass drums, and cymbal crashes with fast palm-muted riffs creating intense, aggressive energy. BPM: 160. Length: 180 seconds\\n- Melancholic piano piece with soft piano lead, string pads, subtle atmospheric synths, and minimal brush percussion evoking a reflective rainy-day feeling. BPM: 60. Length: 240 seconds\\n- Suspenseful electronic thriller with pulsing bass synth, arpeggiated lead synth, cinematic pads, glitchy percussion, and high string stabs creating dark and tense energy. BPM: 100. Length: 200 seconds\\n- Dreamy ambient soundscape with layered pads, soft bell textures, gentle drones, and wind and water field recordings for ethereal and spacious meditation. BPM: 40. Length: 300 seconds\\n- Fingerpicking acoustic guitar solo with harmonics, subtle reverb, occasional shaker and soft stomp percussion, and soft pad layers for warm intimate storytelling. BPM: 70. Length: 120 seconds\\n- Synthwave 80s retro track with arpeggiated synth leads, analog pads, electric bass, punchy electronic drums, gated reverb snares, and atmospheric FX for nostalgic and vibrant energy. BPM: 110. Length: 180 seconds\\n- Tribal percussion ensemble with congas, djembes, bongos, shakers, and frame drums layered with deep synthetic sub-bass in complex polyrhythms. BPM: 100. Length: 140 seconds\\n- 1920s swing jazz with brass section, upright bass, piano, brushed drums, banjo, clarinet, and soft strings that swing lively for energetic dance vibes. BPM: 110. Length: 180 seconds\\n- Futuristic electronic sci-fi track with pulsing bass synth, evolving lead synths, layered pads, glitch percussion, robotic FX, and sub-bass for tense cinematic energy. BPM: 125. Length: 200 seconds\\n- Ambient underwater soundscape with flowing water textures, soft piano motifs, synth drones, distant bells, and underwater reverb for spacious meditative immersion. BPM: 45. Length: 300 seconds\\n- Horror cinematic track with dissonant strings, eerie piano stabs, cinematic percussion including taiko and low toms, and synth FX producing suspenseful creepy tension. BPM: 90. Length: 240 seconds\\n- Reggae track with offbeat guitar, warm basslines, snare, kick, congas, and horn stabs giving laid-back groovy energy. BPM: 85. Length: 150 seconds\\n- Blues track with soulful electric guitar solos, walking bass, piano, and shuffle drums creating expressive and emotive storytelling. BPM: 90. Length: 180 seconds\\n- Latin salsa with congas, timbales, horns, piano montunos, bass, and layered percussion for vibrant danceable energy. BPM: 120. Length: 210 seconds\\n- Afrobeat track with electric guitar stabs, horns, layered percussion, congas, shakers, bass groove, and synth pads for vibrant rhythmic energy. BPM: 105. Length: 200 seconds\\n- Indie rock track with electric guitar riffs, bass, live drum kit, layered synths, and subtle strings for energetic yet emotional feel. BPM: 110. Length: 180 seconds\\n- Funk groove with slap bass, electric guitar chords, brass stabs, drums, congas, and rhythmic keyboards creating high-energy danceable rhythm. BPM: 105. Length: 180 seconds\\n- Drum and bass track with fast breakbeat drums, deep sub-bass, sharp synth leads, pads, and atmospheric FX for high-energy club motion. BPM: 175. Length: 150 seconds\\n- Dark ambient track with drones, distant bells, low rumbles, soft wind textures, and synth pads producing eerie immersive tension. BPM: 50. Length: 300 seconds\\n- Tropical house track with marimba, steel drums, soft synths, smooth bass, layered percussion, and light piano riffs for sunny chill dance vibes. BPM: 110. Length: 180 seconds\\n- Progressive rock track with electric guitar leads, organ, bass, drum kit, synth layers, and occasional strings for epic layered energy. BPM: 100. Length: 220 seconds\\n- Music box melody with delicate metallic tones and soft resonance, lullaby style, with gentle ambient reverb. BPM: 60. Length: 20 seconds\\n- Soft piano arpeggio with warm felted tone and slow attack, lullaby style, with intimate room ambience. BPM: 60. Length: 30 seconds\\n- Harp gentle plucked pattern with airy resonance, lullaby style, with dreamy reverb tail. BPM: 65. Length: 25 seconds\\n- Acoustic guitar fingerstyle pattern with warm nylon strings and soft dynamics, lullaby style, with subtle room resonance. BPM: 60. Length: 30 seconds\\n- Ambient synth pad with smooth evolving texture and soft harmonics, lullaby style, with wide stereo ambience. BPM: 50. Length: 40 seconds\\n- Early rock piano with walking left-hand bass line, shuffle rhythms, and blues scale improvisations in energetic 1950s boogie-woogie style. BPM: 160. Length: 180 seconds\\n- Trip Hop track with jazzy sampled vibraphone, mid-tempo breakbeat drums, harp, Latin ethnic percussion, and sweeping cinematic strings creating airy, relaxing, soulful lounge vibes. BPM: 90. Length: 180 seconds\\n- Country outlaw cinematic instrumental with blues pedal steel guitar, rustic mandolin, fiddle call-and-response, tape-driven rattly drum kit, autoharp, and soaring accordion solo for raw, emotional southern blues expression. BPM: 85. Length: 200 seconds\\n- Neo Classical track with sweeping string section, elegant horns, and delicate piano creating soothing, hypnotic, modern, soft, and classic mood. BPM: 70. Length: 180 seconds\\n- Art Rock desert track with desolate piano chords, western-themed rhythm guitars, unique lead guitars, rattly vintage drum kit, and supporting bass creating lonely, expansive, beautiful, and strange atmospheres. BPM: 95. Length: 180 seconds\\n- Cinematic Sci-Fi score with dramatic horn section, building marcato strings, gliding bassoon, thunderous cymbals, subdued timpani, and subtle synth drones producing awe-inspiring, uplifting, epic intergalactic energy. BPM: 100. Length: 220 seconds\\n- West Coast Hip Hop instrumental with cascading harp melodies, smooth Rhodes piano chops, vintage boom bap drums, and walking double bass producing raw, street, and soulful block-party vibes. BPM: 92. Length: 180 seconds\\n- Synthwave futuristic track with pulsating synth bass, exciting chords, soaring leads, and reverberating drum machine patterns creating gritty, pounding, and cool energy. BPM: 110. Length: 180 seconds\\n- Breakbeat track with complex percussion, intricate breakbeats, gritty synths, lush pads, and 808 bassline producing fresh, modern, futuristic, and rave-ready energy. BPM: 140. Length: 160 seconds\\n- Lounge Jazz 1960s smooth track with laid-back drums, piano chords, double bass, soft electric piano, subtle flute, and unique percussion creating beautiful, atmospheric, eclectic, retro, and chill vibes. BPM: 85. Length: 180 seconds\\n- Latin Jazz 1950s blissful track with laid-back Latin drums, euphoric piano chords, double bass, orchestral accompaniment, acoustic guitar, and vibraphone producing nostalgic, beautiful, atmospheric, cinematic, and chill mood. BPM: 95. Length: 180 seconds\\n- Acid Jazz 1970s summertime track with smooth electric piano, trippy synth leads, laid-back vintage drum kit, fuzzy electric bass, and uplifting violin producing retro, psychedelic, jazzy, relaxing energy. BPM: 100. Length: 180 seconds\\n- Progressive Soul 1970s track with feel-good piano, psychedelic organ, groovy vintage drum kit with percussion, fuzzy electric bass, and synth strings producing retro, raw, soulful, joyous atmosphere. BPM: 90. Length: 180 seconds\\n- Discotheque 1970s French-inspired track with sultry piano, psychedelic guitars, groovy drum kit, fuzzy electric bass, and melancholic organ producing retro, raw, laid-back, and relaxing mood. BPM: 105. Length: 180 seconds\\n- Soul Jazz 1970s track with expressive saxophone, smooth piano, groovy drum kit, rhythmic upright bass, sweeping strings, and minimal vibraphone producing retro, raw, laid-back, and epic energy. BPM: 95. Length: 180 seconds\\n- Vintage R&B 1970s live studio track with subtle brass, smooth piano, sweeping strings, and minimal drums producing retro, beautiful, uplifting, nostalgic mood. BPM: 85. Length: 180 seconds\\n- 50s Pop track with Latin influence, string section, bold brass, vibraphone, acoustic guitar, flute, ethnic percussion, and brushed drums creating sexy, epic, vintage, retro, melancholic, jazzy, dramatic energy. BPM: 100. Length: 180 seconds\\n- A piece of calm, quiet, mellow, serene music perfect for a peaceful film score, featuring soft modulating piano, ambient sfx and foley, beautiful vibraphone, and subtle synthesizer drones. The mood is cinematic, thoughtful, serene and nostalgic. BPM: 55. Length: 300 seconds\",\n \"Instrument\": \"You are a music metadata expert. Given an instrument, generate a descriptive prompt for a generative audio model.\\n\\n1. Identify the instrument.\\n2. Add playing style or technique.\\n3. Include details about material, timbre, or texture.\\n4. Add musical style or mood. Specify the genre, context, or emotional character.\\n5. Add spatial or production qualities.\\n6. Specify BPM: Always include a BPM appropriate to the style and context.\\n7. Specify length: Provide an integer in seconds (6–20 s for loops, 20–180 s for stems).\\n\\nExamples:\\n- Synth arpeggio loop with bright detuned oscillators. BPM: 120. Length: 8 seconds\\n- Chord stab loop with sharp percussive attack. BPM: 90. Length: 6 seconds\\n- Guitar muted strum loop with tight rhythmic feel. BPM: 100. Length: 8 seconds\\n- Pluck sequence loop with bright resonant tone. BPM: 128. Length: 10 seconds\\n- Marimba and vibraphone percussive loop with resonant wooden and metallic tones. BPM: 110. Length: 12 seconds\\n- Drum loop with deep muffled kick on beat one, snappy rimshot snare on beats two and four with rolling ghost note fills, and tight closed hi-hats with subtle open accents. BPM: 85. Length: 10 seconds\\n- Drum groove loop with brushed snare swinging on the ride, soft feathered kick on downbeats, and light closed hi-hat taps on the upbeats. BPM: 130. Length: 12 seconds\\n- Kick and hi-hat loop with four-on-the-floor punchy kick, tight closed hi-hats on every eighth note, and a sharp dry snare on beats two and four. BPM: 130. Length: 15 seconds\\n- Vinyl crackle drum loop with warm low-pass filtered kick, dusty snare with tape saturation, and shuffled closed hi-hats with subtle vinyl crackle ambiance. BPM: 80. Length: 10 seconds\\n- Ambient pad loop with evolving texture. BPM: 80. Length: 12 seconds\\n- Melodic synth bass groove loop with pumping sidechain feel. BPM: 122. Length: 10 seconds\\n- Melodic Bass slap and pop rhythm loop. BPM: 100. Length: 8 seconds\\n- Acoustic bass walking line loop with natural wooden resonance. BPM: 120. Length: 12 seconds\\n- String pizzicato motif loop, suspenseful, with tight string texture. BPM: 90. Length: 8 seconds\\n- Brass staccato riff loop with sharp bright attack. BPM: 130. Length: 10 seconds\\n- Flute airy melodic loop with wooden headjoint resonance. BPM: 100. Length: 6 seconds\\n- Pan flute ambient loop with breathy timbre. BPM: 75. Length: 8 seconds\\n- Clarinet riff loop with warm smooth reed tone. BPM: 120. Length: 10 seconds\\n- Oboe motif loop, orchestral, with rich double reed resonance. BPM: 80. Length: 8 seconds\\n- Recorder Renaissance motif loop with soft wooden timbre. BPM: 100. Length: 6 seconds\\n- Electric sitar riff loop with buzzing resonant tone. BPM: 90. Length: 10 seconds\\n- Koto plucked motif loop with resonant wooden strings. BPM: 90. Length: 8 seconds\\n- Shamisen folk melody loop with percussive twang. BPM: 100. Length: 8 seconds\\n- Banjo fingerpicking loop with metallic string resonance. BPM: 110. Length: 10 seconds\\n- Mandolin tremolo loop with crisp wooden body tone. BPM: 120. Length: 10 seconds\\n- Acoustic guitar chord vamp loop with natural room resonance. BPM: 110. Length: 12 seconds\\n- Nylon string guitar arpeggio loop with warm, soft timbre. BPM: 90. Length: 15 seconds\\n- Electric guitar riff loop with driven distorted tone. BPM: 130. Length: 10 seconds\\n- Slide guitar melody loop with warm resonant glide. BPM: 100. Length: 12 seconds\\n- Steel guitar slide loop with bright pedal steel tone. BPM: 95. Length: 12 seconds\\n- Harpsichord arpeggio loop with crisp plucked attack. BPM: 120. Length: 10 seconds\\n- Rhodes chord vamp loop with warm electric piano tone. BPM: 100. Length: 12 seconds\\n- Clavinet funky rhythm loop. BPM: 105. Length: 10 seconds\\n- Organ chord vamp loop with full drawbar warmth. BPM: 90. Length: 12 seconds\\n- Drum loop with booming 808 kick on beat one, crisp snare on beat three, and rapid triplet hi-hat rolls with open hat accents for aggressive high-energy feel. BPM: 140. Length: 8 seconds\\n- Breakbeat drum loop with chopped Amen-style snare flurries, driving kick on the one, fast sixteenth-note closed hi-hats, and syncopated open hat accents. BPM: 170. Length: 10 seconds\\n- Glitch percussion loop with stuttered kick transients, randomised snare hits processed with bit-crushing, and erratic hi-hat patterns with pitch-shifted metallic ticks. BPM: 120. Length: 12 seconds\\n- Metallic hits loop with distorted kick impacts, processed metal-plate snare slams, and grinding hi-hat noise bursts for aggressive mechanical texture. BPM: 120. Length: 10 seconds\\n- Timpani hits loop, cinematic, with deep resonant kick-like timpani strikes on beat one, rolling snare-style timpani fills, and no hi-hats for a grand orchestral feel. BPM: 70. Length: 8 seconds\\n- Snare roll loop, dramatic, with accelerating snare drum rolls building from soft to crashing, deep supporting kick pulses, and no hi-hats for maximum impact. BPM: 100. Length: 8 seconds\\n- Accordion motif loop with bright reedy bellows tone. BPM: 100. Length: 10 seconds\\n- Harmonica blues riff loop with expressive reed timbre. BPM: 90. Length: 10 seconds\\n- Trombone riff loop with warm sliding brass tone. BPM: 120. Length: 10 seconds\\n- French horn melodic loop, cinematic. BPM: 80. Length: 12 seconds\\n- Soprano sax ballad loop. BPM: 70. Length: 12 seconds\\n- Alto sax bebop riff loop. BPM: 200. Length: 10 seconds\\n- Electric violin melodic loop with reverb. BPM: 90. Length: 10 seconds\\n- String pad loop with cinematic texture. BPM: 70. Length: 15 seconds\\n- Granular synth evolving texture loop. BPM: 90. Length: 15 seconds\\n- Piano motif loop with soft felt hammer tone. BPM: 80. Length: 10 seconds\\n- Pad and synth loop with lush detuned shimmer. BPM: 85. Length: 12 seconds\\n- Synth lead loop with sidechain pumping compression. BPM: 128. Length: 10 seconds\\n- Analog synth bassline loop with deep warm low-end. BPM: 122. Length: 12 seconds\\n- FM synth lead motif loop with bright metallic shimmer. BPM: 110. Length: 10 seconds\\n- Bass groove loop with tight rhythmic two-bar pattern. BPM: 100. Length: 16 seconds\\n- Acoustic guitar fingerstyle motif loop with warm wood resonance. BPM: 90. Length: 45 seconds\\n- Sombre acoustic guitar motif loop with cavernous reverb, delicate fingerpicking, and expressive melancholic tone. BPM: 70. Length: 45 seconds\\n- Electric guitar rock riff motif loop. BPM: 130. Length: 40 seconds\\n- Vintage electric guitar motif loop, live-recorded in a vintage studio, with expressive and dynamic solo performance. BPM: 90. Length: 40 seconds\\n- Piano chord progression motif loop with rich harmonic movement. BPM: 120. Length: 60 seconds\\n- String ensemble cinematic motif loop with rich wooden resonance. BPM: 80. Length: 120 seconds\\n- Brass ensemble cinematic motif loop with bright metallic timbre. BPM: 90. Length: 90 seconds\\n- Ethnic percussion ensemble motif loop with deep resonant djembe kick tones, slapped snare-like rim hits on congas, and layered shakers and bells providing hi-hat-like rhythmic texture with polyrhythmic patterns. BPM: 100. Length: 90 seconds\\n- Synth ambient motif loop with evolving textures. BPM: 80. Length: 180 seconds\\n- Motif loop with warm dusty vinyl crackle and tape saturation. BPM: 80. Length: 60 seconds\\n- Synth lead and bass motif loop with bright punchy energy. BPM: 128. Length: 90 seconds\\n- Funk band motif loop: bass, drums, guitar. BPM: 100. Length: 90 seconds\\n- Ethnic flute motif for cinematic use. BPM: 80. Length: 30 seconds\\n- Steel drum melodic motif loop with bright metallic resonance. BPM: 110. Length: 20 seconds\\n- Marimba percussive motif loop with resonant wooden tone. BPM: 100. Length: 20 seconds\\n- Vibraphone melodic motif loop with metallic shimmer. BPM: 90. Length: 25 seconds\\n- Piano cinematic motif loop with resonant wooden tone. BPM: 80. Length: 30 seconds\\n- Violin expressive cinematic motif loop with rich wooden resonance. BPM: 75. Length: 25 seconds\\n- Cello expressive motif loop with deep wooden resonance. BPM: 70. Length: 30 seconds\\n- Trumpet expressive motif loop with brassy overtones. BPM: 100. Length: 25 seconds\\n- Sax expressive motif loop with warm reed timbre. BPM: 95. Length: 25 seconds\\n- Ethnic drum ensemble motif loop with booming natural-skin bass drum kicks, sharp hand-slap snare accents on djembes and talking drums, and layered wooden and metal percussion providing rhythmic hi-hat-like patterns. BPM: 95. Length: 30 seconds\\n- Ambient drone motif loop. BPM: 60. Length: 180 seconds\\n- Orchestral tension motif loop. BPM: 90. Length: 150 seconds\\n- Electronic track motif loop with drums, bass, synth. BPM: 128. Length: 180 seconds\",\n \"SFX\": \"You are a professional sound design expert. Convert the user's input into a precise, vivid sound effects description suitable for generative audio models.\\n\\nDescribe clearly:\\n- Sound source\\n- Physical character (texture, timbre, material: metal, wood, glass, concrete, etc.)\\n- Spatial qualities (indoor/outdoor, cave/open field/underwater, dry/reverberant, close-up/distant, echoing/muffled)\\n- Temporal evolution (attack, decay, movement, transitions over time)\\n- Include motion or spatial movement if applicable (passing, approaching, stereo movement)\\n\\nAudio length rules:\\n- Very short sounds (impacts, clicks, gunshots): 1–3 seconds\\n- Medium actions (footsteps, object movement, transitions): 3–6 seconds\\n- Ambience / environments: 6–15 seconds\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nOutput constraints:\\n- Length: 1–2 dense sentences maximum\\n- Output ONLY the final rewritten prompt\\n- No explanations, no formatting, no quotes\\n- Use concise but dense technical language\\n- Focus strictly on sound effects or ambience\\n- Always append: Length: X seconds (integer only, no decimals).\\n\\nQuality guidelines:\\n- Be specific and avoid vague terms\\n- Prioritize clarity and realism\\n- Combine elements into one coherent scene\\n- Avoid redundancy\\n\\nExamples:\\n- Heavy rain hitting a metal roof during a thunderstorm, distant thunder rumbles, stereo, realistic ambience. Length: 45 seconds\\n- Quiet forest at dawn with birds chirping, soft wind through leaves, distant stream flowing. Length: 60 seconds\\n- Busy city street at night, cars passing, muffled conversations, occasional horn, urban ambience. Length: 50 seconds\\n- Ocean waves crashing against rocky cliffs, strong wind, dramatic and cinematic. Length: 70 seconds\\n- Wooden door creaking open slowly in an old house, echoing interior, eerie tone. Length: 3 seconds\\n- Glass bottle shattering on concrete, sharp impact, scattered fragments. Length: 2 seconds\\n- Footsteps on gravel, steady walking pace, close perspective. Length: 8 seconds\\n- Typing rapidly on a mechanical keyboard, crisp tactile clicks. Length: 5 seconds\\n- Punch impact with deep bass hit, cinematic trailer style. Length: 2 seconds\\n- Car speeding past at high velocity, doppler effect, realistic whoosh. Length: 3 seconds\\n- Object falling from height and hitting ground with a heavy thud. Length: 2 seconds\\n- Sword swing whooshing through air, fast motion, clean metallic tone. Length: 2 seconds\\n- Futuristic laser blast, clean energy pulse, high-tech sound design. Length: 1 seconds\\n- Spaceship engine humming, low frequency rumble, interior perspective. Length: 90 seconds\\n- Magical spell casting, shimmering particles, rising tonal energy. Length: 8 seconds\\n- Teleportation effect, glitchy digital distortion with a soft whoosh. Length: 5 seconds\\n- Dark eerie drone with distant whispers, creepy, slow build tension. Length: 120 seconds\\n- Sudden horror jump scare sting, sharp violin hit, cinematic. Length: 1 second\\n- Metal scraping slowly in a dark tunnel, echoing and ominous. Length: 20 seconds\\n- Explosion with debris scattering, deep bass, cinematic realism. Length: 4 seconds\\n- Building collapsing, rumbling concrete, dust and debris falling. Length: 25 seconds\\n- Fire crackling intensely, wood burning, close-up detail. Length: 80 seconds\\n- Gunshot in a large empty warehouse, loud echo decay. Length: 2 seconds\\n- Retro arcade coin insert sound, 8-bit style. Length: 1 second\\n- Level up chime, bright, rewarding, fantasy RPG style. Length: 2 seconds\\n- Error buzzer, short, digital, UI feedback. Length: 1 second\\n- Menu navigation clicks, soft futuristic interface sounds. Length: 3 seconds\\n- Layered soundscape: rain, thunder, footsteps, and distant sirens all blending naturally. Length: 90 seconds\\n- Rapid sequence of three impacts: metal hit, glass break, wood crack, spaced evenly. Length: 4 seconds\\n- Sound moving from left to right stereo field: passing motorcycle. Length: 5 seconds\\n- Close vs far perspective transition: footsteps approaching then fading away. Length: 6 seconds\\n- Tape stop sub drop, a massive sub-bass note that mimics a vinyl record or tape machine being turned off, the pitch and speed drop simultaneously, causing the high-end harmonics to smear and thicken as the sound grinds to a halt at a sub-sonic frequency. Length: 11 seconds\\n- Gravel and leaves footsteps, the sound of a hard boot stepping onto dry leaves or gravel, crisp and natural with detailed texture. Length: 11 seconds\\n- Ghostship moan, a massive, deep wooden groan with a low-frequency moan, like heavy timber under immense structural tension, swaying slowly, processed with long, dark wooden room reverb for a sense of scale. Length: 11 seconds\\n- Bicycle chain, a continuous metallic whirring sound of a chain moving over sprockets, with individual teeth catching the links, processed with resonant band-pass filter to emphasize metallic singing. Length: 11 seconds\\n- Warp drive, a sound that starts with a massive suck-back of ambient noise, followed by a supersonic crack and high-pitched zing that disappears into the distance, giving the sense of stretching space-time. Length: 11 seconds\\n- Ice cubes, high-pitched musical clinking of hard ice hitting a thin glass, bright resonant ring with subtle liquid sloshing around the edges. Length: 11 seconds\\n- Paper shuffle, the sound of a thick stack of heavy bond paper being squared up on a desk, dry papery thud with a quick fanning sound as air moves between the pages. Length: 11 seconds\\n- Drawer slam, a blunt, powerful thud made by slamming a wooden desk drawer shut, pronounced low-mid body, slightly distorted for aggressive character. Length: 3 seconds\",\n \"One-shot\": \"You are a music metadata expert. Given an instrument or sound, generate a descriptive prompt for a short, isolated one-shot audio sample for music production.\\n\\n1. Identify the instrument or sound source.\\n2. Describe the playing technique or hit type (e.g., pluck, slam, tap, stab).\\n3. Include details about material, timbre, or texture.\\n4. Add spatial or production qualities (dry/wet, room, close-mic).\\n5. Specify length: short integer in seconds (1–11 s).\\n\\nExamples:\\n- Piano key hit with bright percussive attack and resonant wooden body. Length: 2 seconds\\n- Kick drum punchy low-end hit with warm skin resonance. Length: 2 seconds\\n- Snare drum rimshot accent with crisp snare wires. Length: 2 seconds\\n- Acoustic guitar fingerstyle note with warm spruce tone. Length: 3 seconds\\n- Bass pluck with jazzy tone and resonant wooden body. Length: 3 seconds\\n- Electric guitar power chord with distortion. Length: 3 seconds\\n- Metallic glitch percussion hit with sharp metallic texture. Length: 2 seconds\\n- Tabla resonant tone hit with natural skin timbre. Length: 2 seconds\\n- Djembe slap accent with dry wooden resonance. Length: 2 seconds\\n- Synth stab with reverb tail. Length: 3 seconds\\n- Violin expressive note with vibrato and rich wooden resonance. Length: 3 seconds\\n- Cello legato note, cinematic, with warm resonant body. Length: 3 seconds\\n- Trumpet bright accent with slightly brassy overtones. Length: 2 seconds\\n- Melodic saxophone jazz riff with smooth reed timbre and a slight vibrato bend. Length: 3 seconds\\n- Harp pluck with airy tone and resonant strings. Length: 2 seconds\\n- Glockenspiel bell-like note with bright metallic clarity. Length: 2 seconds\\n- Metallic clang sound design hit. Length: 2 seconds\\n- Granular texture hit. Length: 3 seconds\\n- Reversed piano hit. Length: 2 seconds\\n- Synth riser effect. Length: 6 seconds\\n- Percussion impact hit. Length: 2 seconds\\n- Cinematic hit. Length: 2 seconds\\n- Dry clap, a crisp, natural single hand clap recorded in a dead room with an extremely sharp transient and no room reflections. Length: 1 second\\n- Studio hat, a classic, natural recording of 14-inch hi-hats played tightly closed, zero ring, very fast decay. Length: 1 second\\n- Disco open hat, bright 14-inch open hi-hat with long, shimmering decay, perfect for disco or dance grooves. Length: 1 second\\n- Pillow kick, acoustic kick drum muffled with a heavy blanket, producing a short, dry \\\"thump\\\" with almost zero resonance. Length: 1 second\\n- Short 808, punchy 808 kick with sharp, distorted transient and fast-decaying sub-tail. Length: 1 second\\n- Egg shaker, classic plastic egg shaker recorded with a small-diaphragm condenser mic, producing a light, consistent \\\"tick\\\" with very short sustain. Length: 1 second\\n- African drums, dynamic African drums and percussion ensemble with natural acoustic textures. Length: 3 seconds\\n- Latin drums, dynamic Latin drums and percussion ensemble featuring authentic rhythmic patterns. Length: 3 seconds\\n- String quartet, euphoric string quartet with dynamic and emotional playing, full of expressive harmonies and movement. Length: 3 seconds\\n- Piano, nostalgic, atmospheric piano piece with dynamic and emotional performance, intimate and resonant. Length: 3 seconds\\n- Analogue drift pad, warm polyphonic pad with three detuned oscillators (saw + triangle), subtle pitch drift, and lush bucket-brigade chorus for wide, nostalgic stereo image. Length: 11 seconds\\n- Phase distortion bass, Casio CZ-style phase-distorted sine wave warped into a jagged sawtooth for retro synth bass tone. Length: 11 seconds\\n- Vibrato saxophone, bright lyrical alto sax with fast fluttery vibrato, reedy vintage tone, captured with ribbon mic for warm nostalgic sound. Length: 11 seconds\\n- Lofi upright bass, upright bass recorded with ribbon mic in a wooden room, natural air with slightly boxy resonance, tape-saturated for dusty 1950s jazz feel. Length: 2 seconds\"\n}",
+ "Music"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "StringReplace",
+ "pos": [
+ 1350,
+ 900
+ ],
+ "size": [
+ 260,
+ 280
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 59
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 58
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 60
+ ]
+ }
+ ],
+ "title": "Text Replace (AUDIO LENGTH)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "",
+ "AUDIO_LENGTH",
+ ""
+ ]
+ },
+ {
+ "id": 38,
+ "type": "StringReplace",
+ "pos": [
+ 720,
+ 900
+ ],
+ "size": [
+ 290,
+ 280
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 66
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 52
+ ]
+ }
+ ],
+ "title": "Text Replace (PROMPT TEMPLATE)",
+ "properties": {
+ "Node name for S&R": "StringReplace"
+ },
+ "widgets_values": [
+ "SYSTEM_PROMPTS\n\nInput: USER_INPUT\nTarget audio length: AUDIO_LENGTH seconds.\nOutput:",
+ "SYSTEM_PROMPTS",
+ ""
+ ]
+ },
+ {
+ "id": 35,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -390,
+ 570
+ ],
+ "size": [
+ 400,
+ 100
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 48
+ ]
+ }
+ ],
+ "title": "Boolean (Enable_Reprompt)",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 36,
+ "type": "PrimitiveFloat",
+ "pos": [
+ -390,
+ 410
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 82
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 50,
+ 56
+ ]
+ }
+ ],
+ "title": "Float (Duration)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 150
+ ]
+ },
+ {
+ "id": 25,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 100,
+ 130
+ ],
+ "size": [
+ 440,
+ 190
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 79
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 30
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 39
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "stable_audio_3_medium.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/checkpoints/stable_audio_3_medium.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "stable_audio_3_medium.safetensors"
+ ]
+ },
+ {
+ "id": 26,
+ "type": "CLIPLoader",
+ "pos": [
+ 100,
+ 390
+ ],
+ "size": [
+ 440,
+ 170
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 80
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 34,
+ 35
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "t5gemma_b_b_ul2.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/stable-audio-3/resolve/main/text_encoders/t5gemma_b_b_ul2.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "t5gemma_b_b_ul2.safetensors",
+ "stable_audio",
+ "default"
+ ]
+ },
+ {
+ "id": 54,
+ "type": "PreviewAny",
+ "pos": [
+ 1720,
+ 1580
+ ],
+ "size": [
+ 420,
+ 550
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 4,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 84
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewAny"
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Loaders: checkpoint & CLIP",
+ "bounding": [
+ 80,
+ 50,
+ 485.721654232725,
+ 527.2848777754299
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "CLIP encode: conditioning",
+ "bounding": [
+ 600,
+ 60,
+ 470,
+ 510
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "User inputs: prompt & duration",
+ "bounding": [
+ -400,
+ 10,
+ 430,
+ 740
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Reprompt: full branch (template + LLM)",
+ "bounding": [
+ 60,
+ 780,
+ 1630,
+ 1360
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Reprompt: JSON extract & template fills",
+ "bounding": [
+ 120,
+ 820,
+ 1520,
+ 650
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Helpers: duration to string",
+ "bounding": [
+ 1340,
+ 1180,
+ 280,
+ 250
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Reprompt: Qwen TextGenerate",
+ "bounding": [
+ 680,
+ 1510,
+ 960,
+ 614.65625
+ ],
+ "color": "#444",
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Audio generation: Stable Audio",
+ "bounding": [
+ 60,
+ 10,
+ 1627.3616782294932,
+ 737.0545987464304
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 35,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 13,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 12,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 39,
+ "origin_id": 25,
+ "origin_slot": 2,
+ "target_id": 12,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 50,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 30,
+ "origin_id": 25,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 4,
+ "origin_id": 6,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 6,
+ "origin_id": 7,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 12,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 34,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 49,
+ "origin_id": 34,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 47,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 46,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 48,
+ "origin_id": 35,
+ "origin_slot": 0,
+ "target_id": 34,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 56,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": 41,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 57,
+ "origin_id": 41,
+ "origin_slot": 1,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 52,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 53,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 39,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 40,
+ "origin_id": 29,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 60,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 65,
+ "origin_id": 43,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 59,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 58,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 66,
+ "origin_id": 49,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 27,
+ "origin_id": 12,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "AUDIO"
+ },
+ {
+ "id": 68,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 76,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 78,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 43,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 25,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 80,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 26,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 81,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 29,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 82,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 35,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 84,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 54,
+ "target_slot": 0,
+ "type": "STRING"
+ }
+ ],
+ "extra": {},
+ "category": "Audio/Music generation",
+ "description": "Generates music, instrument loops, sound effects, and one-shots from text using Stable Audio 3 Medium, with optional Qwen 3.5 category-based prompt expansion (Music, Instrument, SFX, One-shot)."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Brightness and Contrast.json b/blueprints/Brightness and Contrast.json
index 90bfe999d..78fc52f29 100644
--- a/blueprints/Brightness and Contrast.json
+++ b/blueprints/Brightness and Contrast.json
@@ -431,9 +431,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adjusts image brightness and contrast using a real-time GPU fragment shader."
}
]
},
"extra": {}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Canny to Image (Z-Image-Turbo).json b/blueprints/Canny to Image (Z-Image-Turbo).json
index ff9717308..903d372b1 100644
--- a/blueprints/Canny to Image (Z-Image-Turbo).json
+++ b/blueprints/Canny to Image (Z-Image-Turbo).json
@@ -162,7 +162,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Canny to Image (Z-Image-Turbo)",
+ "name": "Canny to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1553,7 +1553,8 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
- "category": "Image generation and editing/Canny to image"
+ "category": "Image generation and editing/Conditioned",
+ "description": "Generates an image from a Canny edge map using Z-Image-Turbo, with text conditioning."
}
]
},
@@ -1574,4 +1575,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Canny to Video (LTX 2.0).json b/blueprints/Canny to Video (LTX 2.0).json
index fae8321b9..ed602b521 100644
--- a/blueprints/Canny to Video (LTX 2.0).json
+++ b/blueprints/Canny to Video (LTX 2.0).json
@@ -192,7 +192,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Canny to Video (LTX 2.0)",
+ "name": "Canny to Video (LTX 2.0)",
"inputNode": {
"id": -10,
"bounding": [
@@ -3600,7 +3600,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Canny to video"
+ "category": "Video generation and editing/Conditioned",
+ "description": "Generates video from Canny edge maps using LTX-2, with optional synchronized audio."
}
]
},
@@ -3616,4 +3617,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Chromatic Aberration.json b/blueprints/Chromatic Aberration.json
index ae8037b1b..893fb1190 100644
--- a/blueprints/Chromatic Aberration.json
+++ b/blueprints/Chromatic Aberration.json
@@ -377,8 +377,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adds lens-style chromatic aberration (color fringing) using a real-time GPU fragment shader."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Color Adjustment.json b/blueprints/Color Adjustment.json
index 622bf28af..5abbf8baa 100644
--- a/blueprints/Color Adjustment.json
+++ b/blueprints/Color Adjustment.json
@@ -596,7 +596,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adjusts saturation, temperature, tint, and vibrance using a real-time GPU fragment shader."
}
]
}
diff --git a/blueprints/Color Balance.json b/blueprints/Color Balance.json
index 21d6319ed..d921eab37 100644
--- a/blueprints/Color Balance.json
+++ b/blueprints/Color Balance.json
@@ -1129,7 +1129,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Balances colors across shadows, midtones, and highlights using a real-time GPU fragment shader."
}
]
}
diff --git a/blueprints/Color Curves.json b/blueprints/Color Curves.json
index 1461cf396..b9bfb7029 100644
--- a/blueprints/Color Curves.json
+++ b/blueprints/Color Curves.json
@@ -608,7 +608,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Fine-tunes tone and color with per-channel curve adjustments using a real-time GPU fragment shader."
}
]
}
diff --git a/blueprints/ControlNet (Z-Image-Turbo).json b/blueprints/ControlNet (Z-Image-Turbo).json
new file mode 100644
index 000000000..160ee11e2
--- /dev/null
+++ b/blueprints/ControlNet (Z-Image-Turbo).json
@@ -0,0 +1,1412 @@
+{
+ "revision": 0,
+ "last_node_id": 85,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 85,
+ "type": "d2e76ecf-6e84-4b8c-8913-48efc09ec1c4",
+ "pos": [
+ 440,
+ 1220
+ ],
+ "size": [
+ 480,
+ 0
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "control_image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "label": "patch_model",
+ "name": "name",
+ "type": "COMBO",
+ "widget": {
+ "name": "name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "title": "ControlNet (Z-Image-Turbo)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "83",
+ "text"
+ ],
+ [
+ "79",
+ "seed"
+ ],
+ [
+ "74",
+ "unet_name"
+ ],
+ [
+ "73",
+ "clip_name"
+ ],
+ [
+ "75",
+ "vae_name"
+ ],
+ [
+ "76",
+ "name"
+ ],
+ [
+ "79",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "d2e76ecf-6e84-4b8c-8913-48efc09ec1c4",
+ "version": 1,
+ "state": {
+ "lastGroupId": 9,
+ "lastNodeId": 85,
+ "lastLinkId": 87,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "ControlNet (Z-Image-Turbo)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -500,
+ 620,
+ 120,
+ 180
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1390,
+ 1100,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "fbbb968e-d3cf-40e4-b3ce-7abb074e5bd8",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 65,
+ 80
+ ],
+ "localized_name": "image",
+ "label": "control_image",
+ "pos": [
+ -400,
+ 640
+ ]
+ },
+ {
+ "id": "c1b19877-5417-4580-aea1-44439c70c1dd",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 81
+ ],
+ "pos": [
+ -400,
+ 660
+ ]
+ },
+ {
+ "id": "b5671515-bc7a-4be5-b1e7-d4f0f68907d6",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 83
+ ],
+ "pos": [
+ -400,
+ 680
+ ]
+ },
+ {
+ "id": "2838be23-8034-4f16-87a5-d29d790e8391",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 84
+ ],
+ "pos": [
+ -400,
+ 700
+ ]
+ },
+ {
+ "id": "8a6643b5-8f78-41ff-bbc6-e87b95459706",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 85
+ ],
+ "pos": [
+ -400,
+ 720
+ ]
+ },
+ {
+ "id": "b103dc94-8ca7-456b-a809-414d7e341a1b",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 86
+ ],
+ "pos": [
+ -400,
+ 740
+ ]
+ },
+ {
+ "id": "4a7d65af-f0fd-4a5c-832a-bdc0d15b1f30",
+ "name": "name",
+ "type": "COMBO",
+ "linkIds": [
+ 87
+ ],
+ "label": "patch_model",
+ "pos": [
+ -400,
+ 760
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ccb7fa39-4a3d-4eb2-8fd2-91d08fad9570",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 45
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1410,
+ 1120
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 73,
+ "type": "CLIPLoader",
+ "pos": [
+ 20,
+ 500
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 85
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 44
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "qwen_3_4b.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/text_encoders/qwen_3_4b.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "qwen_3_4b.safetensors",
+ "lumina2",
+ "default"
+ ]
+ },
+ {
+ "id": 74,
+ "type": "UNETLoader",
+ "pos": [
+ 20,
+ 320
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 84
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 79
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "z_image_turbo_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/diffusion_models/z_image_turbo_bf16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "z_image_turbo_bf16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 75,
+ "type": "VAELoader",
+ "pos": [
+ 20,
+ 760
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 86
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 39,
+ 70
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 76,
+ "type": "ModelPatchLoader",
+ "pos": [
+ 20,
+ 940
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "name",
+ "name": "name",
+ "type": "COMBO",
+ "widget": {
+ "name": "name"
+ },
+ "link": 87
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL_PATCH",
+ "name": "MODEL_PATCH",
+ "type": "MODEL_PATCH",
+ "links": [
+ 74
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.51",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ModelPatchLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "Z-Image-Turbo-Fun-Controlnet-Union.safetensors",
+ "url": "https://huggingface.co/alibaba-pai/Z-Image-Turbo-Fun-Controlnet-Union/resolve/main/Z-Image-Turbo-Fun-Controlnet-Union.safetensors",
+ "directory": "model_patches"
+ }
+ ]
+ },
+ "widgets_values": [
+ "Z-Image-Turbo-Fun-Controlnet-Union.safetensors"
+ ]
+ },
+ {
+ "id": 77,
+ "type": "VAEDecode",
+ "pos": [
+ 940,
+ 1100
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 38
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 39
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 45
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 78,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 910,
+ 270
+ ],
+ "size": [
+ 290,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 69
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3
+ ]
+ },
+ {
+ "id": 79,
+ "type": "KSampler",
+ "pos": [
+ 910,
+ 430
+ ],
+ "size": [
+ 300,
+ 570
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 40
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 42
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 78
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 83
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 38
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 729703840979498,
+ "randomize",
+ 8,
+ 1,
+ "res_multistep",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 80,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ 610,
+ 830
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 36
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 42
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ConditioningZeroOut",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 81,
+ "type": "QwenImageDiffsynthControlnet",
+ "pos": [
+ 490,
+ 970
+ ],
+ "size": [
+ 290,
+ 200
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 79
+ },
+ {
+ "localized_name": "model_patch",
+ "name": "model_patch",
+ "type": "MODEL_PATCH",
+ "link": 74
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 70
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 65
+ },
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "shape": 7,
+ "type": "MASK",
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 69
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.76",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "QwenImageDiffsynthControlnet",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 82,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ 40,
+ 1200
+ ],
+ "size": [
+ 260,
+ 170
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 76
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 77
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 78
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptySD3LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 83,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 430,
+ 310
+ ],
+ "size": [
+ 400,
+ 440
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 44
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 81
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 36,
+ 41
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 84,
+ "type": "GetImageSize",
+ "pos": [
+ 50,
+ 1410
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 80
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 76
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 77
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.76",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "GetImageSize",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ 410,
+ 230,
+ 440,
+ 630
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Model",
+ "bounding": [
+ -50,
+ 230,
+ 430,
+ 840
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Apple ControlNet",
+ "bounding": [
+ 410,
+ 890,
+ 440,
+ 330
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 9,
+ "title": "Image Size",
+ "bounding": [
+ -50,
+ 1100,
+ 430,
+ 350
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 38,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": 77,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 39,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 77,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 69,
+ "origin_id": 81,
+ "origin_slot": 0,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 40,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 41,
+ "origin_id": 83,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 42,
+ "origin_id": 80,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 78,
+ "origin_id": 82,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 36,
+ "origin_id": 83,
+ "origin_slot": 0,
+ "target_id": 80,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 79,
+ "origin_id": 74,
+ "origin_slot": 0,
+ "target_id": 81,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 74,
+ "origin_id": 76,
+ "origin_slot": 0,
+ "target_id": 81,
+ "target_slot": 1,
+ "type": "MODEL_PATCH"
+ },
+ {
+ "id": 70,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 81,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 76,
+ "origin_id": 84,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 77,
+ "origin_id": 84,
+ "origin_slot": 1,
+ "target_id": 82,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 44,
+ "origin_id": 73,
+ "origin_slot": 0,
+ "target_id": 83,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 65,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 81,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 80,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 84,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 45,
+ "origin_id": 77,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 81,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 83,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 79,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 84,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 74,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 85,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 73,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 86,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 87,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 76,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Conditioned",
+ "description": "Generates images from a text prompt and ControlNet conditioning (e.g. depth, canny) using Z-Image-Turbo."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Crop Images 2x2.json b/blueprints/Crop Images 2x2.json
new file mode 100644
index 000000000..99b89b608
--- /dev/null
+++ b/blueprints/Crop Images 2x2.json
@@ -0,0 +1,1621 @@
+{
+ "revision": 0,
+ "last_node_id": 139,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 135,
+ "type": "3b5ed000-6ab3-4458-91f7-8d6d366b0b40",
+ "pos": [
+ -2479.9999801712506,
+ 2019.9999372732784
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "label": "top_left",
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "bottom_left",
+ "localized_name": "IMAGE_1",
+ "name": "IMAGE_1",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "top_right",
+ "localized_name": "IMAGE_2",
+ "name": "IMAGE_2",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "bottom_right",
+ "localized_name": "IMAGE_3",
+ "name": "IMAGE_3",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "images",
+ "name": "IMAGE_4",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "Crop Images 2x2"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "3b5ed000-6ab3-4458-91f7-8d6d366b0b40",
+ "version": 1,
+ "state": {
+ "lastGroupId": 3,
+ "lastNodeId": 142,
+ "lastLinkId": 245,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Crop Images 2x2",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -10,
+ 1570,
+ 120,
+ 60
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 2919.9998608196274,
+ 1435,
+ 120,
+ 140
+ ]
+ },
+ "inputs": [
+ {
+ "id": "741854dd-bfb1-4700-ba8c-3b9dea59d021",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 2,
+ 11,
+ 13,
+ 30,
+ 32
+ ],
+ "localized_name": "image",
+ "pos": [
+ 90,
+ 1590
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "0eaca6d4-679a-433e-9703-bfa6dceacb18",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 41
+ ],
+ "localized_name": "IMAGE",
+ "label": "top_left",
+ "pos": [
+ 2939.9998608196274,
+ 1455
+ ]
+ },
+ {
+ "id": "fff5a1ad-3a74-4c87-938c-ee0fff55f840",
+ "name": "IMAGE_1",
+ "type": "IMAGE",
+ "linkIds": [
+ 42
+ ],
+ "localized_name": "IMAGE_1",
+ "label": "bottom_left",
+ "pos": [
+ 2939.9998608196274,
+ 1475
+ ]
+ },
+ {
+ "id": "08f40978-fb25-4d98-b716-b61e43b16043",
+ "name": "IMAGE_2",
+ "type": "IMAGE",
+ "linkIds": [
+ 43
+ ],
+ "localized_name": "IMAGE_2",
+ "label": "top_right",
+ "pos": [
+ 2939.9998608196274,
+ 1495
+ ]
+ },
+ {
+ "id": "17b9416f-3369-43c1-b62f-3e31fc2a7e32",
+ "name": "IMAGE_3",
+ "type": "IMAGE",
+ "linkIds": [
+ 44
+ ],
+ "localized_name": "IMAGE_3",
+ "label": "bottom_right",
+ "pos": [
+ 2939.9998608196274,
+ 1515
+ ]
+ },
+ {
+ "id": "430e2f3b-c617-4549-9daf-3ebf5be423a3",
+ "name": "IMAGE_4",
+ "type": "IMAGE",
+ "linkIds": [
+ 240
+ ],
+ "label": "images",
+ "pos": [
+ 2939.9998608196274,
+ 1535
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 7,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 740,
+ 1390
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 3
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 4
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 7,
+ 14,
+ 28,
+ 40,
+ 242
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, int(a/b))"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "GetImageSize",
+ "pos": [
+ 390,
+ 1450
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 3,
+ 241
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 5,
+ 245
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "GetImageSize"
+ }
+ },
+ {
+ "id": 9,
+ "type": "PrimitiveInt",
+ "pos": [
+ 390,
+ 1650
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 4,
+ 6
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 2,
+ "fixed"
+ ]
+ },
+ {
+ "id": 10,
+ "type": "ImageCropV2",
+ "pos": [
+ 1710,
+ 430
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 11
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 9
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 41,
+ 236
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 12,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1370,
+ 570
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 7
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 13,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 750,
+ 1650
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 5
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 6
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 8,
+ 23,
+ 27,
+ 39,
+ 246
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, int(a/b))"
+ ]
+ },
+ {
+ "id": 138,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1170,
+ 1210
+ ],
+ "size": [
+ 420,
+ 190
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 241
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 242
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 243,
+ 244
+ ]
+ }
+ ],
+ "title": "Math Expression (Right Width)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, a - b)"
+ ]
+ },
+ {
+ "id": 139,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1170,
+ 1860
+ ],
+ "size": [
+ 420,
+ 190
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 245
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 246
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 247,
+ 248
+ ]
+ }
+ ],
+ "title": "Math Expression (Bottom Height)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, a - b)"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "ImageCropV2",
+ "pos": [
+ 1740,
+ 1600
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 13
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 12
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 42,
+ 238
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 16,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1350,
+ 1780
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 23
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 14
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 247
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 25,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1350,
+ 1200
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 28
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 243
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 27
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 29
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 26,
+ "type": "ImageCropV2",
+ "pos": [
+ 1720,
+ 1050
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 30
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 29
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 43,
+ 237
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 30,
+ "type": "ImageCropV2",
+ "pos": [
+ 1740,
+ 2130
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 32
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 35
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 44,
+ 239
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 32,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1370,
+ 2280
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 40
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 39
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 244
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 248
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 35
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 137,
+ "type": "BatchImagesNode",
+ "pos": [
+ 2520,
+ 1540
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image0",
+ "localized_name": "images.image0",
+ "name": "images.image0",
+ "type": "IMAGE",
+ "link": 236
+ },
+ {
+ "label": "image1",
+ "localized_name": "images.image1",
+ "name": "images.image1",
+ "type": "IMAGE",
+ "link": 237
+ },
+ {
+ "label": "image2",
+ "localized_name": "images.image2",
+ "name": "images.image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 238
+ },
+ {
+ "label": "image3",
+ "localized_name": "images.image3",
+ "name": "images.image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 239
+ },
+ {
+ "label": "image4",
+ "localized_name": "images.image4",
+ "name": "images.image4",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 240
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "BatchImagesNode"
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Crop Images 2x2",
+ "bounding": [
+ 380,
+ 360,
+ 1710,
+ 2270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 3,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 4,
+ "origin_id": 9,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 9,
+ "origin_id": 12,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 7,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 12,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 8,
+ "origin_id": 13,
+ "origin_slot": 1,
+ "target_id": 12,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 5,
+ "origin_id": 8,
+ "origin_slot": 1,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 6,
+ "origin_id": 9,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 12,
+ "origin_id": 16,
+ "origin_slot": 0,
+ "target_id": 15,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 23,
+ "origin_id": 13,
+ "origin_slot": 1,
+ "target_id": 16,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 14,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 16,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 247,
+ "origin_id": 139,
+ "origin_slot": 1,
+ "target_id": 16,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 28,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 25,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 243,
+ "origin_id": 138,
+ "origin_slot": 1,
+ "target_id": 25,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 27,
+ "origin_id": 13,
+ "origin_slot": 1,
+ "target_id": 25,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 29,
+ "origin_id": 25,
+ "origin_slot": 0,
+ "target_id": 26,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 35,
+ "origin_id": 32,
+ "origin_slot": 0,
+ "target_id": 30,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 40,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 32,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 39,
+ "origin_id": 13,
+ "origin_slot": 1,
+ "target_id": 32,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 244,
+ "origin_id": 138,
+ "origin_slot": 1,
+ "target_id": 32,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 248,
+ "origin_id": 139,
+ "origin_slot": 1,
+ "target_id": 32,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 241,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": 138,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 242,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 138,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 245,
+ "origin_id": 8,
+ "origin_slot": 1,
+ "target_id": 139,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 246,
+ "origin_id": 13,
+ "origin_slot": 1,
+ "target_id": 139,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 2,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 11,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 13,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 15,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 30,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 26,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 32,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 30,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 41,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 42,
+ "origin_id": 15,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 43,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 44,
+ "origin_id": 30,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 236,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 237,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 238,
+ "origin_id": 15,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 239,
+ "origin_id": 30,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 240,
+ "origin_id": 137,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 4,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Image Tools/Crop",
+ "description": "Splits an image into a 2×2 grid of four equal tiles."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": [],
+ "links_added_by_ue": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Crop Images 3x3.json b/blueprints/Crop Images 3x3.json
new file mode 100644
index 000000000..6ac636da4
--- /dev/null
+++ b/blueprints/Crop Images 3x3.json
@@ -0,0 +1,2958 @@
+{
+ "revision": 0,
+ "last_node_id": 141,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 134,
+ "type": "7fd47bca-ff89-476c-a98d-ca6f7cf756fe",
+ "pos": [
+ -2620,
+ 1620
+ ],
+ "size": [
+ 230,
+ 290
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "label": "top_left",
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "top_center",
+ "name": "IMAGE_1",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "top_right",
+ "name": "IMAGE_2",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "middle_left",
+ "name": "IMAGE_3",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "middle_center",
+ "name": "IMAGE_4",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "middle_right",
+ "name": "IMAGE_5",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "bottom_left",
+ "name": "IMAGE_6",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "bottom_center",
+ "name": "IMAGE_7",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "bottom_right",
+ "name": "IMAGE_8",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "label": "images",
+ "name": "IMAGE_9",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "Crop Images 3x3"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "7fd47bca-ff89-476c-a98d-ca6f7cf756fe",
+ "version": 1,
+ "state": {
+ "lastGroupId": 3,
+ "lastNodeId": 142,
+ "lastLinkId": 245,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Crop Images 3x3",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -710,
+ 5440,
+ 120,
+ 60
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 3430,
+ 5270,
+ 121.720703125,
+ 240
+ ]
+ },
+ "inputs": [
+ {
+ "id": "e54e8e8b-6ce6-4f80-a38f-87a77d990efc",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 74,
+ 75,
+ 82,
+ 91,
+ 94,
+ 117,
+ 129,
+ 137,
+ 148,
+ 157
+ ],
+ "localized_name": "image",
+ "pos": [
+ -610,
+ 5460
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "3dd8abe2-a7da-4052-a556-9ae157ff3cf4",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 101
+ ],
+ "localized_name": "IMAGE",
+ "label": "top_left",
+ "pos": [
+ 3450,
+ 5290
+ ]
+ },
+ {
+ "id": "aa220733-759b-474e-9d29-634a3a23c5da",
+ "name": "IMAGE_1",
+ "type": "IMAGE",
+ "linkIds": [
+ 192
+ ],
+ "label": "top_center",
+ "pos": [
+ 3450,
+ 5310
+ ]
+ },
+ {
+ "id": "f1911df1-d50c-4bf8-9623-5e581d2a8902",
+ "name": "IMAGE_2",
+ "type": "IMAGE",
+ "linkIds": [
+ 193
+ ],
+ "label": "top_right",
+ "pos": [
+ 3450,
+ 5330
+ ]
+ },
+ {
+ "id": "71ebb807-e7e9-438f-990d-511e0745d10d",
+ "name": "IMAGE_3",
+ "type": "IMAGE",
+ "linkIds": [
+ 194
+ ],
+ "label": "middle_left",
+ "pos": [
+ 3450,
+ 5350
+ ]
+ },
+ {
+ "id": "4fb9c99c-3340-4de5-ba2d-51a653aab0b3",
+ "name": "IMAGE_4",
+ "type": "IMAGE",
+ "linkIds": [
+ 195
+ ],
+ "label": "middle_center",
+ "pos": [
+ 3450,
+ 5370
+ ]
+ },
+ {
+ "id": "398643e8-e349-4d59-9c68-6403b7a2772d",
+ "name": "IMAGE_5",
+ "type": "IMAGE",
+ "linkIds": [
+ 196
+ ],
+ "label": "middle_right",
+ "pos": [
+ 3450,
+ 5390
+ ]
+ },
+ {
+ "id": "5b11949c-f4cc-4525-86ae-690e30d3dada",
+ "name": "IMAGE_6",
+ "type": "IMAGE",
+ "linkIds": [
+ 197
+ ],
+ "label": "bottom_left",
+ "pos": [
+ 3450,
+ 5410
+ ]
+ },
+ {
+ "id": "82c69fd9-de36-4c8f-8311-a9e49159640b",
+ "name": "IMAGE_7",
+ "type": "IMAGE",
+ "linkIds": [
+ 198
+ ],
+ "label": "bottom_center",
+ "pos": [
+ 3450,
+ 5430
+ ]
+ },
+ {
+ "id": "aef678db-20aa-47d4-be8a-978065f078c6",
+ "name": "IMAGE_8",
+ "type": "IMAGE",
+ "linkIds": [
+ 199
+ ],
+ "label": "bottom_right",
+ "pos": [
+ 3450,
+ 5450
+ ]
+ },
+ {
+ "id": "77574277-edde-439c-8720-7daa849f4f27",
+ "name": "IMAGE_9",
+ "type": "IMAGE",
+ "linkIds": [
+ 226
+ ],
+ "label": "images",
+ "pos": [
+ 3450,
+ 5470
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 50,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 770,
+ 5310
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 73
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 108
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 77,
+ 85,
+ 89,
+ 97,
+ 99,
+ 127,
+ 142,
+ 146,
+ 152,
+ 300
+ ]
+ }
+ ],
+ "title": "Math Expression (Width)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, int(a/b))"
+ ]
+ },
+ {
+ "id": 51,
+ "type": "GetImageSize",
+ "pos": [
+ 440,
+ 5390
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 74
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 73,
+ 300
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 79,
+ 305
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "GetImageSize"
+ }
+ },
+ {
+ "id": 52,
+ "type": "PrimitiveInt",
+ "pos": [
+ 440,
+ 5590
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 80,
+ 108
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 3,
+ "fixed"
+ ]
+ },
+ {
+ "id": 53,
+ "type": "ImageCropV2",
+ "pos": [
+ 2080,
+ 3020
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 75
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 76
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 101,
+ 227
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 54,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1740,
+ 3160
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 77
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 78
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 76
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 55,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 780,
+ 5570
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 79
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 80
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 78,
+ 84,
+ 86,
+ 88,
+ 90,
+ 98,
+ 100,
+ 121,
+ 123,
+ 126,
+ 161
+ ]
+ }
+ ],
+ "title": "Math Expression(Height)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, int(a/b))"
+ ]
+ },
+ {
+ "id": 57,
+ "type": "ImageCropV2",
+ "pos": [
+ 2080,
+ 4700
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 82
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 194,
+ 230
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 58,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1740,
+ 4830
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 84
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 85
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 86
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 83
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 60,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1740,
+ 3700
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 88
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 89
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 90
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 92
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 61,
+ "type": "ImageCropV2",
+ "pos": [
+ 2100,
+ 3570
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 91
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 92
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 192,
+ 228
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 63,
+ "type": "ImageCropV2",
+ "pos": [
+ 2080,
+ 5310
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 94
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 95
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 195,
+ 231
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 65,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1750,
+ 5330
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 97
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 98
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 99
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 100
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 95
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 71,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 780,
+ 6090
+ ],
+ "size": [
+ 400,
+ 190
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 126
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 136,
+ 147,
+ 156,
+ 306
+ ]
+ }
+ ],
+ "title": "Math Expression(height)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "2 * a"
+ ]
+ },
+ {
+ "id": 75,
+ "type": "ImageCropV2",
+ "pos": [
+ 2100,
+ 5900
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 117
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 118
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 196,
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 77,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1750,
+ 5970
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 128
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 121
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 302
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 123
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 118
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 78,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 780,
+ 5820
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 127
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 128,
+ 132,
+ 163,
+ 301
+ ]
+ }
+ ],
+ "title": "Math Expression(width)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "2 * a"
+ ]
+ },
+ {
+ "id": 140,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1240,
+ 5640
+ ],
+ "size": [
+ 420,
+ 190
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 300
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 301
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 302,
+ 303,
+ 304
+ ]
+ }
+ ],
+ "title": "Math Expression (Right Width)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, a - b)"
+ ]
+ },
+ {
+ "id": 141,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1230,
+ 6340
+ ],
+ "size": [
+ 420,
+ 190
+ ],
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 305
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 306
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 307,
+ 308,
+ 309
+ ]
+ }
+ ],
+ "title": "Math Expression (Bottom Height)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "max(1, a - b)"
+ ]
+ },
+ {
+ "id": 79,
+ "type": "ImageCropV2",
+ "pos": [
+ 2120,
+ 7580
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 129
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 130
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 199,
+ 235
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 81,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1720,
+ 7620
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 132
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 136
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 303
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 307
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 130
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 82,
+ "type": "ImageCropV2",
+ "pos": [
+ 2120,
+ 7040
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 137
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 138
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 198,
+ 234
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 84,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1720,
+ 7080
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 146
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 147
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 142
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 308
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 138
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 85,
+ "type": "ImageCropV2",
+ "pos": [
+ 2110,
+ 6480
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 148
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 149
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 197,
+ 233
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 86,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1670,
+ 6570
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": 156
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 152
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 309
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 149
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 88,
+ "type": "ImageCropV2",
+ "pos": [
+ 2060,
+ 4140
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 157
+ },
+ {
+ "localized_name": "crop_region",
+ "name": "crop_region",
+ "type": "BOUNDING_BOX",
+ "widget": {
+ "name": "crop_region"
+ },
+ "link": 158
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 193,
+ 229
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ImageCropV2"
+ },
+ "widgets_values": [
+ {
+ "x": 0,
+ "y": 0,
+ "width": 512,
+ "height": 512
+ },
+ 0,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 89,
+ "type": "PrimitiveBoundingBox",
+ "pos": [
+ 1720,
+ 4150
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": 163
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 304
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 161
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOUNDING_BOX",
+ "name": "BOUNDING_BOX",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 158
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "PrimitiveBoundingBox"
+ },
+ "widgets_values": [
+ 6,
+ 0,
+ 512,
+ 512
+ ]
+ },
+ {
+ "id": 136,
+ "type": "BatchImagesNode",
+ "pos": [
+ 3170,
+ 5640
+ ],
+ "size": [
+ 230,
+ 290
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image0",
+ "localized_name": "images.image0",
+ "name": "images.image0",
+ "type": "IMAGE",
+ "link": 227
+ },
+ {
+ "label": "image1",
+ "localized_name": "images.image1",
+ "name": "images.image1",
+ "type": "IMAGE",
+ "link": 228
+ },
+ {
+ "label": "image2",
+ "localized_name": "images.image2",
+ "name": "images.image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 229
+ },
+ {
+ "label": "image3",
+ "localized_name": "images.image3",
+ "name": "images.image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 230
+ },
+ {
+ "label": "image4",
+ "localized_name": "images.image4",
+ "name": "images.image4",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 231
+ },
+ {
+ "label": "image5",
+ "localized_name": "images.image5",
+ "name": "images.image5",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 232
+ },
+ {
+ "label": "image6",
+ "localized_name": "images.image6",
+ "name": "images.image6",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 233
+ },
+ {
+ "label": "image7",
+ "localized_name": "images.image7",
+ "name": "images.image7",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 234
+ },
+ {
+ "label": "image8",
+ "localized_name": "images.image8",
+ "name": "images.image8",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 235
+ },
+ {
+ "label": "image9",
+ "localized_name": "images.image9",
+ "name": "images.image9",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 226
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "BatchImagesNode"
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 3,
+ "title": "Crop Images 3x3",
+ "bounding": [
+ 100,
+ 2700,
+ 2640,
+ 5480
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 73,
+ "origin_id": 51,
+ "origin_slot": 0,
+ "target_id": 50,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 108,
+ "origin_id": 52,
+ "origin_slot": 0,
+ "target_id": 50,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 76,
+ "origin_id": 54,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 77,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 54,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 78,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 54,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 79,
+ "origin_id": 51,
+ "origin_slot": 1,
+ "target_id": 55,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 80,
+ "origin_id": 52,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 83,
+ "origin_id": 58,
+ "origin_slot": 0,
+ "target_id": 57,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 84,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 58,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 85,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 58,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 86,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 58,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 88,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 60,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 89,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 60,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 90,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 60,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 92,
+ "origin_id": 60,
+ "origin_slot": 0,
+ "target_id": 61,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 95,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": 63,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 97,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 65,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 98,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 65,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 99,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 65,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 100,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 65,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 126,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 71,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 118,
+ "origin_id": 77,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 128,
+ "origin_id": 78,
+ "origin_slot": 1,
+ "target_id": 77,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 121,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 77,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 302,
+ "origin_id": 140,
+ "origin_slot": 1,
+ "target_id": 77,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 123,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 77,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 127,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 130,
+ "origin_id": 81,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 132,
+ "origin_id": 78,
+ "origin_slot": 1,
+ "target_id": 81,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 136,
+ "origin_id": 71,
+ "origin_slot": 1,
+ "target_id": 81,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 303,
+ "origin_id": 140,
+ "origin_slot": 1,
+ "target_id": 81,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 307,
+ "origin_id": 141,
+ "origin_slot": 1,
+ "target_id": 81,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 138,
+ "origin_id": 84,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 146,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 84,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 147,
+ "origin_id": 71,
+ "origin_slot": 1,
+ "target_id": 84,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 142,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 84,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 308,
+ "origin_id": 141,
+ "origin_slot": 1,
+ "target_id": 84,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 149,
+ "origin_id": 86,
+ "origin_slot": 0,
+ "target_id": 85,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 156,
+ "origin_id": 71,
+ "origin_slot": 1,
+ "target_id": 86,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 152,
+ "origin_id": 50,
+ "origin_slot": 1,
+ "target_id": 86,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 309,
+ "origin_id": 141,
+ "origin_slot": 1,
+ "target_id": 86,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 158,
+ "origin_id": 89,
+ "origin_slot": 0,
+ "target_id": 88,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 163,
+ "origin_id": 78,
+ "origin_slot": 1,
+ "target_id": 89,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 304,
+ "origin_id": 140,
+ "origin_slot": 1,
+ "target_id": 89,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 161,
+ "origin_id": 55,
+ "origin_slot": 1,
+ "target_id": 89,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 300,
+ "origin_id": 51,
+ "origin_slot": 0,
+ "target_id": 140,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 301,
+ "origin_id": 78,
+ "origin_slot": 1,
+ "target_id": 140,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 305,
+ "origin_id": 51,
+ "origin_slot": 1,
+ "target_id": 141,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 306,
+ "origin_id": 71,
+ "origin_slot": 1,
+ "target_id": 141,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 74,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 51,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 75,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 82,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 57,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 91,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 61,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 94,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 117,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 129,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 79,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 137,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 148,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 85,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 157,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 88,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 101,
+ "origin_id": 53,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 192,
+ "origin_id": 61,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 193,
+ "origin_id": 88,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 194,
+ "origin_id": 57,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 195,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 196,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 5,
+ "type": "IMAGE"
+ },
+ {
+ "id": 197,
+ "origin_id": 85,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 6,
+ "type": "IMAGE"
+ },
+ {
+ "id": 198,
+ "origin_id": 82,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 7,
+ "type": "IMAGE"
+ },
+ {
+ "id": 199,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 8,
+ "type": "IMAGE"
+ },
+ {
+ "id": 226,
+ "origin_id": 136,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 9,
+ "type": "IMAGE"
+ },
+ {
+ "id": 227,
+ "origin_id": 53,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 228,
+ "origin_id": 61,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 229,
+ "origin_id": 88,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 230,
+ "origin_id": 57,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 231,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 232,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 5,
+ "type": "IMAGE"
+ },
+ {
+ "id": 233,
+ "origin_id": 85,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 6,
+ "type": "IMAGE"
+ },
+ {
+ "id": 234,
+ "origin_id": 82,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 7,
+ "type": "IMAGE"
+ },
+ {
+ "id": 235,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 8,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Image Tools/Crop",
+ "description": "Splits an image into a 3×3 grid of nine equal tiles."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": [],
+ "links_added_by_ue": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Depth to Image (Z-Image-Turbo).json b/blueprints/Depth to Image (Z-Image-Turbo).json
index 0b657534f..2790827a3 100644
--- a/blueprints/Depth to Image (Z-Image-Turbo).json
+++ b/blueprints/Depth to Image (Z-Image-Turbo).json
@@ -160,7 +160,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Depth to Image (Z-Image-Turbo)",
+ "name": "Depth to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1579,7 +1579,8 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
- "category": "Image generation and editing/Depth to image"
+ "category": "Image generation and editing/Conditioned",
+ "description": "Generates an image from a depth map using Z-Image-Turbo with text conditioning."
},
{
"id": "458bdf3c-4b58-421c-af50-c9c663a4d74c",
@@ -2461,7 +2462,8 @@
]
},
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
}
]
},
@@ -2482,4 +2484,4 @@
"VHS_KeepIntermediate": true
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Depth to Video (ltx 2.0).json b/blueprints/Depth to Video (ltx 2.0).json
index 98c39eea5..56912de51 100644
--- a/blueprints/Depth to Video (ltx 2.0).json
+++ b/blueprints/Depth to Video (ltx 2.0).json
@@ -261,7 +261,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Depth to Video (LTX 2.0)",
+ "name": "Depth to Video (LTX 2.0)",
"inputNode": {
"id": -10,
"bounding": [
@@ -4233,7 +4233,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Depth to video"
+ "category": "Video generation and editing/Conditioned",
+ "description": "Generates depth-controlled video with LTX-2: motion and structure follow a depth-reference video alongside text prompting, optional first-frame image conditioning, with optional synchronized audio."
},
{
"id": "38b60539-50a7-42f9-a5fe-bdeca26272e2",
@@ -5192,7 +5193,8 @@
],
"extra": {
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
}
]
},
@@ -5208,4 +5210,4 @@
"workflowRendererVersion": "LG"
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Edge-Preserving Blur.json b/blueprints/Edge-Preserving Blur.json
index 18012beb1..fbda9f126 100644
--- a/blueprints/Edge-Preserving Blur.json
+++ b/blueprints/Edge-Preserving Blur.json
@@ -450,9 +450,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Blur"
+ "category": "Image Tools/Blur",
+ "description": "Applies bilateral (edge-preserving) blur to soften images while retaining detail."
}
]
},
"extra": {}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Film Grain.json b/blueprints/Film Grain.json
index a680b3ece..3226ea9aa 100644
--- a/blueprints/Film Grain.json
+++ b/blueprints/Film Grain.json
@@ -580,8 +580,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adds procedural film grain texture for a cinematic look via GPU fragment shader."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/First-Last-Frame to Video (LTX-2.3).json b/blueprints/First-Last-Frame to Video (LTX-2.3).json
new file mode 100644
index 000000000..4cae2dc24
--- /dev/null
+++ b/blueprints/First-Last-Frame to Video (LTX-2.3).json
@@ -0,0 +1,3361 @@
+{
+ "revision": 0,
+ "last_node_id": 228,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 228,
+ "type": "a5982aee-8136-4819-86a0-cf9d9e510ad6",
+ "pos": [
+ 1490,
+ 4730
+ ],
+ "size": [
+ 274.8169921875,
+ 276
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "first_frame",
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "label": "last_frame",
+ "localized_name": "input_1",
+ "name": "input_1",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "label": "width",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "height",
+ "name": "value_1",
+ "type": "INT",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "value_2",
+ "type": "INT",
+ "widget": {
+ "name": "value_2"
+ },
+ "link": null
+ },
+ {
+ "label": "fps",
+ "name": "value_3",
+ "type": "INT",
+ "widget": {
+ "name": "value_3"
+ },
+ "link": null
+ },
+ {
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ },
+ {
+ "label": "ckpt_name",
+ "name": "ckpt_name_1",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name_1"
+ },
+ "link": null
+ },
+ {
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "222",
+ "text"
+ ],
+ [
+ "215",
+ "value"
+ ],
+ [
+ "216",
+ "value"
+ ],
+ [
+ "198",
+ "value"
+ ],
+ [
+ "205",
+ "value"
+ ],
+ [
+ "196",
+ "noise_seed"
+ ],
+ [
+ "224",
+ "ckpt_name"
+ ],
+ [
+ "225",
+ "text_encoder"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "First-Last-Frame to Video (LTX-2.3)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "a5982aee-8136-4819-86a0-cf9d9e510ad6",
+ "version": 1,
+ "state": {
+ "lastGroupId": 22,
+ "lastNodeId": 228,
+ "lastLinkId": 276,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "First-Last-Frame to Video (LTX-2.3)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ 270,
+ 3100,
+ 120,
+ 240
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 3620,
+ 3120,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "6fe179c4-d96f-4383-b202-844f6de4922e",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 251
+ ],
+ "localized_name": "input",
+ "label": "first_frame",
+ "pos": [
+ 370,
+ 3120
+ ]
+ },
+ {
+ "id": "e80df1ae-5f39-4f86-91bd-0467635e2f2d",
+ "name": "input_1",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 253
+ ],
+ "localized_name": "input_1",
+ "label": "last_frame",
+ "pos": [
+ 370,
+ 3140
+ ]
+ },
+ {
+ "id": "433148fa-bf73-4ab1-81d9-09e2e38ed861",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 265
+ ],
+ "pos": [
+ 370,
+ 3160
+ ]
+ },
+ {
+ "id": "36915bc8-a6ed-4d48-8619-e0e8723228e9",
+ "name": "value",
+ "type": "INT",
+ "linkIds": [
+ 266
+ ],
+ "label": "width",
+ "pos": [
+ 370,
+ 3180
+ ]
+ },
+ {
+ "id": "425a36b8-91ab-41b7-81e9-496eba064ec8",
+ "name": "value_1",
+ "type": "INT",
+ "linkIds": [
+ 267
+ ],
+ "label": "height",
+ "pos": [
+ 370,
+ 3200
+ ]
+ },
+ {
+ "id": "0c9e003b-bd07-4b7d-aa6d-789e138ed161",
+ "name": "value_2",
+ "type": "INT",
+ "linkIds": [
+ 268
+ ],
+ "label": "duration",
+ "pos": [
+ 370,
+ 3220
+ ]
+ },
+ {
+ "id": "581b52ff-21c5-4774-ac2a-8f69a7e09e2e",
+ "name": "value_3",
+ "type": "INT",
+ "linkIds": [
+ 269
+ ],
+ "label": "fps",
+ "pos": [
+ 370,
+ 3240
+ ]
+ },
+ {
+ "id": "d03cc171-45da-4658-99aa-77252bbcf522",
+ "name": "noise_seed",
+ "type": "INT",
+ "linkIds": [
+ 270
+ ],
+ "pos": [
+ 370,
+ 3260
+ ]
+ },
+ {
+ "id": "e68e61c8-905e-43ac-8c76-65ac52270a08",
+ "name": "ckpt_name_1",
+ "type": "COMBO",
+ "linkIds": [
+ 272,
+ 275,
+ 276
+ ],
+ "label": "ckpt_name",
+ "pos": [
+ 370,
+ 3280
+ ]
+ },
+ {
+ "id": "5d065f3b-891b-499f-950b-c2df0be24536",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "linkIds": [
+ 273
+ ],
+ "pos": [
+ 370,
+ 3300
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "0c8c2dc0-c67c-4bc2-9e57-6aa00db2e3a9",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "linkIds": [
+ 252
+ ],
+ "localized_name": "VIDEO",
+ "pos": [
+ 3640,
+ 3140
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 195,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 1480,
+ 3780
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 203
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 229
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 196,
+ "type": "RandomNoise",
+ "pos": [
+ 1990,
+ 2320
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": 270
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 246
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "noise_seed": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 315253765879496,
+ "randomize"
+ ]
+ },
+ {
+ "id": 197,
+ "type": "LTXVEmptyLatentAudio",
+ "pos": [
+ 2090,
+ 3820
+ ],
+ "size": [
+ 280,
+ 170
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 205
+ },
+ {
+ "localized_name": "frames_number",
+ "name": "frames_number",
+ "type": "INT",
+ "widget": {
+ "name": "frames_number"
+ },
+ "link": 262
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "INT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 207
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Latent",
+ "name": "Latent",
+ "type": "LATENT",
+ "links": [
+ 245
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVEmptyLatentAudio",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 97,
+ 25,
+ 1
+ ]
+ },
+ {
+ "id": 198,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3650
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 268
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 260
+ ]
+ }
+ ],
+ "title": "Duration",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 5,
+ "fixed"
+ ]
+ },
+ {
+ "id": 199,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 1480,
+ 3340
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 210
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 240
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 200,
+ "type": "LTXVCropGuides",
+ "pos": [
+ 2820,
+ 2450
+ ],
+ "size": [
+ 280,
+ 120
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 213
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 214
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 215
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": []
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": []
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 211
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.8.2",
+ "Node name for S&R": "LTXVCropGuides",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 201,
+ "type": "EmptyLTXVLatentVideo",
+ "pos": [
+ 2090,
+ 3580
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 218
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 219
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 263
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 239
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "Node name for S&R": "EmptyLTXVLatentVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 512,
+ 97,
+ 1
+ ]
+ },
+ {
+ "id": 202,
+ "type": "LTXVConditioning",
+ "pos": [
+ 2090,
+ 3400
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 221
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 222
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "FLOAT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 223
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 236
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 237
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "LTXVConditioning",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 203,
+ "type": "GetImageSize",
+ "pos": [
+ 1480,
+ 3500
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 224
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 218
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 219
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "GetImageSize",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 204,
+ "type": "LTXVAddGuide",
+ "pos": [
+ 2750,
+ 3700
+ ],
+ "size": [
+ 280,
+ 240
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 225
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 226
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 227
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 228
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 229
+ },
+ {
+ "localized_name": "frame_idx",
+ "name": "frame_idx",
+ "type": "INT",
+ "widget": {
+ "name": "frame_idx"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 213,
+ 242
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 214,
+ 243
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 244
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "LTXVAddGuide",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ -1,
+ 0.7
+ ]
+ },
+ {
+ "id": 205,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3800
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 269
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 207,
+ 235,
+ 261
+ ]
+ }
+ ],
+ "title": "Frame Rate(int)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25,
+ "fixed"
+ ]
+ },
+ {
+ "id": 206,
+ "type": "LTXVAddGuide",
+ "pos": [
+ 2750,
+ 3430
+ ],
+ "size": [
+ 280,
+ 240
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 236
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 237
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 238
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 239
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 240
+ },
+ {
+ "localized_name": "frame_idx",
+ "name": "frame_idx",
+ "type": "INT",
+ "widget": {
+ "name": "frame_idx"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 225
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 226
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 228
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "LTXVAddGuide",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 0.7
+ ]
+ },
+ {
+ "id": 207,
+ "type": "CFGGuider",
+ "pos": [
+ 1990,
+ 2500
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 241
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 242
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 243
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 247
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 208,
+ "type": "SamplerEulerAncestral",
+ "pos": [
+ 1990,
+ 2720
+ ],
+ "size": [
+ 280,
+ 120
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "eta",
+ "name": "eta",
+ "type": "FLOAT",
+ "widget": {
+ "name": "eta"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "s_noise",
+ "name": "s_noise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "s_noise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 248
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "SamplerEulerAncestral",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 209,
+ "type": "ManualSigmas",
+ "pos": [
+ 1990,
+ 2910
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 249
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "1., 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0"
+ ]
+ },
+ {
+ "id": 210,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 1990,
+ 3090
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 244
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 245
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 250
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 211,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 2460,
+ 2330
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 246
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 247
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 248
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 249
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 250
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": []
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": [
+ 204
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 212,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 760,
+ 3970
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 235
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 223,
+ 234
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.17.0",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 213,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1130,
+ 3340
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 251
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 208
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 209
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 210,
+ 224
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "resize_type.width": true,
+ "resize_type.height": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 640,
+ 360,
+ "center",
+ "nearest-exact"
+ ]
+ },
+ {
+ "id": 214,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1130,
+ 3780
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 253
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 201
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 202
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 203
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "resize_type.width": true,
+ "resize_type.height": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 640,
+ 360,
+ "center",
+ "nearest-exact"
+ ]
+ },
+ {
+ "id": 215,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3340
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 266
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 201,
+ 208
+ ]
+ }
+ ],
+ "title": "Width",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1280,
+ "fixed"
+ ]
+ },
+ {
+ "id": 216,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3490
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 267
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 202,
+ 209
+ ]
+ }
+ ],
+ "title": "height",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 217,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1320,
+ 2870
+ ],
+ "size": [
+ 590,
+ 200
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 230
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 222
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, unreadable text on shirt or hat, incorrect lettering on cap (“PNTR”), incorrect t-shirt slogan (“JUST DO IT”), missing microphone, misplaced microphone, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, smiling, laughing, exaggerated sadness, wrong gaze direction, eyes looking at camera, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, missing sniff sounds, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, missing door or shelves, missing shallow depth of field, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 218,
+ "type": "CreateVideo",
+ "pos": [
+ 3280,
+ 2320
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 232
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 233
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 234
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 252
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "CreateVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 219,
+ "type": "VAEDecodeTiled",
+ "pos": [
+ 2820,
+ 2630
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 211
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 212
+ },
+ {
+ "localized_name": "tile_size",
+ "name": "tile_size",
+ "type": "INT",
+ "widget": {
+ "name": "tile_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "overlap",
+ "name": "overlap",
+ "type": "INT",
+ "widget": {
+ "name": "overlap"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_size",
+ "name": "temporal_size",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_overlap",
+ "name": "temporal_overlap",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_overlap"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "VAEDecodeTiled",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 64,
+ 4096,
+ 64
+ ]
+ },
+ {
+ "id": 220,
+ "type": "LTXVAudioVAEDecode",
+ "pos": [
+ 2820,
+ 2920
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 216
+ },
+ {
+ "label": "Audio VAE",
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 217
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio",
+ "name": "Audio",
+ "type": "AUDIO",
+ "links": [
+ 233
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVAudioVAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 221,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 2460,
+ 2580
+ ],
+ "size": [
+ 250,
+ 100
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 204
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 215
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 216
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 222,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1310,
+ 2380
+ ],
+ "size": [
+ 620,
+ 420
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 231
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 265
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 221
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 223,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 770,
+ 2380
+ ],
+ "size": [
+ 420,
+ 160
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 276
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 241
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 212,
+ 227,
+ 238
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 224,
+ "type": "LTXVAudioVAELoader",
+ "pos": [
+ 770,
+ 2660
+ ],
+ "size": [
+ 420,
+ 110
+ ],
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 272
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio VAE",
+ "name": "Audio VAE",
+ "type": "VAE",
+ "links": [
+ 205,
+ 217
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "Node name for S&R": "LTXVAudioVAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 225,
+ "type": "LTXAVTextEncoderLoader",
+ "pos": [
+ 770,
+ 2890
+ ],
+ "size": [
+ 410,
+ 160
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "text_encoder",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": 273
+ },
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 275
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 230,
+ 231
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "Node name for S&R": "LTXAVTextEncoderLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "gemma_3_12B_it_fp4_mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "gemma_3_12B_it_fp4_mixed.safetensors",
+ "ltx-2.3-22b-distilled-fp8.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 226,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 760,
+ 4020
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 260
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 261
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 262,
+ 263
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b + 1"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Conditioning",
+ "bounding": [
+ 1850,
+ 3250,
+ 1370,
+ 800
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Settings",
+ "bounding": [
+ 730,
+ 3250,
+ 290,
+ 800
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "FIrst Frame",
+ "bounding": [
+ 1050,
+ 3250,
+ 770,
+ 400
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Last Frame",
+ "bounding": [
+ 1050,
+ 3680,
+ 770,
+ 370
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Model",
+ "bounding": [
+ 730,
+ 2240,
+ 500,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Prompt",
+ "bounding": [
+ 1260,
+ 2240,
+ 680,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Sampling",
+ "bounding": [
+ 1970,
+ 2240,
+ 770,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Decoding",
+ "bounding": [
+ 2770,
+ 2240,
+ 450,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 203,
+ "origin_id": 214,
+ "origin_slot": 0,
+ "target_id": 195,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 205,
+ "origin_id": 224,
+ "origin_slot": 0,
+ "target_id": 197,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 207,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 197,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 210,
+ "origin_id": 213,
+ "origin_slot": 0,
+ "target_id": 199,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 213,
+ "origin_id": 204,
+ "origin_slot": 0,
+ "target_id": 200,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 214,
+ "origin_id": 204,
+ "origin_slot": 1,
+ "target_id": 200,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 215,
+ "origin_id": 221,
+ "origin_slot": 0,
+ "target_id": 200,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 218,
+ "origin_id": 203,
+ "origin_slot": 0,
+ "target_id": 201,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 219,
+ "origin_id": 203,
+ "origin_slot": 1,
+ "target_id": 201,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 221,
+ "origin_id": 222,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 222,
+ "origin_id": 217,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 223,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 224,
+ "origin_id": 213,
+ "origin_slot": 0,
+ "target_id": 203,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 225,
+ "origin_id": 206,
+ "origin_slot": 0,
+ "target_id": 204,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 226,
+ "origin_id": 206,
+ "origin_slot": 1,
+ "target_id": 204,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 227,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 204,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 228,
+ "origin_id": 206,
+ "origin_slot": 2,
+ "target_id": 204,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 229,
+ "origin_id": 195,
+ "origin_slot": 0,
+ "target_id": 204,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 236,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 237,
+ "origin_id": 202,
+ "origin_slot": 1,
+ "target_id": 206,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 238,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 206,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 239,
+ "origin_id": 201,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 240,
+ "origin_id": 199,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 241,
+ "origin_id": 223,
+ "origin_slot": 0,
+ "target_id": 207,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 242,
+ "origin_id": 204,
+ "origin_slot": 0,
+ "target_id": 207,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 243,
+ "origin_id": 204,
+ "origin_slot": 1,
+ "target_id": 207,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 244,
+ "origin_id": 204,
+ "origin_slot": 2,
+ "target_id": 210,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 245,
+ "origin_id": 197,
+ "origin_slot": 0,
+ "target_id": 210,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 246,
+ "origin_id": 196,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 247,
+ "origin_id": 207,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 248,
+ "origin_id": 208,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 249,
+ "origin_id": 209,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 250,
+ "origin_id": 210,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 235,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 212,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 208,
+ "origin_id": 215,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 209,
+ "origin_id": 216,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 201,
+ "origin_id": 215,
+ "origin_slot": 0,
+ "target_id": 214,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 202,
+ "origin_id": 216,
+ "origin_slot": 0,
+ "target_id": 214,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 230,
+ "origin_id": 225,
+ "origin_slot": 0,
+ "target_id": 217,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 232,
+ "origin_id": 219,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 233,
+ "origin_id": 220,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 234,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 211,
+ "origin_id": 200,
+ "origin_slot": 2,
+ "target_id": 219,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 212,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 219,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 216,
+ "origin_id": 221,
+ "origin_slot": 1,
+ "target_id": 220,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 217,
+ "origin_id": 224,
+ "origin_slot": 0,
+ "target_id": 220,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 204,
+ "origin_id": 211,
+ "origin_slot": 1,
+ "target_id": 221,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 231,
+ "origin_id": 225,
+ "origin_slot": 0,
+ "target_id": 222,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 251,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 253,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 214,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 252,
+ "origin_id": 218,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 260,
+ "origin_id": 198,
+ "origin_slot": 0,
+ "target_id": 226,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 261,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 226,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 262,
+ "origin_id": 226,
+ "origin_slot": 1,
+ "target_id": 197,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 263,
+ "origin_id": 226,
+ "origin_slot": 1,
+ "target_id": 201,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 265,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 222,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 266,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 215,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 267,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 216,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 268,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 198,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 269,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 205,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 270,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 196,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 272,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 224,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 273,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 225,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 275,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 225,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 276,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 223,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Video generation and editing/Conditioned",
+ "description": "Generates a video interpolating between first and last keyframes using LTX-2.3."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/First-Last-Frame to Video.json b/blueprints/First-Last-Frame to Video.json
new file mode 100644
index 000000000..d76e1e045
--- /dev/null
+++ b/blueprints/First-Last-Frame to Video.json
@@ -0,0 +1,3361 @@
+{
+ "revision": 0,
+ "last_node_id": 227,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 227,
+ "type": "283e4561-61a2-4538-b960-265736eb041f",
+ "pos": [
+ 620,
+ 3140
+ ],
+ "size": [
+ 540,
+ 0
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "first_frame",
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "label": "last_frame",
+ "localized_name": "input_1",
+ "name": "input_1",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "label": "width",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "height",
+ "name": "value_1",
+ "type": "INT",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "value_2",
+ "type": "INT",
+ "widget": {
+ "name": "value_2"
+ },
+ "link": null
+ },
+ {
+ "label": "fps",
+ "name": "value_3",
+ "type": "INT",
+ "widget": {
+ "name": "value_3"
+ },
+ "link": null
+ },
+ {
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ },
+ {
+ "label": "ckpt_name",
+ "name": "ckpt_name_1",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name_1"
+ },
+ "link": null
+ },
+ {
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "title": "First-Last-Frame to Video",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "222",
+ "text"
+ ],
+ [
+ "215",
+ "value"
+ ],
+ [
+ "216",
+ "value"
+ ],
+ [
+ "198",
+ "value"
+ ],
+ [
+ "205",
+ "value"
+ ],
+ [
+ "196",
+ "noise_seed"
+ ],
+ [
+ "224",
+ "ckpt_name"
+ ],
+ [
+ "225",
+ "text_encoder"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ }
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "283e4561-61a2-4538-b960-265736eb041f",
+ "version": 1,
+ "state": {
+ "lastGroupId": 22,
+ "lastNodeId": 227,
+ "lastLinkId": 276,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "First-Last-Frame to Video",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ 270,
+ 3100,
+ 120,
+ 240
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 3620,
+ 3120,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "6fe179c4-d96f-4383-b202-844f6de4922e",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 251
+ ],
+ "localized_name": "input",
+ "label": "first_frame",
+ "pos": [
+ 370,
+ 3120
+ ]
+ },
+ {
+ "id": "e80df1ae-5f39-4f86-91bd-0467635e2f2d",
+ "name": "input_1",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 253
+ ],
+ "localized_name": "input_1",
+ "label": "last_frame",
+ "pos": [
+ 370,
+ 3140
+ ]
+ },
+ {
+ "id": "433148fa-bf73-4ab1-81d9-09e2e38ed861",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 265
+ ],
+ "pos": [
+ 370,
+ 3160
+ ]
+ },
+ {
+ "id": "36915bc8-a6ed-4d48-8619-e0e8723228e9",
+ "name": "value",
+ "type": "INT",
+ "linkIds": [
+ 266
+ ],
+ "label": "width",
+ "pos": [
+ 370,
+ 3180
+ ]
+ },
+ {
+ "id": "425a36b8-91ab-41b7-81e9-496eba064ec8",
+ "name": "value_1",
+ "type": "INT",
+ "linkIds": [
+ 267
+ ],
+ "label": "height",
+ "pos": [
+ 370,
+ 3200
+ ]
+ },
+ {
+ "id": "0c9e003b-bd07-4b7d-aa6d-789e138ed161",
+ "name": "value_2",
+ "type": "INT",
+ "linkIds": [
+ 268
+ ],
+ "label": "duration",
+ "pos": [
+ 370,
+ 3220
+ ]
+ },
+ {
+ "id": "581b52ff-21c5-4774-ac2a-8f69a7e09e2e",
+ "name": "value_3",
+ "type": "INT",
+ "linkIds": [
+ 269
+ ],
+ "label": "fps",
+ "pos": [
+ 370,
+ 3240
+ ]
+ },
+ {
+ "id": "d03cc171-45da-4658-99aa-77252bbcf522",
+ "name": "noise_seed",
+ "type": "INT",
+ "linkIds": [
+ 270
+ ],
+ "pos": [
+ 370,
+ 3260
+ ]
+ },
+ {
+ "id": "e68e61c8-905e-43ac-8c76-65ac52270a08",
+ "name": "ckpt_name_1",
+ "type": "COMBO",
+ "linkIds": [
+ 272,
+ 275,
+ 276
+ ],
+ "label": "ckpt_name",
+ "pos": [
+ 370,
+ 3280
+ ]
+ },
+ {
+ "id": "5d065f3b-891b-499f-950b-c2df0be24536",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "linkIds": [
+ 273
+ ],
+ "pos": [
+ 370,
+ 3300
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "0c8c2dc0-c67c-4bc2-9e57-6aa00db2e3a9",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "linkIds": [
+ 252
+ ],
+ "localized_name": "VIDEO",
+ "pos": [
+ 3640,
+ 3140
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 195,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 1480,
+ 3780
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 203
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 229
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 196,
+ "type": "RandomNoise",
+ "pos": [
+ 1990,
+ 2320
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": 270
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 246
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "noise_seed": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 315253765879496,
+ "randomize"
+ ]
+ },
+ {
+ "id": 197,
+ "type": "LTXVEmptyLatentAudio",
+ "pos": [
+ 2090,
+ 3820
+ ],
+ "size": [
+ 280,
+ 170
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 205
+ },
+ {
+ "localized_name": "frames_number",
+ "name": "frames_number",
+ "type": "INT",
+ "widget": {
+ "name": "frames_number"
+ },
+ "link": 262
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "INT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 207
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Latent",
+ "name": "Latent",
+ "type": "LATENT",
+ "links": [
+ 245
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVEmptyLatentAudio",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 97,
+ 25,
+ 1
+ ]
+ },
+ {
+ "id": 198,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3650
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 268
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 260
+ ]
+ }
+ ],
+ "title": "Duration",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 5,
+ "fixed"
+ ]
+ },
+ {
+ "id": 199,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 1480,
+ 3340
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 210
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 240
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 200,
+ "type": "LTXVCropGuides",
+ "pos": [
+ 2820,
+ 2450
+ ],
+ "size": [
+ 280,
+ 120
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 213
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 214
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 215
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": []
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": []
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 211
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.8.2",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "Node name for S&R": "LTXVCropGuides",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 201,
+ "type": "EmptyLTXVLatentVideo",
+ "pos": [
+ 2090,
+ 3580
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 218
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 219
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 263
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 239
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptyLTXVLatentVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 512,
+ 97,
+ 1
+ ]
+ },
+ {
+ "id": 202,
+ "type": "LTXVConditioning",
+ "pos": [
+ 2090,
+ 3400
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 221
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 222
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "FLOAT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 223
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 236
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 237
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVConditioning",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25
+ ]
+ },
+ {
+ "id": 203,
+ "type": "GetImageSize",
+ "pos": [
+ 1480,
+ 3500
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 224
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 218
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 219
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "GetImageSize",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 204,
+ "type": "LTXVAddGuide",
+ "pos": [
+ 2750,
+ 3700
+ ],
+ "size": [
+ 280,
+ 240
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 225
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 226
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 227
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 228
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 229
+ },
+ {
+ "localized_name": "frame_idx",
+ "name": "frame_idx",
+ "type": "INT",
+ "widget": {
+ "name": "frame_idx"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 213,
+ 242
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 214,
+ 243
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 244
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVAddGuide",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ -1,
+ 0.7
+ ]
+ },
+ {
+ "id": 205,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3800
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 269
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 207,
+ 235,
+ 261
+ ]
+ }
+ ],
+ "title": "Frame Rate(int)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25,
+ "fixed"
+ ]
+ },
+ {
+ "id": 206,
+ "type": "LTXVAddGuide",
+ "pos": [
+ 2750,
+ 3430
+ ],
+ "size": [
+ 280,
+ 240
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 236
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 237
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 238
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 239
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 240
+ },
+ {
+ "localized_name": "frame_idx",
+ "name": "frame_idx",
+ "type": "INT",
+ "widget": {
+ "name": "frame_idx"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 225
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 226
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 228
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVAddGuide",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 0.7
+ ]
+ },
+ {
+ "id": 207,
+ "type": "CFGGuider",
+ "pos": [
+ 1990,
+ 2500
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 241
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 242
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 243
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 247
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 208,
+ "type": "SamplerEulerAncestral",
+ "pos": [
+ 1990,
+ 2720
+ ],
+ "size": [
+ 280,
+ 120
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "eta",
+ "name": "eta",
+ "type": "FLOAT",
+ "widget": {
+ "name": "eta"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "s_noise",
+ "name": "s_noise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "s_noise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 248
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "SamplerEulerAncestral",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 209,
+ "type": "ManualSigmas",
+ "pos": [
+ 1990,
+ 2910
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 249
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "1., 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0"
+ ]
+ },
+ {
+ "id": 210,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 1990,
+ 3090
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 244
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 245
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 250
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 211,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 2460,
+ 2330
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 246
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 247
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 248
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 249
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 250
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": []
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": [
+ 204
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 212,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 760,
+ 3970
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 235
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 223,
+ 234
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.17.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 213,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1130,
+ 3340
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 251
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 208
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 209
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 210,
+ 224
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "resize_type.width": true,
+ "resize_type.height": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 640,
+ 360,
+ "center",
+ "nearest-exact"
+ ]
+ },
+ {
+ "id": 214,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1130,
+ 3780
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 253
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 201
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 202
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 203
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "resize_type.width": true,
+ "resize_type.height": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 640,
+ 360,
+ "center",
+ "nearest-exact"
+ ]
+ },
+ {
+ "id": 215,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3340
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 266
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 201,
+ 208
+ ]
+ }
+ ],
+ "title": "Width",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1280,
+ "fixed"
+ ]
+ },
+ {
+ "id": 216,
+ "type": "PrimitiveInt",
+ "pos": [
+ 760,
+ 3490
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 267
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 202,
+ 209
+ ]
+ }
+ ],
+ "title": "height",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 217,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1320,
+ 2870
+ ],
+ "size": [
+ 590,
+ 200
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 230
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 222
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "blurry, out of focus, overexposed, underexposed, low contrast, washed out colors, excessive noise, grainy texture, poor lighting, flickering, motion blur, distorted proportions, unnatural skin tones, deformed facial features, asymmetrical face, missing facial features, extra limbs, disfigured hands, wrong hand count, artifacts around text, unreadable text on shirt or hat, incorrect lettering on cap (“PNTR”), incorrect t-shirt slogan (“JUST DO IT”), missing microphone, misplaced microphone, inconsistent perspective, camera shake, incorrect depth of field, background too sharp, background clutter, distracting reflections, harsh shadows, inconsistent lighting direction, color banding, cartoonish rendering, 3D CGI look, unrealistic materials, uncanny valley effect, incorrect ethnicity, wrong gender, exaggerated expressions, smiling, laughing, exaggerated sadness, wrong gaze direction, eyes looking at camera, mismatched lip sync, silent or muted audio, distorted voice, robotic voice, echo, background noise, off-sync audio, missing sniff sounds, incorrect dialogue, added dialogue, repetitive speech, jittery movement, awkward pauses, incorrect timing, unnatural transitions, inconsistent framing, tilted camera, missing door or shelves, missing shallow depth of field, flat lighting, inconsistent tone, cinematic oversaturation, stylized filters, or AI artifacts."
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 218,
+ "type": "CreateVideo",
+ "pos": [
+ 3280,
+ 2320
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 232
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 233
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 234
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 252
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CreateVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 219,
+ "type": "VAEDecodeTiled",
+ "pos": [
+ 2820,
+ 2630
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 211
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 212
+ },
+ {
+ "localized_name": "tile_size",
+ "name": "tile_size",
+ "type": "INT",
+ "widget": {
+ "name": "tile_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "overlap",
+ "name": "overlap",
+ "type": "INT",
+ "widget": {
+ "name": "overlap"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_size",
+ "name": "temporal_size",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_overlap",
+ "name": "temporal_overlap",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_overlap"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecodeTiled",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 64,
+ 4096,
+ 64
+ ]
+ },
+ {
+ "id": 220,
+ "type": "LTXVAudioVAEDecode",
+ "pos": [
+ 2820,
+ 2920
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 216
+ },
+ {
+ "label": "Audio VAE",
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 217
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio",
+ "name": "Audio",
+ "type": "AUDIO",
+ "links": [
+ 233
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVAudioVAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 221,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 2460,
+ 2580
+ ],
+ "size": [
+ 250,
+ 100
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 204
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 215
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 216
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 222,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 1310,
+ 2380
+ ],
+ "size": [
+ 620,
+ 420
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 231
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 265
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 221
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.5.2",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 223,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 770,
+ 2380
+ ],
+ "size": [
+ 420,
+ 160
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 276
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 241
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 212,
+ 227,
+ 238
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 224,
+ "type": "LTXVAudioVAELoader",
+ "pos": [
+ 770,
+ 2660
+ ],
+ "size": [
+ 420,
+ 110
+ ],
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 272
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio VAE",
+ "name": "Audio VAE",
+ "type": "VAE",
+ "links": [
+ 205,
+ 217
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "Node name for S&R": "LTXVAudioVAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 225,
+ "type": "LTXAVTextEncoderLoader",
+ "pos": [
+ 770,
+ 2890
+ ],
+ "size": [
+ 410,
+ 160
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "text_encoder",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": 273
+ },
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 275
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 230,
+ 231
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.10.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.5.2"
+ },
+ "Node name for S&R": "LTXAVTextEncoderLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "gemma_3_12B_it_fp4_mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "ltx-2.3-22b-distilled-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-distilled-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "gemma_3_12B_it_fp4_mixed.safetensors",
+ "ltx-2.3-22b-distilled-fp8.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 226,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 760,
+ 4020
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 260
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 261
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 262,
+ 263
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b + 1"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Conditioning",
+ "bounding": [
+ 1850,
+ 3250,
+ 1370,
+ 800
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Settings",
+ "bounding": [
+ 730,
+ 3250,
+ 290,
+ 800
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "FIrst Frame",
+ "bounding": [
+ 1050,
+ 3250,
+ 770,
+ 400
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Last Frame",
+ "bounding": [
+ 1050,
+ 3680,
+ 770,
+ 370
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Model",
+ "bounding": [
+ 730,
+ 2240,
+ 500,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Prompt",
+ "bounding": [
+ 1260,
+ 2240,
+ 680,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Sampling",
+ "bounding": [
+ 1970,
+ 2240,
+ 770,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Decoding",
+ "bounding": [
+ 2770,
+ 2240,
+ 450,
+ 980
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 203,
+ "origin_id": 214,
+ "origin_slot": 0,
+ "target_id": 195,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 205,
+ "origin_id": 224,
+ "origin_slot": 0,
+ "target_id": 197,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 207,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 197,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 210,
+ "origin_id": 213,
+ "origin_slot": 0,
+ "target_id": 199,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 213,
+ "origin_id": 204,
+ "origin_slot": 0,
+ "target_id": 200,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 214,
+ "origin_id": 204,
+ "origin_slot": 1,
+ "target_id": 200,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 215,
+ "origin_id": 221,
+ "origin_slot": 0,
+ "target_id": 200,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 218,
+ "origin_id": 203,
+ "origin_slot": 0,
+ "target_id": 201,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 219,
+ "origin_id": 203,
+ "origin_slot": 1,
+ "target_id": 201,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 221,
+ "origin_id": 222,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 222,
+ "origin_id": 217,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 223,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 202,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 224,
+ "origin_id": 213,
+ "origin_slot": 0,
+ "target_id": 203,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 225,
+ "origin_id": 206,
+ "origin_slot": 0,
+ "target_id": 204,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 226,
+ "origin_id": 206,
+ "origin_slot": 1,
+ "target_id": 204,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 227,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 204,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 228,
+ "origin_id": 206,
+ "origin_slot": 2,
+ "target_id": 204,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 229,
+ "origin_id": 195,
+ "origin_slot": 0,
+ "target_id": 204,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 236,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 237,
+ "origin_id": 202,
+ "origin_slot": 1,
+ "target_id": 206,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 238,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 206,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 239,
+ "origin_id": 201,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 240,
+ "origin_id": 199,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 241,
+ "origin_id": 223,
+ "origin_slot": 0,
+ "target_id": 207,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 242,
+ "origin_id": 204,
+ "origin_slot": 0,
+ "target_id": 207,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 243,
+ "origin_id": 204,
+ "origin_slot": 1,
+ "target_id": 207,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 244,
+ "origin_id": 204,
+ "origin_slot": 2,
+ "target_id": 210,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 245,
+ "origin_id": 197,
+ "origin_slot": 0,
+ "target_id": 210,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 246,
+ "origin_id": 196,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 247,
+ "origin_id": 207,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 248,
+ "origin_id": 208,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 249,
+ "origin_id": 209,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 250,
+ "origin_id": 210,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 235,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 212,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 208,
+ "origin_id": 215,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 209,
+ "origin_id": 216,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 201,
+ "origin_id": 215,
+ "origin_slot": 0,
+ "target_id": 214,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 202,
+ "origin_id": 216,
+ "origin_slot": 0,
+ "target_id": 214,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 230,
+ "origin_id": 225,
+ "origin_slot": 0,
+ "target_id": 217,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 232,
+ "origin_id": 219,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 233,
+ "origin_id": 220,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 234,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 218,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 211,
+ "origin_id": 200,
+ "origin_slot": 2,
+ "target_id": 219,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 212,
+ "origin_id": 223,
+ "origin_slot": 2,
+ "target_id": 219,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 216,
+ "origin_id": 221,
+ "origin_slot": 1,
+ "target_id": 220,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 217,
+ "origin_id": 224,
+ "origin_slot": 0,
+ "target_id": 220,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 204,
+ "origin_id": 211,
+ "origin_slot": 1,
+ "target_id": 221,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 231,
+ "origin_id": 225,
+ "origin_slot": 0,
+ "target_id": 222,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 251,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 213,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 253,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 214,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 252,
+ "origin_id": 218,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 260,
+ "origin_id": 198,
+ "origin_slot": 0,
+ "target_id": 226,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 261,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 226,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 262,
+ "origin_id": 226,
+ "origin_slot": 1,
+ "target_id": 197,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 263,
+ "origin_id": 226,
+ "origin_slot": 1,
+ "target_id": 201,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 265,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 222,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 266,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 215,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 267,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 216,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 268,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 198,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 269,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 205,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 270,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 196,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 272,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 224,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 273,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 225,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 275,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 225,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 276,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 223,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Video generation and editing/FLF2V",
+ "description": "Generates a video that interpolates between the first and last keyframes using LTX-2.3, including optional audio."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Frame Interpolation.json b/blueprints/Frame Interpolation.json
new file mode 100644
index 000000000..8e183de7e
--- /dev/null
+++ b/blueprints/Frame Interpolation.json
@@ -0,0 +1,858 @@
+{
+ "revision": 0,
+ "last_node_id": 16,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 16,
+ "type": "022693be-2baa-4009-870a-28921508a7ef",
+ "pos": [
+ -2990,
+ -3240
+ ],
+ "size": [
+ 410,
+ 200
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "multiplier",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_fps_multiplier",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "label": "VIDEO",
+ "name": "VIDEO_1",
+ "type": "VIDEO",
+ "links": []
+ },
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "9",
+ "value"
+ ],
+ [
+ "13",
+ "value"
+ ],
+ [
+ "1",
+ "model_name"
+ ]
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [],
+ "title": "Frame Interpolation"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "022693be-2baa-4009-870a-28921508a7ef",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 17,
+ "lastLinkId": 28,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Frame Interpolation",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -2810,
+ -3070,
+ 159.7421875,
+ 120
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1270,
+ -3075,
+ 120,
+ 80
+ ]
+ },
+ "inputs": [
+ {
+ "id": "05e31c51-dcb6-4a1e-9651-1b9ad4f7a287",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 2
+ ],
+ "localized_name": "video",
+ "pos": [
+ -2670.2578125,
+ -3050
+ ]
+ },
+ {
+ "id": "feecb409-7d1c-4a99-9c63-50c5fecdd3c9",
+ "name": "value",
+ "type": "INT",
+ "linkIds": [
+ 22
+ ],
+ "label": "multiplier",
+ "pos": [
+ -2670.2578125,
+ -3030
+ ]
+ },
+ {
+ "id": "0b8a861b-b581-4068-9e8c-f8d15daf1ca6",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 23
+ ],
+ "label": "enable_fps_multiplier",
+ "pos": [
+ -2670.2578125,
+ -3010
+ ]
+ },
+ {
+ "id": "a22b101e-8773-4e17-a297-7ee3aae09162",
+ "name": "model_name",
+ "type": "COMBO",
+ "linkIds": [
+ 24
+ ],
+ "pos": [
+ -2670.2578125,
+ -2990
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ef2ada05-d5aa-492a-9394-6c3e71e39ebb",
+ "name": "VIDEO_1",
+ "type": "VIDEO",
+ "linkIds": [
+ 26
+ ],
+ "label": "VIDEO",
+ "pos": [
+ -1250,
+ -3055
+ ]
+ },
+ {
+ "id": "5aacc622-2a07-4983-b31c-e04461f7f953",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 28
+ ],
+ "pos": [
+ -1250,
+ -3035
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 1,
+ "type": "FrameInterpolationModelLoader",
+ "pos": [
+ -2510,
+ -3370
+ ],
+ "size": [
+ 370,
+ 90
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 24
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INTERP_MODEL",
+ "name": "INTERP_MODEL",
+ "type": "INTERP_MODEL",
+ "links": [
+ 1
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "FrameInterpolationModelLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "models": [
+ {
+ "name": "film_net_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/frame_interpolation/resolve/main/frame_interpolation/film_net_fp16.safetensors",
+ "directory": "frame_interpolation"
+ }
+ ]
+ },
+ "widgets_values": [
+ "film_net_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 2,
+ "type": "FrameInterpolate",
+ "pos": [
+ -2040,
+ -3370
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "interp_model",
+ "name": "interp_model",
+ "type": "INTERP_MODEL",
+ "link": 1
+ },
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 3
+ },
+ {
+ "localized_name": "multiplier",
+ "name": "multiplier",
+ "type": "INT",
+ "widget": {
+ "name": "multiplier"
+ },
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 4,
+ 28
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "FrameInterpolate",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ 2
+ ]
+ },
+ {
+ "id": 5,
+ "type": "CreateVideo",
+ "pos": [
+ -1600,
+ -3370
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 4
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 5
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 12
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 26
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 9,
+ "type": "PrimitiveInt",
+ "pos": [
+ -2500,
+ -2970
+ ],
+ "size": [
+ 270,
+ 90
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 22
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 8,
+ 19
+ ]
+ }
+ ],
+ "title": "Int (Multiplier)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ 2,
+ "fixed"
+ ]
+ },
+ {
+ "id": 10,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -1610,
+ -3120
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 11
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 13
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 15
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 13,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -2500,
+ -2770
+ ],
+ "size": [
+ 310,
+ 90
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 23
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 15
+ ]
+ }
+ ],
+ "title": "Boolean (Apply multiplier to FPS?)",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 3,
+ "type": "GetVideoComponents",
+ "pos": [
+ -2500,
+ -3170
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 2
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 3
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 5
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 11,
+ 18
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ }
+ },
+ {
+ "id": 11,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -2090,
+ -3070
+ ],
+ "size": [
+ 400,
+ 210
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 18
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 19
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 13
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3"
+ },
+ "widgets_values": [
+ "min(abs(b), 16) * a"
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "INTERP_MODEL"
+ },
+ {
+ "id": 3,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 8,
+ "origin_id": 9,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 4,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 5,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 5,
+ "origin_id": 3,
+ "origin_slot": 1,
+ "target_id": 5,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 12,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": 5,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 11,
+ "origin_id": 3,
+ "origin_slot": 2,
+ "target_id": 10,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 13,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 15,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 18,
+ "origin_id": 3,
+ "origin_slot": 2,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 19,
+ "origin_id": 9,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 2,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 22,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 9,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 23,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 24,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 1,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 26,
+ "origin_id": 5,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 28,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Video Tools",
+ "description": "Increases video frame rate by synthesizing intermediate frames with a frame interpolation model."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Geometry Estimation (MoGe).json b/blueprints/Geometry Estimation (MoGe).json
new file mode 100644
index 000000000..e6f08bf71
--- /dev/null
+++ b/blueprints/Geometry Estimation (MoGe).json
@@ -0,0 +1,1266 @@
+{
+ "revision": 0,
+ "last_node_id": 67,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 67,
+ "type": "936dfaf2-575a-48b5-9e0c-df391319d11f",
+ "pos": [
+ -3950,
+ 5000
+ ],
+ "size": [
+ 430,
+ 480
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source_image",
+ "name": "source_image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "inference_resolution",
+ "name": "inference_resolution",
+ "type": "INT",
+ "widget": {
+ "name": "inference_resolution"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "inference_batch_size",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "inference_batch_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "mesh_frame_index",
+ "name": "mesh_frame_index",
+ "type": "INT",
+ "widget": {
+ "name": "mesh_frame_index"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "mesh_decimation",
+ "name": "mesh_decimation",
+ "type": "INT",
+ "widget": {
+ "name": "mesh_decimation"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "mesh_gap_threshold",
+ "name": "mesh_gap_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "mesh_gap_threshold"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "mesh_texture",
+ "name": "mesh_texture",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "mesh_texture"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "COMBO",
+ "widget": {
+ "name": "moge_model"
+ },
+ "link": null
+ },
+ {
+ "label": "auto_resize_input",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "mesh",
+ "name": "mesh",
+ "type": "MESH",
+ "links": []
+ },
+ {
+ "localized_name": "normal_opengl",
+ "name": "normal_opengl",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "localized_name": "normal_directx",
+ "name": "normal_directx",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "55",
+ "resolution_level"
+ ],
+ [
+ "55",
+ "batch_size"
+ ],
+ [
+ "54",
+ "batch_index"
+ ],
+ [
+ "54",
+ "decimation"
+ ],
+ [
+ "54",
+ "discontinuity_threshold"
+ ],
+ [
+ "54",
+ "texture"
+ ],
+ [
+ "58",
+ "model_name"
+ ],
+ [
+ "66",
+ "switch"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Geometry Estimation (MoGe)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "936dfaf2-575a-48b5-9e0c-df391319d11f",
+ "version": 1,
+ "state": {
+ "lastGroupId": 1,
+ "lastNodeId": 69,
+ "lastLinkId": 91,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Geometry Estimation (MoGe)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -5130,
+ 5320,
+ 167.337890625,
+ 228
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -3090,
+ 4966,
+ 131.51953125,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520",
+ "name": "source_image",
+ "type": "IMAGE",
+ "linkIds": [
+ 48,
+ 55,
+ 56,
+ 82
+ ],
+ "localized_name": "source_image",
+ "pos": [
+ -4986.662109375,
+ 5344
+ ]
+ },
+ {
+ "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52",
+ "name": "inference_resolution",
+ "type": "INT",
+ "linkIds": [
+ 73
+ ],
+ "localized_name": "inference_resolution",
+ "pos": [
+ -4986.662109375,
+ 5364
+ ]
+ },
+ {
+ "id": "616638fe-f603-4d10-bae9-fc87c134380f",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "linkIds": [
+ 74
+ ],
+ "localized_name": "inference_batch_size",
+ "pos": [
+ -4986.662109375,
+ 5384
+ ]
+ },
+ {
+ "id": "fcacfca9-7927-4c38-94da-8ab22256325f",
+ "name": "mesh_frame_index",
+ "type": "INT",
+ "linkIds": [
+ 75
+ ],
+ "localized_name": "mesh_frame_index",
+ "pos": [
+ -4986.662109375,
+ 5404
+ ]
+ },
+ {
+ "id": "acbfe7f9-1b69-42c1-8614-4ccf54b28d4e",
+ "name": "mesh_decimation",
+ "type": "INT",
+ "linkIds": [
+ 76
+ ],
+ "localized_name": "mesh_decimation",
+ "pos": [
+ -4986.662109375,
+ 5424
+ ]
+ },
+ {
+ "id": "cd20f9a7-3a0a-4c4c-98d7-96f423867b87",
+ "name": "mesh_gap_threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 77
+ ],
+ "localized_name": "mesh_gap_threshold",
+ "pos": [
+ -4986.662109375,
+ 5444
+ ]
+ },
+ {
+ "id": "6f5c15f7-7f77-4fc9-b47b-3514467b06b6",
+ "name": "mesh_texture",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 78
+ ],
+ "localized_name": "mesh_texture",
+ "pos": [
+ -4986.662109375,
+ 5464
+ ]
+ },
+ {
+ "id": "65694805-186e-4181-a721-df8b5af49d31",
+ "name": "moge_model",
+ "type": "COMBO",
+ "linkIds": [
+ 79
+ ],
+ "localized_name": "moge_model",
+ "pos": [
+ -4986.662109375,
+ 5484
+ ]
+ },
+ {
+ "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 83
+ ],
+ "label": "auto_resize_input",
+ "pos": [
+ -4986.662109375,
+ 5504
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "3c616ea0-9a4c-4cff-a405-662320229df0",
+ "name": "mesh",
+ "type": "MESH",
+ "linkIds": [
+ 34
+ ],
+ "localized_name": "mesh",
+ "pos": [
+ -3066,
+ 4990
+ ]
+ },
+ {
+ "id": "ff85a763-b7f7-4bcc-9b1d-a4eaf55ad2f9",
+ "name": "normal_opengl",
+ "type": "IMAGE",
+ "linkIds": [
+ 62
+ ],
+ "localized_name": "normal_opengl",
+ "pos": [
+ -3066,
+ 5010
+ ]
+ },
+ {
+ "id": "26b3f88a-0ba0-4d4d-9c7d-0ad76106c844",
+ "name": "normal_directx",
+ "type": "IMAGE",
+ "linkIds": [
+ 63
+ ],
+ "localized_name": "normal_directx",
+ "pos": [
+ -3066,
+ 5030
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 54,
+ "type": "MoGePointMapToMesh",
+ "pos": [
+ -3440,
+ 5220
+ ],
+ "size": [
+ 290,
+ 200
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 33
+ },
+ {
+ "localized_name": "batch_index",
+ "name": "batch_index",
+ "type": "INT",
+ "widget": {
+ "name": "batch_index"
+ },
+ "link": 75
+ },
+ {
+ "localized_name": "decimation",
+ "name": "decimation",
+ "type": "INT",
+ "widget": {
+ "name": "decimation"
+ },
+ "link": 76
+ },
+ {
+ "localized_name": "discontinuity_threshold",
+ "name": "discontinuity_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "discontinuity_threshold"
+ },
+ "link": 77
+ },
+ {
+ "localized_name": "texture",
+ "name": "texture",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "texture"
+ },
+ "link": 78
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MESH",
+ "name": "MESH",
+ "type": "MESH",
+ "links": [
+ 34
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGePointMapToMesh",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 1,
+ 0.04,
+ true
+ ]
+ },
+ {
+ "id": 55,
+ "type": "MoGeInference",
+ "pos": [
+ -3790,
+ 5180
+ ],
+ "size": [
+ 270,
+ 230
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "MOGE_MODEL",
+ "link": 58
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 81
+ },
+ {
+ "localized_name": "resolution_level",
+ "name": "resolution_level",
+ "type": "INT",
+ "widget": {
+ "name": "resolution_level"
+ },
+ "link": 73
+ },
+ {
+ "localized_name": "fov_x_degrees",
+ "name": "fov_x_degrees",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fov_x_degrees"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": 74
+ },
+ {
+ "localized_name": "force_projection",
+ "name": "force_projection",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "force_projection"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "apply_mask",
+ "name": "apply_mask",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "apply_mask"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "links": [
+ 33,
+ 59,
+ 60
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeInference",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 9,
+ 0,
+ 4,
+ true,
+ true
+ ]
+ },
+ {
+ "id": 58,
+ "type": "LoadMoGeModel",
+ "pos": [
+ -4180,
+ 4910
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 79
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MOGE_MODEL",
+ "name": "MOGE_MODEL",
+ "type": "MOGE_MODEL",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadMoGeModel",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "moge_2_vitl_normal_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors",
+ "directory": "geometry_estimation"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "moge_2_vitl_normal_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 59,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -4720,
+ 4910
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 49
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": [
+ 53
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a > 2048"
+ ]
+ },
+ {
+ "id": 60,
+ "type": "GetImageSize",
+ "pos": [
+ -4980,
+ 4910
+ ],
+ "size": [
+ 230,
+ 160
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 49
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 61,
+ "type": "ResizeImagesByLongerEdge",
+ "pos": [
+ -4650,
+ 5210
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 55
+ },
+ {
+ "localized_name": "longer_edge",
+ "name": "longer_edge",
+ "type": "INT",
+ "widget": {
+ "name": "longer_edge"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 54
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImagesByLongerEdge",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 2048
+ ]
+ },
+ {
+ "id": 62,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4180,
+ 5120
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 56
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 54
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 63,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 4890
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 59
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 62
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "normal_opengl"
+ ]
+ },
+ {
+ "id": 64,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 5050
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 60
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 63
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "normal_directx"
+ ]
+ },
+ {
+ "id": 66,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4160,
+ 5340
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 82
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 80
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 81
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "auto_resize_if_width_gt_2048",
+ "bounding": [
+ -5000,
+ 4840,
+ 690,
+ 280
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 33,
+ "origin_id": 55,
+ "origin_slot": 0,
+ "target_id": 54,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 58,
+ "origin_id": 58,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 0,
+ "type": "MOGE_MODEL"
+ },
+ {
+ "id": 49,
+ "origin_id": 60,
+ "origin_slot": 0,
+ "target_id": 59,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 54,
+ "origin_id": 61,
+ "origin_slot": 0,
+ "target_id": 62,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 53,
+ "origin_id": 59,
+ "origin_slot": 2,
+ "target_id": 62,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 59,
+ "origin_id": 55,
+ "origin_slot": 0,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 60,
+ "origin_id": 55,
+ "origin_slot": 0,
+ "target_id": 64,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 48,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 60,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 55,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 61,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 56,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 34,
+ "origin_id": 54,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MESH"
+ },
+ {
+ "id": 62,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 63,
+ "origin_id": 64,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 73,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 55,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 74,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 55,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 75,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 54,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 76,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 54,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 77,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 54,
+ "target_slot": 3,
+ "type": "FLOAT"
+ },
+ {
+ "id": 78,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 54,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 58,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 80,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 66,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 81,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 82,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 66,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ }
+ ],
+ "category": "3D/Geometry Estimation",
+ "description": "Estimates 3D scene geometry from an input image using MoGe, outputting a mesh plus OpenGL and DirectX normal maps.",
+ "extra": {}
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Get Any Video Frame.json b/blueprints/Get Any Video Frame.json
new file mode 100644
index 000000000..9ff0f8e6e
--- /dev/null
+++ b/blueprints/Get Any Video Frame.json
@@ -0,0 +1,485 @@
+{
+ "revision": 0,
+ "last_node_id": 98,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 98,
+ "type": "dca6e78d-fb06-421e-97f7-6ce17a665260",
+ "pos": [
+ -410,
+ -2230
+ ],
+ "size": [
+ 270,
+ 104
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "frame_index",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "title": "Get Any Video Frame",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "100",
+ "value"
+ ]
+ ]
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "dca6e78d-fb06-421e-97f7-6ce17a665260",
+ "version": 1,
+ "state": {
+ "lastGroupId": 1,
+ "lastNodeId": 136,
+ "lastLinkId": 302,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Get Any Video Frame",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ 380,
+ -57,
+ 120,
+ 80
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1460,
+ -57,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "2ceec378-8dcf-4340-8570-155967f59a93",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 4
+ ],
+ "pos": [
+ 480,
+ -37
+ ]
+ },
+ {
+ "id": "819955f6-c686-4896-8032-ff2d0059109a",
+ "name": "value",
+ "type": "INT",
+ "linkIds": [
+ 283
+ ],
+ "label": "frame_index",
+ "pos": [
+ 480,
+ -17
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "1ab0684d-6a44-45b6-8aa4-a0b971a1d41e",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 5
+ ],
+ "pos": [
+ 1480,
+ -37
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 1,
+ "type": "GetVideoComponents",
+ "pos": [
+ 560,
+ -150
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 4
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 1,
+ 2
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": null
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents"
+ }
+ },
+ {
+ "id": 2,
+ "type": "GetImageSize",
+ "pos": [
+ 560,
+ 50
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": [
+ 285
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize"
+ }
+ },
+ {
+ "id": 3,
+ "type": "ImageFromBatch",
+ "pos": [
+ 1130,
+ -150
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 2
+ },
+ {
+ "localized_name": "batch_index",
+ "name": "batch_index",
+ "type": "INT",
+ "widget": {
+ "name": "batch_index"
+ },
+ "link": 286
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 5
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFromBatch"
+ },
+ "widgets_values": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 99,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 910,
+ 100
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 284
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 285
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 286
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "min(max(int(a if a >= 0 else b + a), 0), b - 1)"
+ ]
+ },
+ {
+ "id": 100,
+ "type": "PrimitiveInt",
+ "pos": [
+ 560,
+ 250
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 283
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 284
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 0,
+ "fixed"
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 2,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 4,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 1,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 5,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 283,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 100,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 284,
+ "origin_id": 100,
+ "origin_slot": 0,
+ "target_id": 99,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 285,
+ "origin_id": 2,
+ "origin_slot": 2,
+ "target_id": 99,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 286,
+ "origin_id": 99,
+ "origin_slot": 1,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "INT"
+ }
+ ],
+ "extra": {},
+ "category": "Video Tools",
+ "description": "Extracts one image frame from a video at a chosen index, with optional trim and FPS control."
+ }
+ ]
+ },
+ "extra": {
+ "ds": {
+ "scale": 1.197015527856339,
+ "offset": [
+ -168.76833554248222,
+ 540.6638955283997
+ ]
+ },
+ "frontendVersion": "1.42.8"
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Glow.json b/blueprints/Glow.json
index 8c690fc68..2bbfdee51 100644
--- a/blueprints/Glow.json
+++ b/blueprints/Glow.json
@@ -268,7 +268,7 @@
"Node name for S&R": "GLSLShader"
},
"widgets_values": [
- "#version 300 es\nprecision mediump float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform int u_int0; // Blend mode\nuniform int u_int1; // Color tint\nuniform float u_float0; // Intensity\nuniform float u_float1; // Radius\nuniform float u_float2; // Threshold\n\nin vec2 v_texCoord;\nout vec4 fragColor;\n\nconst int BLEND_ADD = 0;\nconst int BLEND_SCREEN = 1;\nconst int BLEND_SOFT = 2;\nconst int BLEND_OVERLAY = 3;\nconst int BLEND_LIGHTEN = 4;\n\nconst float GOLDEN_ANGLE = 2.39996323;\nconst int MAX_SAMPLES = 48;\nconst vec3 LUMA = vec3(0.299, 0.587, 0.114);\n\nfloat hash(vec2 p) {\n p = fract(p * vec2(123.34, 456.21));\n p += dot(p, p + 45.32);\n return fract(p.x * p.y);\n}\n\nvec3 hexToRgb(int h) {\n return vec3(\n float((h >> 16) & 255),\n float((h >> 8) & 255),\n float(h & 255)\n ) * (1.0 / 255.0);\n}\n\nvec3 blend(vec3 base, vec3 glow, int mode) {\n if (mode == BLEND_SCREEN) {\n return 1.0 - (1.0 - base) * (1.0 - glow);\n }\n if (mode == BLEND_SOFT) {\n return mix(\n base - (1.0 - 2.0 * glow) * base * (1.0 - base),\n base + (2.0 * glow - 1.0) * (sqrt(base) - base),\n step(0.5, glow)\n );\n }\n if (mode == BLEND_OVERLAY) {\n return mix(\n 2.0 * base * glow,\n 1.0 - 2.0 * (1.0 - base) * (1.0 - glow),\n step(0.5, base)\n );\n }\n if (mode == BLEND_LIGHTEN) {\n return max(base, glow);\n }\n return base + glow;\n}\n\nvoid main() {\n vec4 original = texture(u_image0, v_texCoord);\n \n float intensity = u_float0 * 0.05;\n float radius = u_float1 * u_float1 * 0.012;\n \n if (intensity < 0.001 || radius < 0.1) {\n fragColor = original;\n return;\n }\n \n float threshold = 1.0 - u_float2 * 0.01;\n float t0 = threshold - 0.15;\n float t1 = threshold + 0.15;\n \n vec2 texelSize = 1.0 / u_resolution;\n float radius2 = radius * radius;\n \n float sampleScale = clamp(radius * 0.75, 0.35, 1.0);\n int samples = int(float(MAX_SAMPLES) * sampleScale);\n \n float noise = hash(gl_FragCoord.xy);\n float angleOffset = noise * GOLDEN_ANGLE;\n float radiusJitter = 0.85 + noise * 0.3;\n \n float ca = cos(GOLDEN_ANGLE);\n float sa = sin(GOLDEN_ANGLE);\n vec2 dir = vec2(cos(angleOffset), sin(angleOffset));\n \n vec3 glow = vec3(0.0);\n float totalWeight = 0.0;\n \n // Center tap\n float centerMask = smoothstep(t0, t1, dot(original.rgb, LUMA));\n glow += original.rgb * centerMask * 2.0;\n totalWeight += 2.0;\n \n for (int i = 1; i < MAX_SAMPLES; i++) {\n if (i >= samples) break;\n \n float fi = float(i);\n float dist = sqrt(fi / float(samples)) * radius * radiusJitter;\n \n vec2 offset = dir * dist * texelSize;\n vec3 c = texture(u_image0, v_texCoord + offset).rgb;\n float mask = smoothstep(t0, t1, dot(c, LUMA));\n \n float w = 1.0 - (dist * dist) / (radius2 * 1.5);\n w = max(w, 0.0);\n w *= w;\n \n glow += c * mask * w;\n totalWeight += w;\n \n dir = vec2(\n dir.x * ca - dir.y * sa,\n dir.x * sa + dir.y * ca\n );\n }\n \n glow *= intensity / max(totalWeight, 0.001);\n \n if (u_int1 > 0) {\n glow *= hexToRgb(u_int1);\n }\n \n vec3 result = blend(original.rgb, glow, u_int0);\n result += (noise - 0.5) * (1.0 / 255.0);\n \n fragColor = vec4(clamp(result, 0.0, 1.0), original.a);\n}",
+ "#version 300 es\nprecision mediump float;\n\nuniform sampler2D u_image0;\nuniform int u_int0; // Blend mode\nuniform int u_int1; // Color tint\nuniform float u_float0; // Intensity\nuniform float u_float1; // Radius\nuniform float u_float2; // Threshold\n\nin vec2 v_texCoord;\nout vec4 fragColor;\n\nconst int BLEND_ADD = 0;\nconst int BLEND_SCREEN = 1;\nconst int BLEND_SOFT = 2;\nconst int BLEND_OVERLAY = 3;\nconst int BLEND_LIGHTEN = 4;\n\nconst float GOLDEN_ANGLE = 2.39996323;\nconst int MAX_SAMPLES = 48;\nconst vec3 LUMA = vec3(0.299, 0.587, 0.114);\n\nfloat hash(vec2 p) {\n p = fract(p * vec2(123.34, 456.21));\n p += dot(p, p + 45.32);\n return fract(p.x * p.y);\n}\n\nvec3 hexToRgb(int h) {\n return vec3(\n float((h >> 16) & 255),\n float((h >> 8) & 255),\n float(h & 255)\n ) * (1.0 / 255.0);\n}\n\nvec3 blend(vec3 base, vec3 glow, int mode) {\n if (mode == BLEND_SCREEN) {\n return 1.0 - (1.0 - base) * (1.0 - glow);\n }\n if (mode == BLEND_SOFT) {\n return mix(\n base - (1.0 - 2.0 * glow) * base * (1.0 - base),\n base + (2.0 * glow - 1.0) * (sqrt(base) - base),\n step(0.5, glow)\n );\n }\n if (mode == BLEND_OVERLAY) {\n return mix(\n 2.0 * base * glow,\n 1.0 - 2.0 * (1.0 - base) * (1.0 - glow),\n step(0.5, base)\n );\n }\n if (mode == BLEND_LIGHTEN) {\n return max(base, glow);\n }\n return base + glow;\n}\n\nvoid main() {\n vec4 original = texture(u_image0, v_texCoord);\n \n float intensity = u_float0 * 0.05;\n float radius = u_float1 * u_float1 * 0.012;\n \n if (intensity < 0.001 || radius < 0.1) {\n fragColor = original;\n return;\n }\n \n float threshold = 1.0 - u_float2 * 0.01;\n float t0 = threshold - 0.15;\n float t1 = threshold + 0.15;\n \n vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));\n float radius2 = radius * radius;\n \n float sampleScale = clamp(radius * 0.75, 0.35, 1.0);\n int samples = int(float(MAX_SAMPLES) * sampleScale);\n \n float noise = hash(gl_FragCoord.xy);\n float angleOffset = noise * GOLDEN_ANGLE;\n float radiusJitter = 0.85 + noise * 0.3;\n \n float ca = cos(GOLDEN_ANGLE);\n float sa = sin(GOLDEN_ANGLE);\n vec2 dir = vec2(cos(angleOffset), sin(angleOffset));\n \n vec3 glow = vec3(0.0);\n float totalWeight = 0.0;\n \n // Center tap\n float centerMask = smoothstep(t0, t1, dot(original.rgb, LUMA));\n glow += original.rgb * centerMask * 2.0;\n totalWeight += 2.0;\n \n for (int i = 1; i < MAX_SAMPLES; i++) {\n if (i >= samples) break;\n \n float fi = float(i);\n float dist = sqrt(fi / float(samples)) * radius * radiusJitter;\n \n vec2 offset = dir * dist * texelSize;\n vec3 c = texture(u_image0, v_texCoord + offset).rgb;\n float mask = smoothstep(t0, t1, dot(c, LUMA));\n \n float w = 1.0 - (dist * dist) / (radius2 * 1.5);\n w = max(w, 0.0);\n w *= w;\n \n glow += c * mask * w;\n totalWeight += w;\n \n dir = vec2(\n dir.x * ca - dir.y * sa,\n dir.x * sa + dir.y * ca\n );\n }\n \n glow *= intensity / max(totalWeight, 0.001);\n \n if (u_int1 > 0) {\n glow *= hexToRgb(u_int1);\n }\n \n vec3 result = blend(original.rgb, glow, u_int0);\n result += (noise - 0.5) * (1.0 / 255.0);\n \n fragColor = vec4(clamp(result, 0.0, 1.0), original.a);\n}",
"from_input"
]
},
@@ -575,8 +575,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adds a glow/bloom effect around bright image areas via GPU fragment shader."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Hue and Saturation.json b/blueprints/Hue and Saturation.json
index 1a2df8937..cddf0154a 100644
--- a/blueprints/Hue and Saturation.json
+++ b/blueprints/Hue and Saturation.json
@@ -752,8 +752,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adjusts hue, saturation, and lightness of an image using a real-time GPU fragment shader."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Blur.json b/blueprints/Image Blur.json
index b1d449e32..0ca8d9931 100644
--- a/blueprints/Image Blur.json
+++ b/blueprints/Image Blur.json
@@ -331,7 +331,7 @@
"Node name for S&R": "GLSLShader"
},
"widgets_values": [
- "#version 300 es\n#pragma passes 2\nprecision highp float;\n\n// Blur type constants\nconst int BLUR_GAUSSIAN = 0;\nconst int BLUR_BOX = 1;\nconst int BLUR_RADIAL = 2;\n\n// Radial blur config\nconst int RADIAL_SAMPLES = 12;\nconst float RADIAL_STRENGTH = 0.0003;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)\nuniform float u_float0; // Blur radius/amount\nuniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nvoid main() {\n vec2 texelSize = 1.0 / u_resolution;\n float radius = max(u_float0, 0.0);\n\n // Radial (angular) blur - single pass, doesn't use separable\n if (u_int0 == BLUR_RADIAL) {\n // Only execute on first pass\n if (u_pass > 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec2 center = vec2(0.5);\n vec2 dir = v_texCoord - center;\n float dist = length(dir);\n\n if (dist < 1e-4) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec4 sum = vec4(0.0);\n float totalWeight = 0.0;\n float angleStep = radius * RADIAL_STRENGTH;\n\n dir /= dist;\n\n float cosStep = cos(angleStep);\n float sinStep = sin(angleStep);\n\n float negAngle = -float(RADIAL_SAMPLES) * angleStep;\n vec2 rotDir = vec2(\n dir.x * cos(negAngle) - dir.y * sin(negAngle),\n dir.x * sin(negAngle) + dir.y * cos(negAngle)\n );\n\n for (int i = -RADIAL_SAMPLES; i <= RADIAL_SAMPLES; i++) {\n vec2 uv = center + rotDir * dist;\n float w = 1.0 - abs(float(i)) / float(RADIAL_SAMPLES);\n sum += texture(u_image0, uv) * w;\n totalWeight += w;\n\n rotDir = vec2(\n rotDir.x * cosStep - rotDir.y * sinStep,\n rotDir.x * sinStep + rotDir.y * cosStep\n );\n }\n\n fragColor0 = sum / max(totalWeight, 0.001);\n return;\n }\n\n // Separable Gaussian / Box blur\n int samples = int(ceil(radius));\n\n if (samples == 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n // Direction: pass 0 = horizontal, pass 1 = vertical\n vec2 dir = (u_pass == 0) ? vec2(1.0, 0.0) : vec2(0.0, 1.0);\n\n vec4 color = vec4(0.0);\n float totalWeight = 0.0;\n float sigma = radius / 2.0;\n\n for (int i = -samples; i <= samples; i++) {\n vec2 offset = dir * float(i) * texelSize;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float weight;\n if (u_int0 == BLUR_GAUSSIAN) {\n weight = gaussian(float(i), sigma);\n } else {\n // BLUR_BOX\n weight = 1.0;\n }\n\n color += sample_color * weight;\n totalWeight += weight;\n }\n\n fragColor0 = color / totalWeight;\n}\n",
+ "#version 300 es\n#pragma passes 2\nprecision highp float;\n\n// Blur type constants\nconst int BLUR_GAUSSIAN = 0;\nconst int BLUR_BOX = 1;\nconst int BLUR_RADIAL = 2;\n\n// Radial blur config\nconst int RADIAL_SAMPLES = 12;\nconst float RADIAL_STRENGTH = 0.0003;\n\nuniform sampler2D u_image0;\nuniform int u_int0; // Blur type (BLUR_GAUSSIAN, BLUR_BOX, BLUR_RADIAL)\nuniform float u_float0; // Blur radius/amount\nuniform int u_pass; // Pass index (0 = horizontal, 1 = vertical)\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nvoid main() {\n vec2 texelSize = 1.0 / vec2(textureSize(u_image0, 0));\n float radius = max(u_float0, 0.0);\n\n // Radial (angular) blur - single pass, doesn't use separable\n if (u_int0 == BLUR_RADIAL) {\n // Only execute on first pass\n if (u_pass > 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec2 center = vec2(0.5);\n vec2 dir = v_texCoord - center;\n float dist = length(dir);\n\n if (dist < 1e-4) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n vec4 sum = vec4(0.0);\n float totalWeight = 0.0;\n float angleStep = radius * RADIAL_STRENGTH;\n\n dir /= dist;\n\n float cosStep = cos(angleStep);\n float sinStep = sin(angleStep);\n\n float negAngle = -float(RADIAL_SAMPLES) * angleStep;\n vec2 rotDir = vec2(\n dir.x * cos(negAngle) - dir.y * sin(negAngle),\n dir.x * sin(negAngle) + dir.y * cos(negAngle)\n );\n\n for (int i = -RADIAL_SAMPLES; i <= RADIAL_SAMPLES; i++) {\n vec2 uv = center + rotDir * dist;\n float w = 1.0 - abs(float(i)) / float(RADIAL_SAMPLES);\n sum += texture(u_image0, uv) * w;\n totalWeight += w;\n\n rotDir = vec2(\n rotDir.x * cosStep - rotDir.y * sinStep,\n rotDir.x * sinStep + rotDir.y * cosStep\n );\n }\n\n fragColor0 = sum / max(totalWeight, 0.001);\n return;\n }\n\n // Separable Gaussian / Box blur\n int samples = int(ceil(radius));\n\n if (samples == 0) {\n fragColor0 = texture(u_image0, v_texCoord);\n return;\n }\n\n // Direction: pass 0 = horizontal, pass 1 = vertical\n vec2 dir = (u_pass == 0) ? vec2(1.0, 0.0) : vec2(0.0, 1.0);\n\n vec4 color = vec4(0.0);\n float totalWeight = 0.0;\n float sigma = radius / 2.0;\n\n for (int i = -samples; i <= samples; i++) {\n vec2 offset = dir * float(i) * texelSize;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float weight;\n if (u_int0 == BLUR_GAUSSIAN) {\n weight = gaussian(float(i), sigma);\n } else {\n // BLUR_BOX\n weight = 1.0;\n }\n\n color += sample_color * weight;\n totalWeight += weight;\n }\n\n fragColor0 = color / totalWeight;\n}\n",
"from_input"
]
}
@@ -374,7 +374,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Blur"
+ "category": "Image Tools/Blur",
+ "description": "Applies Gaussian, Box, or Radial blur to soften images and create stylized depth or motion effects."
}
]
}
diff --git a/blueprints/Image Captioning (gemini).json b/blueprints/Image Captioning (gemini).json
index 98cfb8999..9005e5191 100644
--- a/blueprints/Image Captioning (gemini).json
+++ b/blueprints/Image Captioning (gemini).json
@@ -310,8 +310,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Text generation/Image Captioning"
+ "category": "Image Tools",
+ "description": "Generates descriptive captions for images using Google's Gemini multimodal LLM."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Channels.json b/blueprints/Image Channels.json
index 9c7b675b2..b6fdff5be 100644
--- a/blueprints/Image Channels.json
+++ b/blueprints/Image Channels.json
@@ -315,8 +315,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Manipulates individual RGBA channels for masking, compositing, and channel effects."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image to Depth Map (Lotus).json b/blueprints/Image Depth Estimation (Lotus Depth).json
similarity index 92%
rename from blueprints/Image to Depth Map (Lotus).json
rename to blueprints/Image Depth Estimation (Lotus Depth).json
index 089f2cd42..8aa338d0d 100644
--- a/blueprints/Image to Depth Map (Lotus).json
+++ b/blueprints/Image Depth Estimation (Lotus Depth).json
@@ -1,19 +1,18 @@
{
- "id": "6af0a6c1-0161-4528-8685-65776e838d44",
"revision": 0,
- "last_node_id": 75,
- "last_link_id": 245,
+ "last_node_id": 76,
+ "last_link_id": 0,
"nodes": [
{
- "id": 75,
- "type": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
+ "id": 76,
+ "type": "96338968-1242-4f02-b6a1-d496af4bcffe",
"pos": [
- 600,
- 830
+ 670,
+ 1280
],
"size": [
400,
- 110
+ 201.3125
],
"flags": {},
"order": 0,
@@ -59,47 +58,44 @@
"links": []
}
],
+ "title": "Image Depth Estimation (Lotus Depth)",
"properties": {
"proxyWidgets": [
[
- "-1",
+ "28",
"sigma"
],
[
- "-1",
+ "10",
"unet_name"
],
[
- "-1",
+ "14",
"vae_name"
]
],
"cnr_id": "comfy-core",
"ver": "0.14.1"
},
- "widgets_values": [
- 999.0000000000002,
- "lotus-depth-d-v1-1.safetensors",
- "vae-ft-mse-840000-ema-pruned.safetensors"
- ]
+ "widgets_values": []
}
],
"links": [],
- "groups": [],
+ "version": 0.4,
"definitions": {
"subgraphs": [
{
- "id": "488652fd-6edf-4d06-8f9f-4d84d3a34eaf",
+ "id": "96338968-1242-4f02-b6a1-d496af4bcffe",
"version": 1,
"state": {
"lastGroupId": 1,
- "lastNodeId": 75,
+ "lastNodeId": 76,
"lastLinkId": 245,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
- "name": "local-Image to Depth Map (Lotus)",
+ "name": "Image Depth Estimation (Lotus Depth)",
"inputNode": {
"id": -10,
"bounding": [
@@ -191,12 +187,12 @@
"id": 10,
"type": "UNETLoader",
"pos": [
- 108.05555555555557,
- -253.05555555555557
+ 110,
+ -250
],
"size": [
- 254.93706597222226,
- 82
+ 260,
+ 90
],
"flags": {},
"order": 4,
@@ -234,9 +230,9 @@
}
],
"properties": {
+ "Node name for S&R": "UNETLoader",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "UNETLoader",
"models": [
{
"name": "lotus-depth-d-v1-1.safetensors",
@@ -255,12 +251,12 @@
"id": 18,
"type": "DisableNoise",
"pos": [
- 607.0641494069639,
- -268.33337840371513
+ 610,
+ -270
],
"size": [
- 175,
- 33.333333333333336
+ 180,
+ 40
],
"flags": {},
"order": 0,
@@ -278,26 +274,25 @@
}
],
"properties": {
+ "Node name for S&R": "DisableNoise",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "DisableNoise",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
- "id": 23,
+ "id": 74,
"type": "VAEEncode",
"pos": [
620,
160
],
"size": [
- 175,
+ 180,
50
],
"flags": {},
- "order": 10,
+ "order": 11,
"mode": 0,
"inputs": [
{
@@ -325,12 +320,11 @@
}
],
"properties": {
+ "Node name for S&R": "VAEEncode",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "VAEEncode",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 21,
@@ -341,7 +335,7 @@
],
"size": [
210,
- 58
+ 60
],
"flags": {},
"order": 1,
@@ -369,9 +363,9 @@
}
],
"properties": {
+ "Node name for S&R": "KSamplerSelect",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "KSamplerSelect",
"widget_ue_connectable": {}
},
"widgets_values": [
@@ -386,7 +380,7 @@
-170
],
"size": [
- 175,
+ 180,
50
],
"flags": {},
@@ -418,12 +412,11 @@
}
],
"properties": {
+ "Node name for S&R": "BasicGuider",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "BasicGuider",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 16,
@@ -433,8 +426,8 @@
-130
],
"size": [
- 295.99609375,
- 271.65798611111114
+ 300,
+ 280
],
"flags": {},
"order": 6,
@@ -490,12 +483,11 @@
}
],
"properties": {
+ "Node name for S&R": "SamplerCustomAdvanced",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "SamplerCustomAdvanced",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 28,
@@ -506,10 +498,10 @@
],
"size": [
210,
- 58
+ 60
],
"flags": {},
- "order": 11,
+ "order": 10,
"mode": 0,
"inputs": [
{
@@ -540,9 +532,9 @@
}
],
"properties": {
+ "Node name for S&R": "SetFirstSigma",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "SetFirstSigma",
"widget_ue_connectable": {}
},
"widgets_values": [
@@ -557,7 +549,7 @@
-120
],
"size": [
- 175,
+ 180,
50
],
"flags": {},
@@ -589,12 +581,11 @@
}
],
"properties": {
+ "Node name for S&R": "VAEDecode",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "VAEDecode",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 22,
@@ -604,8 +595,8 @@
-220
],
"size": [
- 175,
- 33.333333333333336
+ 180,
+ 40
],
"flags": {},
"order": 9,
@@ -630,12 +621,11 @@
}
],
"properties": {
+ "Node name for S&R": "ImageInvert",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "ImageInvert",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 14,
@@ -645,8 +635,8 @@
-90
],
"size": [
- 254.93706597222226,
- 58
+ 260,
+ 60
],
"flags": {},
"order": 5,
@@ -675,9 +665,9 @@
}
],
"properties": {
+ "Node name for S&R": "VAELoader",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "VAELoader",
"models": [
{
"name": "vae-ft-mse-840000-ema-pruned.safetensors",
@@ -692,15 +682,15 @@
]
},
{
- "id": 68,
+ "id": 75,
"type": "LotusConditioning",
"pos": [
400,
-150
],
"size": [
- 175,
- 33.333333333333336
+ 180,
+ 40
],
"flags": {},
"order": 2,
@@ -718,12 +708,11 @@
}
],
"properties": {
+ "Node name for S&R": "LotusConditioning",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "LotusConditioning",
"widget_ue_connectable": {}
- },
- "widgets_values": []
+ }
},
{
"id": 20,
@@ -734,7 +723,7 @@
],
"size": [
210,
- 106
+ 110
],
"flags": {},
"order": 8,
@@ -786,9 +775,9 @@
}
],
"properties": {
+ "Node name for S&R": "BasicScheduler",
"cnr_id": "comfy-core",
"ver": "0.3.34",
- "Node name for S&R": "BasicScheduler",
"widget_ue_connectable": {}
},
"widgets_values": [
@@ -850,7 +839,7 @@
},
{
"id": 201,
- "origin_id": 23,
+ "origin_id": 74,
"origin_slot": 0,
"target_id": 16,
"target_slot": 4,
@@ -866,7 +855,7 @@
},
{
"id": 238,
- "origin_id": 68,
+ "origin_id": 75,
"origin_slot": 0,
"target_id": 19,
"target_slot": 1,
@@ -892,7 +881,7 @@
"id": 38,
"origin_id": 14,
"origin_slot": 0,
- "target_id": 23,
+ "target_id": 74,
"target_slot": 1,
"type": "VAE"
},
@@ -908,7 +897,7 @@
"id": 37,
"origin_id": -10,
"origin_slot": 0,
- "target_id": 23,
+ "target_id": 74,
"target_slot": 0,
"type": "IMAGE"
},
@@ -948,11 +937,11 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Depth to image"
+ "category": "Conditioning & Preprocessors/Depth",
+ "description": "Estimates a monocular depth map from an input image using the Lotus depth estimation model."
}
]
},
- "config": {},
"extra": {
"ds": {
"scale": 1.3589709866044692,
@@ -960,8 +949,6 @@
-138.53613935617864,
-786.0629126022195
]
- },
- "workflowRendererVersion": "LG"
- },
- "version": 0.4
-}
+ }
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image Depth Estimation (MoGe).json b/blueprints/Image Depth Estimation (MoGe).json
new file mode 100644
index 000000000..e2d5d1298
--- /dev/null
+++ b/blueprints/Image Depth Estimation (MoGe).json
@@ -0,0 +1,1154 @@
+{
+ "revision": 0,
+ "last_node_id": 49,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 49,
+ "type": "ca1fac5f-abe5-4729-b7fe-2299f6630a65",
+ "pos": [
+ -3970,
+ 5000
+ ],
+ "size": [
+ 430,
+ 330
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source_image",
+ "name": "source_image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "inference_resolution",
+ "name": "inference_resolution",
+ "type": "INT",
+ "widget": {
+ "name": "inference_resolution"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "inference_batch_size",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "inference_batch_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "COMBO",
+ "widget": {
+ "name": "moge_model"
+ },
+ "link": null
+ },
+ {
+ "label": "auto_resize_input",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "depth_colored",
+ "name": "depth_colored",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "localized_name": "depth",
+ "name": "depth",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": []
+ }
+ ],
+ "title": "Image Depth Estimation (MoGe)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "13",
+ "resolution_level"
+ ],
+ [
+ "13",
+ "batch_size"
+ ],
+ [
+ "32",
+ "model_name"
+ ],
+ [
+ "53",
+ "switch"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "ca1fac5f-abe5-4729-b7fe-2299f6630a65",
+ "version": 1,
+ "state": {
+ "lastGroupId": 1,
+ "lastNodeId": 69,
+ "lastLinkId": 90,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Depth Estimation (MoGe)",
+ "description": "Estimates monocular depth from an input image using MoGe, outputting both raw and colorized depth maps plus a mask.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -5130,
+ 5320,
+ 167.337890625,
+ 148
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -3090,
+ 4966,
+ 129,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "cc8ce79d-ba20-4a25-a51c-c2afcd35e520",
+ "name": "source_image",
+ "type": "IMAGE",
+ "linkIds": [
+ 48,
+ 55,
+ 56,
+ 82
+ ],
+ "localized_name": "source_image",
+ "pos": [
+ -4986.662109375,
+ 5344
+ ]
+ },
+ {
+ "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52",
+ "name": "inference_resolution",
+ "type": "INT",
+ "linkIds": [
+ 73
+ ],
+ "localized_name": "inference_resolution",
+ "pos": [
+ -4986.662109375,
+ 5364
+ ]
+ },
+ {
+ "id": "616638fe-f603-4d10-bae9-fc87c134380f",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "linkIds": [
+ 74
+ ],
+ "localized_name": "inference_batch_size",
+ "pos": [
+ -4986.662109375,
+ 5384
+ ]
+ },
+ {
+ "id": "65694805-186e-4181-a721-df8b5af49d31",
+ "name": "moge_model",
+ "type": "COMBO",
+ "linkIds": [
+ 79
+ ],
+ "localized_name": "moge_model",
+ "pos": [
+ -4986.662109375,
+ 5404
+ ]
+ },
+ {
+ "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 83
+ ],
+ "label": "auto_resize_input",
+ "pos": [
+ -4986.662109375,
+ 5424
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "59c37b52-074f-49fc-9731-483f899c12c4",
+ "name": "depth_colored",
+ "type": "IMAGE",
+ "linkIds": [
+ 36
+ ],
+ "localized_name": "depth_colored",
+ "pos": [
+ -3066,
+ 4990
+ ]
+ },
+ {
+ "id": "f583e936-da5c-4630-9901-391fa605c1f8",
+ "name": "depth",
+ "type": "IMAGE",
+ "linkIds": [
+ 40
+ ],
+ "localized_name": "depth",
+ "pos": [
+ -3066,
+ 5010
+ ]
+ },
+ {
+ "id": "6845b6a1-1980-454a-9451-314f24495c1d",
+ "name": "MASK",
+ "type": "MASK",
+ "linkIds": [
+ 86
+ ],
+ "pos": [
+ -3066,
+ 5030
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 13,
+ "type": "MoGeInference",
+ "pos": [
+ -3790,
+ 5180
+ ],
+ "size": [
+ 270,
+ 230
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "MOGE_MODEL",
+ "link": 58
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 81
+ },
+ {
+ "localized_name": "resolution_level",
+ "name": "resolution_level",
+ "type": "INT",
+ "widget": {
+ "name": "resolution_level"
+ },
+ "link": 73
+ },
+ {
+ "localized_name": "fov_x_degrees",
+ "name": "fov_x_degrees",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fov_x_degrees"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": 74
+ },
+ {
+ "localized_name": "force_projection",
+ "name": "force_projection",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "force_projection"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "apply_mask",
+ "name": "apply_mask",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "apply_mask"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "links": [
+ 35,
+ 39,
+ 61
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeInference",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3,
+ 0,
+ 4,
+ true,
+ true
+ ]
+ },
+ {
+ "id": 23,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 4870
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 35
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 36
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "depth_colored"
+ ]
+ },
+ {
+ "id": 25,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 5030
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 39
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "depth"
+ ]
+ },
+ {
+ "id": 32,
+ "type": "LoadMoGeModel",
+ "pos": [
+ -4180,
+ 4880
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 79
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MOGE_MODEL",
+ "name": "MOGE_MODEL",
+ "type": "MOGE_MODEL",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadMoGeModel",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "moge_2_vitl_normal_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors",
+ "directory": "geometry_estimation"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "moge_2_vitl_normal_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 36,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -4720,
+ 4910
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 49
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": [
+ 53
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a > 2048"
+ ]
+ },
+ {
+ "id": 37,
+ "type": "GetImageSize",
+ "pos": [
+ -4980,
+ 4910
+ ],
+ "size": [
+ 230,
+ 160
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 49
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 40,
+ "type": "ResizeImagesByLongerEdge",
+ "pos": [
+ -4650,
+ 5210
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 55
+ },
+ {
+ "localized_name": "longer_edge",
+ "name": "longer_edge",
+ "type": "INT",
+ "widget": {
+ "name": "longer_edge"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 54
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImagesByLongerEdge",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 2048
+ ]
+ },
+ {
+ "id": 42,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4180,
+ 5060
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 56
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 54
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 45,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 5200
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 61
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 85
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "mask"
+ ]
+ },
+ {
+ "id": 53,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4160,
+ 5340
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 82
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 80
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 81
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 68,
+ "type": "ImageToMask",
+ "pos": [
+ -3420,
+ 5360
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 85
+ },
+ {
+ "localized_name": "channel",
+ "name": "channel",
+ "type": "COMBO",
+ "widget": {
+ "name": "channel"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 86
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageToMask"
+ },
+ "widgets_values": [
+ "red"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "auto_resize_if_width_gt_2048",
+ "bounding": [
+ -5000,
+ 4840,
+ 690,
+ 280
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 58,
+ "origin_id": 32,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "MOGE_MODEL"
+ },
+ {
+ "id": 35,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 23,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 39,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 25,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 49,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 54,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 53,
+ "origin_id": 36,
+ "origin_slot": 2,
+ "target_id": 42,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 61,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 48,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 55,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 56,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 36,
+ "origin_id": 23,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 40,
+ "origin_id": 25,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 73,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 13,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 74,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 13,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 32,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 80,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 81,
+ "origin_id": 53,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 82,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 53,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 85,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 68,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 86,
+ "origin_id": 68,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "MASK"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Depth"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Inpaint(Wan2.1 VACE).json b/blueprints/Image Edit (FireRed Image Edit 1.1).json
similarity index 53%
rename from blueprints/Video Inpaint(Wan2.1 VACE).json
rename to blueprints/Image Edit (FireRed Image Edit 1.1).json
index f404e6773..b82c7d18b 100644
--- a/blueprints/Video Inpaint(Wan2.1 VACE).json
+++ b/blueprints/Image Edit (FireRed Image Edit 1.1).json
@@ -1,59 +1,66 @@
{
- "id": "2f429c60-2e03-4117-908b-31e1fab04bba",
"revision": 0,
- "last_node_id": 229,
- "last_link_id": 366,
+ "last_node_id": 213,
+ "last_link_id": 0,
"nodes": [
{
- "id": 229,
- "type": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67",
+ "id": 213,
+ "type": "e35fbbeb-d7b1-46d1-a74e-959517d0fb1a",
"pos": [
- -230,
- 160
+ -700,
+ -470
],
"size": [
- 400,
- 480
+ 500,
+ 0
],
"flags": {},
- "order": 0,
+ "order": 2,
"mode": 0,
"inputs": [
{
- "label": "video mask",
- "localized_name": "mask",
- "name": "mask",
- "type": "MASK",
- "link": null
- },
- {
- "localized_name": "video",
- "name": "video",
- "type": "VIDEO",
- "link": null
- },
- {
- "name": "width",
- "type": "INT",
- "widget": {
- "name": "width"
- },
- "link": null
- },
- {
- "name": "height",
- "type": "INT",
- "widget": {
- "name": "height"
- },
- "link": null
- },
- {
- "label": "reference image",
- "name": "reference_image_1",
+ "localized_name": "image",
+ "name": "image",
"type": "IMAGE",
"link": null
},
+ {
+ "label": "image2 (optional)",
+ "name": "image2_1",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "image3 (optional)",
+ "name": "image3_1",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
{
"name": "unet_name",
"type": "COMBO",
@@ -62,14 +69,6 @@
},
"link": null
},
- {
- "name": "lora_name",
- "type": "COMBO",
- "widget": {
- "name": "lora_name"
- },
- "link": null
- },
{
"name": "clip_name",
"type": "COMBO",
@@ -85,376 +84,276 @@
"name": "vae_name"
},
"link": null
+ },
+ {
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
}
],
"outputs": [
{
- "localized_name": "VIDEO",
- "name": "VIDEO",
- "type": "VIDEO",
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
"links": []
}
],
"properties": {
"proxyWidgets": [
[
- "6",
- "text"
+ "208",
+ "prompt"
],
[
- "-1",
- "width"
+ "207",
+ "value"
],
[
- "-1",
- "height"
- ],
- [
- "3",
+ "210",
"seed"
],
[
- "3",
- "control_after_generate"
- ],
- [
- "-1",
+ "205",
"unet_name"
],
[
- "-1",
- "lora_name"
- ],
- [
- "-1",
+ "203",
"clip_name"
],
[
- "-1",
+ "202",
"vae_name"
+ ],
+ [
+ "204",
+ "lora_name"
+ ],
+ [
+ "210",
+ "control_after_generate"
]
],
"cnr_id": "comfy-core",
- "ver": "0.13.0"
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
},
- "widgets_values": [
- null,
- 720,
- 720,
- null,
- null,
- "wan2.1_vace_14B_fp16.safetensors",
- "Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
- "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
- "wan_2.1_vae.safetensors"
- ]
+ "widgets_values": [],
+ "title": "Image Edit (FireRed Image Edit 1.1)"
}
],
"links": [],
- "groups": [],
+ "version": 0.4,
"definitions": {
"subgraphs": [
{
- "id": "53a657f3-c9eb-40f2-9ebd-1ed77d25ed67",
+ "id": "e35fbbeb-d7b1-46d1-a74e-959517d0fb1a",
"version": 1,
"state": {
- "lastGroupId": 25,
- "lastNodeId": 229,
- "lastLinkId": 366,
+ "lastGroupId": 8,
+ "lastNodeId": 213,
+ "lastLinkId": 378,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
- "name": "local-Video Inpaint(Wan2.1 VACE)",
+ "name": "Image Edit (FireRed Image Edit 1.1)",
"inputNode": {
"id": -10,
"bounding": [
- -970,
- 800,
- 132.54296875,
- 220
+ -1670,
+ -1370,
+ 151.744140625,
+ 240
]
},
"outputNode": {
"id": -20,
"bounding": [
- 1480,
- 535,
+ 1860,
+ -1340,
120,
60
]
},
"inputs": [
{
- "id": "9fdda38d-6aa7-48ad-b425-f493d8aa585c",
- "name": "mask",
- "type": "MASK",
- "linkIds": [
- 351,
- 335,
- 345
- ],
- "localized_name": "mask",
- "label": "video mask",
- "pos": [
- -857.45703125,
- 820
- ]
- },
- {
- "id": "8b1788cc-46d2-4f40-8b33-70fd56b4cb24",
- "name": "video",
- "type": "VIDEO",
- "linkIds": [
- 336
- ],
- "localized_name": "video",
- "pos": [
- -857.45703125,
- 840
- ]
- },
- {
- "id": "09393f21-257e-4476-bb02-54899a8252b8",
- "name": "width",
- "type": "INT",
- "linkIds": [
- 355
- ],
- "pos": [
- -857.45703125,
- 860
- ]
- },
- {
- "id": "07a030f7-7eac-4b3f-b8f3-f00ee87b191d",
- "name": "height",
- "type": "INT",
- "linkIds": [
- 356
- ],
- "pos": [
- -857.45703125,
- 880
- ]
- },
- {
- "id": "255908d3-6cc9-48fc-b76b-ab9fb72695bc",
- "name": "reference_image_1",
+ "id": "1d810e30-f1fb-4d10-95f8-3c5f7db2c8b7",
+ "name": "image",
"type": "IMAGE",
"linkIds": [
- 361
+ 371
],
- "label": "reference image",
+ "localized_name": "image",
"pos": [
- -857.45703125,
- 900
+ -1538.255859375,
+ -1350
]
},
{
- "id": "18a5d241-523c-433d-ae05-25b6e69d1e29",
- "name": "unet_name",
- "type": "COMBO",
+ "id": "a8decf32-2262-4cdd-9e6b-c0ca7d4cdebe",
+ "name": "image2_1",
+ "type": "IMAGE",
"linkIds": [
- 363
+ 355,
+ 356
],
+ "label": "image2 (optional)",
"pos": [
- -857.45703125,
- 920
+ -1538.255859375,
+ -1330
]
},
{
- "id": "d7576e1b-da5f-402f-81b2-d37f838b1f8f",
- "name": "lora_name",
- "type": "COMBO",
+ "id": "3ff7a4ed-8e3d-45d4-b1d8-40ed88a6def6",
+ "name": "image3_1",
+ "type": "IMAGE",
+ "linkIds": [
+ 357,
+ 358
+ ],
+ "label": "image3 (optional)",
+ "pos": [
+ -1538.255859375,
+ -1310
+ ]
+ },
+ {
+ "id": "01d9e68c-c664-4584-9cde-66f60e54eb3c",
+ "name": "prompt",
+ "type": "STRING",
+ "linkIds": [
+ 359
+ ],
+ "pos": [
+ -1538.255859375,
+ -1290
+ ]
+ },
+ {
+ "id": "97d24b10-6540-48c4-81eb-a432832f5729",
+ "name": "value",
+ "type": "BOOLEAN",
"linkIds": [
364
],
+ "label": "enable_turbo_mode",
"pos": [
- -857.45703125,
- 940
+ -1538.255859375,
+ -1270
]
},
{
- "id": "41676a3e-c710-4723-821e-f651ad3784b1",
+ "id": "15890efb-ba15-41cd-91ef-5adad7a52167",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 372
+ ],
+ "pos": [
+ -1538.255859375,
+ -1250
+ ]
+ },
+ {
+ "id": "43f22fe2-6836-4f75-8146-04c84fbba75d",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 373
+ ],
+ "pos": [
+ -1538.255859375,
+ -1230
+ ]
+ },
+ {
+ "id": "cd5e4502-2aca-4645-9e2e-ca8719f05bf6",
"name": "clip_name",
"type": "COMBO",
"linkIds": [
- 365
+ 374
],
"pos": [
- -857.45703125,
- 960
+ -1538.255859375,
+ -1210
]
},
{
- "id": "41fc878c-9aa6-4c12-bef3-ceda6b094b7c",
+ "id": "f6ae73dc-39e8-44b2-958d-705ae159ea86",
"name": "vae_name",
"type": "COMBO",
"linkIds": [
- 366
+ 375
],
"pos": [
- -857.45703125,
- 980
+ -1538.255859375,
+ -1190
+ ]
+ },
+ {
+ "id": "66dc179d-e6c9-4485-a2db-a47d25b44363",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 376
+ ],
+ "pos": [
+ -1538.255859375,
+ -1170
]
}
],
"outputs": [
{
- "id": "d4861f39-1011-49dc-80fd-ee318b614a8d",
- "name": "VIDEO",
- "type": "VIDEO",
+ "id": "712c5c76-8620-44e1-9c9d-0798b6cdb77a",
+ "name": "IMAGE",
+ "type": "IMAGE",
"linkIds": [
- 129
+ 292
],
- "localized_name": "VIDEO",
+ "localized_name": "IMAGE",
"pos": [
- 1500,
- 555
+ 1880,
+ -1320
]
}
],
"widgets": [],
"nodes": [
{
- "id": 58,
- "type": "TrimVideoLatent",
+ "id": 193,
+ "type": "ModelSamplingAuraFlow",
"pos": [
- 760,
- 390
+ 1010,
+ -1680
],
"size": [
- 315,
- 60
- ],
- "flags": {
- "collapsed": false
- },
- "order": 13,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "samples",
- "name": "samples",
- "type": "LATENT",
- "link": 116
- },
- {
- "localized_name": "trim_amount",
- "name": "trim_amount",
- "type": "INT",
- "widget": {
- "name": "trim_amount"
- },
- "link": 115
- }
- ],
- "outputs": [
- {
- "localized_name": "LATENT",
- "name": "LATENT",
- "type": "LATENT",
- "links": [
- 117
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "TrimVideoLatent",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {
- "trim_amount": true
- }
- },
- "widgets_values": [
- 0
- ]
- },
- {
- "id": 8,
- "type": "VAEDecode",
- "pos": [
- 770,
- 500
- ],
- "size": [
- 315,
- 46
- ],
- "flags": {
- "collapsed": false
- },
- "order": 11,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "samples",
- "name": "samples",
- "type": "LATENT",
- "link": 117
- },
- {
- "localized_name": "vae",
- "name": "vae",
- "type": "VAE",
- "link": 76
- }
- ],
- "outputs": [
- {
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "type": "IMAGE",
- "slot_index": 0,
- "links": [
- 139
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "VAEDecode",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": []
- },
- {
- "id": 48,
- "type": "ModelSamplingSD3",
- "pos": [
- 400,
- 50
- ],
- "size": [
- 315,
- 58
+ 290,
+ 110
],
"flags": {},
- "order": 9,
+ "order": 4,
"mode": 0,
"inputs": [
{
"localized_name": "model",
"name": "model",
"type": "MODEL",
- "link": 279
+ "link": 326
},
{
"localized_name": "shift",
@@ -471,542 +370,1160 @@
"localized_name": "MODEL",
"name": "MODEL",
"type": "MODEL",
- "slot_index": 0,
"links": [
- 280
+ 294
]
}
],
"properties": {
+ "Node name for S&R": "ModelSamplingAuraFlow",
"cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "ModelSamplingSD3",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
+ "secondTabWidth": 65
},
"widgets_values": [
- 5
+ 3.1
]
},
{
- "id": 219,
- "type": "InvertMask",
+ "id": 194,
+ "type": "ComfySwitchNode",
"pos": [
- 400,
- 990
+ 680,
+ -1690
],
"size": [
- 140,
- 26
+ 260,
+ 140
],
"flags": {},
- "order": 24,
+ "order": 5,
"mode": 0,
"inputs": [
{
- "localized_name": "mask",
- "name": "mask",
- "type": "MASK",
- "link": 351
- }
- ],
- "outputs": [
- {
- "localized_name": "MASK",
- "name": "MASK",
- "type": "MASK",
- "links": [
- 352
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "InvertMask"
- },
- "widgets_values": []
- },
- {
- "id": 216,
- "type": "MaskToImage",
- "pos": [
- 560,
- 990
- ],
- "size": [
- 193.2779296875,
- 26
- ],
- "flags": {},
- "order": 23,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "mask",
- "name": "mask",
- "type": "MASK",
- "link": 352
- }
- ],
- "outputs": [
- {
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "type": "IMAGE",
- "links": [
- 334
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "MaskToImage"
- },
- "widgets_values": []
- },
- {
- "id": 213,
- "type": "RebatchImages",
- "pos": [
- 410,
- 690
- ],
- "size": [
- 230,
- 60
- ],
- "flags": {},
- "order": 21,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "images",
- "name": "images",
- "type": "IMAGE",
- "link": 360
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 324
},
{
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "widget": {
- "name": "batch_size"
- },
- "link": 340
- }
- ],
- "outputs": [
- {
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "shape": 6,
- "type": "IMAGE",
- "links": [
- 333
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "RebatchImages"
- },
- "widgets_values": [
- 1
- ]
- },
- {
- "id": 68,
- "type": "CreateVideo",
- "pos": [
- 1150,
- 50
- ],
- "size": [
- 270,
- 78
- ],
- "flags": {
- "collapsed": false
- },
- "order": 14,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "images",
- "name": "images",
- "type": "IMAGE",
- "link": 139
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 325
},
{
- "localized_name": "audio",
- "name": "audio",
- "shape": 7,
- "type": "AUDIO",
- "link": 362
- },
- {
- "localized_name": "fps",
- "name": "fps",
- "type": "FLOAT",
- "widget": {
- "name": "fps"
- },
- "link": 353
- }
- ],
- "outputs": [
- {
- "localized_name": "VIDEO",
- "name": "VIDEO",
- "type": "VIDEO",
- "links": [
- 129
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "CreateVideo",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- 16
- ]
- },
- {
- "id": 208,
- "type": "ImageCompositeMasked",
- "pos": [
- 410,
- 790
- ],
- "size": [
- 230,
- 146
- ],
- "flags": {},
- "order": 18,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "destination",
- "name": "destination",
- "type": "IMAGE",
- "link": 333
- },
- {
- "localized_name": "source",
- "name": "source",
- "type": "IMAGE",
- "link": 334
- },
- {
- "localized_name": "mask",
- "name": "mask",
- "shape": 7,
- "type": "MASK",
- "link": 335
- },
- {
- "localized_name": "x",
- "name": "x",
- "type": "INT",
- "widget": {
- "name": "x"
- },
- "link": null
- },
- {
- "localized_name": "y",
- "name": "y",
- "type": "INT",
- "widget": {
- "name": "y"
- },
- "link": null
- },
- {
- "localized_name": "resize_source",
- "name": "resize_source",
+ "localized_name": "switch",
+ "name": "switch",
"type": "BOOLEAN",
"widget": {
- "name": "resize_source"
+ "name": "switch"
},
- "link": null
+ "link": 323
}
],
"outputs": [
{
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "type": "IMAGE",
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
"links": [
- 341,
- 344
+ 326
]
}
],
+ "title": "Switch (Model)",
"properties": {
+ "Node name for S&R": "ComfySwitchNode",
"cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "ImageCompositeMasked"
- },
- "widgets_values": [
- 0,
- 0,
- true
- ]
- },
- {
- "id": 214,
- "type": "PreviewImage",
- "pos": [
- 760,
- 690
- ],
- "size": [
- 300,
- 300
- ],
- "flags": {},
- "order": 22,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "images",
- "name": "images",
- "type": "IMAGE",
- "link": 341
- }
- ],
- "outputs": [],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "PreviewImage"
- },
- "widgets_values": []
- },
- {
- "id": 111,
- "type": "MaskToImage",
- "pos": [
- 20,
- 1270
- ],
- "size": [
- 240,
- 26
- ],
- "flags": {},
- "order": 15,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "mask",
- "name": "mask",
- "type": "MASK",
- "link": 345
- }
- ],
- "outputs": [
- {
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "type": "IMAGE",
- "links": [
- 201
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "MaskToImage",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
+ "secondTabWidth": 65
},
- "widgets_values": []
+ "widgets_values": [
+ false
+ ]
},
{
- "id": 129,
- "type": "RepeatImageBatch",
+ "id": 195,
+ "type": "PrimitiveInt",
"pos": [
- 20,
- 1160
+ 190,
+ -1680
],
"size": [
- 240,
- 60
+ 230,
+ 110
],
"flags": {},
- "order": 16,
+ "order": 0,
"mode": 0,
"inputs": [
{
- "localized_name": "image",
- "name": "image",
- "type": "IMAGE",
- "link": 201
- },
- {
- "localized_name": "amount",
- "name": "amount",
+ "localized_name": "value",
+ "name": "value",
"type": "INT",
"widget": {
- "name": "amount"
- },
- "link": 346
- }
- ],
- "outputs": [
- {
- "localized_name": "IMAGE",
- "name": "IMAGE",
- "type": "IMAGE",
- "links": [
- 202
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "RepeatImageBatch",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {
- "amount": true
- }
- },
- "widgets_values": [
- 17
- ]
- },
- {
- "id": 130,
- "type": "ImageToMask",
- "pos": [
- 20,
- 1050
- ],
- "size": [
- 240,
- 60
- ],
- "flags": {},
- "order": 17,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "image",
- "name": "image",
- "type": "IMAGE",
- "link": 202
- },
- {
- "localized_name": "channel",
- "name": "channel",
- "type": "COMBO",
- "widget": {
- "name": "channel"
+ "name": "value"
},
"link": null
}
],
"outputs": [
{
- "localized_name": "MASK",
- "name": "MASK",
- "type": "MASK",
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
"links": [
- 349
+ 329
]
}
],
+ "title": "Int (Steps)",
"properties": {
+ "Node name for S&R": "PrimitiveInt",
"cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "ImageToMask",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
+ "secondTabWidth": 65
},
"widgets_values": [
- "red"
+ 40,
+ "fixed"
]
},
{
- "id": 3,
- "type": "KSampler",
+ "id": 196,
+ "type": "CFGNorm",
"pos": [
- 770,
- 50
+ 1010,
+ -1510
],
"size": [
- 315,
- 262
+ 290,
+ 110
],
"flags": {},
- "order": 10,
+ "order": 6,
"mode": 0,
"inputs": [
{
"localized_name": "model",
"name": "model",
"type": "MODEL",
- "link": 280
+ "link": 294
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "patched_model",
+ "name": "patched_model",
+ "type": "MODEL",
+ "links": [
+ 295
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CFGNorm",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 197,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 680,
+ -1250
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 333
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 334
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 336
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 335
+ ]
+ }
+ ],
+ "title": "Switch (CFG)",
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 198,
+ "type": "PrimitiveInt",
+ "pos": [
+ 190,
+ -1060
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 337
+ ]
+ }
+ ],
+ "title": "Float (Steps)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 8,
+ "fixed"
+ ]
+ },
+ {
+ "id": 199,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 190,
+ -1500
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 333
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 4
+ ]
+ },
+ {
+ "id": 200,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 190,
+ -1230
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 334
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 201,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 680,
+ -1470
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 329
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 337
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 330
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 345
+ ]
+ }
+ ],
+ "title": "Switch (Steps)",
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 202,
+ "type": "VAELoader",
+ "pos": [
+ -960,
+ -1100
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 375
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 298,
+ 299,
+ 300,
+ 314
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "models": [
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/FireRedTeam/FireRed-Image-Edit-1.0-ComfyUI/resolve/main/qwen_image_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_image_vae.safetensors"
+ ]
+ },
+ {
+ "id": 203,
+ "type": "CLIPLoader",
+ "pos": [
+ -960,
+ -1400
+ ],
+ "size": [
+ 400,
+ 150
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 374
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 296,
+ 297
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "models": [
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/HunyuanVideo_1.5_repackaged/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "qwen_image",
+ "default"
+ ]
+ },
+ {
+ "id": 204,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 100,
+ -900
+ ],
+ "size": [
+ 400,
+ 140
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 316
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 376
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 325
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "models": [
+ {
+ "name": "FireRed-Image-Edit-1.0-Lightning-8steps-v1.0.safetensors",
+ "url": "https://huggingface.co/FireRedTeam/FireRed-Image-Edit-1.0-ComfyUI/resolve/main/FireRed-Image-Edit-1.0-Lightning-8steps-v1.0.safetensors",
+ "directory": "loras"
+ }
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "FireRed-Image-Edit-1.0-Lightning-8steps-v1.0.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 205,
+ "type": "UNETLoader",
+ "pos": [
+ -960,
+ -1670
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 373
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 316,
+ 324
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "models": [
+ {
+ "name": "FireRed-Image-Edit-1.1-transformer.safetensors",
+ "url": "https://huggingface.co/FireRedTeam/FireRed-Image-Edit-1.1-ComfyUI/resolve/main/FireRed-Image-Edit-1.1-transformer.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "FireRed-Image-Edit-1.1-transformer.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 206,
+ "type": "VAEEncode",
+ "pos": [
+ -390,
+ -810
+ ],
+ "size": [
+ 390,
+ 100
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 368
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 300
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 303
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 207,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 160,
+ -650
+ ],
+ "size": [
+ 400,
+ 100
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 364
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 323,
+ 330,
+ 336
+ ]
+ }
+ ],
+ "title": "Enable Lightning LoRA?",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 208,
+ "type": "TextEncodeQwenImageEditPlus",
+ "pos": [
+ -480,
+ -1690
+ ],
+ "size": [
+ 470,
+ 370
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 296
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 298
+ },
+ {
+ "localized_name": "image1",
+ "name": "image1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 369
+ },
+ {
+ "localized_name": "image2",
+ "name": "image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 355
+ },
+ {
+ "localized_name": "image3",
+ "name": "image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 357
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 359
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 312
+ ]
+ }
+ ],
+ "title": "TextEncodeQwenImageEditPlus (Positive)",
+ "properties": {
+ "Node name for S&R": "TextEncodeQwenImageEditPlus",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 209,
+ "type": "TextEncodeQwenImageEditPlus",
+ "pos": [
+ -470,
+ -1240
+ ],
+ "size": [
+ 460,
+ 290
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 297
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 299
+ },
+ {
+ "localized_name": "image1",
+ "name": "image1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 370
+ },
+ {
+ "localized_name": "image2",
+ "name": "image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 356
+ },
+ {
+ "localized_name": "image3",
+ "name": "image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 358
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 313
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextEncodeQwenImageEditPlus",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 210,
+ "type": "KSampler",
+ "pos": [
+ 1010,
+ -1340
+ ],
+ "size": [
+ 270,
+ 480
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 295
},
{
"localized_name": "positive",
"name": "positive",
"type": "CONDITIONING",
- "link": 98
+ "link": 312
},
{
"localized_name": "negative",
"name": "negative",
"type": "CONDITIONING",
- "link": 99
+ "link": 313
},
{
"localized_name": "latent_image",
"name": "latent_image",
"type": "LATENT",
- "link": 160
+ "link": 303
},
{
"localized_name": "seed",
@@ -1015,7 +1532,7 @@
"widget": {
"name": "seed"
},
- "link": null
+ "link": 372
},
{
"localized_name": "steps",
@@ -1024,7 +1541,7 @@
"widget": {
"name": "steps"
},
- "link": null
+ "link": 345
},
{
"localized_name": "cfg",
@@ -1033,7 +1550,7 @@
"widget": {
"name": "cfg"
},
- "link": null
+ "link": 335
},
{
"localized_name": "sampler_name",
@@ -1068,559 +1585,65 @@
"localized_name": "LATENT",
"name": "LATENT",
"type": "LATENT",
- "slot_index": 0,
"links": [
- 116
+ 273
]
}
],
"properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
"Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
+ "secondTabWidth": 65
},
"widgets_values": [
- 584027519362099,
- "randomize",
+ 43,
+ "fixed",
+ 40,
4,
- 1,
- "uni_pc",
+ "euler",
"simple",
1
]
},
{
- "id": 224,
- "type": "MarkdownNote",
+ "id": 211,
+ "type": "VAEDecode",
"pos": [
- 420,
- -160
+ 1440,
+ -1340
],
"size": [
- 310,
- 110
+ 230,
+ 100
],
- "flags": {},
- "order": 0,
- "mode": 0,
- "inputs": [],
- "outputs": [],
- "title": "About Video Size",
- "properties": {},
- "widgets_values": [
- "| Model | 480P | 720P |\n| ------------------------------------------------------------ | ---- | ---- |\n| [VACE-1.3B](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) | ✅ | ❌ |\n| [VACE-14B](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) | ✅ | ✅ |"
- ],
- "color": "#432",
- "bgcolor": "#000"
- },
- {
- "id": 223,
- "type": "MarkdownNote",
- "pos": [
- 770,
- -210
- ],
- "size": [
- 303.90106201171875,
- 158.5415802001953
- ],
- "flags": {},
- "order": 1,
- "mode": 0,
- "inputs": [],
- "outputs": [],
- "title": "KSampler Setting",
- "properties": {},
- "widgets_values": [
- "## Default\n\n- steps:20\n- cfg:6.0\n\n## For CausVid LoRA\n\n- steps: 2-4\n- cfg: 1.0\n\n"
- ],
- "color": "#432",
- "bgcolor": "#000"
- },
- {
- "id": 6,
- "type": "CLIPTextEncode",
- "pos": [
- -80,
- 60
- ],
- "size": [
- 420,
- 280
- ],
- "flags": {},
- "order": 7,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "clip",
- "name": "clip",
- "type": "CLIP",
- "link": 74
- },
- {
- "localized_name": "text",
- "name": "text",
- "type": "STRING",
- "widget": {
- "name": "text"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "CONDITIONING",
- "name": "CONDITIONING",
- "type": "CONDITIONING",
- "slot_index": 0,
- "links": [
- 96
- ]
- }
- ],
- "title": "CLIP Text Encode (Positive Prompt)",
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "CLIPTextEncode",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
+ "flags": {
+ "collapsed": false
},
- "widgets_values": [
- ""
- ],
- "color": "#232",
- "bgcolor": "#353"
- },
- {
- "id": 140,
- "type": "UNETLoader",
- "pos": [
- -505.8336486816406,
- 88.22794342041016
- ],
- "size": [
- 360,
- 82
- ],
- "flags": {},
- "order": 2,
+ "order": 18,
"mode": 0,
"inputs": [
{
- "localized_name": "unet_name",
- "name": "unet_name",
- "type": "COMBO",
- "widget": {
- "name": "unet_name"
- },
- "link": 363
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 273
},
{
- "localized_name": "weight_dtype",
- "name": "weight_dtype",
- "type": "COMBO",
- "widget": {
- "name": "weight_dtype"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "MODEL",
- "name": "MODEL",
- "type": "MODEL",
- "slot_index": 0,
- "links": [
- 248
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "UNETLoader",
- "models": [
- {
- "name": "wan2.1_vace_14B_fp16.safetensors",
- "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors",
- "directory": "diffusion_models"
- }
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "wan2.1_vace_14B_fp16.safetensors",
- "fp8_e4m3fn_fast"
- ]
- },
- {
- "id": 154,
- "type": "LoraLoaderModelOnly",
- "pos": [
- -505.8336486816406,
- 228.2279510498047
- ],
- "size": [
- 360,
- 85.11004638671875
- ],
- "flags": {},
- "order": 6,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "model",
- "name": "model",
- "type": "MODEL",
- "link": 248
- },
- {
- "localized_name": "lora_name",
- "name": "lora_name",
- "type": "COMBO",
- "widget": {
- "name": "lora_name"
- },
- "link": 364
- },
- {
- "localized_name": "strength_model",
- "name": "strength_model",
- "type": "FLOAT",
- "widget": {
- "name": "strength_model"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "MODEL",
- "name": "MODEL",
- "type": "MODEL",
- "links": [
- 279
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "LoraLoaderModelOnly",
- "models": [
- {
- "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
- "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
- "directory": "loras"
- }
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
- 0.30000000000000004
- ]
- },
- {
- "id": 38,
- "type": "CLIPLoader",
- "pos": [
- -499.14141845703125,
- 368.0911865234375
- ],
- "size": [
- 360,
- 106
- ],
- "flags": {},
- "order": 3,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "clip_name",
- "name": "clip_name",
- "type": "COMBO",
- "widget": {
- "name": "clip_name"
- },
- "link": 365
- },
- {
- "localized_name": "type",
- "name": "type",
- "type": "COMBO",
- "widget": {
- "name": "type"
- },
- "link": null
- },
- {
- "localized_name": "device",
- "name": "device",
- "shape": 7,
- "type": "COMBO",
- "widget": {
- "name": "device"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "CLIP",
- "name": "CLIP",
- "type": "CLIP",
- "slot_index": 0,
- "links": [
- 74,
- 75
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "CLIPLoader",
- "models": [
- {
- "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
- "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true",
- "directory": "text_encoders"
- }
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
- "wan",
- "default"
- ]
- },
- {
- "id": 39,
- "type": "VAELoader",
- "pos": [
- -498.5298156738281,
- 517.2576293945312
- ],
- "size": [
- 360,
- 60
- ],
- "flags": {},
- "order": 4,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "vae_name",
- "name": "vae_name",
- "type": "COMBO",
- "widget": {
- "name": "vae_name"
- },
- "link": 366
- }
- ],
- "outputs": [
- {
- "localized_name": "VAE",
- "name": "VAE",
+ "localized_name": "vae",
+ "name": "vae",
"type": "VAE",
- "slot_index": 0,
- "links": [
- 76,
- 101
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "VAELoader",
- "models": [
- {
- "name": "wan_2.1_vae.safetensors",
- "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors",
- "directory": "vae"
- }
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "wan_2.1_vae.safetensors"
- ]
- },
- {
- "id": 221,
- "type": "MarkdownNote",
- "pos": [
- 380,
- 1090
- ],
- "size": [
- 480,
- 170
- ],
- "flags": {},
- "order": 5,
- "mode": 0,
- "inputs": [],
- "outputs": [],
- "title": "[EN] About video mask",
- "properties": {
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "Currently, it's difficult to perfectly draw dynamic masks for different frames using only core nodes. However, to avoid requiring users to install additional custom nodes, our templates only use core nodes. You can refer to this implementation idea to achieve video inpainting.\n\nYou can use KJNode’s Points Editor and Sam2Segmentation to create some dynamic mask functions.\n\nCustom node links:\n- [ComfyUI-KJNodes](https://github.com/kijai/ComfyUI-KJNodes)\n- [ComfyUI-segment-anything-2](https://github.com/kijai/ComfyUI-segment-anything-2)"
- ],
- "color": "#432",
- "bgcolor": "#000"
- },
- {
- "id": 7,
- "type": "CLIPTextEncode",
- "pos": [
- -80,
- 390
- ],
- "size": [
- 425.27801513671875,
- 180.6060791015625
- ],
- "flags": {},
- "order": 8,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "clip",
- "name": "clip",
- "type": "CLIP",
- "link": 75
- },
- {
- "localized_name": "text",
- "name": "text",
- "type": "STRING",
- "widget": {
- "name": "text"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "CONDITIONING",
- "name": "CONDITIONING",
- "type": "CONDITIONING",
- "slot_index": 0,
- "links": [
- 97
- ]
- }
- ],
- "title": "CLIP Text Encode (Negative Prompt)",
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "CLIPTextEncode",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {}
- },
- "widgets_values": [
- "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝,"
- ],
- "color": "#223",
- "bgcolor": "#335"
- },
- {
- "id": 229,
- "type": "ImageFromBatch",
- "pos": [
- -510,
- 800
- ],
- "size": [
- 270,
- 82
- ],
- "flags": {},
- "order": 25,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "image",
- "name": "image",
- "type": "IMAGE",
- "link": 358
- },
- {
- "localized_name": "batch_index",
- "name": "batch_index",
- "type": "INT",
- "widget": {
- "name": "batch_index"
- },
- "link": null
- },
- {
- "localized_name": "length",
- "name": "length",
- "type": "INT",
- "widget": {
- "name": "length"
- },
- "link": null
+ "link": 314
}
],
"outputs": [
@@ -1628,300 +1651,115 @@
"localized_name": "IMAGE",
"name": "IMAGE",
"type": "IMAGE",
+ "slot_index": 0,
"links": [
- 359,
- 360
+ 292
]
}
],
"properties": {
+ "Node name for S&R": "VAEDecode",
"cnr_id": "comfy-core",
- "ver": "0.13.0",
- "Node name for S&R": "ImageFromBatch"
- },
- "widgets_values": [
- 0,
- 81
- ]
- },
- {
- "id": 49,
- "type": "WanVaceToVideo",
- "pos": [
- 400,
- 200
- ],
- "size": [
- 315,
- 254
- ],
- "flags": {},
- "order": 12,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "positive",
- "name": "positive",
- "type": "CONDITIONING",
- "link": 96
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
},
- {
- "localized_name": "negative",
- "name": "negative",
- "type": "CONDITIONING",
- "link": 97
- },
- {
- "localized_name": "vae",
- "name": "vae",
- "type": "VAE",
- "link": 101
- },
- {
- "localized_name": "control_video",
- "name": "control_video",
- "shape": 7,
- "type": "IMAGE",
- "link": 344
- },
- {
- "localized_name": "control_masks",
- "name": "control_masks",
- "shape": 7,
- "type": "MASK",
- "link": 349
- },
- {
- "localized_name": "reference_image",
- "name": "reference_image",
- "shape": 7,
- "type": "IMAGE",
- "link": 361
- },
- {
- "localized_name": "width",
- "name": "width",
- "type": "INT",
- "widget": {
- "name": "width"
- },
- "link": 355
- },
- {
- "localized_name": "height",
- "name": "height",
- "type": "INT",
- "widget": {
- "name": "height"
- },
- "link": 356
- },
- {
- "localized_name": "length",
- "name": "length",
- "type": "INT",
- "widget": {
- "name": "length"
- },
- "link": null
- },
- {
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "widget": {
- "name": "batch_size"
- },
- "link": null
- },
- {
- "localized_name": "strength",
- "name": "strength",
- "type": "FLOAT",
- "widget": {
- "name": "strength"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "positive",
- "name": "positive",
- "type": "CONDITIONING",
- "links": [
- 98
- ]
- },
- {
- "localized_name": "negative",
- "name": "negative",
- "type": "CONDITIONING",
- "links": [
- 99
- ]
- },
- {
- "localized_name": "latent",
- "name": "latent",
- "type": "LATENT",
- "links": [
- 160
- ]
- },
- {
- "localized_name": "trim_latent",
- "name": "trim_latent",
- "type": "INT",
- "links": [
- 115
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.34",
- "Node name for S&R": "WanVaceToVideo",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65,
- "widget_ue_connectable": {
- "width": true,
- "height": true,
- "length": true
- }
- },
- "widgets_values": [
- 720,
- 720,
- 81,
- 1,
- 1
- ]
+ "secondTabWidth": 65
+ }
},
{
- "id": 211,
- "type": "GetImageSize",
+ "id": 212,
+ "type": "ResizeImageMaskNode",
"pos": [
- 70,
- 800
+ -900,
+ -810
],
"size": [
- 190,
- 66
- ],
- "flags": {
- "collapsed": false
- },
- "order": 20,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "image",
- "name": "image",
- "type": "IMAGE",
- "link": 359
- }
- ],
- "outputs": [
- {
- "localized_name": "width",
- "name": "width",
- "type": "INT",
- "links": null
- },
- {
- "localized_name": "height",
- "name": "height",
- "type": "INT",
- "links": null
- },
- {
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "links": [
- 340,
- 346
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "GetImageSize"
- },
- "widgets_values": []
- },
- {
- "id": 210,
- "type": "GetVideoComponents",
- "pos": [
- -510,
- 690
- ],
- "size": [
- 193.530859375,
- 66
+ 280,
+ 110
],
"flags": {},
"order": 19,
"mode": 0,
"inputs": [
{
- "localized_name": "video",
- "name": "video",
- "type": "VIDEO",
- "link": 336
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 371
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resize_type.megapixels",
+ "name": "resize_type.megapixels",
+ "type": "FLOAT",
+ "widget": {
+ "name": "resize_type.megapixels"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
}
],
"outputs": [
{
- "localized_name": "images",
- "name": "images",
- "type": "IMAGE",
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
"links": [
- 358
- ]
- },
- {
- "localized_name": "audio",
- "name": "audio",
- "type": "AUDIO",
- "links": [
- 362
- ]
- },
- {
- "localized_name": "fps",
- "name": "fps",
- "type": "FLOAT",
- "links": [
- 353
+ 368,
+ 369,
+ 370
]
}
],
"properties": {
+ "Node name for S&R": "ResizeImageMaskNode",
"cnr_id": "comfy-core",
- "ver": "0.3.40",
- "Node name for S&R": "GetVideoComponents"
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
},
- "widgets_values": []
+ "widgets_values": [
+ "scale total pixels",
+ 1,
+ "lanczos"
+ ]
}
],
"groups": [
{
"id": 1,
- "title": "Step1 - Load models here",
+ "title": "Model",
"bounding": [
- -540,
- -30,
- 430,
- 620
+ -990,
+ -1770,
+ 460,
+ 870
],
"color": "#3f789e",
"font_size": 24,
@@ -1931,88 +1769,36 @@
"id": 2,
"title": "Prompt",
"bounding": [
- -90,
- -30,
- 450,
- 620
+ -500,
+ -1770,
+ 510,
+ 870
],
"color": "#3f789e",
"font_size": 24,
"flags": {}
},
{
- "id": 3,
- "title": "Sampling & Decoding",
+ "id": 7,
+ "title": "Original",
"bounding": [
- 380,
- -30,
- 720,
- 620
+ 40,
+ -1770,
+ 530,
+ 410
],
"color": "#3f789e",
"font_size": 24,
"flags": {}
},
{
- "id": 10,
- "title": "Repeat Mask Batch",
+ "id": 8,
+ "title": "Lightning LoRA",
"bounding": [
- -90,
- 910,
- 450,
- 460
- ],
- "color": "#3f789e",
- "font_size": 24,
- "flags": {}
- },
- {
- "id": 21,
- "title": "Get video info",
- "bounding": [
- -540,
- 610,
- 900,
- 290
- ],
- "color": "#3f789e",
- "font_size": 24,
- "flags": {}
- },
- {
- "id": 22,
- "title": "Composite video & masks",
- "bounding": [
- 380,
- 610,
- 720,
- 420
- ],
- "color": "#3f789e",
- "font_size": 24,
- "flags": {}
- },
- {
- "id": 23,
- "title": "Step4 - Set video size & length",
- "bounding": [
- 390,
- 130,
- 360,
- 340
- ],
- "color": "#A88",
- "font_size": 24,
- "flags": {}
- },
- {
- "id": 25,
- "title": "14B",
- "bounding": [
- -520,
- 10,
- 380,
- 308.7100524902344
+ 40,
+ -1330,
+ 560,
+ 610
],
"color": "#3f789e",
"font_size": 24,
@@ -2021,367 +1807,343 @@
],
"links": [
{
- "id": 116,
- "origin_id": 3,
+ "id": 326,
+ "origin_id": 194,
"origin_slot": 0,
- "target_id": 58,
- "target_slot": 0,
- "type": "LATENT"
- },
- {
- "id": 115,
- "origin_id": 49,
- "origin_slot": 3,
- "target_id": 58,
- "target_slot": 1,
- "type": "INT"
- },
- {
- "id": 117,
- "origin_id": 58,
- "origin_slot": 0,
- "target_id": 8,
- "target_slot": 0,
- "type": "LATENT"
- },
- {
- "id": 76,
- "origin_id": 39,
- "origin_slot": 0,
- "target_id": 8,
- "target_slot": 1,
- "type": "VAE"
- },
- {
- "id": 279,
- "origin_id": 154,
- "origin_slot": 0,
- "target_id": 48,
+ "target_id": 193,
"target_slot": 0,
"type": "MODEL"
},
{
- "id": 352,
- "origin_id": 219,
+ "id": 324,
+ "origin_id": 205,
"origin_slot": 0,
- "target_id": 216,
+ "target_id": 194,
"target_slot": 0,
- "type": "MASK"
+ "type": "MODEL"
},
{
- "id": 340,
- "origin_id": 211,
- "origin_slot": 2,
- "target_id": 213,
+ "id": 325,
+ "origin_id": 204,
+ "origin_slot": 0,
+ "target_id": 194,
"target_slot": 1,
- "type": "INT"
+ "type": "MODEL"
},
{
- "id": 96,
- "origin_id": 6,
+ "id": 323,
+ "origin_id": 207,
"origin_slot": 0,
- "target_id": 49,
- "target_slot": 0,
- "type": "CONDITIONING"
- },
- {
- "id": 97,
- "origin_id": 7,
- "origin_slot": 0,
- "target_id": 49,
- "target_slot": 1,
- "type": "CONDITIONING"
- },
- {
- "id": 101,
- "origin_id": 39,
- "origin_slot": 0,
- "target_id": 49,
+ "target_id": 194,
"target_slot": 2,
- "type": "VAE"
+ "type": "BOOLEAN"
},
{
- "id": 344,
- "origin_id": 208,
+ "id": 294,
+ "origin_id": 193,
"origin_slot": 0,
- "target_id": 49,
- "target_slot": 3,
- "type": "IMAGE"
- },
- {
- "id": 349,
- "origin_id": 130,
- "origin_slot": 0,
- "target_id": 49,
- "target_slot": 4,
- "type": "MASK"
- },
- {
- "id": 139,
- "origin_id": 8,
- "origin_slot": 0,
- "target_id": 68,
+ "target_id": 196,
"target_slot": 0,
- "type": "IMAGE"
- },
- {
- "id": 353,
- "origin_id": 210,
- "origin_slot": 2,
- "target_id": 68,
- "target_slot": 2,
- "type": "FLOAT"
+ "type": "MODEL"
},
{
"id": 333,
- "origin_id": 213,
+ "origin_id": 199,
"origin_slot": 0,
- "target_id": 208,
+ "target_id": 197,
"target_slot": 0,
- "type": "IMAGE"
+ "type": "FLOAT"
},
{
"id": 334,
- "origin_id": 216,
+ "origin_id": 200,
"origin_slot": 0,
- "target_id": 208,
+ "target_id": 197,
"target_slot": 1,
- "type": "IMAGE"
+ "type": "FLOAT"
},
{
- "id": 341,
- "origin_id": 208,
+ "id": 336,
+ "origin_id": 207,
"origin_slot": 0,
- "target_id": 214,
- "target_slot": 0,
- "type": "IMAGE"
+ "target_id": 197,
+ "target_slot": 2,
+ "type": "BOOLEAN"
},
{
- "id": 201,
- "origin_id": 111,
+ "id": 329,
+ "origin_id": 195,
"origin_slot": 0,
- "target_id": 129,
+ "target_id": 201,
"target_slot": 0,
- "type": "IMAGE"
+ "type": "INT"
},
{
- "id": 346,
- "origin_id": 211,
- "origin_slot": 2,
- "target_id": 129,
+ "id": 337,
+ "origin_id": 198,
+ "origin_slot": 0,
+ "target_id": 201,
"target_slot": 1,
"type": "INT"
},
{
- "id": 202,
- "origin_id": 129,
+ "id": 330,
+ "origin_id": 207,
"origin_slot": 0,
- "target_id": 130,
- "target_slot": 0,
- "type": "IMAGE"
+ "target_id": 201,
+ "target_slot": 2,
+ "type": "BOOLEAN"
},
{
- "id": 280,
- "origin_id": 48,
+ "id": 297,
+ "origin_id": 203,
"origin_slot": 0,
- "target_id": 3,
+ "target_id": 209,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 299,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 209,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 316,
+ "origin_id": 205,
+ "origin_slot": 0,
+ "target_id": 204,
"target_slot": 0,
"type": "MODEL"
},
{
- "id": 98,
- "origin_id": 49,
+ "id": 296,
+ "origin_id": 203,
"origin_slot": 0,
- "target_id": 3,
+ "target_id": 208,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 298,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 208,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 300,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 295,
+ "origin_id": 196,
+ "origin_slot": 0,
+ "target_id": 210,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 312,
+ "origin_id": 208,
+ "origin_slot": 0,
+ "target_id": 210,
"target_slot": 1,
"type": "CONDITIONING"
},
{
- "id": 99,
- "origin_id": 49,
- "origin_slot": 1,
- "target_id": 3,
+ "id": 313,
+ "origin_id": 209,
+ "origin_slot": 0,
+ "target_id": 210,
"target_slot": 2,
"type": "CONDITIONING"
},
{
- "id": 160,
- "origin_id": 49,
- "origin_slot": 2,
- "target_id": 3,
+ "id": 303,
+ "origin_id": 206,
+ "origin_slot": 0,
+ "target_id": 210,
"target_slot": 3,
"type": "LATENT"
},
{
- "id": 74,
- "origin_id": 38,
+ "id": 345,
+ "origin_id": 201,
"origin_slot": 0,
- "target_id": 6,
- "target_slot": 0,
- "type": "CLIP"
- },
- {
- "id": 248,
- "origin_id": 140,
- "origin_slot": 0,
- "target_id": 154,
- "target_slot": 0,
- "type": "MODEL"
- },
- {
- "id": 75,
- "origin_id": 38,
- "origin_slot": 0,
- "target_id": 7,
- "target_slot": 0,
- "type": "CLIP"
- },
- {
- "id": 351,
- "origin_id": -10,
- "origin_slot": 0,
- "target_id": 219,
- "target_slot": 0,
- "type": "MASK"
+ "target_id": 210,
+ "target_slot": 5,
+ "type": "INT"
},
{
"id": 335,
- "origin_id": -10,
+ "origin_id": 197,
"origin_slot": 0,
- "target_id": 208,
- "target_slot": 2,
- "type": "MASK"
- },
- {
- "id": 345,
- "origin_id": -10,
- "origin_slot": 0,
- "target_id": 111,
- "target_slot": 0,
- "type": "MASK"
- },
- {
- "id": 336,
- "origin_id": -10,
- "origin_slot": 1,
"target_id": 210,
- "target_slot": 0,
- "type": "VIDEO"
+ "target_slot": 6,
+ "type": "FLOAT"
},
{
- "id": 129,
- "origin_id": 68,
+ "id": 273,
+ "origin_id": 210,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 314,
+ "origin_id": 202,
+ "origin_slot": 0,
+ "target_id": 211,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 292,
+ "origin_id": 211,
"origin_slot": 0,
"target_id": -20,
"target_slot": 0,
- "type": "VIDEO"
+ "type": "IMAGE"
},
{
"id": 355,
"origin_id": -10,
- "origin_slot": 2,
- "target_id": 49,
- "target_slot": 6,
- "type": "INT"
+ "origin_slot": 1,
+ "target_id": 208,
+ "target_slot": 3,
+ "type": "IMAGE"
},
{
"id": 356,
"origin_id": -10,
- "origin_slot": 3,
- "target_id": 49,
- "target_slot": 7,
- "type": "INT"
+ "origin_slot": 1,
+ "target_id": 209,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 357,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 208,
+ "target_slot": 4,
+ "type": "IMAGE"
},
{
"id": 358,
- "origin_id": 210,
- "origin_slot": 0,
- "target_id": 229,
- "target_slot": 0,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 209,
+ "target_slot": 4,
"type": "IMAGE"
},
{
"id": 359,
- "origin_id": 229,
- "origin_slot": 0,
- "target_id": 211,
- "target_slot": 0,
- "type": "IMAGE"
- },
- {
- "id": 360,
- "origin_id": 229,
- "origin_slot": 0,
- "target_id": 213,
- "target_slot": 0,
- "type": "IMAGE"
- },
- {
- "id": 361,
"origin_id": -10,
- "origin_slot": 4,
- "target_id": 49,
+ "origin_slot": 3,
+ "target_id": 208,
"target_slot": 5,
- "type": "IMAGE"
- },
- {
- "id": 362,
- "origin_id": 210,
- "origin_slot": 1,
- "target_id": 68,
- "target_slot": 1,
- "type": "AUDIO"
- },
- {
- "id": 363,
- "origin_id": -10,
- "origin_slot": 5,
- "target_id": 140,
- "target_slot": 0,
- "type": "COMBO"
+ "type": "STRING"
},
{
"id": 364,
"origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 207,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 368,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 206,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 369,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 208,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 370,
+ "origin_id": 212,
+ "origin_slot": 0,
+ "target_id": 209,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 371,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 212,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 372,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 210,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 373,
+ "origin_id": -10,
"origin_slot": 6,
- "target_id": 154,
- "target_slot": 1,
+ "target_id": 205,
+ "target_slot": 0,
"type": "COMBO"
},
{
- "id": 365,
+ "id": 374,
"origin_id": -10,
"origin_slot": 7,
- "target_id": 38,
+ "target_id": 203,
"target_slot": 0,
"type": "COMBO"
},
{
- "id": 366,
+ "id": 375,
"origin_id": -10,
"origin_slot": 8,
- "target_id": 39,
+ "target_id": 202,
"target_slot": 0,
"type": "COMBO"
+ },
+ {
+ "id": 376,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 204,
+ "target_slot": 1,
+ "type": "COMBO"
}
],
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Inpaint video"
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits images via text instructions using FireRed Image Edit 1.1, a diffusion-based instruction-following editing model."
}
]
},
- "config": {},
"extra": {
- "workflowRendererVersion": "LG",
- "ds": {
- "scale": 0.8183828377358485,
- "offset": [
- 1215.8643989712405,
- 178.87024992690183
- ]
- }
- },
- "version": 0.4
-}
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image Edit (Flux.2 Dev).json b/blueprints/Image Edit (Flux.2 Dev).json
new file mode 100644
index 000000000..92827bf17
--- /dev/null
+++ b/blueprints/Image Edit (Flux.2 Dev).json
@@ -0,0 +1,2050 @@
+{
+ "revision": 0,
+ "last_node_id": 139,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 139,
+ "type": "41b0c117-7470-454c-914e-b8742dc06d62",
+ "pos": [
+ -650,
+ 570
+ ],
+ "size": [
+ 400,
+ 0
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "turbo_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "123",
+ "text"
+ ],
+ [
+ "129",
+ "unet_name"
+ ],
+ [
+ "124",
+ "clip_name"
+ ],
+ [
+ "121",
+ "vae_name"
+ ],
+ [
+ "138",
+ "value"
+ ],
+ [
+ "128",
+ "lora_name"
+ ],
+ [
+ "125",
+ "noise_seed"
+ ],
+ [
+ "125",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true,
+ "value": true,
+ "lora_name": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Image Edit (Flux.2 Dev)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "41b0c117-7470-454c-914e-b8742dc06d62",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 139,
+ "lastLinkId": 194,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Edit (Flux.2 Dev)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1520,
+ 400,
+ 151.744140625,
+ 180
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1240,
+ 420,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "fc74acd5-30a9-410b-abb5-4a4171ba3d25",
+ "name": "pixels",
+ "type": "IMAGE",
+ "linkIds": [
+ 126,
+ 169
+ ],
+ "localized_name": "pixels",
+ "label": "image",
+ "pos": [
+ -1388.255859375,
+ 420
+ ]
+ },
+ {
+ "id": "3e69affa-397b-4d52-82d7-68dfcef9e761",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 168
+ ],
+ "label": "prompt",
+ "pos": [
+ -1388.255859375,
+ 440
+ ]
+ },
+ {
+ "id": "2f016a8a-fb3e-4cb9-97f2-a991defe4fa2",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 177
+ ],
+ "pos": [
+ -1388.255859375,
+ 460
+ ]
+ },
+ {
+ "id": "799b9dc7-0c90-4b19-9a13-e01d896bea1f",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 178
+ ],
+ "pos": [
+ -1388.255859375,
+ 480
+ ]
+ },
+ {
+ "id": "e58a83c9-1b93-4378-9598-f24068820313",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 179
+ ],
+ "pos": [
+ -1388.255859375,
+ 500
+ ]
+ },
+ {
+ "id": "8335a4a9-0ce4-4e67-a641-1c9d7a762977",
+ "name": "value",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 191
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -1388.255859375,
+ 520
+ ]
+ },
+ {
+ "id": "890b22b4-44a7-4707-912a-ca8b4ee7b7c9",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 192
+ ],
+ "label": "turbo_lora",
+ "pos": [
+ -1388.255859375,
+ 540
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "3eaa05d6-4960-4a7c-bf2a-8b585fbb7c9c",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 9
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1260,
+ 440
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 118,
+ "type": "Flux2Scheduler",
+ "pos": [
+ 540,
+ 430
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 188
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 170
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 172
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 132
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "Flux2Scheduler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ 1248,
+ 832
+ ]
+ },
+ {
+ "id": 119,
+ "type": "BasicGuider",
+ "pos": [
+ 530,
+ 120
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 185
+ },
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 166
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "slot_index": 0,
+ "links": [
+ 30
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "BasicGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 120,
+ "type": "KSamplerSelect",
+ "pos": [
+ 530,
+ 270
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 19
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler"
+ ]
+ },
+ {
+ "id": 121,
+ "type": "VAELoader",
+ "pos": [
+ -970,
+ 390
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 179
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 127,
+ 159
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "full_encoder_small_decoder.safetensors",
+ "url": "https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/resolve/main/full_encoder_small_decoder.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "full_encoder_small_decoder.safetensors"
+ ]
+ },
+ {
+ "id": 122,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 790,
+ -50
+ ],
+ "size": [
+ 280,
+ 170
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 37
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 30
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 19
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 132
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 161
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 24
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 123,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -630,
+ -50
+ ],
+ "size": [
+ 430,
+ 360
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 117
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 168
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 41
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 124,
+ "type": "CLIPLoader",
+ "pos": [
+ -970,
+ 160
+ ],
+ "size": [
+ 300,
+ 150
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 178
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 117
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "mistral_3_small_flux2_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux2-dev/resolve/main/split_files/text_encoders/mistral_3_small_flux2_bf16.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "mistral_3_small_flux2_bf16.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 125,
+ "type": "RandomNoise",
+ "pos": [
+ 530,
+ -50
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 37
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 342971778941390,
+ "randomize"
+ ]
+ },
+ {
+ "id": 126,
+ "type": "VAEDecode",
+ "pos": [
+ 830,
+ 410
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 24
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 159
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 127,
+ "type": "FluxGuidance",
+ "pos": [
+ -520,
+ 390
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "localized_name": "guidance",
+ "name": "guidance",
+ "type": "FLOAT",
+ "widget": {
+ "name": "guidance"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 144
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxGuidance",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 4
+ ],
+ "color": "#233",
+ "bgcolor": "#355"
+ },
+ {
+ "id": 128,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ -150,
+ 200
+ ],
+ "size": [
+ 300,
+ 140
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 181
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 192
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 183
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "Flux_2-Turbo-LoRA_comfyui.safetensors",
+ "url": "https://huggingface.co/ByteZSzn/Flux.2-Turbo-ComfyUI/resolve/main/Flux_2-Turbo-LoRA_comfyui.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "Flux_2-Turbo-LoRA_comfyui.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 129,
+ "type": "UNETLoader",
+ "pos": [
+ -970,
+ -40
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 177
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 181,
+ 184
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "flux2_dev_fp8mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux2-dev/resolve/main/split_files/diffusion_models/flux2_dev_fp8mixed.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "flux2_dev_fp8mixed.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 130,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 220,
+ 10
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 184
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 183
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 190
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 185
+ ]
+ }
+ ],
+ "title": "Switch(model)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 131,
+ "type": "PrimitiveInt",
+ "pos": [
+ -150,
+ 430
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 186
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 8,
+ "fixed"
+ ]
+ },
+ {
+ "id": 132,
+ "type": "PrimitiveInt",
+ "pos": [
+ -150,
+ -50
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 187
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ "fixed"
+ ]
+ },
+ {
+ "id": 133,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 220,
+ 280
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 187
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 186
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 189
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 188
+ ]
+ }
+ ],
+ "title": "Switch(steps)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 134,
+ "type": "EmptyFlux2LatentImage",
+ "pos": [
+ 530,
+ 790
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 171
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 173
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 161
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptyFlux2LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1248,
+ 832,
+ 1
+ ]
+ },
+ {
+ "id": 135,
+ "type": "GetImageSize",
+ "pos": [
+ -100,
+ 810
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 169
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 170,
+ 171
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 172,
+ 173
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "GetImageSize",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 136,
+ "type": "VAEEncode",
+ "pos": [
+ -910,
+ 600
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 126
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 127
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 125
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 137,
+ "type": "ReferenceLatent",
+ "pos": [
+ -470,
+ 580
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 144
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "shape": 7,
+ "type": "LATENT",
+ "link": 125
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 166
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ReferenceLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 138,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -130,
+ 640
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 191
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 189,
+ 190
+ ]
+ }
+ ],
+ "title": "Enable 8 steps lora",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Models",
+ "bounding": [
+ -980,
+ -120,
+ 320,
+ 640
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Custom sampler",
+ "bounding": [
+ 520,
+ -120,
+ 590,
+ 740
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Image size",
+ "bounding": [
+ 510,
+ 690,
+ 590,
+ 290
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Prompt",
+ "bounding": [
+ -640,
+ -120,
+ 450,
+ 640
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Original",
+ "bounding": [
+ -160,
+ -120,
+ 340,
+ 230
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "8 Steps LoRA",
+ "bounding": [
+ -160,
+ 130,
+ 340,
+ 430
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 41,
+ "origin_id": 123,
+ "origin_slot": 0,
+ "target_id": 127,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 144,
+ "origin_id": 127,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 125,
+ "origin_id": 136,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 37,
+ "origin_id": 125,
+ "origin_slot": 0,
+ "target_id": 122,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 30,
+ "origin_id": 119,
+ "origin_slot": 0,
+ "target_id": 122,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 19,
+ "origin_id": 120,
+ "origin_slot": 0,
+ "target_id": 122,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 132,
+ "origin_id": 118,
+ "origin_slot": 0,
+ "target_id": 122,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 161,
+ "origin_id": 134,
+ "origin_slot": 0,
+ "target_id": 122,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 24,
+ "origin_id": 122,
+ "origin_slot": 0,
+ "target_id": 126,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 159,
+ "origin_id": 121,
+ "origin_slot": 0,
+ "target_id": 126,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 117,
+ "origin_id": 124,
+ "origin_slot": 0,
+ "target_id": 123,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 127,
+ "origin_id": 121,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 126,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 9,
+ "origin_id": 126,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 166,
+ "origin_id": 137,
+ "origin_slot": 0,
+ "target_id": 119,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 168,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 123,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 169,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 135,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 170,
+ "origin_id": 135,
+ "origin_slot": 0,
+ "target_id": 118,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 171,
+ "origin_id": 135,
+ "origin_slot": 0,
+ "target_id": 134,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 172,
+ "origin_id": 135,
+ "origin_slot": 1,
+ "target_id": 118,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 173,
+ "origin_id": 135,
+ "origin_slot": 1,
+ "target_id": 134,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 177,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 129,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 178,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 124,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 179,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 121,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 181,
+ "origin_id": 129,
+ "origin_slot": 0,
+ "target_id": 128,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 183,
+ "origin_id": 128,
+ "origin_slot": 0,
+ "target_id": 130,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 184,
+ "origin_id": 129,
+ "origin_slot": 0,
+ "target_id": 130,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 185,
+ "origin_id": 130,
+ "origin_slot": 0,
+ "target_id": 119,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 186,
+ "origin_id": 131,
+ "origin_slot": 0,
+ "target_id": 133,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 187,
+ "origin_id": 132,
+ "origin_slot": 0,
+ "target_id": 133,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 188,
+ "origin_id": 133,
+ "origin_slot": 0,
+ "target_id": 118,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 189,
+ "origin_id": 138,
+ "origin_slot": 0,
+ "target_id": 133,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 190,
+ "origin_id": 138,
+ "origin_slot": 0,
+ "target_id": 130,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 191,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 138,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 192,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 128,
+ "target_slot": 1,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits an image from text instructions using Flux.2 [dev], with guidance, schedulers, and optional Turbo LoRAs."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image Edit (Flux.2 Klein 4B).json b/blueprints/Image Edit (Flux.2 Klein 4B).json
index 78bbb7414..7f6fa7a4b 100644
--- a/blueprints/Image Edit (Flux.2 Klein 4B).json
+++ b/blueprints/Image Edit (Flux.2 Klein 4B).json
@@ -128,7 +128,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image Edit (Flux.2 Klein 4B)",
+ "name": "Image Edit (Flux.2 Klein 4B)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1472,7 +1472,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Edit image"
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits an input image via text instructions using FLUX.2 [klein] 4B."
},
{
"id": "6007e698-2ebd-4917-84d8-299b35d7b7ab",
@@ -1821,7 +1822,8 @@
],
"extra": {
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Applies reference image conditioning for style/identity transfer (Flux.2 Klein 4B)."
}
]
},
diff --git a/blueprints/Image Edit (LongCat Image Edit).json b/blueprints/Image Edit (LongCat Image Edit).json
new file mode 100644
index 000000000..de1c155a2
--- /dev/null
+++ b/blueprints/Image Edit (LongCat Image Edit).json
@@ -0,0 +1,1428 @@
+{
+ "revision": 0,
+ "last_node_id": 176,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 176,
+ "type": "372a02a0-a79c-40b4-84a9-34f246fe0e9c",
+ "pos": [
+ 967.0861152473078,
+ 4977.534165136897
+ ],
+ "size": [
+ 330,
+ 380
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "27",
+ "prompt"
+ ],
+ [
+ "33",
+ "steps"
+ ],
+ [
+ "33",
+ "cfg"
+ ],
+ [
+ "33",
+ "seed"
+ ],
+ [
+ "34",
+ "unet_name"
+ ],
+ [
+ "38",
+ "clip_name"
+ ],
+ [
+ "26",
+ "vae_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Image Edit (LongCat Image Edit)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "372a02a0-a79c-40b4-84a9-34f246fe0e9c",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 176,
+ "lastLinkId": 376,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Edit (LongCat Image Edit)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -750,
+ 380,
+ 120,
+ 200
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1680,
+ 340,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "616c4f3e-8b64-4711-bee2-5ecbe1814fe4",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 14
+ ],
+ "localized_name": "image",
+ "pos": [
+ -650,
+ 400
+ ]
+ },
+ {
+ "id": "d39759fc-a5a9-4b82-a88f-df9b953f1d98",
+ "name": "prompt",
+ "type": "STRING",
+ "linkIds": [
+ 36
+ ],
+ "pos": [
+ -650,
+ 420
+ ]
+ },
+ {
+ "id": "48627f43-cdf1-4ea9-9e11-ec13451a7323",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 37
+ ],
+ "pos": [
+ -650,
+ 440
+ ]
+ },
+ {
+ "id": "2213f872-d40f-4fc3-be01-b8fc73f1d92c",
+ "name": "cfg",
+ "type": "FLOAT",
+ "linkIds": [
+ 42
+ ],
+ "pos": [
+ -650,
+ 460
+ ]
+ },
+ {
+ "id": "2c7b3e65-e71e-4a9b-a9f8-d2e814ccb6af",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 43
+ ],
+ "pos": [
+ -650,
+ 480
+ ]
+ },
+ {
+ "id": "bddb2317-7210-48d5-81fd-6b2d6fac33f4",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 44
+ ],
+ "pos": [
+ -650,
+ 500
+ ]
+ },
+ {
+ "id": "a283167b-6d7f-4d19-ad86-1fff2335c08d",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 45
+ ],
+ "pos": [
+ -650,
+ 520
+ ]
+ },
+ {
+ "id": "e033047f-cc37-4043-b4a0-25d7bab661af",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 46
+ ],
+ "pos": [
+ -650,
+ 540
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "0a288e93-c03f-4805-80f3-4e320a6a492e",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 20
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1700,
+ 360
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 26,
+ "type": "VAELoader",
+ "pos": [
+ -360,
+ 590
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 46
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 4,
+ 5,
+ 6,
+ 7
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 27,
+ "type": "TextEncodeQwenImageEdit",
+ "pos": [
+ 10,
+ 200
+ ],
+ "size": [
+ 280,
+ 190
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 2
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 4
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 15
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 36
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 8
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "TextEncodeQwenImageEdit"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 28,
+ "type": "TextEncodeQwenImageEdit",
+ "pos": [
+ 10,
+ 440
+ ],
+ "size": [
+ 280,
+ 190
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 3
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 5
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 16
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "TextEncodeQwenImageEdit"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 29,
+ "type": "FluxKontextMultiReferenceLatentMethod",
+ "pos": [
+ 660,
+ 200
+ ],
+ "size": [
+ 270,
+ 80
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "showAdvanced": false,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 10
+ },
+ {
+ "localized_name": "reference_latents_method",
+ "name": "reference_latents_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "reference_latents_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxKontextMultiReferenceLatentMethod"
+ },
+ "widgets_values": [
+ "index"
+ ]
+ },
+ {
+ "id": 30,
+ "type": "FluxGuidance",
+ "pos": [
+ 330,
+ 440
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 9
+ },
+ {
+ "localized_name": "guidance",
+ "name": "guidance",
+ "type": "FLOAT",
+ "widget": {
+ "name": "guidance"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 11
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxGuidance"
+ },
+ "widgets_values": [
+ 4.5
+ ]
+ },
+ {
+ "id": 31,
+ "type": "FluxGuidance",
+ "pos": [
+ 330,
+ 200
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 8
+ },
+ {
+ "localized_name": "guidance",
+ "name": "guidance",
+ "type": "FLOAT",
+ "widget": {
+ "name": "guidance"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 10
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxGuidance"
+ },
+ "widgets_values": [
+ 4.5
+ ]
+ },
+ {
+ "id": 32,
+ "type": "FluxKontextMultiReferenceLatentMethod",
+ "pos": [
+ 660,
+ 440
+ ],
+ "size": [
+ 270,
+ 80
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 11
+ },
+ {
+ "localized_name": "reference_latents_method",
+ "name": "reference_latents_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "reference_latents_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxKontextMultiReferenceLatentMethod"
+ },
+ "widgets_values": [
+ "index"
+ ]
+ },
+ {
+ "id": 33,
+ "type": "KSampler",
+ "pos": [
+ 1080,
+ 210
+ ],
+ "size": [
+ 270,
+ 460
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 12
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 13
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 18
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 43
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 37
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 42
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 19
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 43,
+ "fixed",
+ 50,
+ 4.5,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 34,
+ "type": "UNETLoader",
+ "pos": [
+ -360,
+ 170
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 44
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 1
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "longcat_image_edit_bf16.safetensors",
+ "url": "https://huggingface.co/TalmajM/LongCat-Image-Edit_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/longcat_image_edit_bf16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "longcat_image_edit_bf16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 35,
+ "type": "VAEEncode",
+ "pos": [
+ 710,
+ 790
+ ],
+ "size": [
+ 260,
+ 100
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 17
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 6
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 18
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEEncode"
+ }
+ },
+ {
+ "id": 36,
+ "type": "VAEDecode",
+ "pos": [
+ 1100,
+ 800
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 19
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 7
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 20
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 37,
+ "type": "ImageScaleToTotalPixels",
+ "pos": [
+ -370,
+ 790
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 14
+ },
+ {
+ "localized_name": "upscale_method",
+ "name": "upscale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "upscale_method"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "megapixels",
+ "name": "megapixels",
+ "type": "FLOAT",
+ "widget": {
+ "name": "megapixels"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resolution_steps",
+ "name": "resolution_steps",
+ "type": "INT",
+ "widget": {
+ "name": "resolution_steps"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 15,
+ 16,
+ 17
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ImageScaleToTotalPixels"
+ },
+ "widgets_values": [
+ "lanczos",
+ 1,
+ 16
+ ]
+ },
+ {
+ "id": 38,
+ "type": "CLIPLoader",
+ "pos": [
+ -360,
+ 360
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 45
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 2,
+ 3
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "models": [
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "longcat_image",
+ "default"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Models",
+ "bounding": [
+ -380,
+ 100,
+ 320,
+ 630
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Conditioning",
+ "bounding": [
+ -30,
+ 100,
+ 1030,
+ 630
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Sample",
+ "bounding": [
+ 1030,
+ 100,
+ 360,
+ 630
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 2,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 4,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 15,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 3,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 5,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 16,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 28,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 10,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 29,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 9,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 30,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 8,
+ "origin_id": 27,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 11,
+ "origin_id": 30,
+ "origin_slot": 0,
+ "target_id": 32,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 1,
+ "origin_id": 34,
+ "origin_slot": 0,
+ "target_id": 33,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 12,
+ "origin_id": 29,
+ "origin_slot": 0,
+ "target_id": 33,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 13,
+ "origin_id": 32,
+ "origin_slot": 0,
+ "target_id": 33,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 18,
+ "origin_id": 35,
+ "origin_slot": 0,
+ "target_id": 33,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 17,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 6,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 19,
+ "origin_id": 33,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 7,
+ "origin_id": 26,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 14,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 20,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 36,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 27,
+ "target_slot": 3,
+ "type": "STRING"
+ },
+ {
+ "id": 37,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 33,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 42,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 33,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 43,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 33,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 44,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 34,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 45,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 46,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 26,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits images via text instructions using LongCat Image Edit, an instruction-following image editing diffusion model."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image Edit (Qwen 2509).json b/blueprints/Image Edit (Qwen 2509).json
new file mode 100644
index 000000000..f7be322a0
--- /dev/null
+++ b/blueprints/Image Edit (Qwen 2509).json
@@ -0,0 +1,1947 @@
+{
+ "revision": 0,
+ "last_node_id": 433,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 433,
+ "type": "eba40a3a-f6c5-48ac-b58e-55525d06b373",
+ "pos": [
+ 90,
+ -160
+ ],
+ "size": [
+ 390,
+ 610
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "image2 (optional)",
+ "name": "image2",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "image3 (optional)",
+ "name": "image3",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "111",
+ "prompt"
+ ],
+ [
+ "3",
+ "seed"
+ ],
+ [
+ "443",
+ "value"
+ ],
+ [
+ "37",
+ "unet_name"
+ ],
+ [
+ "38",
+ "clip_name"
+ ],
+ [
+ "39",
+ "vae_name"
+ ],
+ [
+ "3",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.3.62"
+ },
+ "widgets_values": [],
+ "title": "Image Edit (Qwen 2509)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "eba40a3a-f6c5-48ac-b58e-55525d06b373",
+ "version": 1,
+ "state": {
+ "lastGroupId": 51,
+ "lastNodeId": 468,
+ "lastLinkId": 731,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Edit (Qwen 2509)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1160,
+ 280,
+ 151.744140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 2030,
+ -20,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "d5089bd3-63bc-4a24-b478-6565ed2364e3",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 248
+ ],
+ "label": "image",
+ "pos": [
+ -1028.255859375,
+ 300
+ ]
+ },
+ {
+ "id": "9e80fff0-ed0a-439f-a16e-a4a6cc1eb601",
+ "name": "image2",
+ "type": "IMAGE",
+ "linkIds": [
+ 235,
+ 236
+ ],
+ "label": "image2 (optional)",
+ "pos": [
+ -1028.255859375,
+ 320
+ ]
+ },
+ {
+ "id": "49d98fd6-01b5-440b-8603-579252fd7fef",
+ "name": "image3",
+ "type": "IMAGE",
+ "linkIds": [
+ 237,
+ 238
+ ],
+ "label": "image3 (optional)",
+ "pos": [
+ -1028.255859375,
+ 340
+ ]
+ },
+ {
+ "id": "5de32f24-a7b5-4423-b772-72824005f585",
+ "name": "prompt",
+ "type": "STRING",
+ "linkIds": [
+ 244
+ ],
+ "pos": [
+ -1028.255859375,
+ 360
+ ]
+ },
+ {
+ "id": "85fb3d74-7881-4c71-bc8c-624be5eedc3d",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 718
+ ],
+ "pos": [
+ -1028.255859375,
+ 380
+ ]
+ },
+ {
+ "id": "b0c828de-d7eb-42a3-8dfb-4f53360d4fc9",
+ "name": "value",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 719
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -1028.255859375,
+ 400
+ ]
+ },
+ {
+ "id": "072baa05-5551-4a98-bd66-015a36833ac2",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 720
+ ],
+ "pos": [
+ -1028.255859375,
+ 420
+ ]
+ },
+ {
+ "id": "d2891d11-b336-4750-9742-b93717c9ae39",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 721
+ ],
+ "pos": [
+ -1028.255859375,
+ 440
+ ]
+ },
+ {
+ "id": "4218135f-5128-4b7e-8572-92cc55615793",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 722
+ ],
+ "pos": [
+ -1028.255859375,
+ 460
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "c4ebfc18-de83-4361-8e42-767c3c8c25c0",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 110
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 2050,
+ 0
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 75,
+ "type": "CFGNorm",
+ "pos": [
+ 1080,
+ 30
+ ],
+ "size": [
+ 290,
+ 110
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 141
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "patched_model",
+ "name": "patched_model",
+ "type": "MODEL",
+ "links": [
+ 186
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CFGNorm",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.50",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "strength": true
+ }
+ }
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 39,
+ "type": "VAELoader",
+ "pos": [
+ -730,
+ 410
+ ],
+ "size": [
+ 330,
+ 110
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 722
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 76,
+ 168,
+ 206,
+ 207
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "models": [
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "qwen_image_vae.safetensors"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "CLIPLoader",
+ "pos": [
+ -730,
+ 150
+ ],
+ "size": [
+ 330,
+ 150
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 721
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 204,
+ 205
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "models": [
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "qwen_image",
+ "default"
+ ]
+ },
+ {
+ "id": 37,
+ "type": "UNETLoader",
+ "pos": [
+ -730,
+ -60
+ ],
+ "size": [
+ 330,
+ 110
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 720
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 184,
+ 710
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "models": [
+ {
+ "name": "qwen_image_edit_2509_fp8_e4m3fn.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image-Edit_ComfyUI/resolve/main/split_files/diffusion_models/qwen_image_edit_2509_fp8_e4m3fn.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "qwen_image_edit_2509_fp8_e4m3fn.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 110,
+ "type": "TextEncodeQwenImageEditPlus",
+ "pos": [
+ -240,
+ 320
+ ],
+ "size": [
+ 400,
+ 240
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 204
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 206
+ },
+ {
+ "localized_name": "image1",
+ "name": "image1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 251
+ },
+ {
+ "localized_name": "image2",
+ "name": "image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 236
+ },
+ {
+ "localized_name": "image3",
+ "name": "image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 238
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 210
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextEncodeQwenImageEditPlus",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.59"
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 66,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 1070,
+ -120
+ ],
+ "size": [
+ 290,
+ 110
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 708
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 141
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 3
+ ]
+ },
+ {
+ "id": 111,
+ "type": "TextEncodeQwenImageEditPlus",
+ "pos": [
+ -250,
+ -70
+ ],
+ "size": [
+ 410,
+ 330
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 205
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "shape": 7,
+ "type": "VAE",
+ "link": 207
+ },
+ {
+ "localized_name": "image1",
+ "name": "image1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 250
+ },
+ {
+ "localized_name": "image2",
+ "name": "image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 235
+ },
+ {
+ "localized_name": "image3",
+ "name": "image3",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 237
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 244
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 211
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextEncodeQwenImageEditPlus",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.59"
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 88,
+ "type": "VAEEncode",
+ "pos": [
+ -70,
+ 640
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 249
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 168
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 246
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.50",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {}
+ }
+ }
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 1590,
+ -60
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 128
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 76
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 110
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ }
+ },
+ {
+ "id": 89,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 320,
+ 300
+ ],
+ "size": [
+ 300,
+ 140
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 184
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 709
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.50",
+ "models": [
+ {
+ "name": "Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16.safetensors",
+ "url": "https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Edit-2509/Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16.safetensors",
+ "directory": "loras"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "lora_name": true,
+ "strength_model": true
+ }
+ }
+ },
+ "widgets_values": [
+ "Qwen-Image-Edit-2509-Lightning-4steps-V1.0-bf16.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 117,
+ "type": "FluxKontextImageScale",
+ "pos": [
+ -680,
+ 630
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 248
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 249,
+ 250,
+ 251
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "FluxKontextImageScale"
+ }
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1070,
+ 210
+ ],
+ "size": [
+ 300,
+ 590
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 186
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 211
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 210
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 246
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 718
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 707
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 706
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 128
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 973414316252139,
+ "randomize",
+ 4,
+ 1,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 436,
+ "type": "PrimitiveInt",
+ "pos": [
+ 320,
+ 500
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 713
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 4,
+ "fixed"
+ ]
+ },
+ {
+ "id": 437,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 320,
+ 670
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 714
+ ]
+ }
+ ],
+ "title": "CFG",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 438,
+ "type": "PrimitiveInt",
+ "pos": [
+ 320,
+ -100
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 711
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 20,
+ "fixed"
+ ]
+ },
+ {
+ "id": 439,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 320,
+ 70
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 712
+ ]
+ }
+ ],
+ "title": "CFG",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 4
+ ]
+ },
+ {
+ "id": 440,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 750,
+ -80
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 710
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 709
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 715
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 708
+ ]
+ }
+ ],
+ "title": "Switch (Model)",
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 441,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 730,
+ 340
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 711
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 713
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 716
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 707
+ ]
+ }
+ ],
+ "title": "Switch (Steps)",
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 442,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 730,
+ 520
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 712
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 714
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 717
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 706
+ ]
+ }
+ ],
+ "title": "Switch (CFG)",
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 443,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 330,
+ 850
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 719
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 715,
+ 716,
+ 717
+ ]
+ }
+ ],
+ "title": "Enable Lightning LoRA",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 444,
+ "type": "MarkdownNote",
+ "pos": [
+ 240,
+ -500
+ ],
+ "size": [
+ 450,
+ 310
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "title": "Note: KSampler settings",
+ "properties": {},
+ "widgets_values": [
+ "You can test and find the best setting by yourself. The following table is for reference.\n| Parameters | Qwen Team | Comfy Original | with 4steps LoRA |\n|--------|---------|------------|---------------------------|\n| Steps | 50 | 20 | 4 |\n| CFG | 4.0 | 2.5 | 1.0 |"
+ ],
+ "color": "#432",
+ "bgcolor": "#000"
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Step1 - Load models",
+ "bounding": [
+ -770,
+ -170,
+ 410,
+ 750
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Step 4 - Prompt",
+ "bounding": [
+ -330,
+ -170,
+ 570,
+ 750
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 50,
+ "title": "Lightning LoRA",
+ "bounding": [
+ 270,
+ 220,
+ 390,
+ 570
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 51,
+ "title": "Original Settings",
+ "bounding": [
+ 270,
+ -170,
+ 390,
+ 360
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 141,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 128,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 76,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 184,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 89,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 186,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 211,
+ "origin_id": 111,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 210,
+ "origin_id": 110,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 168,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 88,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 204,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 110,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 206,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 110,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 205,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 111,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 207,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 111,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 110,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 235,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 111,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 236,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 110,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 237,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 111,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 238,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 110,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
+ {
+ "id": 244,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 111,
+ "target_slot": 5,
+ "type": "STRING"
+ },
+ {
+ "id": 246,
+ "origin_id": 88,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 248,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 117,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 249,
+ "origin_id": 117,
+ "origin_slot": 0,
+ "target_id": 88,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 250,
+ "origin_id": 117,
+ "origin_slot": 0,
+ "target_id": 111,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 251,
+ "origin_id": 117,
+ "origin_slot": 0,
+ "target_id": 110,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 706,
+ "origin_id": 442,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 707,
+ "origin_id": 441,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 708,
+ "origin_id": 440,
+ "origin_slot": 0,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 709,
+ "origin_id": 89,
+ "origin_slot": 0,
+ "target_id": 440,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 710,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 440,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 711,
+ "origin_id": 438,
+ "origin_slot": 0,
+ "target_id": 441,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 712,
+ "origin_id": 439,
+ "origin_slot": 0,
+ "target_id": 442,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 713,
+ "origin_id": 436,
+ "origin_slot": 0,
+ "target_id": 441,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 714,
+ "origin_id": 437,
+ "origin_slot": 0,
+ "target_id": 442,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 715,
+ "origin_id": 443,
+ "origin_slot": 0,
+ "target_id": 440,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 716,
+ "origin_id": 443,
+ "origin_slot": 0,
+ "target_id": 441,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 717,
+ "origin_id": 443,
+ "origin_slot": 0,
+ "target_id": 442,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 718,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 719,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 443,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 720,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 721,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 722,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits images from text instructions using Qwen-Image-Edit-2509 with optional Lightning LoRA for few-step sampling."
+ }
+ ]
+ },
+ "extra": {}
+}
diff --git a/blueprints/Image Edit (Qwen 2511).json b/blueprints/Image Edit (Qwen 2511).json
index 582171fa0..1aa7e5765 100644
--- a/blueprints/Image Edit (Qwen 2511).json
+++ b/blueprints/Image Edit (Qwen 2511).json
@@ -132,7 +132,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image Edit (Qwen 2511)",
+ "name": "Image Edit (Qwen 2511)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1468,7 +1468,8 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
- "category": "Image generation and editing/Edit image"
+ "category": "Image generation and editing/Edit image",
+ "description": "Edits images via text instructions using Qwen-Image-Edit-2511 with improved character consistency and integrated LoRA."
}
]
},
@@ -1489,4 +1490,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Face Detection (Mediapipe).json b/blueprints/Image Face Detection (Mediapipe).json
new file mode 100644
index 000000000..e2548d485
--- /dev/null
+++ b/blueprints/Image Face Detection (Mediapipe).json
@@ -0,0 +1,779 @@
+{
+ "revision": 0,
+ "last_node_id": 33,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 33,
+ "type": "6062babb-b649-4a71-be9e-20ebce567744",
+ "pos": [
+ -450,
+ 4240
+ ],
+ "size": [
+ 420,
+ 400
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "face_landmarker",
+ "type": "FACE_LANDMARKER",
+ "link": null
+ },
+ {
+ "name": "detector_variant",
+ "type": "COMBO",
+ "widget": {
+ "name": "detector_variant"
+ },
+ "link": null
+ },
+ {
+ "name": "num_faces",
+ "type": "INT",
+ "widget": {
+ "name": "num_faces"
+ },
+ "link": null
+ },
+ {
+ "label": "custom_face_oval",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.face_oval"
+ },
+ "link": null
+ },
+ {
+ "label": "custom_lips",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.lips"
+ },
+ "link": null
+ },
+ {
+ "label": "custom_left_eye",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.left_eye"
+ },
+ "link": null
+ },
+ {
+ "label": "custom_right_eye",
+ "name": "regions.right_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.right_eye"
+ },
+ "link": null
+ },
+ {
+ "label": "custom_irises",
+ "name": "regions.irises",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.irises"
+ },
+ "link": null
+ },
+ {
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "face_landmarks",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "links": []
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ },
+ {
+ "label": "mask",
+ "name": "MASK_1",
+ "type": "MASK",
+ "links": []
+ }
+ ],
+ "title": "Image Face Detection (Mediapipe)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "11",
+ "detector_variant"
+ ],
+ [
+ "11",
+ "num_faces"
+ ],
+ [
+ "20",
+ "regions.face_oval"
+ ],
+ [
+ "20",
+ "regions.lips"
+ ],
+ [
+ "20",
+ "regions.left_eye"
+ ],
+ [
+ "20",
+ "regions.right_eye"
+ ],
+ [
+ "20",
+ "regions.irises"
+ ],
+ [
+ "2",
+ "model_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "6062babb-b649-4a71-be9e-20ebce567744",
+ "version": 1,
+ "state": {
+ "lastGroupId": 2,
+ "lastNodeId": 158,
+ "lastLinkId": 140,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Face Detection (Mediapipe)",
+ "description": "Detects facial landmarks from an image using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -710,
+ 4300,
+ 148.880859375,
+ 248
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 140,
+ 4480,
+ 137.677734375,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "705dc1ae-6dc9-4155-92df-52f816ad451e",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 60
+ ],
+ "localized_name": "image",
+ "pos": [
+ -585.119140625,
+ 4324
+ ]
+ },
+ {
+ "id": "d6277190-732c-4604-b7cd-d3a9588bf761",
+ "name": "face_landmarker",
+ "type": "FACE_LANDMARKER",
+ "linkIds": [
+ 74
+ ],
+ "pos": [
+ -585.119140625,
+ 4344
+ ]
+ },
+ {
+ "id": "ac473a08-6a86-42a7-b460-e70c6c5e1e2b",
+ "name": "detector_variant",
+ "type": "COMBO",
+ "linkIds": [
+ 75
+ ],
+ "pos": [
+ -585.119140625,
+ 4364
+ ]
+ },
+ {
+ "id": "1bec2252-ca2d-496e-8a33-33a61d21f897",
+ "name": "num_faces",
+ "type": "INT",
+ "linkIds": [
+ 76
+ ],
+ "pos": [
+ -585.119140625,
+ 4384
+ ]
+ },
+ {
+ "id": "17994fa2-0ea0-4c9b-a70a-19789c459c80",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 77
+ ],
+ "label": "custom_face_oval",
+ "pos": [
+ -585.119140625,
+ 4404
+ ]
+ },
+ {
+ "id": "1c6c5893-2aee-4c37-b702-15ef2e20d863",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 78
+ ],
+ "label": "custom_lips",
+ "pos": [
+ -585.119140625,
+ 4424
+ ]
+ },
+ {
+ "id": "f353fcea-4b6f-42a1-8fdd-32b3aa1e1f09",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 79
+ ],
+ "label": "custom_left_eye",
+ "pos": [
+ -585.119140625,
+ 4444
+ ]
+ },
+ {
+ "id": "1387e121-c1fb-4522-8f0d-43459e11dd86",
+ "name": "regions.right_eye",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 80
+ ],
+ "label": "custom_right_eye",
+ "pos": [
+ -585.119140625,
+ 4464
+ ]
+ },
+ {
+ "id": "14acb0a0-d1f4-48f3-ba31-811b26236ef9",
+ "name": "regions.irises",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 81
+ ],
+ "label": "custom_irises",
+ "pos": [
+ -585.119140625,
+ 4484
+ ]
+ },
+ {
+ "id": "25a82859-87de-42c8-8431-09948665546e",
+ "name": "model_name",
+ "type": "COMBO",
+ "linkIds": [
+ 86
+ ],
+ "pos": [
+ -585.119140625,
+ 4504
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "d2ba3f92-e8b1-49c3-9590-cfad56c54cf4",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "linkIds": [
+ 44
+ ],
+ "localized_name": "face_landmarks",
+ "pos": [
+ 164,
+ 4504
+ ]
+ },
+ {
+ "id": "4f356bb0-d4c4-4f93-b4cf-0845a65c4e6d",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 25
+ ],
+ "localized_name": "bboxes",
+ "pos": [
+ 164,
+ 4524
+ ]
+ },
+ {
+ "id": "f6309e1d-6397-4363-b38f-778a122abc51",
+ "name": "MASK_1",
+ "type": "MASK",
+ "linkIds": [
+ 83
+ ],
+ "label": "mask",
+ "pos": [
+ 164,
+ 4544
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 11,
+ "type": "MediaPipeFaceLandmarker",
+ "pos": [
+ -280,
+ 4280
+ ],
+ "size": [
+ 350,
+ 220
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "face_detection_model",
+ "name": "face_detection_model",
+ "type": "FACE_DETECTION_MODEL",
+ "link": 66
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 60
+ },
+ {
+ "localized_name": "detector_variant",
+ "name": "detector_variant",
+ "type": "COMBO",
+ "widget": {
+ "name": "detector_variant"
+ },
+ "link": 75
+ },
+ {
+ "localized_name": "num_faces",
+ "name": "num_faces",
+ "type": "INT",
+ "widget": {
+ "name": "num_faces"
+ },
+ "link": 76
+ },
+ {
+ "localized_name": "min_confidence",
+ "name": "min_confidence",
+ "type": "FLOAT",
+ "widget": {
+ "name": "min_confidence"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "missing_frame_fallback",
+ "name": "missing_frame_fallback",
+ "type": "COMBO",
+ "widget": {
+ "name": "missing_frame_fallback"
+ },
+ "link": null
+ },
+ {
+ "name": "face_landmarker",
+ "type": "FACE_LANDMARKER",
+ "link": 74
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "face_landmarks",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "links": [
+ 44,
+ 46
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 25
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MediaPipeFaceLandmarker",
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "full",
+ 0,
+ 0.5,
+ "empty"
+ ]
+ },
+ {
+ "id": 2,
+ "type": "LoadMediaPipeFaceLandmarker",
+ "pos": [
+ -290,
+ 4060
+ ],
+ "size": [
+ 350,
+ 140
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 86
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FACE_DETECTION_MODEL",
+ "name": "FACE_DETECTION_MODEL",
+ "type": "FACE_DETECTION_MODEL",
+ "links": [
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadMediaPipeFaceLandmarker",
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0",
+ "models": [
+ {
+ "name": "mediapipe_face_fp32.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors",
+ "directory": "detection"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "mediapipe_face_fp32.safetensors"
+ ]
+ },
+ {
+ "id": 20,
+ "type": "MediaPipeFaceMask",
+ "pos": [
+ -290,
+ 4560
+ ],
+ "size": [
+ 360,
+ 180
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "face_landmarks",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "link": 46
+ },
+ {
+ "localized_name": "regions",
+ "name": "regions",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "regions"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "regions.face_oval",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.face_oval"
+ },
+ "link": 77
+ },
+ {
+ "localized_name": "regions.lips",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.lips"
+ },
+ "link": 78
+ },
+ {
+ "localized_name": "regions.left_eye",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.left_eye"
+ },
+ "link": 79
+ },
+ {
+ "localized_name": "regions.right_eye",
+ "name": "regions.right_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.right_eye"
+ },
+ "link": 80
+ },
+ {
+ "localized_name": "regions.irises",
+ "name": "regions.irises",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.irises"
+ },
+ "link": 81
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 83
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MediaPipeFaceMask",
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "custom",
+ true,
+ false,
+ false,
+ false,
+ false
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 66,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "FACE_DETECTION_MODEL"
+ },
+ {
+ "id": 46,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 20,
+ "target_slot": 0,
+ "type": "FACE_LANDMARKS"
+ },
+ {
+ "id": 60,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 44,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "FACE_LANDMARKS"
+ },
+ {
+ "id": 25,
+ "origin_id": 11,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 74,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 11,
+ "target_slot": 6,
+ "type": "FACE_LANDMARKER"
+ },
+ {
+ "id": 75,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 11,
+ "target_slot": 2,
+ "type": "COMBO"
+ },
+ {
+ "id": 76,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 11,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 77,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 20,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 78,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 20,
+ "target_slot": 3,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 20,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 80,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 20,
+ "target_slot": 5,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 81,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 20,
+ "target_slot": 6,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 83,
+ "origin_id": 20,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "MASK"
+ },
+ {
+ "id": 86,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Face Detection"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Image Inpainting (Flux.1 Fill Dev).json b/blueprints/Image Inpainting (Flux.1 Fill Dev).json
new file mode 100644
index 000000000..c1326ed3d
--- /dev/null
+++ b/blueprints/Image Inpainting (Flux.1 Fill Dev).json
@@ -0,0 +1,1206 @@
+{
+ "revision": 0,
+ "last_node_id": 232,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 232,
+ "type": "6e8d6e38-bdc3-436c-be85-ef9e67e70e07",
+ "pos": [
+ 1270,
+ 4640
+ ],
+ "size": [
+ 400,
+ 470
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": null
+ },
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "23",
+ "text"
+ ],
+ [
+ "3",
+ "seed"
+ ],
+ [
+ "31",
+ "unet_name"
+ ],
+ [
+ "34",
+ "clip_name1"
+ ],
+ [
+ "34",
+ "clip_name2"
+ ],
+ [
+ "230",
+ "vae_name"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "Image Inpainting (Flux.1 Fill Dev)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "6e8d6e38-bdc3-436c-be85-ef9e67e70e07",
+ "version": 1,
+ "state": {
+ "lastGroupId": 22,
+ "lastNodeId": 232,
+ "lastLinkId": 286,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Inpainting (Flux.1 Fill Dev)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -850,
+ 164,
+ 120,
+ 200
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1230,
+ 140,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "65727ee9-09d0-40c9-bd86-11e0823eb676",
+ "name": "pixels",
+ "type": "IMAGE",
+ "linkIds": [
+ 99
+ ],
+ "localized_name": "pixels",
+ "label": "image",
+ "pos": [
+ -750,
+ 184
+ ]
+ },
+ {
+ "id": "28424f77-56c5-49c1-ba41-6bd78287c186",
+ "name": "mask",
+ "type": "MASK",
+ "linkIds": [
+ 100
+ ],
+ "localized_name": "mask",
+ "pos": [
+ -750,
+ 204
+ ]
+ },
+ {
+ "id": "2339e5e0-8f8d-4600-b158-7d7dae5f0535",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 277
+ ],
+ "label": "prompt",
+ "pos": [
+ -750,
+ 224
+ ]
+ },
+ {
+ "id": "5f433d9b-b97e-4bac-bb88-eb668de2d5a7",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 282
+ ],
+ "pos": [
+ -750,
+ 244
+ ]
+ },
+ {
+ "id": "35a8b6c1-c92c-4c1a-9b24-2e9bae7808f6",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 283
+ ],
+ "pos": [
+ -750,
+ 264
+ ]
+ },
+ {
+ "id": "3af8f8be-bce8-4ba0-aea0-ccf6b377d5f6",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "linkIds": [
+ 284
+ ],
+ "pos": [
+ -750,
+ 284
+ ]
+ },
+ {
+ "id": "d9a4af80-4fa1-4792-b955-78bdaef4596e",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "linkIds": [
+ 285
+ ],
+ "pos": [
+ -750,
+ 304
+ ]
+ },
+ {
+ "id": "d59398cf-7e9c-4dae-8c5a-08c4756f256a",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 286
+ ],
+ "pos": [
+ -750,
+ 324
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "1dee24ec-54a8-41be-aa30-a8fb797d3d23",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 95
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1250,
+ 160
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 34,
+ "type": "DualCLIPLoader",
+ "pos": [
+ -590,
+ 150
+ ],
+ "size": [
+ 320,
+ 180
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name1",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": 284
+ },
+ {
+ "localized_name": "clip_name2",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": 285
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 62
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "DualCLIPLoader",
+ "models": [
+ {
+ "name": "clip_l.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "t5xxl_fp16.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "clip_l.safetensors",
+ "t5xxl_fp16.safetensors",
+ "flux",
+ "default"
+ ]
+ },
+ {
+ "id": 229,
+ "type": "FluxGuidance",
+ "pos": [
+ 410,
+ -40
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "localized_name": "guidance",
+ "name": "guidance",
+ "type": "FLOAT",
+ "widget": {
+ "name": "guidance"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "FluxGuidance"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 230,
+ "type": "VAELoader",
+ "pos": [
+ -590,
+ 450
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 286
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 60,
+ 82
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 31,
+ "type": "UNETLoader",
+ "pos": [
+ -590,
+ -90
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 283
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 85
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "flux1-fill-dev.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux1-dev/resolve/main/split_files/diffusion_models/flux1-fill-dev.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "flux1-fill-dev.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 46,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ 90,
+ 420
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 101
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 102
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "ConditioningZeroOut"
+ }
+ },
+ {
+ "id": 23,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -160,
+ -70
+ ],
+ "size": [
+ 480,
+ 410
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 62
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 277
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 41,
+ 101
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 39,
+ "type": "DifferentialDiffusion",
+ "pos": [
+ 780,
+ -110
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 85
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "shape": 7,
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 86
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "DifferentialDiffusion"
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 231,
+ "type": "VAEDecode",
+ "pos": [
+ 780,
+ 590
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 7
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 60
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 95
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 38,
+ "type": "InpaintModelConditioning",
+ "pos": [
+ 420,
+ 120
+ ],
+ "size": [
+ 310,
+ 200
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 80
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 102
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 82
+ },
+ {
+ "localized_name": "pixels",
+ "name": "pixels",
+ "type": "IMAGE",
+ "link": 99
+ },
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 100
+ },
+ {
+ "localized_name": "noise_mask",
+ "name": "noise_mask",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "noise_mask"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 77
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "slot_index": 1,
+ "links": [
+ 78
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "slot_index": 2,
+ "links": [
+ 88
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "InpaintModelConditioning"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 770,
+ 40
+ ],
+ "size": [
+ 290,
+ 470
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 86
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 77
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 78
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 88
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 282
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 7
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 20,
+ 1,
+ "euler",
+ "normal",
+ 1
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Load models",
+ "bounding": [
+ -620,
+ -160,
+ 410,
+ 790
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -180,
+ -160,
+ 520,
+ 670
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 41,
+ "origin_id": 23,
+ "origin_slot": 0,
+ "target_id": 229,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 101,
+ "origin_id": 23,
+ "origin_slot": 0,
+ "target_id": 46,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 62,
+ "origin_id": 34,
+ "origin_slot": 0,
+ "target_id": 23,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 85,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 86,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 77,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 78,
+ "origin_id": 38,
+ "origin_slot": 1,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 88,
+ "origin_id": 38,
+ "origin_slot": 2,
+ "target_id": 3,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 7,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 231,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 60,
+ "origin_id": 230,
+ "origin_slot": 0,
+ "target_id": 231,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 80,
+ "origin_id": 229,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 102,
+ "origin_id": 46,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 82,
+ "origin_id": 230,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 99,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 38,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 100,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 38,
+ "target_slot": 4,
+ "type": "MASK"
+ },
+ {
+ "id": 95,
+ "origin_id": 231,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 277,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 23,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 282,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 283,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 284,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 34,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 285,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 34,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 286,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 230,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Inpaint image",
+ "description": "Inpaints masked image regions using Flux.1 fill [dev], Black Forest Labs' inpainting/outpainting model."
+ }
+ ]
+ },
+ "extra": {
+ "ds": {
+ "scale": 0.8480949417360862,
+ "offset": [
+ 833.9510730024642,
+ 210.32152847588895
+ ]
+ },
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Image Inpainting (Qwen-image).json b/blueprints/Image Inpainting (Qwen-image).json
index d06f31dd2..a06d57e19 100644
--- a/blueprints/Image Inpainting (Qwen-image).json
+++ b/blueprints/Image Inpainting (Qwen-image).json
@@ -124,7 +124,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image Inpainting (Qwen-image)",
+ "name": "Image Inpainting (Qwen-image)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1548,7 +1548,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Inpaint image"
+ "category": "Image generation and editing/Inpaint image",
+ "description": "Inpaints masked regions using Qwen-Image, extending its multilingual text rendering to inpainting tasks."
},
{
"id": "56a1f603-fbd2-40ed-94ef-c9ecbd96aca8",
@@ -1907,7 +1908,8 @@
],
"extra": {
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Expands and softens mask edges to reduce visible seams after image processing."
}
]
},
@@ -1923,4 +1925,4 @@
"workflowRendererVersion": "LG"
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Levels.json b/blueprints/Image Levels.json
index ef256a1aa..1a1b18932 100644
--- a/blueprints/Image Levels.json
+++ b/blueprints/Image Levels.json
@@ -742,9 +742,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Color adjust"
+ "category": "Image Tools/Color adjust",
+ "description": "Adjusts black point, white point, and gamma for tonal range control via GPU shader."
}
]
},
"extra": {}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Outpainting (Qwen-Image).json b/blueprints/Image Outpainting (Qwen-Image).json
index bf2c4241a..6c07227c0 100644
--- a/blueprints/Image Outpainting (Qwen-Image).json
+++ b/blueprints/Image Outpainting (Qwen-Image).json
@@ -204,7 +204,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image Outpainting (Qwen-Image)",
+ "name": "Image Outpainting (Qwen-Image)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1919,7 +1919,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Outpaint image"
+ "category": "Image generation and editing/Outpaint image",
+ "description": "Outpaints beyond image boundaries using Qwen-Image's outpainting capabilities."
},
{
"id": "f93c215e-c393-460e-9534-ed2c3d8a652e",
@@ -2278,7 +2279,8 @@
],
"extra": {
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Expands and softens mask edges to reduce visible seams after image processing."
},
{
"id": "2a4b2cc0-db37-4302-a067-da392f38f06b",
@@ -2733,7 +2735,8 @@
],
"extra": {
"workflowRendererVersion": "LG"
- }
+ },
+ "description": "Scales both image and mask together while preserving alignment for editing workflows."
}
]
},
@@ -2749,4 +2752,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image Segmentation (SAM3).json b/blueprints/Image Segmentation (SAM3).json
new file mode 100644
index 000000000..a2ef40ac8
--- /dev/null
+++ b/blueprints/Image Segmentation (SAM3).json
@@ -0,0 +1,714 @@
+{
+ "revision": 0,
+ "last_node_id": 99,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 99,
+ "type": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
+ "pos": [
+ -1630,
+ -3270
+ ],
+ "size": [
+ 290,
+ 370
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "object",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "link": null
+ },
+ {
+ "name": "positive_coords",
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "negative_coords",
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": null
+ },
+ {
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": []
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "78",
+ "text"
+ ],
+ [
+ "75",
+ "threshold"
+ ],
+ [
+ "75",
+ "refine_iterations"
+ ],
+ [
+ "75",
+ "individual_masks"
+ ],
+ [
+ "77",
+ "ckpt_name"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Image Segmentation (SAM3)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "6e7ab3ea-96aa-470f-9b94-3d9d0e01f481",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 113,
+ "lastLinkId": 283,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Segmentation (SAM3)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -2260,
+ -3450,
+ 136.369140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1130,
+ -3305,
+ 120,
+ 80
+ ]
+ },
+ "inputs": [
+ {
+ "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 264
+ ],
+ "localized_name": "image",
+ "label": "image",
+ "pos": [
+ -2143.630859375,
+ -3430
+ ]
+ },
+ {
+ "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 265
+ ],
+ "label": "object",
+ "pos": [
+ -2143.630859375,
+ -3410
+ ]
+ },
+ {
+ "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 266
+ ],
+ "pos": [
+ -2143.630859375,
+ -3390
+ ]
+ },
+ {
+ "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
+ "name": "positive_coords",
+ "type": "STRING",
+ "linkIds": [
+ 267
+ ],
+ "pos": [
+ -2143.630859375,
+ -3370
+ ]
+ },
+ {
+ "id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
+ "name": "negative_coords",
+ "type": "STRING",
+ "linkIds": [
+ 268
+ ],
+ "pos": [
+ -2143.630859375,
+ -3350
+ ]
+ },
+ {
+ "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 269
+ ],
+ "pos": [
+ -2143.630859375,
+ -3330
+ ]
+ },
+ {
+ "id": "b1439668-b050-490b-a5dc-fc4052c55666",
+ "name": "refine_iterations",
+ "type": "INT",
+ "linkIds": [
+ 270
+ ],
+ "pos": [
+ -2143.630859375,
+ -3310
+ ]
+ },
+ {
+ "id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 271
+ ],
+ "pos": [
+ -2143.630859375,
+ -3290
+ ]
+ },
+ {
+ "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 272
+ ],
+ "pos": [
+ -2143.630859375,
+ -3270
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
+ "name": "masks",
+ "type": "MASK",
+ "linkIds": [
+ 231
+ ],
+ "localized_name": "masks",
+ "pos": [
+ -1110,
+ -3285
+ ]
+ },
+ {
+ "id": "8f622e40-8528-4078-b7d3-147e9f872194",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 232
+ ],
+ "localized_name": "bboxes",
+ "pos": [
+ -1110,
+ -3265
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 75,
+ "type": "SAM3_Detect",
+ "pos": [
+ -1470,
+ -3460
+ ],
+ "size": [
+ 270,
+ 260
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 237
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 264
+ },
+ {
+ "label": "conditioning",
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "shape": 7,
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "label": "bboxes",
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 266
+ },
+ {
+ "label": "positive_coords",
+ "localized_name": "positive_coords",
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 267
+ },
+ {
+ "label": "negative_coords",
+ "localized_name": "negative_coords",
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 268
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 269
+ },
+ {
+ "localized_name": "refine_iterations",
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": 270
+ },
+ {
+ "localized_name": "individual_masks",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": 271
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 231
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "Node name for S&R": "SAM3_Detect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.5,
+ 2,
+ false
+ ]
+ },
+ {
+ "id": 77,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -1970,
+ -3200
+ ],
+ "size": [
+ 330,
+ 140
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 272
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 237
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 240
+ ]
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "sam3.1_multiplex_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "sam3.1_multiplex_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 78,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2000,
+ -3000
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 240
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 265
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 237,
+ "origin_id": 77,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 200,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 240,
+ "origin_id": 77,
+ "origin_slot": 1,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 231,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 232,
+ "origin_id": 75,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 264,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 265,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 78,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 266,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 75,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 267,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 75,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 268,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 75,
+ "target_slot": 5,
+ "type": "STRING"
+ },
+ {
+ "id": 269,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 75,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 270,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 75,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 271,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 75,
+ "target_slot": 8,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 272,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 77,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Segmentation & Mask",
+ "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Image Upscale(Z-image-Turbo).json b/blueprints/Image Upscale(Z-image-Turbo).json
index 0d2b6e240..25d2838a8 100644
--- a/blueprints/Image Upscale(Z-image-Turbo).json
+++ b/blueprints/Image Upscale(Z-image-Turbo).json
@@ -141,7 +141,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image Upscale(Z-image-Turbo)",
+ "name": "Image Upscale (Z-image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1302,7 +1302,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Enhance"
+ "category": "Image generation and editing/Upscale",
+ "description": "Upscales images to higher resolution using Z-Image-Turbo."
}
]
},
@@ -1311,4 +1312,4 @@
"workflowRendererVersion": "LG"
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Image to Layers(Qwen-Image Layered).json b/blueprints/Image to Layers(Qwen-Image-Layered).json
similarity index 82%
rename from blueprints/Image to Layers(Qwen-Image Layered).json
rename to blueprints/Image to Layers(Qwen-Image-Layered).json
index 164ffbd8d..7b44f0563 100644
--- a/blueprints/Image to Layers(Qwen-Image Layered).json
+++ b/blueprints/Image to Layers(Qwen-Image-Layered).json
@@ -1,15 +1,14 @@
{
- "id": "1a761372-7c82-4016-b9bf-fa285967e1e9",
"revision": 0,
- "last_node_id": 83,
+ "last_node_id": 176,
"last_link_id": 0,
"nodes": [
{
- "id": 83,
- "type": "f754a936-daaf-4b6e-9658-41fdc54d301d",
+ "id": 176,
+ "type": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
"pos": [
- 61.999827823554256,
- 153.3332507624185
+ -1150,
+ 200
],
"size": [
400,
@@ -56,6 +55,38 @@
"name": "layers"
},
"link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
}
],
"outputs": [
@@ -66,28 +97,41 @@
"links": []
}
],
+ "title": "Image to Layers (Qwen-Image-Layered)",
"properties": {
"proxyWidgets": [
[
- "-1",
+ "6",
"text"
],
[
- "-1",
+ "3",
"steps"
],
[
- "-1",
+ "3",
"cfg"
],
[
- "-1",
+ "83",
"layers"
],
[
"3",
"seed"
],
+ [
+ "37",
+ "unet_name"
+ ],
+ [
+ "38",
+ "clip_name"
+ ],
+ [
+ "39",
+ "vae_name"
+ ],
[
"3",
"control_after_generate"
@@ -95,6 +139,11 @@
],
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -103,25 +152,20 @@
"secondTabOffset": 80,
"secondTabWidth": 65
},
- "widgets_values": [
- "",
- 20,
- 2.5,
- 2
- ]
+ "widgets_values": []
}
],
"links": [],
- "groups": [],
+ "version": 0.4,
"definitions": {
"subgraphs": [
{
- "id": "f754a936-daaf-4b6e-9658-41fdc54d301d",
+ "id": "2d2e3c8e-53b3-4618-be52-6d1d99382f0e",
"version": 1,
"state": {
- "lastGroupId": 3,
- "lastNodeId": 83,
- "lastLinkId": 159,
+ "lastGroupId": 8,
+ "lastNodeId": 176,
+ "lastLinkId": 380,
"lastRerouteId": 0
},
"revision": 0,
@@ -130,10 +174,10 @@
"inputNode": {
"id": -10,
"bounding": [
- -510,
- 523,
+ -720,
+ 720,
120,
- 140
+ 220
]
},
"outputNode": {
@@ -156,8 +200,8 @@
],
"localized_name": "image",
"pos": [
- -410,
- 543
+ -620,
+ 740
]
},
{
@@ -168,8 +212,8 @@
150
],
"pos": [
- -410,
- 563
+ -620,
+ 760
]
},
{
@@ -180,8 +224,8 @@
153
],
"pos": [
- -410,
- 583
+ -620,
+ 780
]
},
{
@@ -192,8 +236,8 @@
154
],
"pos": [
- -410,
- 603
+ -620,
+ 800
]
},
{
@@ -204,8 +248,56 @@
159
],
"pos": [
- -410,
- 623
+ -620,
+ 820
+ ]
+ },
+ {
+ "id": "9f76338b-f4ca-4bb3-b61a-57b3f233061e",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 377
+ ],
+ "pos": [
+ -620,
+ 840
+ ]
+ },
+ {
+ "id": "8d0422d5-5eee-4f7e-9817-dc613cc62eca",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 378
+ ],
+ "pos": [
+ -620,
+ 860
+ ]
+ },
+ {
+ "id": "552eece2-a735-4d00-ae78-ded454622bc1",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 379
+ ],
+ "pos": [
+ -620,
+ 880
+ ]
+ },
+ {
+ "id": "1e6d141c-d0f9-4a2b-895c-b6780e57cfa0",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 380
+ ],
+ "pos": [
+ -620,
+ 900
]
}
],
@@ -231,14 +323,14 @@
"type": "CLIPLoader",
"pos": [
-320,
- 310
+ 360
],
"size": [
- 346.7470703125,
- 106
+ 350,
+ 150
],
"flags": {},
- "order": 0,
+ "order": 5,
"mode": 0,
"inputs": [
{
@@ -248,7 +340,7 @@
"widget": {
"name": "clip_name"
},
- "link": null
+ "link": 379
},
{
"localized_name": "type",
@@ -283,9 +375,14 @@
}
],
"properties": {
- "Node name for S&R": "CLIPLoader",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "CLIPLoader",
"models": [
{
"name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
@@ -312,14 +409,14 @@
"type": "VAELoader",
"pos": [
-320,
- 460
+ 580
],
"size": [
- 346.7470703125,
- 58
+ 350,
+ 110
],
"flags": {},
- "order": 1,
+ "order": 6,
"mode": 0,
"inputs": [
{
@@ -329,7 +426,7 @@
"widget": {
"name": "vae_name"
},
- "link": null
+ "link": 380
}
],
"outputs": [
@@ -345,9 +442,14 @@
}
],
"properties": {
- "Node name for S&R": "VAELoader",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "VAELoader",
"models": [
{
"name": "qwen_image_layered_vae.safetensors",
@@ -375,11 +477,11 @@
420
],
"size": [
- 425.27801513671875,
- 180.6060791015625
+ 430,
+ 190
],
"flags": {},
- "order": 3,
+ "order": 2,
"mode": 0,
"inputs": [
{
@@ -411,9 +513,14 @@
],
"title": "CLIP Text Encode (Negative Prompt)",
"properties": {
- "Node name for S&R": "CLIPTextEncode",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -432,12 +539,12 @@
"id": 70,
"type": "ReferenceLatent",
"pos": [
- 330,
- 670
+ 140,
+ 700
],
"size": [
- 204.1666717529297,
- 46
+ 210,
+ 50
],
"flags": {
"collapsed": true
@@ -470,9 +577,14 @@
}
],
"properties": {
- "Node name for S&R": "ReferenceLatent",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "ReferenceLatent",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -480,19 +592,18 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 69,
"type": "ReferenceLatent",
"pos": [
- 330,
- 710
+ 160,
+ 820
],
"size": [
- 204.1666717529297,
- 46
+ 210,
+ 50
],
"flags": {
"collapsed": true
@@ -525,9 +636,14 @@
}
],
"properties": {
- "Node name for S&R": "ReferenceLatent",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "ReferenceLatent",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -535,8 +651,7 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 66,
@@ -547,10 +662,10 @@
],
"size": [
270,
- 58
+ 110
],
"flags": {},
- "order": 4,
+ "order": 7,
"mode": 0,
"inputs": [
{
@@ -580,9 +695,14 @@
}
],
"properties": {
- "Node name for S&R": "ModelSamplingAuraFlow",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "ModelSamplingAuraFlow",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -600,11 +720,11 @@
"type": "LatentCutToBatch",
"pos": [
830,
- 160
+ 140
],
"size": [
270,
- 82
+ 140
],
"flags": {},
"order": 11,
@@ -646,9 +766,14 @@
}
],
"properties": {
- "Node name for S&R": "LatentCutToBatch",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "LatentCutToBatch",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -666,12 +791,12 @@
"id": 71,
"type": "VAEEncode",
"pos": [
- 100,
- 690
+ -280,
+ 780
],
"size": [
- 140,
- 46
+ 230,
+ 100
],
"flags": {
"collapsed": false
@@ -704,9 +829,14 @@
}
],
"properties": {
- "Node name for S&R": "VAEEncode",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "VAEEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -714,24 +844,23 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 8,
"type": "VAEDecode",
"pos": [
850,
- 310
+ 370
],
"size": [
210,
- 46
+ 50
],
"flags": {
"collapsed": true
},
- "order": 7,
+ "order": 3,
"mode": 0,
"inputs": [
{
@@ -759,9 +888,14 @@
}
],
"properties": {
- "Node name for S&R": "VAEDecode",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "VAEDecode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -769,8 +903,7 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 6,
@@ -780,11 +913,11 @@
180
],
"size": [
- 422.84503173828125,
- 164.31304931640625
+ 430,
+ 170
],
"flags": {},
- "order": 6,
+ "order": 1,
"mode": 0,
"inputs": [
{
@@ -816,9 +949,14 @@
],
"title": "CLIP Text Encode (Positive Prompt)",
"properties": {
- "Node name for S&R": "CLIPTextEncode",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -838,14 +976,14 @@
"type": "KSampler",
"pos": [
530,
- 280
+ 340
],
"size": [
270,
400
],
"flags": {},
- "order": 5,
+ "order": 0,
"mode": 0,
"inputs": [
{
@@ -879,7 +1017,7 @@
"widget": {
"name": "seed"
},
- "link": null
+ "link": 377
},
{
"localized_name": "steps",
@@ -939,9 +1077,14 @@
}
],
"properties": {
- "Node name for S&R": "KSampler",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "KSampler",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -964,12 +1107,12 @@
"id": 78,
"type": "GetImageSize",
"pos": [
- 80,
- 790
+ -280,
+ 930
],
"size": [
- 210,
- 136
+ 230,
+ 140
],
"flags": {},
"order": 12,
@@ -1007,9 +1150,14 @@
}
],
"properties": {
- "Node name for S&R": "GetImageSize",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "GetImageSize",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -1017,23 +1165,23 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 83,
"type": "EmptyQwenImageLayeredLatentImage",
"pos": [
- 320,
- 790
+ -280,
+ 1120
],
"size": [
- 330.9341796875,
- 130
+ 340,
+ 200
],
"flags": {},
"order": 13,
"mode": 0,
+ "showAdvanced": true,
"inputs": [
{
"localized_name": "width",
@@ -1083,9 +1231,14 @@
}
],
"properties": {
- "Node name for S&R": "EmptyQwenImageLayeredLatentImage",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "EmptyQwenImageLayeredLatentImage",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -1109,11 +1262,11 @@
180
],
"size": [
- 346.7470703125,
- 82
+ 350,
+ 110
],
"flags": {},
- "order": 2,
+ "order": 4,
"mode": 0,
"inputs": [
{
@@ -1123,7 +1276,7 @@
"widget": {
"name": "unet_name"
},
- "link": null
+ "link": 378
},
{
"localized_name": "weight_dtype",
@@ -1147,9 +1300,14 @@
}
],
"properties": {
- "Node name for S&R": "UNETLoader",
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {},
+ "version": "7.7"
+ },
+ "Node name for S&R": "UNETLoader",
"models": [
{
"name": "qwen_image_layered_bf16.safetensors",
@@ -1191,8 +1349,8 @@
"bounding": [
-330,
110,
- 366.7470703125,
- 421.6
+ 370,
+ 610
],
"color": "#3f789e",
"font_size": 24,
@@ -1391,16 +1549,48 @@
"target_id": 83,
"target_slot": 2,
"type": "INT"
+ },
+ {
+ "id": 377,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 378,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 379,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 380,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "COMBO"
}
],
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Image to layers"
+ "category": "Image generation and editing/Image to layers",
+ "description": "Decomposes an image into variable-resolution RGBA layers for independent editing using Qwen-Image-Layered."
}
]
},
- "config": {},
"extra": {
"ds": {
"scale": 1.14,
@@ -1409,7 +1599,6 @@
6.855893974423647
]
},
- "workflowRendererVersion": "LG"
- },
- "version": 0.4
-}
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image to Model (Hunyuan3d 2.1).json b/blueprints/Image to Model (Hunyuan3d 2.1).json
index 4705603a8..ee5552656 100644
--- a/blueprints/Image to Model (Hunyuan3d 2.1).json
+++ b/blueprints/Image to Model (Hunyuan3d 2.1).json
@@ -72,7 +72,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image to Model (Hunyuan3d 2.1)",
+ "name": "Image to 3D Model (Hunyuan3d 2.1)",
"inputNode": {
"id": -10,
"bounding": [
@@ -765,7 +765,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "3D/Image to 3D Model"
+ "category": "3D/Image to 3D Model",
+ "description": "Generates 3D mesh models from a single input image using Hunyuan3D 2.0/2.1."
}
]
},
diff --git a/blueprints/Image to Pose Map (SDPose Multi-Person).json b/blueprints/Image to Pose Map (SDPose Multi-Person).json
new file mode 100644
index 000000000..38df20775
--- /dev/null
+++ b/blueprints/Image to Pose Map (SDPose Multi-Person).json
@@ -0,0 +1,1206 @@
+{
+ "revision": 0,
+ "last_node_id": 675,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 675,
+ "type": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "pos": [
+ -2480,
+ 3400
+ ],
+ "size": [
+ 370,
+ 590.625
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "label": "resize_target_longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": null
+ },
+ {
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": null
+ },
+ {
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": null
+ },
+ {
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": null
+ },
+ {
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": null
+ },
+ {
+ "label": "detect_threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "label": "detect_class",
+ "name": "class_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "class_name"
+ },
+ "link": null
+ },
+ {
+ "name": "max_detections",
+ "type": "INT",
+ "widget": {
+ "name": "max_detections"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": null
+ },
+ {
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "674",
+ "resize_type.longer_size"
+ ],
+ [
+ "674",
+ "scale_method"
+ ],
+ [
+ "672",
+ "draw_body"
+ ],
+ [
+ "672",
+ "draw_hands"
+ ],
+ [
+ "672",
+ "draw_face"
+ ],
+ [
+ "672",
+ "draw_feet"
+ ],
+ [
+ "672",
+ "stick_width"
+ ],
+ [
+ "672",
+ "face_point_size"
+ ],
+ [
+ "672",
+ "score_threshold"
+ ],
+ [
+ "678",
+ "threshold"
+ ],
+ [
+ "678",
+ "class_name"
+ ],
+ [
+ "678",
+ "max_detections"
+ ],
+ [
+ "673",
+ "ckpt_name"
+ ],
+ [
+ "677",
+ "unet_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Image to Pose Map (SDPose Multi-Person)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "version": 1,
+ "state": {
+ "lastGroupId": 2,
+ "lastNodeId": 691,
+ "lastLinkId": 1740,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image to Pose Map (SDPose Multi-Person)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -3350,
+ 3410,
+ 190.8984375,
+ 348
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1840,
+ 3570,
+ 128,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 1700
+ ],
+ "localized_name": "input",
+ "pos": [
+ -3183.1015625,
+ 3434
+ ]
+ },
+ {
+ "id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "linkIds": [
+ 1704
+ ],
+ "label": "resize_target_longer_size",
+ "pos": [
+ -3183.1015625,
+ 3454
+ ]
+ },
+ {
+ "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
+ "name": "scale_method",
+ "type": "COMBO",
+ "linkIds": [
+ 1705
+ ],
+ "pos": [
+ -3183.1015625,
+ 3474
+ ]
+ },
+ {
+ "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1706
+ ],
+ "pos": [
+ -3183.1015625,
+ 3494
+ ]
+ },
+ {
+ "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1707
+ ],
+ "pos": [
+ -3183.1015625,
+ 3514
+ ]
+ },
+ {
+ "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1708
+ ],
+ "pos": [
+ -3183.1015625,
+ 3534
+ ]
+ },
+ {
+ "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1709
+ ],
+ "pos": [
+ -3183.1015625,
+ 3554
+ ]
+ },
+ {
+ "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
+ "name": "stick_width",
+ "type": "INT",
+ "linkIds": [
+ 1710
+ ],
+ "pos": [
+ -3183.1015625,
+ 3574
+ ]
+ },
+ {
+ "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
+ "name": "face_point_size",
+ "type": "INT",
+ "linkIds": [
+ 1711
+ ],
+ "pos": [
+ -3183.1015625,
+ 3594
+ ]
+ },
+ {
+ "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 1712
+ ],
+ "pos": [
+ -3183.1015625,
+ 3614
+ ]
+ },
+ {
+ "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 1718
+ ],
+ "label": "detect_threshold",
+ "pos": [
+ -3183.1015625,
+ 3634
+ ]
+ },
+ {
+ "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2",
+ "name": "class_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1719
+ ],
+ "label": "detect_class",
+ "pos": [
+ -3183.1015625,
+ 3654
+ ]
+ },
+ {
+ "id": "4417e988-6e80-4236-be31-4c179037f5a2",
+ "name": "max_detections",
+ "type": "INT",
+ "linkIds": [
+ 1720
+ ],
+ "pos": [
+ -3183.1015625,
+ 3674
+ ]
+ },
+ {
+ "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1721
+ ],
+ "pos": [
+ -3183.1015625,
+ 3694
+ ]
+ },
+ {
+ "id": "4d75122c-2c14-452a-98fe-d1545d3e012a",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1722
+ ],
+ "pos": [
+ -3183.1015625,
+ 3714
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 1701
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ -1816,
+ 3594
+ ]
+ },
+ {
+ "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "linkIds": [
+ 1725
+ ],
+ "pos": [
+ -1816,
+ 3614
+ ]
+ },
+ {
+ "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 1726
+ ],
+ "pos": [
+ -1816,
+ 3634
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 671,
+ "type": "SDPoseKeypointExtractor",
+ "pos": [
+ -2550,
+ 3080
+ ],
+ "size": [
+ 270,
+ 180
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1696
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 1697
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1698
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 1717
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": [
+ 1699,
+ 1725
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseKeypointExtractor",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 16
+ ]
+ },
+ {
+ "id": 674,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ -2970,
+ 3580
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 1700
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resize_type.longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": 1704
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": 1705
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 1698,
+ 1716
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImageMaskNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale longer dimension",
+ 1024,
+ "lanczos"
+ ]
+ },
+ {
+ "id": 672,
+ "type": "SDPoseDrawKeypoints",
+ "pos": [
+ -2540,
+ 3590
+ ],
+ "size": [
+ 270,
+ 280
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "link": 1699
+ },
+ {
+ "localized_name": "draw_body",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": 1706
+ },
+ {
+ "localized_name": "draw_hands",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": 1707
+ },
+ {
+ "localized_name": "draw_face",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": 1708
+ },
+ {
+ "localized_name": "draw_feet",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": 1709
+ },
+ {
+ "localized_name": "stick_width",
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": 1710
+ },
+ {
+ "localized_name": "face_point_size",
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": 1711
+ },
+ {
+ "localized_name": "score_threshold",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": 1712
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 1701
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseDrawKeypoints",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true,
+ true,
+ true,
+ true,
+ 4,
+ 2,
+ 0.5
+ ]
+ },
+ {
+ "id": 673,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -3040,
+ 3080
+ ],
+ "size": [
+ 390,
+ 190
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 1721
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1696
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 1697
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "models": [
+ {
+ "name": "sdpose_wholebody_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "sdpose_wholebody_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 677,
+ "type": "UNETLoader",
+ "pos": [
+ -3030,
+ 3330
+ ],
+ "size": [
+ 370,
+ 140
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 1722
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1715
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "models": [
+ {
+ "name": "rt_detr_v4-x-hgnet_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "rt_detr_v4-x-hgnet_fp16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 678,
+ "type": "RTDETR_detect",
+ "pos": [
+ -2540,
+ 3320
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1715
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1716
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 1718
+ },
+ {
+ "localized_name": "class_name",
+ "name": "class_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "class_name"
+ },
+ "link": 1719
+ },
+ {
+ "localized_name": "max_detections",
+ "name": "max_detections",
+ "type": "INT",
+ "widget": {
+ "name": "max_detections"
+ },
+ "link": 1720
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 1717,
+ 1726
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "RTDETR_detect",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.5,
+ "person",
+ 1
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1696,
+ "origin_id": 673,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 1697,
+ "origin_id": 673,
+ "origin_slot": 2,
+ "target_id": 671,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 1698,
+ "origin_id": 674,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1699,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": 672,
+ "target_slot": 0,
+ "type": "POSE_KEYPOINT"
+ },
+ {
+ "id": 1700,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 674,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 1701,
+ "origin_id": 672,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1704,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 674,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 1705,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 674,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 1706,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 672,
+ "target_slot": 1,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1707,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 672,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1708,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 672,
+ "target_slot": 3,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1709,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 672,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1710,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 672,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 1711,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 672,
+ "target_slot": 6,
+ "type": "INT"
+ },
+ {
+ "id": 1712,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 672,
+ "target_slot": 7,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1715,
+ "origin_id": 677,
+ "origin_slot": 0,
+ "target_id": 678,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 1716,
+ "origin_id": 674,
+ "origin_slot": 0,
+ "target_id": 678,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1717,
+ "origin_id": 678,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 1718,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 678,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1719,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 678,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 1720,
+ "origin_id": -10,
+ "origin_slot": 12,
+ "target_id": 678,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 1721,
+ "origin_id": -10,
+ "origin_slot": 13,
+ "target_id": 673,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1722,
+ "origin_id": -10,
+ "origin_slot": 14,
+ "target_id": 677,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1725,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "POSE_KEYPOINT"
+ },
+ {
+ "id": 1726,
+ "origin_id": 678,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "BOUNDING_BOX"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Conditioning & Preprocessors/Pose",
+ "description": "Detects multiple people in an image and outputs per-person pose keypoints, skeleton renders, and bounding boxes using SDPose."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Image to Pose Map (SDPose-OOD).json b/blueprints/Image to Pose Map (SDPose-OOD).json
new file mode 100644
index 000000000..76ee9ff4e
--- /dev/null
+++ b/blueprints/Image to Pose Map (SDPose-OOD).json
@@ -0,0 +1,888 @@
+{
+ "revision": 0,
+ "last_node_id": 675,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 675,
+ "type": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "pos": [
+ -2480,
+ 3400
+ ],
+ "size": [
+ 360,
+ 433.3125
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "label": "resize_target_longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": null
+ },
+ {
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": null
+ },
+ {
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": null
+ },
+ {
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": null
+ },
+ {
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "674",
+ "resize_type.longer_size"
+ ],
+ [
+ "674",
+ "scale_method"
+ ],
+ [
+ "672",
+ "draw_body"
+ ],
+ [
+ "672",
+ "draw_hands"
+ ],
+ [
+ "672",
+ "draw_face"
+ ],
+ [
+ "672",
+ "draw_feet"
+ ],
+ [
+ "672",
+ "stick_width"
+ ],
+ [
+ "672",
+ "face_point_size"
+ ],
+ [
+ "672",
+ "score_threshold"
+ ],
+ [
+ "673",
+ "ckpt_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Image to Pose Map (SDPose-OOD)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 676,
+ "lastLinkId": 1715,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image to Pose Map (SDPose-OOD)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -3290,
+ 3590,
+ 190.8984375,
+ 288
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1756.2451602089645,
+ 3366,
+ 128,
+ 88
+ ]
+ },
+ "inputs": [
+ {
+ "id": "e24699c3-1356-4634-9eb4-19bb58e5c0b0",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 1700
+ ],
+ "localized_name": "input",
+ "pos": [
+ -3123.1015625,
+ 3614
+ ]
+ },
+ {
+ "id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "linkIds": [
+ 1704
+ ],
+ "label": "resize_target_longer_size",
+ "pos": [
+ -3123.1015625,
+ 3634
+ ]
+ },
+ {
+ "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
+ "name": "scale_method",
+ "type": "COMBO",
+ "linkIds": [
+ 1705
+ ],
+ "pos": [
+ -3123.1015625,
+ 3654
+ ]
+ },
+ {
+ "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1706
+ ],
+ "pos": [
+ -3123.1015625,
+ 3674
+ ]
+ },
+ {
+ "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1707
+ ],
+ "pos": [
+ -3123.1015625,
+ 3694
+ ]
+ },
+ {
+ "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1708
+ ],
+ "pos": [
+ -3123.1015625,
+ 3714
+ ]
+ },
+ {
+ "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1709
+ ],
+ "pos": [
+ -3123.1015625,
+ 3734
+ ]
+ },
+ {
+ "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
+ "name": "stick_width",
+ "type": "INT",
+ "linkIds": [
+ 1710
+ ],
+ "pos": [
+ -3123.1015625,
+ 3754
+ ]
+ },
+ {
+ "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
+ "name": "face_point_size",
+ "type": "INT",
+ "linkIds": [
+ 1711
+ ],
+ "pos": [
+ -3123.1015625,
+ 3774
+ ]
+ },
+ {
+ "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 1712
+ ],
+ "pos": [
+ -3123.1015625,
+ 3794
+ ]
+ },
+ {
+ "id": "ae46de61-2cc6-483e-8ee9-87e4144a2ffa",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1713
+ ],
+ "pos": [
+ -3123.1015625,
+ 3814
+ ]
+ },
+ {
+ "id": "41bec0c6-dffa-4c78-9289-ee678715ae54",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 1714
+ ],
+ "pos": [
+ -3123.1015625,
+ 3834
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 1701
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ -1732.2451602089645,
+ 3390
+ ]
+ },
+ {
+ "id": "29a6584e-4685-4986-8ffd-e6d8539953fd",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "linkIds": [
+ 1715
+ ],
+ "pos": [
+ -1732.2451602089645,
+ 3410
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 671,
+ "type": "SDPoseKeypointExtractor",
+ "pos": [
+ -2470,
+ 3250
+ ],
+ "size": [
+ 270,
+ 180
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1696
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 1697
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1698
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 1714
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": [
+ 1699,
+ 1715
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseKeypointExtractor",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 16
+ ]
+ },
+ {
+ "id": 674,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ -2960,
+ 3490
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 1700
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resize_type.longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": 1704
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": 1705
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 1698
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImageMaskNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "scale longer dimension",
+ 1024,
+ "area"
+ ]
+ },
+ {
+ "id": 672,
+ "type": "SDPoseDrawKeypoints",
+ "pos": [
+ -2120,
+ 3260
+ ],
+ "size": [
+ 270,
+ 280
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "link": 1699
+ },
+ {
+ "localized_name": "draw_body",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": 1706
+ },
+ {
+ "localized_name": "draw_hands",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": 1707
+ },
+ {
+ "localized_name": "draw_face",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": 1708
+ },
+ {
+ "localized_name": "draw_feet",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": 1709
+ },
+ {
+ "localized_name": "stick_width",
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": 1710
+ },
+ {
+ "localized_name": "face_point_size",
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": 1711
+ },
+ {
+ "localized_name": "score_threshold",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": 1712
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 1701
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseDrawKeypoints",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ true,
+ true,
+ true,
+ true,
+ 4,
+ 2,
+ 0.5
+ ]
+ },
+ {
+ "id": 673,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -2960,
+ 3250
+ ],
+ "size": [
+ 390,
+ 190
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 1713
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1696
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 1697
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "models": [
+ {
+ "name": "sdpose_wholebody_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "sdpose_wholebody_fp16.safetensors"
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1696,
+ "origin_id": 673,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 1697,
+ "origin_id": 673,
+ "origin_slot": 2,
+ "target_id": 671,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 1698,
+ "origin_id": 674,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1699,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": 672,
+ "target_slot": 0,
+ "type": "POSE_KEYPOINT"
+ },
+ {
+ "id": 1700,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 674,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 1701,
+ "origin_id": 672,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1704,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 674,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 1705,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 674,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 1706,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 672,
+ "target_slot": 1,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1707,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 672,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1708,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 672,
+ "target_slot": 3,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1709,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 672,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1710,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 672,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 1711,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 672,
+ "target_slot": 6,
+ "type": "INT"
+ },
+ {
+ "id": 1712,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 672,
+ "target_slot": 7,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1713,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 673,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1714,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 671,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 1715,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "POSE_KEYPOINT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Conditioning & Preprocessors/Pose",
+ "description": "Extracts human pose keypoints and stick-figure visuals from an image using SDPose-OOD, with optional bounding-box input per subject."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image to Video (LTX-2.3).json b/blueprints/Image to Video (LTX-2.3).json
new file mode 100644
index 000000000..3db524ea0
--- /dev/null
+++ b/blueprints/Image to Video (LTX-2.3).json
@@ -0,0 +1,4234 @@
+{
+ "revision": 0,
+ "last_node_id": 320,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 320,
+ "type": "2454ad83-157c-40dd-9f19-5daaf4041ce0",
+ "pos": [
+ 30,
+ 4150
+ ],
+ "size": [
+ 390,
+ 466.625
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "first_frame",
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": null
+ },
+ {
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "width",
+ "name": "value_2",
+ "type": "INT",
+ "widget": {
+ "name": "value_2"
+ },
+ "link": null
+ },
+ {
+ "label": "height",
+ "name": "value_3",
+ "type": "INT",
+ "widget": {
+ "name": "value_3"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "value_4",
+ "type": "INT",
+ "widget": {
+ "name": "value_4"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "label": "distilled_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": null
+ },
+ {
+ "label": "latent_upscale_model",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": null
+ },
+ {
+ "label": "fps",
+ "name": "value_5",
+ "type": "INT",
+ "widget": {
+ "name": "value_5"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "title": "Image to Video (LTX-2.3)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "319",
+ "value"
+ ],
+ [
+ "312",
+ "value"
+ ],
+ [
+ "299",
+ "value"
+ ],
+ [
+ "301",
+ "value"
+ ],
+ [
+ "300",
+ "value"
+ ],
+ [
+ "316",
+ "ckpt_name"
+ ],
+ [
+ "277",
+ "control_after_generate"
+ ],
+ [
+ "277",
+ "noise_seed"
+ ],
+ [
+ "285",
+ "lora_name"
+ ],
+ [
+ "317",
+ "text_encoder"
+ ],
+ [
+ "311",
+ "model_name"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value_1": true,
+ "value_2": true,
+ "value_3": true,
+ "value_4": true,
+ "lora_name": true,
+ "model_name": true,
+ "value_5": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "2454ad83-157c-40dd-9f19-5daaf4041ce0",
+ "version": 1,
+ "state": {
+ "lastGroupId": 25,
+ "lastNodeId": 323,
+ "lastLinkId": 631,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image to Video (LTX-2.3)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ 730,
+ 4110,
+ 162.162109375,
+ 240
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 6590,
+ 4360,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "7afd6ea8-c738-4fd9-97b8-66fa905cd381",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "linkIds": [
+ 535
+ ],
+ "localized_name": "input",
+ "label": "first_frame",
+ "pos": [
+ 872.162109375,
+ 4130
+ ]
+ },
+ {
+ "id": "9494c550-4172-49c6-930e-5b508f775e77",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 595
+ ],
+ "pos": [
+ 872.162109375,
+ 4150
+ ]
+ },
+ {
+ "id": "58dbb3f6-f924-4548-96ef-e0e34610bd4e",
+ "name": "value_2",
+ "type": "INT",
+ "linkIds": [
+ 597
+ ],
+ "label": "width",
+ "pos": [
+ 872.162109375,
+ 4170
+ ]
+ },
+ {
+ "id": "6086d5b8-2586-448c-a641-dd14d76dd102",
+ "name": "value_3",
+ "type": "INT",
+ "linkIds": [
+ 598
+ ],
+ "label": "height",
+ "pos": [
+ 872.162109375,
+ 4190
+ ]
+ },
+ {
+ "id": "feb8c2eb-ae48-4fa8-bc24-929552d656c3",
+ "name": "value_4",
+ "type": "INT",
+ "linkIds": [
+ 599
+ ],
+ "label": "duration",
+ "pos": [
+ 872.162109375,
+ 4210
+ ]
+ },
+ {
+ "id": "d7255058-319a-4880-8f9a-7e542c8f3c3c",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 601,
+ 604,
+ 605
+ ],
+ "pos": [
+ 872.162109375,
+ 4230
+ ]
+ },
+ {
+ "id": "4afce68d-8f65-4342-9d6d-ae0a7688c3e3",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 602
+ ],
+ "label": "distilled_lora",
+ "pos": [
+ 872.162109375,
+ 4250
+ ]
+ },
+ {
+ "id": "ab842b4b-c977-4679-b421-424722785b57",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "linkIds": [
+ 606
+ ],
+ "pos": [
+ 872.162109375,
+ 4270
+ ]
+ },
+ {
+ "id": "9e47372d-28d9-4311-91e9-e90d03f4eb43",
+ "name": "model_name",
+ "type": "COMBO",
+ "linkIds": [
+ 607
+ ],
+ "label": "latent_upscale_model",
+ "pos": [
+ 872.162109375,
+ 4290
+ ]
+ },
+ {
+ "id": "3e32ce15-0ae7-4cd0-909f-a354e8e9c4c9",
+ "name": "value_5",
+ "type": "INT",
+ "linkIds": [
+ 624
+ ],
+ "label": "fps",
+ "pos": [
+ 872.162109375,
+ 4310
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "954ef307-c897-4eea-8b5c-5c6ce15a5357",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "linkIds": [
+ 536
+ ],
+ "localized_name": "VIDEO",
+ "pos": [
+ 6610,
+ 4380
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 276,
+ "type": "RandomNoise",
+ "pos": [
+ 4700,
+ 3650
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 490
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 42,
+ "fixed"
+ ]
+ },
+ {
+ "id": 277,
+ "type": "RandomNoise",
+ "pos": [
+ 3160,
+ 3630
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 483
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 519681071352364,
+ "randomize"
+ ]
+ },
+ {
+ "id": 278,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 4710,
+ 4490
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 512
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 513
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 494
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 279,
+ "type": "LTXVAudioVAELoader",
+ "pos": [
+ 1660,
+ 4100
+ ],
+ "size": [
+ 430,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 604
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio VAE",
+ "name": "Audio VAE",
+ "type": "VAE",
+ "links": [
+ 481,
+ 496
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVAudioVAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-dev-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 280,
+ "type": "KSamplerSelect",
+ "pos": [
+ 4700,
+ 4100
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 492
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler_cfg_pp"
+ ]
+ },
+ {
+ "id": 281,
+ "type": "ManualSigmas",
+ "pos": [
+ 4700,
+ 4290
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 493
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "0.85, 0.7250, 0.4219, 0.0"
+ ]
+ },
+ {
+ "id": 282,
+ "type": "CFGGuider",
+ "pos": [
+ 4700,
+ 3850
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 478
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 479
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 480
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 491
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 283,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 3550,
+ 3630
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 483
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 484
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 485
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 544
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 487
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": [
+ 488
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 284,
+ "type": "LTXVCropGuides",
+ "pos": [
+ 3830,
+ 3810
+ ],
+ "size": [
+ 250,
+ 120
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 475
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 476
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 477
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 479
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 480
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "slot_index": 2,
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVCropGuides",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 285,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 1660,
+ 3890
+ ],
+ "size": [
+ 430,
+ 140
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 520
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 602
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 478,
+ 541
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-lora-384.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-22b-distilled-lora-384.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-lora-384.safetensors",
+ 0.5
+ ]
+ },
+ {
+ "id": 286,
+ "type": "ResizeImagesByLongerEdge",
+ "pos": [
+ 2070,
+ 4810
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 523
+ },
+ {
+ "localized_name": "longer_edge",
+ "name": "longer_edge",
+ "type": "INT",
+ "widget": {
+ "name": "longer_edge"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 505
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "ResizeImagesByLongerEdge",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1536
+ ]
+ },
+ {
+ "id": 287,
+ "type": "LTXVLatentUpsampler",
+ "pos": [
+ 4250,
+ 3760
+ ],
+ "size": [
+ 330,
+ 120
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 547
+ },
+ {
+ "localized_name": "upscale_model",
+ "name": "upscale_model",
+ "type": "LATENT_UPSCALE_MODEL",
+ "link": 545
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 554
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 548
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "LTXVLatentUpsampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 288,
+ "type": "LTXVImgToVideoInplace",
+ "pos": [
+ 4230,
+ 4100
+ ],
+ "size": [
+ 300,
+ 180
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 552
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 515
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 548
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "bypass",
+ "name": "bypass",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "bypass"
+ },
+ "link": 543
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 512
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVImgToVideoInplace",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1,
+ false
+ ]
+ },
+ {
+ "id": 289,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 2100,
+ 5010
+ ],
+ "size": [
+ 290,
+ 110
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 505
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 510,
+ 515
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 18
+ ]
+ },
+ {
+ "id": 290,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1660,
+ 4810
+ ],
+ "size": [
+ 300,
+ 160
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 535
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 558
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 559
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 523
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 1920,
+ 1088,
+ "center",
+ "lanczos"
+ ]
+ },
+ {
+ "id": 291,
+ "type": "KSamplerSelect",
+ "pos": [
+ 3160,
+ 4040
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 485
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler_ancestral_cfg_pp"
+ ]
+ },
+ {
+ "id": 292,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2540,
+ 4830
+ ],
+ "size": [
+ 210,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 560
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 561
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a/2"
+ ]
+ },
+ {
+ "id": 293,
+ "type": "Reroute",
+ "pos": [
+ 3850,
+ 4050
+ ],
+ "size": [
+ 230,
+ 40
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 557
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 552,
+ 553,
+ 554
+ ]
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 294,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2550,
+ 4890
+ ],
+ "size": [
+ 210,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 562
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 563
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a/2"
+ ]
+ },
+ {
+ "id": 295,
+ "type": "EmptyLTXVLatentVideo",
+ "pos": [
+ 2870,
+ 4940
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 561
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 563
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 631
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 511
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "Node name for S&R": "EmptyLTXVLatentVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 512,
+ 97,
+ 1
+ ]
+ },
+ {
+ "id": 296,
+ "type": "LTXVImgToVideoInplace",
+ "pos": [
+ 3230,
+ 4810
+ ],
+ "size": [
+ 280,
+ 180
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 556
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 510
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 511
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "bypass",
+ "name": "bypass",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "bypass"
+ },
+ "link": 542
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 497
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVImgToVideoInplace",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.7,
+ false
+ ]
+ },
+ {
+ "id": 297,
+ "type": "LTXVAudioVAEDecode",
+ "pos": [
+ 5760,
+ 3970
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 495
+ },
+ {
+ "label": "Audio VAE",
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 496
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio",
+ "name": "Audio",
+ "type": "AUDIO",
+ "links": [
+ 534
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVAudioVAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 298,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2540,
+ 5030
+ ],
+ "size": [
+ 210,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 564
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 566,
+ 591
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 565
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 299,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1190,
+ 4650
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 598
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 559,
+ 562
+ ]
+ }
+ ],
+ "title": "Height",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 300,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1190,
+ 4840
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 624
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 564,
+ 629
+ ]
+ }
+ ],
+ "title": "Frame Rate",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25,
+ "fixed"
+ ]
+ },
+ {
+ "id": 301,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1190,
+ 4280
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 599
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 628
+ ]
+ }
+ ],
+ "title": "Duration",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 5,
+ "fixed"
+ ]
+ },
+ {
+ "id": 302,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 1190,
+ 4110
+ ],
+ "size": [
+ 370,
+ 100
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 542,
+ 543
+ ]
+ }
+ ],
+ "title": "Switch to Text to Video?",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.0",
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 303,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 2170,
+ 3640
+ ],
+ "size": [
+ 600,
+ 390
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 615
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 625
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 526
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 304,
+ "type": "LTXVConditioning",
+ "pos": [
+ 2800,
+ 3810
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 526
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 527
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "FLOAT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 566
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 475,
+ 518
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 476,
+ 519
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "LTXVConditioning",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 305,
+ "type": "LTXVEmptyLatentAudio",
+ "pos": [
+ 3540,
+ 4960
+ ],
+ "size": [
+ 280,
+ 170
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 481
+ },
+ {
+ "localized_name": "frames_number",
+ "name": "frames_number",
+ "type": "INT",
+ "widget": {
+ "name": "frames_number"
+ },
+ "link": 630
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "INT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 565
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Latent",
+ "name": "Latent",
+ "type": "LATENT",
+ "links": [
+ 498
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVEmptyLatentAudio",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 97,
+ 25,
+ 1
+ ]
+ },
+ {
+ "id": 306,
+ "type": "ManualSigmas",
+ "pos": [
+ 3160,
+ 4220
+ ],
+ "size": [
+ 500,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 544
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0"
+ ]
+ },
+ {
+ "id": 307,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 3820,
+ 3630
+ ],
+ "size": [
+ 250,
+ 100
+ ],
+ "flags": {},
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 488
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 477,
+ 547
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 513
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 308,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 5050,
+ 3650
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 490
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 491
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 492
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 493
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 494
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": [
+ 578
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 309,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 5390,
+ 3650
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 578
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 539
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 495
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 310,
+ "type": "CreateVideo",
+ "pos": [
+ 6050,
+ 4490
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 34,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 538
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 534
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 591
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 536
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "CreateVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 311,
+ "type": "LatentUpscaleModelLoader",
+ "pos": [
+ 1670,
+ 4550
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {},
+ "order": 35,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 607
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT_UPSCALE_MODEL",
+ "name": "LATENT_UPSCALE_MODEL",
+ "type": "LATENT_UPSCALE_MODEL",
+ "links": [
+ 545
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LatentUpscaleModelLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-spatial-upscaler-x2-1.1.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors",
+ "directory": "latent_upscale_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
+ ]
+ },
+ {
+ "id": 312,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1190,
+ 4470
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 36,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 597
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 558,
+ 560
+ ]
+ }
+ ],
+ "title": "Width",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1280,
+ "fixed"
+ ]
+ },
+ {
+ "id": 313,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 2180,
+ 4120
+ ],
+ "size": [
+ 600,
+ 170
+ ],
+ "flags": {},
+ "order": 37,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 627
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 527
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "pc game, console game, video game, cartoon, childish, ugly"
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 314,
+ "type": "CFGGuider",
+ "pos": [
+ 3160,
+ 3810
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 38,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 541
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 518
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 519
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 484
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 315,
+ "type": "VAEDecodeTiled",
+ "pos": [
+ 5750,
+ 3610
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 39,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 539
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 553
+ },
+ {
+ "localized_name": "tile_size",
+ "name": "tile_size",
+ "type": "INT",
+ "widget": {
+ "name": "tile_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "overlap",
+ "name": "overlap",
+ "type": "INT",
+ "widget": {
+ "name": "overlap"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_size",
+ "name": "temporal_size",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_overlap",
+ "name": "temporal_overlap",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_overlap"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 538
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "VAEDecodeTiled",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 64,
+ 4096,
+ 4
+ ]
+ },
+ {
+ "id": 316,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 1660,
+ 3660
+ ],
+ "size": [
+ 430,
+ 160
+ ],
+ "flags": {},
+ "order": 40,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 601
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 520
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 556,
+ 557
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-dev-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 317,
+ "type": "LTXAVTextEncoderLoader",
+ "pos": [
+ 1660,
+ 4280
+ ],
+ "size": [
+ 430,
+ 170
+ ],
+ "flags": {},
+ "order": 41,
+ "mode": 0,
+ "showAdvanced": false,
+ "inputs": [
+ {
+ "localized_name": "text_encoder",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": 606
+ },
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 605
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 615,
+ 627
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXAVTextEncoderLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ },
+ {
+ "name": "gemma_3_12B_it_fp4_mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "gemma_3_12B_it_fp4_mixed.safetensors",
+ "ltx-2.3-22b-dev-fp8.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 318,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 3860,
+ 4830
+ ],
+ "size": [
+ 240,
+ 100
+ ],
+ "flags": {},
+ "order": 42,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 497
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 498
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 487
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 319,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ 1190,
+ 3680
+ ],
+ "size": [
+ 370,
+ 350
+ ],
+ "flags": {},
+ "order": 43,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 595
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 625
+ ]
+ }
+ ],
+ "title": "Prompt",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveStringMultiline",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 323,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 1210,
+ 5040
+ ],
+ "size": [
+ 360,
+ 210
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 44,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 628
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 629
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 630,
+ 631
+ ]
+ }
+ ],
+ "title": "Math Expression (length)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b + 1"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ 1630,
+ 3550,
+ 480,
+ 1140
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Generate Low Resolution",
+ "bounding": [
+ 3130,
+ 3550,
+ 1000,
+ 1140
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ 2140,
+ 3550,
+ 960,
+ 1140
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Generate High Resolution",
+ "bounding": [
+ 4670,
+ 3550,
+ 990,
+ 1130
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Lantent Upscale",
+ "bounding": [
+ 4160,
+ 3550,
+ 480,
+ 1130
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 19,
+ "title": "Video Settings",
+ "bounding": [
+ 1150,
+ 3550,
+ 460,
+ 1610
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 20,
+ "title": "Image Preprocess",
+ "bounding": [
+ 1630,
+ 4720,
+ 830,
+ 440
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 21,
+ "title": "Empty Latent",
+ "bounding": [
+ 2820,
+ 4720,
+ 1310,
+ 450
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 22,
+ "title": "Number conversion",
+ "bounding": [
+ 2480,
+ 4720,
+ 310,
+ 440
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 512,
+ "origin_id": 288,
+ "origin_slot": 0,
+ "target_id": 278,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 513,
+ "origin_id": 307,
+ "origin_slot": 1,
+ "target_id": 278,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 478,
+ "origin_id": 285,
+ "origin_slot": 0,
+ "target_id": 282,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 479,
+ "origin_id": 284,
+ "origin_slot": 0,
+ "target_id": 282,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 480,
+ "origin_id": 284,
+ "origin_slot": 1,
+ "target_id": 282,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 541,
+ "origin_id": 285,
+ "origin_slot": 0,
+ "target_id": 314,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 518,
+ "origin_id": 304,
+ "origin_slot": 0,
+ "target_id": 314,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 519,
+ "origin_id": 304,
+ "origin_slot": 1,
+ "target_id": 314,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 483,
+ "origin_id": 277,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 484,
+ "origin_id": 314,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 485,
+ "origin_id": 291,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 544,
+ "origin_id": 306,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 487,
+ "origin_id": 318,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 475,
+ "origin_id": 304,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 476,
+ "origin_id": 304,
+ "origin_slot": 1,
+ "target_id": 284,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 477,
+ "origin_id": 307,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 520,
+ "origin_id": 316,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 523,
+ "origin_id": 290,
+ "origin_slot": 0,
+ "target_id": 286,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 547,
+ "origin_id": 307,
+ "origin_slot": 0,
+ "target_id": 287,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 545,
+ "origin_id": 311,
+ "origin_slot": 0,
+ "target_id": 287,
+ "target_slot": 1,
+ "type": "LATENT_UPSCALE_MODEL"
+ },
+ {
+ "id": 554,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 287,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 552,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 515,
+ "origin_id": 289,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 548,
+ "origin_id": 287,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 543,
+ "origin_id": 302,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 505,
+ "origin_id": 286,
+ "origin_slot": 0,
+ "target_id": 289,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 558,
+ "origin_id": 312,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 559,
+ "origin_id": 299,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 560,
+ "origin_id": 312,
+ "origin_slot": 0,
+ "target_id": 292,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 557,
+ "origin_id": 316,
+ "origin_slot": 2,
+ "target_id": 293,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 562,
+ "origin_id": 299,
+ "origin_slot": 0,
+ "target_id": 294,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 561,
+ "origin_id": 292,
+ "origin_slot": 1,
+ "target_id": 295,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 563,
+ "origin_id": 294,
+ "origin_slot": 1,
+ "target_id": 295,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 556,
+ "origin_id": 316,
+ "origin_slot": 2,
+ "target_id": 296,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 510,
+ "origin_id": 289,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 511,
+ "origin_id": 295,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 542,
+ "origin_id": 302,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 495,
+ "origin_id": 309,
+ "origin_slot": 1,
+ "target_id": 297,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 496,
+ "origin_id": 279,
+ "origin_slot": 0,
+ "target_id": 297,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 564,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 298,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 526,
+ "origin_id": 303,
+ "origin_slot": 0,
+ "target_id": 304,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 527,
+ "origin_id": 313,
+ "origin_slot": 0,
+ "target_id": 304,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 566,
+ "origin_id": 298,
+ "origin_slot": 0,
+ "target_id": 304,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 497,
+ "origin_id": 296,
+ "origin_slot": 0,
+ "target_id": 318,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 498,
+ "origin_id": 305,
+ "origin_slot": 0,
+ "target_id": 318,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 481,
+ "origin_id": 279,
+ "origin_slot": 0,
+ "target_id": 305,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 565,
+ "origin_id": 298,
+ "origin_slot": 1,
+ "target_id": 305,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 488,
+ "origin_id": 283,
+ "origin_slot": 0,
+ "target_id": 307,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 490,
+ "origin_id": 276,
+ "origin_slot": 0,
+ "target_id": 308,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 491,
+ "origin_id": 282,
+ "origin_slot": 0,
+ "target_id": 308,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 492,
+ "origin_id": 280,
+ "origin_slot": 0,
+ "target_id": 308,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 493,
+ "origin_id": 281,
+ "origin_slot": 0,
+ "target_id": 308,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 494,
+ "origin_id": 278,
+ "origin_slot": 0,
+ "target_id": 308,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 578,
+ "origin_id": 308,
+ "origin_slot": 0,
+ "target_id": 309,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 539,
+ "origin_id": 309,
+ "origin_slot": 0,
+ "target_id": 315,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 553,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 315,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 538,
+ "origin_id": 315,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 534,
+ "origin_id": 297,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 591,
+ "origin_id": 298,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 535,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 0,
+ "type": "IMAGE,MASK"
+ },
+ {
+ "id": 536,
+ "origin_id": 310,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 595,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 319,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 597,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 312,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 598,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 299,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 599,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 301,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 601,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 316,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 602,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 285,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 604,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 279,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 605,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 317,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 606,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 317,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 607,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 311,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 615,
+ "origin_id": 317,
+ "origin_slot": 0,
+ "target_id": 303,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 624,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 300,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 625,
+ "origin_id": 319,
+ "origin_slot": 0,
+ "target_id": 303,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 627,
+ "origin_id": 317,
+ "origin_slot": 0,
+ "target_id": 313,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 628,
+ "origin_id": 301,
+ "origin_slot": 0,
+ "target_id": 323,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 629,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 323,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 630,
+ "origin_id": 323,
+ "origin_slot": 1,
+ "target_id": 305,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 631,
+ "origin_id": 323,
+ "origin_slot": 1,
+ "target_id": 295,
+ "target_slot": 2,
+ "type": "INT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "Vue-corrected"
+ },
+ "category": "Video generation and editing/Image to video",
+ "description": "Generates video from a single input image using LTX-2.3."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Image to Video (Wan 2.2).json b/blueprints/Image to Video (Wan 2.2).json
index a8dafd3c9..a24adcfb6 100644
--- a/blueprints/Image to Video (Wan 2.2).json
+++ b/blueprints/Image to Video (Wan 2.2).json
@@ -206,7 +206,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Image to Video (Wan 2.2)",
+ "name": "Image to Video (Wan 2.2)",
"inputNode": {
"id": -10,
"bounding": [
@@ -2027,7 +2027,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Image to video"
+ "category": "Video generation and editing/Image to video",
+ "description": "Image-to-video with Wan 2.2 using a start image plus text prompt to extend motion from the still frame."
}
]
},
diff --git a/blueprints/Merge Videos.json b/blueprints/Merge Videos.json
new file mode 100644
index 000000000..689e6ec16
--- /dev/null
+++ b/blueprints/Merge Videos.json
@@ -0,0 +1,1219 @@
+{
+ "revision": 0,
+ "last_node_id": 26,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 26,
+ "type": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb",
+ "pos": [
+ -980,
+ 480
+ ],
+ "size": [
+ 290,
+ 190
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "base_video",
+ "localized_name": "clip_to_resize",
+ "name": "clip_to_resize",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "second_video",
+ "localized_name": "base_video",
+ "name": "base_video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "pad_second_video",
+ "localized_name": "pad_second_video",
+ "name": "pad_second_video",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "pad_second_video"
+ },
+ "link": null
+ },
+ {
+ "name": "interpolation",
+ "type": "COMBO",
+ "widget": {
+ "name": "interpolation"
+ },
+ "link": null
+ },
+ {
+ "name": "padding_color",
+ "type": "COMBO",
+ "widget": {
+ "name": "padding_color"
+ },
+ "link": null
+ },
+ {
+ "label": "drop_audio",
+ "localized_name": "drop_audio",
+ "name": "drop_audio",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "drop_audio"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "merged_video",
+ "name": "merged_video",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "28",
+ "value"
+ ],
+ [
+ "6",
+ "interpolation"
+ ],
+ [
+ "6",
+ "padding_color"
+ ],
+ [
+ "11",
+ "value"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Merge Videos"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "32e6dbcc-e2d7-45c0-a245-fc74b8271dfb",
+ "version": 1,
+ "state": {
+ "lastGroupId": 2,
+ "lastNodeId": 34,
+ "lastLinkId": 75,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Merge Videos",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1990,
+ 700,
+ 152.5546875,
+ 168
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1210,
+ 614,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "2fb09e41-c5fa-4654-b9d2-569b59626ec4",
+ "name": "clip_to_resize",
+ "type": "VIDEO",
+ "linkIds": [
+ 50
+ ],
+ "localized_name": "clip_to_resize",
+ "label": "base_video",
+ "pos": [
+ -1861.4453125,
+ 724
+ ]
+ },
+ {
+ "id": "017f8d09-7900-4dc9-b95c-0cab31bcde7d",
+ "name": "base_video",
+ "type": "VIDEO",
+ "linkIds": [
+ 51
+ ],
+ "localized_name": "base_video",
+ "label": "second_video",
+ "pos": [
+ -1861.4453125,
+ 744
+ ]
+ },
+ {
+ "id": "a39894ce-1785-4037-b39c-b40d2e470c43",
+ "name": "pad_second_video",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 59
+ ],
+ "localized_name": "pad_second_video",
+ "label": "pad_second_video",
+ "pos": [
+ -1861.4453125,
+ 764
+ ]
+ },
+ {
+ "id": "b4fb86cb-8d87-4193-8533-88a57df50e18",
+ "name": "interpolation",
+ "type": "COMBO",
+ "linkIds": [
+ 60
+ ],
+ "pos": [
+ -1861.4453125,
+ 784
+ ]
+ },
+ {
+ "id": "2413a2e2-cfdc-4d1d-9e2e-81e7acdf35e3",
+ "name": "padding_color",
+ "type": "COMBO",
+ "linkIds": [
+ 62
+ ],
+ "pos": [
+ -1861.4453125,
+ 804
+ ]
+ },
+ {
+ "id": "338b1e09-0efb-424f-949b-e730a0aa8527",
+ "name": "drop_audio",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 63
+ ],
+ "localized_name": "drop_audio",
+ "label": "drop_audio",
+ "pos": [
+ -1861.4453125,
+ 824
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "be99efc6-7fb3-4059-93d0-136dc8cc8faf",
+ "name": "merged_video",
+ "type": "VIDEO",
+ "linkIds": [
+ 16
+ ],
+ "localized_name": "merged_video",
+ "pos": [
+ 1234,
+ 638
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 11,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -990,
+ 1230
+ ],
+ "size": [
+ 270,
+ 80
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 63
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 14
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 10,
+ "type": "EmptyAudio",
+ "pos": [
+ -990,
+ 1060
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "duration",
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sample_rate",
+ "name": "sample_rate",
+ "type": "INT",
+ "widget": {
+ "name": "sample_rate"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "channels",
+ "name": "channels",
+ "type": "INT",
+ "widget": {
+ "name": "channels"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "links": [
+ 22
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyAudio",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 60,
+ 44100,
+ 2
+ ]
+ },
+ {
+ "id": 3,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -370,
+ 1010
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 21
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 22
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 14
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 12
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 6,
+ "type": "ResizeAndPadImage",
+ "pos": [
+ -400,
+ 440
+ ],
+ "size": [
+ 270,
+ 210
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "showAdvanced": true,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 39
+ },
+ {
+ "localized_name": "target_width",
+ "name": "target_width",
+ "type": "INT",
+ "widget": {
+ "name": "target_width"
+ },
+ "link": 4
+ },
+ {
+ "localized_name": "target_height",
+ "name": "target_height",
+ "type": "INT",
+ "widget": {
+ "name": "target_height"
+ },
+ "link": 5
+ },
+ {
+ "localized_name": "padding_color",
+ "name": "padding_color",
+ "type": "COMBO",
+ "widget": {
+ "name": "padding_color"
+ },
+ "link": 62
+ },
+ {
+ "localized_name": "interpolation",
+ "name": "interpolation",
+ "type": "COMBO",
+ "widget": {
+ "name": "interpolation"
+ },
+ "link": 60
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 75
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeAndPadImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 512,
+ 512,
+ "white",
+ "lanczos"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "CreateVideo",
+ "pos": [
+ 880,
+ 280
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 19
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 12
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 15
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 16
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 9,
+ "type": "AudioMerge",
+ "pos": [
+ -990,
+ 890
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio1",
+ "name": "audio1",
+ "type": "AUDIO",
+ "link": 9
+ },
+ {
+ "localized_name": "audio2",
+ "name": "audio2",
+ "type": "AUDIO",
+ "link": 10
+ },
+ {
+ "localized_name": "merge_method",
+ "name": "merge_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "merge_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "links": [
+ 21
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "AudioMerge",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "add"
+ ]
+ },
+ {
+ "id": 2,
+ "type": "GetVideoComponents",
+ "pos": [
+ -1590,
+ 460
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 51
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 39,
+ 54
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 9
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 27,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 60,
+ 70
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 54
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 75
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 56
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 55
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 1,
+ "type": "GetVideoComponents",
+ "pos": [
+ -1600,
+ 30
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 50
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 3,
+ 17
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 10
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 15
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 7,
+ "type": "GetImageSize",
+ "pos": [
+ -1000,
+ 480
+ ],
+ "size": [
+ 260,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 3
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 4
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 5
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 28,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -1590,
+ 190
+ ],
+ "size": [
+ 270,
+ 80
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 59
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 56
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 13,
+ "type": "BatchImagesNode",
+ "pos": [
+ 530,
+ 10
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image0",
+ "localized_name": "images.image0",
+ "name": "images.image0",
+ "type": "IMAGE",
+ "link": 17
+ },
+ {
+ "label": "image1",
+ "localized_name": "images.image1",
+ "name": "images.image1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 55
+ },
+ {
+ "label": "image2",
+ "localized_name": "images.image2",
+ "name": "images.image2",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 19
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "BatchImagesNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Audio",
+ "bounding": [
+ -1000,
+ 820,
+ 915,
+ 496
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 21,
+ "origin_id": 9,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "AUDIO"
+ },
+ {
+ "id": 22,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 14,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 9,
+ "origin_id": 2,
+ "origin_slot": 1,
+ "target_id": 9,
+ "target_slot": 0,
+ "type": "AUDIO"
+ },
+ {
+ "id": 10,
+ "origin_id": 1,
+ "origin_slot": 1,
+ "target_id": 9,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 39,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 4,
+ "origin_id": 7,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 5,
+ "origin_id": 7,
+ "origin_slot": 1,
+ "target_id": 6,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 3,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 17,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 19,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 12,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 15,
+ "origin_id": 1,
+ "origin_slot": 2,
+ "target_id": 8,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 16,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 50,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 1,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 51,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 54,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 55,
+ "origin_id": 27,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 56,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 59,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 28,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 60,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 6,
+ "target_slot": 4,
+ "type": "COMBO"
+ },
+ {
+ "id": 62,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 6,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 63,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 75,
+ "origin_id": 6,
+ "origin_slot": 0,
+ "target_id": 27,
+ "target_slot": 1,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Video Tools",
+ "description": "Concatenates two videos end-to-end with optional resize, letterbox padding, and audio merge or drop."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Pose to Image (Z-Image-Turbo).json b/blueprints/Pose to Image (Z-Image-Turbo).json
index a55410ba4..92ee80907 100644
--- a/blueprints/Pose to Image (Z-Image-Turbo).json
+++ b/blueprints/Pose to Image (Z-Image-Turbo).json
@@ -134,7 +134,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Pose to Image (Z-Image-Turbo)",
+ "name": "Pose to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1298,7 +1298,8 @@
"VHS_MetadataImage": true,
"VHS_KeepIntermediate": true
},
- "category": "Image generation and editing/Pose to image"
+ "category": "Image generation and editing/Conditioned",
+ "description": "Generates an image from pose keypoints using Z-Image-Turbo with text conditioning."
}
]
},
@@ -1319,4 +1320,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Pose to Video (LTX 2.0).json b/blueprints/Pose to Video (LTX 2.0).json
index ae369941c..04eb69972 100644
--- a/blueprints/Pose to Video (LTX 2.0).json
+++ b/blueprints/Pose to Video (LTX 2.0).json
@@ -1,28 +1,26 @@
{
- "id": "01cd475b-52df-43bf-aafa-484a5976d2d2",
"revision": 0,
- "last_node_id": 160,
- "last_link_id": 410,
+ "last_node_id": 143,
+ "last_link_id": 0,
"nodes": [
{
- "id": 1,
- "type": "f0e58a6b-7246-4103-9fec-73b423634b1f",
+ "id": 143,
+ "type": "68857357-cbc2-4c3a-a786-c3a58d43f9b1",
"pos": [
- 210,
- 3830
+ 290,
+ 3960
],
"size": [
- 420,
+ 400,
500
],
"flags": {
"collapsed": false
},
- "order": 0,
+ "order": 13,
"mode": 0,
"inputs": [
{
- "label": "prompt",
"name": "text",
"type": "STRING",
"widget": {
@@ -31,33 +29,32 @@
"link": null
},
{
- "label": "first_frame_strength",
- "name": "strength",
- "type": "FLOAT",
- "widget": {
- "name": "strength"
- },
- "link": null
- },
- {
- "label": "disable_first_frame",
- "name": "bypass",
- "type": "BOOLEAN",
- "widget": {
- "name": "bypass"
- },
- "link": null
- },
- {
- "label": "first frame",
+ "label": "control_images",
"name": "image",
"type": "IMAGE",
"link": null
},
{
- "label": "control image",
- "name": "input",
- "type": "IMAGE,MASK",
+ "label": "first_frame",
+ "name": "image_1",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "image_strength",
+ "name": "strength_1",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_1"
+ },
+ "link": null
+ },
+ {
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
"link": null
},
{
@@ -69,6 +66,7 @@
"link": null
},
{
+ "label": "control_lora",
"name": "lora_name",
"type": "COMBO",
"widget": {
@@ -77,7 +75,15 @@
"link": null
},
{
- "label": "distll_lora",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": null
+ },
+ {
+ "label": "distill_lora",
"name": "lora_name_1",
"type": "COMBO",
"widget": {
@@ -93,30 +99,6 @@
"name": "model_name"
},
"link": null
- },
- {
- "name": "resize_type.width",
- "type": "INT",
- "widget": {
- "name": "resize_type.width"
- },
- "link": null
- },
- {
- "name": "resize_type.height",
- "type": "INT",
- "widget": {
- "name": "resize_type.height"
- },
- "link": null
- },
- {
- "name": "length",
- "type": "INT",
- "widget": {
- "name": "length"
- },
- "link": null
}
],
"outputs": [
@@ -130,56 +112,49 @@
"properties": {
"proxyWidgets": [
[
- "-1",
+ "124",
"text"
],
[
- "-1",
- "resize_type.width"
- ],
- [
- "-1",
- "resize_type.height"
- ],
- [
- "-1",
- "length"
- ],
- [
- "-1",
+ "149",
"strength"
],
- [
- "-1",
- "bypass"
- ],
[
"126",
"noise_seed"
],
[
- "126",
- "control_after_generate"
- ],
- [
- "-1",
+ "103",
"ckpt_name"
],
[
- "-1",
+ "134",
"lora_name"
],
[
- "-1",
- "model_name"
+ "97",
+ "text_encoder"
],
[
- "-1",
- "lora_name_1"
+ "105",
+ "lora_name"
+ ],
+ [
+ "100",
+ "model_name"
]
],
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "lora_name": true,
+ "strength": true,
+ "bypass": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -188,52 +163,40 @@
"secondTabOffset": 80,
"secondTabWidth": 65
},
- "widgets_values": [
- "",
- 1280,
- 720,
- 97,
- 1,
- false,
- null,
- null,
- "ltx-2-19b-dev-fp8.safetensors",
- "ltx-2-19b-ic-lora-pose-control.safetensors",
- "ltx-2-spatial-upscaler-x2-1.0.safetensors",
- "ltx-2-19b-distilled-lora-384.safetensors"
- ]
+ "widgets_values": [],
+ "title": "Pose to Video (LTX 2.0)"
}
],
"links": [],
- "groups": [],
+ "version": 0.4,
"definitions": {
"subgraphs": [
{
- "id": "f0e58a6b-7246-4103-9fec-73b423634b1f",
+ "id": "68857357-cbc2-4c3a-a786-c3a58d43f9b1",
"version": 1,
"state": {
- "lastGroupId": 11,
- "lastNodeId": 160,
- "lastLinkId": 410,
+ "lastGroupId": 14,
+ "lastNodeId": 701,
+ "lastLinkId": 1774,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
- "name": "local-Pose to Video (LTX 2.0)",
+ "name": "Pose to Video (LTX 2.0)",
"inputNode": {
"id": -10,
"bounding": [
- -2220,
- 4180,
- 153.3203125,
- 280
+ -2050,
+ 4100,
+ 127.029296875,
+ 240
]
},
"outputNode": {
"id": -20,
"bounding": [
- 1750.2777777777776,
- 4091.1111111111113,
+ 1750,
+ 4090,
120,
60
]
@@ -246,154 +209,128 @@
"linkIds": [
345
],
- "label": "prompt",
"pos": [
- -2086.6796875,
+ -1942.970703125,
+ 4120
+ ]
+ },
+ {
+ "id": "35a07084-3ecf-482a-a330-b40278770ca3",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 348,
+ 380
+ ],
+ "label": "control_images",
+ "pos": [
+ -1942.970703125,
+ 4140
+ ]
+ },
+ {
+ "id": "bea20802-d654-4287-a8ef-0f834314bcf9",
+ "name": "image_1",
+ "type": "IMAGE",
+ "linkIds": [
+ 364,
+ 379
+ ],
+ "label": "first_frame",
+ "pos": [
+ -1942.970703125,
+ 4160
+ ]
+ },
+ {
+ "id": "b9b4151d-df88-40c0-a2bd-6e35b94557fe",
+ "name": "strength_1",
+ "type": "FLOAT",
+ "linkIds": [
+ 1758,
+ 1759
+ ],
+ "label": "image_strength",
+ "pos": [
+ -1942.970703125,
+ 4180
+ ]
+ },
+ {
+ "id": "b51f6a12-9152-4526-b115-443cfd23003f",
+ "name": "noise_seed",
+ "type": "INT",
+ "linkIds": [
+ 1767
+ ],
+ "pos": [
+ -1942.970703125,
4200
]
},
{
- "id": "59430efe-1090-4e36-8afe-b21ce7f4268b",
- "name": "strength",
- "type": "FLOAT",
+ "id": "47248f12-f174-4e35-854c-fa5eebea2903",
+ "name": "ckpt_name",
+ "type": "COMBO",
"linkIds": [
- 370,
- 371
+ 1768,
+ 1770,
+ 1771
],
- "label": "first_frame_strength",
"pos": [
- -2086.6796875,
+ -1942.970703125,
4220
]
},
{
- "id": "6145a9b9-68ed-4956-89f7-7a5ebdd5c99e",
- "name": "bypass",
- "type": "BOOLEAN",
+ "id": "6feb34cf-7972-4d3a-91fc-11070a84dc5f",
+ "name": "lora_name",
+ "type": "COMBO",
"linkIds": [
- 363,
- 368
+ 1769
],
- "label": "disable_first_frame",
+ "label": "control_lora",
"pos": [
- -2086.6796875,
+ -1942.970703125,
4240
]
},
{
- "id": "f7aa8c12-bdba-4bbd-84cf-b49cfc32a1dd",
- "name": "image",
- "type": "IMAGE",
+ "id": "6b423a3e-6c0e-445d-93c0-2cc3945400d1",
+ "name": "text_encoder",
+ "type": "COMBO",
"linkIds": [
- 398,
- 399
+ 1772
],
- "label": "first frame",
"pos": [
- -2086.6796875,
+ -1942.970703125,
4260
]
},
{
- "id": "da40a4c0-cd19-46c6-8eb3-62d0026fbe85",
- "name": "input",
- "type": "IMAGE,MASK",
+ "id": "ffd38c52-cc57-4e68-b140-94e7b03499b1",
+ "name": "lora_name_1",
+ "type": "COMBO",
"linkIds": [
- 400
+ 1773
],
- "label": "control image",
+ "label": "distill_lora",
"pos": [
- -2086.6796875,
+ -1942.970703125,
4280
]
},
{
- "id": "8005344b-99d6-4829-a619-c4e8ef640eb9",
- "name": "ckpt_name",
- "type": "COMBO",
- "linkIds": [
- 401,
- 402,
- 403
- ],
- "pos": [
- -2086.6796875,
- 4300
- ]
- },
- {
- "id": "25e7c4e8-850c-4f37-bc14-e3f4b5f228c0",
- "name": "lora_name",
- "type": "COMBO",
- "linkIds": [
- 404,
- 405
- ],
- "pos": [
- -2086.6796875,
- 4320
- ]
- },
- {
- "id": "f16a18dd-947e-400a-8889-02cf998f760a",
- "name": "lora_name_1",
- "type": "COMBO",
- "linkIds": [
- 406
- ],
- "label": "distll_lora",
- "pos": [
- -2086.6796875,
- 4340
- ]
- },
- {
- "id": "1abf156c-4c85-4ee5-8671-62df3177d835",
+ "id": "6d8b9605-acf0-4dd7-8d45-f824c2fd5895",
"name": "model_name",
"type": "COMBO",
"linkIds": [
- 407
+ 1774
],
"label": "upscale_model",
"pos": [
- -2086.6796875,
- 4360
- ]
- },
- {
- "id": "203402cf-4253-4daf-bf78-5def9496e0af",
- "name": "resize_type.width",
- "type": "INT",
- "linkIds": [
- 408
- ],
- "pos": [
- -2086.6796875,
- 4380
- ]
- },
- {
- "id": "e6d8ac4a-34d4-46c6-bcb2-4e66a696438c",
- "name": "resize_type.height",
- "type": "INT",
- "linkIds": [
- 409
- ],
- "pos": [
- -2086.6796875,
- 4400
- ]
- },
- {
- "id": "6aa6cf2c-bc4f-4f8b-be62-aa15793375dc",
- "name": "length",
- "type": "INT",
- "linkIds": [
- 410
- ],
- "pos": [
- -2086.6796875,
- 4420
+ -1942.970703125,
+ 4300
]
}
],
@@ -407,8 +344,8 @@
],
"localized_name": "VIDEO",
"pos": [
- 1770.2777777777776,
- 4111.111111111111
+ 1770,
+ 4110
]
}
],
@@ -418,15 +355,15 @@
"id": 93,
"type": "CFGGuider",
"pos": [
- -697.721823660531,
- 3671.1105325465196
+ -690,
+ 3710
],
"size": [
- 269.97395833333337,
- 98
+ 270,
+ 160
],
"flags": {},
- "order": 16,
+ "order": 7,
"mode": 0,
"inputs": [
{
@@ -470,6 +407,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CFGGuider",
"enableTabs": false,
"tabWidth": 65,
@@ -487,12 +429,12 @@
"id": 94,
"type": "KSamplerSelect",
"pos": [
- -697.721823660531,
- 3841.1107362825187
+ -690,
+ 3940
],
"size": [
- 269.97395833333337,
- 58
+ 270,
+ 110
],
"flags": {},
"order": 0,
@@ -521,6 +463,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "KSamplerSelect",
"enableTabs": false,
"tabWidth": 65,
@@ -538,12 +485,12 @@
"id": 99,
"type": "ManualSigmas",
"pos": [
- 410.27824286284044,
- 3851.110970278795
+ 450,
+ 3910
],
"size": [
- 269.97395833333337,
- 58
+ 270,
+ 110
],
"flags": {},
"order": 1,
@@ -572,6 +519,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "ManualSigmas",
"enableTabs": false,
"tabWidth": 65,
@@ -589,15 +541,15 @@
"id": 100,
"type": "LatentUpscaleModelLoader",
"pos": [
- -69.72208571196083,
- 3701.1104657166875
+ -70,
+ 3790
],
"size": [
- 389.97395833333337,
- 58
+ 390,
+ 110
],
"flags": {},
- "order": 2,
+ "order": 11,
"mode": 0,
"inputs": [
{
@@ -607,7 +559,7 @@
"widget": {
"name": "model_name"
},
- "link": 407
+ "link": 1774
}
],
"outputs": [
@@ -623,21 +575,26 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LatentUpscaleModelLoader",
- "models": [
- {
- "name": "ltx-2-spatial-upscaler-x2-1.0.safetensors",
- "url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-spatial-upscaler-x2-1.0.safetensors",
- "directory": "latent_upscale_models"
- }
- ],
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2-spatial-upscaler-x2-1.0.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-spatial-upscaler-x2-1.0.safetensors",
+ "directory": "latent_upscale_models"
+ }
+ ]
},
"widgets_values": [
"ltx-2-spatial-upscaler-x2-1.0.safetensors"
@@ -647,15 +604,15 @@
"id": 101,
"type": "LTXVConcatAVLatent",
"pos": [
- 410.27824286284044,
- 4101.110949206838
+ 450,
+ 4220
],
"size": [
- 269.97395833333337,
- 46
+ 270,
+ 120
],
"flags": {},
- "order": 18,
+ "order": 12,
"mode": 0,
"inputs": [
{
@@ -684,6 +641,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVConcatAVLatent",
"enableTabs": false,
"tabWidth": 65,
@@ -692,22 +654,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 108,
"type": "CFGGuider",
"pos": [
- 410.27824286284044,
- 3701.1104657166875
+ 450,
+ 3720
],
"size": [
- 269.97395833333337,
- 98
+ 270,
+ 160
],
"flags": {},
- "order": 22,
+ "order": 18,
"mode": 0,
"inputs": [
{
@@ -751,6 +712,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CFGGuider",
"enableTabs": false,
"tabWidth": 65,
@@ -764,19 +730,101 @@
1
]
},
+ {
+ "id": 111,
+ "type": "LTXVEmptyLatentAudio",
+ "pos": [
+ -1100,
+ 4940
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 285
+ },
+ {
+ "localized_name": "frames_number",
+ "name": "frames_number",
+ "type": "INT",
+ "widget": {
+ "name": "frames_number"
+ },
+ "link": 329
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "INT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 354
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Latent",
+ "name": "Latent",
+ "type": "LATENT",
+ "links": [
+ 300
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVEmptyLatentAudio",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 97,
+ 25,
+ 1
+ ]
+ },
{
"id": 123,
"type": "SamplerCustomAdvanced",
"pos": [
- -387.72197839215096,
- 3521.1103425011374
+ -380,
+ 3530
],
"size": [
- 213.09895833333334,
- 106
+ 230,
+ 170
],
"flags": {},
- "order": 31,
+ "order": 29,
"mode": 0,
"inputs": [
{
@@ -829,6 +877,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.60",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "SamplerCustomAdvanced",
"enableTabs": false,
"tabWidth": 65,
@@ -837,22 +890,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 114,
"type": "LTXVConditioning",
"pos": [
- -1133.7215420073496,
- 4141.110347554622
+ -1130,
+ 4140
],
"size": [
- 269.97395833333337,
- 78
+ 270,
+ 130
],
"flags": {},
- "order": 27,
+ "order": 23,
"mode": 0,
"inputs": [
{
@@ -898,6 +950,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVConditioning",
"enableTabs": false,
"tabWidth": 65,
@@ -915,15 +972,15 @@
"id": 119,
"type": "CLIPTextEncode",
"pos": [
- -1163.7218246405453,
- 3881.1109034489627
+ -1160,
+ 3880
],
"size": [
400,
- 88
+ 200
],
"flags": {},
- "order": 12,
+ "order": 27,
"mode": 0,
"inputs": [
{
@@ -955,6 +1012,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
@@ -974,15 +1036,15 @@
"id": 116,
"type": "LTXVConcatAVLatent",
"pos": [
- -519.7217122979332,
- 4701.110031965835
+ -520,
+ 4830
],
"size": [
- 187.5,
- 46
+ 230,
+ 100
],
"flags": {},
- "order": 29,
+ "order": 25,
"mode": 0,
"inputs": [
{
@@ -1012,6 +1074,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVConcatAVLatent",
"enableTabs": false,
"tabWidth": 65,
@@ -1020,22 +1087,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 122,
"type": "LTXVSeparateAVLatent",
"pos": [
- -393.72183921949465,
- 3801.1107787938904
+ -380,
+ 3810
],
"size": [
- 239.97395833333334,
- 46
+ 240,
+ 100
],
"flags": {},
- "order": 30,
+ "order": 28,
"mode": 0,
"inputs": [
{
@@ -1066,6 +1132,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVSeparateAVLatent",
"enableTabs": false,
"tabWidth": 65,
@@ -1074,22 +1145,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 124,
"type": "CLIPTextEncode",
"pos": [
- -1174.7214530029996,
- 3515.1112854387566
+ -1170,
+ 3510
],
"size": [
- 409.97395833333337,
- 88
+ 410,
+ 320
],
"flags": {},
- "order": 32,
+ "order": 30,
"mode": 0,
"inputs": [
{
@@ -1121,6 +1191,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
@@ -1140,15 +1215,15 @@
"id": 98,
"type": "KSamplerSelect",
"pos": [
- 410.27824286284044,
- 3981.1101681370833
+ 450,
+ 4070
],
"size": [
- 269.97395833333337,
- 58
+ 270,
+ 110
],
"flags": {},
- "order": 3,
+ "order": 2,
"mode": 0,
"inputs": [
{
@@ -1174,6 +1249,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.75",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "KSamplerSelect",
"enableTabs": false,
"tabWidth": 65,
@@ -1191,12 +1271,12 @@
"id": 105,
"type": "LoraLoaderModelOnly",
"pos": [
- -69.72208571196083,
- 3571.110499039739
+ -70,
+ 3570
],
"size": [
- 389.97395833333337,
- 82
+ 390,
+ 140
],
"flags": {},
"order": 15,
@@ -1215,7 +1295,7 @@
"widget": {
"name": "lora_name"
},
- "link": 406
+ "link": 1773
},
{
"localized_name": "strength_model",
@@ -1240,21 +1320,26 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.75",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LoraLoaderModelOnly",
- "models": [
- {
- "name": "ltx-2-19b-distilled-lora-384.safetensors",
- "url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-distilled-lora-384.safetensors",
- "directory": "loras"
- }
- ],
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2-19b-distilled-lora-384.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-distilled-lora-384.safetensors",
+ "directory": "loras"
+ }
+ ]
},
"widgets_values": [
"ltx-2-19b-distilled-lora-384.safetensors",
@@ -1265,15 +1350,15 @@
"id": 95,
"type": "LTXVScheduler",
"pos": [
- -699.7218704597861,
- 3981.1101681370833
+ -690,
+ 4130
],
"size": [
- 269.97395833333337,
- 154
+ 270,
+ 170
],
"flags": {},
- "order": 17,
+ "order": 8,
"mode": 0,
"inputs": [
{
@@ -1342,6 +1427,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVScheduler",
"enableTabs": false,
"tabWidth": 65,
@@ -1363,15 +1453,15 @@
"id": 126,
"type": "RandomNoise",
"pos": [
- -697.721823660531,
- 3521.1103425011374
+ -690,
+ 3520
],
"size": [
- 269.97395833333337,
- 82
+ 270,
+ 110
],
"flags": {},
- "order": 4,
+ "order": 31,
"mode": 0,
"inputs": [
{
@@ -1381,7 +1471,7 @@
"widget": {
"name": "noise_seed"
},
- "link": null
+ "link": 1767
}
],
"outputs": [
@@ -1397,6 +1487,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "RandomNoise",
"enableTabs": false,
"tabWidth": 65,
@@ -1408,22 +1503,22 @@
},
"widgets_values": [
0,
- "randomize"
+ "fixed"
]
},
{
"id": 107,
"type": "SamplerCustomAdvanced",
"pos": [
- 710.2782734905775,
- 3571.110499039739
+ 730,
+ 3570
],
"size": [
- 212.36979166666669,
- 106
+ 230,
+ 170
],
"flags": {},
- "order": 21,
+ "order": 17,
"mode": 0,
"inputs": [
{
@@ -1476,6 +1571,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.75",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "SamplerCustomAdvanced",
"enableTabs": false,
"tabWidth": 65,
@@ -1484,22 +1584,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
- "id": 143,
+ "id": 187,
"type": "RandomNoise",
"pos": [
- 410.27824286284044,
- 3571.110499039739
+ 450,
+ 3570
],
"size": [
- 269.97395833333337,
- 82
+ 270,
+ 110
],
"flags": {},
- "order": 5,
+ "order": 3,
"mode": 0,
"inputs": [
{
@@ -1525,6 +1624,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "RandomNoise",
"enableTabs": false,
"tabWidth": 65,
@@ -1543,12 +1647,12 @@
"id": 139,
"type": "LTXVAudioVAEDecode",
"pos": [
- 1130.2783163694094,
- 3841.1107362825187
+ 1130,
+ 3840
],
"size": [
- 239.97395833333334,
- 46
+ 240,
+ 100
],
"flags": {},
"order": 35,
@@ -1565,7 +1669,7 @@
"localized_name": "audio_vae",
"name": "audio_vae",
"type": "VAE",
- "link": 383
+ "link": 340
}
],
"outputs": [
@@ -1581,6 +1685,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVAudioVAEDecode",
"enableTabs": false,
"tabWidth": 65,
@@ -1589,22 +1698,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 106,
"type": "CreateVideo",
"pos": [
- 1420.2783925712918,
- 3761.1104019496292
+ 1420,
+ 3760
],
"size": [
- 269.97395833333337,
- 78
+ 270,
+ 130
],
"flags": {},
- "order": 20,
+ "order": 16,
"mode": 0,
"inputs": [
{
@@ -1643,6 +1751,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CreateVideo",
"enableTabs": false,
"tabWidth": 65,
@@ -1660,15 +1773,15 @@
"id": 134,
"type": "LoraLoaderModelOnly",
"pos": [
- -1649.721454901846,
- 3761.1104019496292
+ -1650,
+ 3750
],
"size": [
- 419.97395833333337,
- 82
+ 420,
+ 140
],
"flags": {},
- "order": 13,
+ "order": 33,
"mode": 0,
"inputs": [
{
@@ -1684,7 +1797,7 @@
"widget": {
"name": "lora_name"
},
- "link": 404
+ "link": 1769
},
{
"localized_name": "strength_model",
@@ -1710,21 +1823,26 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LoraLoaderModelOnly",
- "models": [
- {
- "name": "ltx-2-19b-ic-lora-pose-control.safetensors",
- "url": "https://huggingface.co/Lightricks/LTX-2-19b-IC-LoRA-Pose-Control/resolve/main/ltx-2-19b-ic-lora-pose-control.safetensors",
- "directory": "loras"
- }
- ],
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
"hasSecondTab": false,
"secondTabText": "Send Back",
"secondTabOffset": 80,
- "secondTabWidth": 65
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2-19b-ic-lora-pose-control.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2-19b-IC-LoRA-Pose-Control/resolve/main/ltx-2-19b-ic-lora-pose-control.safetensors",
+ "directory": "loras"
+ }
+ ]
},
"widgets_values": [
"ltx-2-19b-ic-lora-pose-control.safetensors",
@@ -1737,12 +1855,12 @@
"id": 138,
"type": "LTXVSeparateAVLatent",
"pos": [
- 730.2784619127078,
- 3731.1109580277
+ 740,
+ 3810
],
"size": [
- 193.2916015625,
- 46
+ 230,
+ 100
],
"flags": {},
"order": 34,
@@ -1777,6 +1895,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVSeparateAVLatent",
"enableTabs": false,
"tabWidth": 65,
@@ -1785,22 +1908,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
- "id": 144,
+ "id": 188,
"type": "VAEDecodeTiled",
"pos": [
- 1120.2783619435547,
- 3641.110599376351
+ 1120,
+ 3640
],
"size": [
- 269.97395833333337,
+ 270,
150
],
"flags": {},
- "order": 36,
+ "order": 38,
"mode": 0,
"inputs": [
{
@@ -1865,6 +1987,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "VAEDecodeTiled",
"enableTabs": false,
"tabWidth": 65,
@@ -1885,15 +2012,15 @@
"id": 113,
"type": "VAEDecode",
"pos": [
- 1130.2783163694094,
- 3531.1113453160738
+ 1130,
+ 3530
],
"size": [
- 239.97395833333334,
- 46
+ 240,
+ 100
],
"flags": {},
- "order": 26,
+ "order": 22,
"mode": 0,
"inputs": [
{
@@ -1920,6 +2047,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.75",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "VAEDecode",
"enableTabs": false,
"tabWidth": 65,
@@ -1928,22 +2060,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 145,
"type": "PrimitiveInt",
"pos": [
- -1600,
- 4940
+ -1610,
+ 4800
],
"size": [
- 269.97395833333337,
- 82
+ 270,
+ 110
],
"flags": {},
- "order": 6,
+ "order": 4,
"mode": 0,
"inputs": [
{
@@ -1969,6 +2100,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "PrimitiveInt",
"enableTabs": false,
"tabWidth": 65,
@@ -1987,15 +2123,15 @@
"id": 148,
"type": "PrimitiveFloat",
"pos": [
- -1600,
- 5070
+ -1610,
+ 4930
],
"size": [
- 269.97395833333337,
- 58
+ 270,
+ 110
],
"flags": {},
- "order": 7,
+ "order": 5,
"mode": 0,
"inputs": [
{
@@ -2022,6 +2158,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "PrimitiveFloat",
"enableTabs": false,
"tabWidth": 65,
@@ -2035,19 +2176,105 @@
24
]
},
+ {
+ "id": 115,
+ "type": "EmptyLTXVLatentVideo",
+ "pos": [
+ -1100,
+ 4740
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 296
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 297
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 330
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 360
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptyLTXVLatentVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 512,
+ 97,
+ 1
+ ]
+ },
{
"id": 118,
"type": "Reroute",
"pos": [
- -229.7217758812614,
- 4211.111007032079
+ -350,
+ 3980
],
"size": [
- 75,
- 26
+ 230,
+ 40
],
"flags": {},
- "order": 14,
+ "order": 26,
"mode": 0,
"inputs": [
{
@@ -2069,22 +2296,29 @@
],
"properties": {
"showOutputText": false,
- "horizontal": false
+ "horizontal": false,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
}
},
{
- "id": 151,
+ "id": 189,
"type": "LTXVImgToVideoInplace",
"pos": [
- -19.72161465663438,
- 4071.1107364662485
+ 180,
+ 4040
],
"size": [
- 269.97395833333337,
- 122
+ 260,
+ 190
],
- "flags": {},
- "order": 38,
+ "flags": {
+ "collapsed": false
+ },
+ "order": 39,
"mode": 0,
"inputs": [
{
@@ -2097,7 +2331,7 @@
"localized_name": "image",
"name": "image",
"type": "IMAGE",
- "link": 398
+ "link": 379
},
{
"localized_name": "latent",
@@ -2112,7 +2346,7 @@
"widget": {
"name": "strength"
},
- "link": 371
+ "link": 1759
},
{
"localized_name": "bypass",
@@ -2137,6 +2371,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVImgToVideoInplace",
"enableTabs": false,
"tabWidth": 65,
@@ -2155,15 +2394,15 @@
"id": 104,
"type": "LTXVCropGuides",
"pos": [
- -9.721939801202097,
- 3841.1107362825187
+ -90,
+ 4210
],
"size": [
- 239.97395833333334,
- 66
+ 240,
+ 120
],
"flags": {},
- "order": 19,
+ "order": 14,
"mode": 0,
"inputs": [
{
@@ -2215,6 +2454,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.68",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVCropGuides",
"enableTabs": false,
"tabWidth": 65,
@@ -2223,22 +2467,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 112,
"type": "LTXVLatentUpsampler",
"pos": [
- -9.721939801202097,
- 3961.111517352274
+ -90,
+ 4030
],
"size": [
- 259.97395833333337,
- 66
+ 260,
+ 120
],
"flags": {},
- "order": 25,
+ "order": 21,
"mode": 0,
"inputs": [
{
@@ -2274,6 +2517,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVLatentUpsampler",
"enableTabs": false,
"tabWidth": 65,
@@ -2282,22 +2530,117 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
+ }
+ },
+ {
+ "id": 154,
+ "type": "MarkdownNote",
+ "pos": [
+ -1640,
+ 5050
+ ],
+ "size": [
+ 350,
+ 170
+ ],
+ "flags": {
+ "collapsed": false
},
- "widgets_values": []
+ "order": 6,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "title": "Frame Rate Note",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "Please make sure the frame rate value is the same in both boxes"
+ ],
+ "color": "#432",
+ "bgcolor": "#653"
+ },
+ {
+ "id": 96,
+ "type": "LTXVAudioVAELoader",
+ "pos": [
+ -1650,
+ 3970
+ ],
+ "size": [
+ 420,
+ 110
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 1770
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio VAE",
+ "name": "Audio VAE",
+ "type": "VAE",
+ "links": [
+ 285,
+ 340
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVAudioVAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2-19b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2-19b-dev-fp8.safetensors"
+ ]
},
{
"id": 97,
"type": "LTXAVTextEncoderLoader",
"pos": [
- -1649.721454901846,
- 4041.1110828665023
+ -1650,
+ 4160
],
"size": [
- 419.97395833333337,
- 106
+ 420,
+ 150
],
"flags": {},
- "order": 8,
+ "order": 10,
"mode": 0,
"inputs": [
{
@@ -2307,7 +2650,7 @@
"widget": {
"name": "text_encoder"
},
- "link": 405
+ "link": 1772
},
{
"localized_name": "ckpt_name",
@@ -2316,7 +2659,7 @@
"widget": {
"name": "ckpt_name"
},
- "link": 403
+ "link": 1771
},
{
"localized_name": "device",
@@ -2342,7 +2685,19 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXAVTextEncoderLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
"models": [
{
"name": "ltx-2-19b-dev-fp8.safetensors",
@@ -2354,17 +2709,10 @@
"url": "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors",
"directory": "text_encoders"
}
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65
+ ]
},
"widgets_values": [
- "ltx-2-19b-ic-lora-pose-control.safetensors",
+ "gemma_3_12B_it_fp4_mixed.safetensors",
"ltx-2-19b-dev-fp8.safetensors",
"default"
]
@@ -2373,15 +2721,15 @@
"id": 103,
"type": "CheckpointLoaderSimple",
"pos": [
- -1649.721454901846,
- 3591.1104777840524
+ -1650,
+ 3520
],
"size": [
- 419.97395833333337,
- 98
+ 420,
+ 160
],
"flags": {},
- "order": 9,
+ "order": 13,
"mode": 0,
"inputs": [
{
@@ -2391,7 +2739,7 @@
"widget": {
"name": "ckpt_name"
},
- "link": 401
+ "link": 1768
}
],
"outputs": [
@@ -2424,137 +2772,89 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.56",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
"models": [
{
"name": "ltx-2-19b-dev-fp8.safetensors",
"url": "https://huggingface.co/Lightricks/LTX-2/resolve/main/ltx-2-19b-dev-fp8.safetensors",
"directory": "checkpoints"
}
- ],
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65
+ ]
},
"widgets_values": [
"ltx-2-19b-dev-fp8.safetensors"
]
},
{
- "id": 156,
- "type": "LTXVAudioVAELoader",
+ "id": 110,
+ "type": "GetImageSize",
"pos": [
- -1636.9543279290153,
- 3911.095334870057
+ -1610,
+ 4630
],
"size": [
- 399.0494791666667,
- 58
+ 260,
+ 120
],
"flags": {},
- "order": 10,
+ "order": 19,
"mode": 0,
"inputs": [
- {
- "localized_name": "ckpt_name",
- "name": "ckpt_name",
- "type": "COMBO",
- "widget": {
- "name": "ckpt_name"
- },
- "link": 402
- }
- ],
- "outputs": [
- {
- "localized_name": "Audio VAE",
- "name": "Audio VAE",
- "type": "VAE",
- "links": [
- 382,
- 383
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.11.0",
- "Node name for S&R": "LTXVAudioVAELoader"
- },
- "widgets_values": [
- "ltx-2-19b-dev-fp8.safetensors"
- ]
- },
- {
- "id": 149,
- "type": "LTXVImgToVideoInplace",
- "pos": [
- -1089.7215608128167,
- 4401.110560478942
- ],
- "size": [
- 269.97395833333337,
- 122
- ],
- "flags": {},
- "order": 37,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "vae",
- "name": "vae",
- "type": "VAE",
- "link": 359
- },
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
- "link": 399
- },
- {
- "localized_name": "latent",
- "name": "latent",
- "type": "LATENT",
- "link": 360
- },
- {
- "localized_name": "strength",
- "name": "strength",
- "type": "FLOAT",
- "widget": {
- "name": "strength"
- },
- "link": 370
- },
- {
- "localized_name": "bypass",
- "name": "bypass",
- "type": "BOOLEAN",
- "widget": {
- "name": "bypass"
- },
- "link": 363
+ "link": 381
}
],
"outputs": [
{
- "localized_name": "latent",
- "name": "latent",
- "type": "LATENT",
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
"links": [
- 357
+ 296
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 297
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": [
+ 329,
+ 330
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
- "Node name for S&R": "LTXVImgToVideoInplace",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "GetImageSize",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -2562,25 +2862,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": [
- 1,
- false
- ]
+ }
},
{
"id": 132,
"type": "LTXVAddGuide",
"pos": [
- -599.7217670603999,
- 4421.110609115862
+ -600,
+ 4550
],
"size": [
- 269.97395833333337,
- 162
+ 270,
+ 240
],
"flags": {},
- "order": 33,
+ "order": 32,
"mode": 0,
"inputs": [
{
@@ -2611,7 +2907,7 @@
"localized_name": "image",
"name": "image",
"type": "IMAGE",
- "link": 395
+ "link": 348
},
{
"localized_name": "frame_idx",
@@ -2663,6 +2959,11 @@
"properties": {
"cnr_id": "comfy-core",
"ver": "0.3.75",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
"Node name for S&R": "LTXVAddGuide",
"enableTabs": false,
"tabWidth": 65,
@@ -2678,114 +2979,76 @@
]
},
{
- "id": 154,
- "type": "MarkdownNote",
+ "id": 149,
+ "type": "LTXVImgToVideoInplace",
"pos": [
- -1630,
- 5190
+ -1090,
+ 4530
],
"size": [
- 350,
- 88
- ],
- "flags": {
- "collapsed": false
- },
- "order": 11,
- "mode": 0,
- "inputs": [],
- "outputs": [],
- "title": "Frame Rate Note",
- "properties": {},
- "widgets_values": [
- "Please make sure the frame rate value is the same in both boxes"
- ],
- "color": "#432",
- "bgcolor": "#653"
- },
- {
- "id": 159,
- "type": "ResizeImageMaskNode",
- "pos": [
- -1610,
- 4580
- ],
- "size": [
- 284.375,
- 154
+ 270,
+ 180
],
"flags": {},
- "order": 39,
+ "order": 36,
"mode": 0,
"inputs": [
{
- "localized_name": "input",
- "name": "input",
- "type": "IMAGE,MASK",
- "link": 400
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 359
},
{
- "localized_name": "resize_type",
- "name": "resize_type",
- "type": "COMFY_DYNAMICCOMBO_V3",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 364
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 360
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
"widget": {
- "name": "resize_type"
+ "name": "strength"
},
- "link": null
+ "link": 1758
},
{
- "localized_name": "width",
- "name": "resize_type.width",
- "type": "INT",
+ "localized_name": "bypass",
+ "name": "bypass",
+ "type": "BOOLEAN",
"widget": {
- "name": "resize_type.width"
- },
- "link": 408
- },
- {
- "localized_name": "height",
- "name": "resize_type.height",
- "type": "INT",
- "widget": {
- "name": "resize_type.height"
- },
- "link": 409
- },
- {
- "localized_name": "crop",
- "name": "resize_type.crop",
- "type": "COMBO",
- "widget": {
- "name": "resize_type.crop"
- },
- "link": null
- },
- {
- "localized_name": "scale_method",
- "name": "scale_method",
- "type": "COMBO",
- "widget": {
- "name": "scale_method"
+ "name": "bypass"
},
"link": null
}
],
"outputs": [
{
- "localized_name": "resized",
- "name": "resized",
- "type": "IMAGE,MASK",
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
"links": [
- 391,
- 392,
- 395
+ 357
]
}
],
"properties": {
"cnr_id": "comfy-core",
"ver": "0.7.0",
- "Node name for S&R": "ResizeImageMaskNode",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LTXVImgToVideoInplace",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -2795,139 +3058,69 @@
"secondTabWidth": 65
},
"widgets_values": [
- "scale dimensions",
- 1280,
- 720,
- "center",
- "lanczos"
+ 1,
+ false
]
},
{
- "id": 110,
- "type": "GetImageSize",
+ "id": 155,
+ "type": "ImageScaleBy",
"pos": [
- -1600,
- 4780
+ -1620,
+ 4440
],
"size": [
- 259.97395833333337,
- 66
+ 280,
+ 140
],
"flags": {},
- "order": 23,
+ "order": 37,
"mode": 0,
"inputs": [
{
"localized_name": "image",
"name": "image",
"type": "IMAGE",
- "link": 391
- }
- ],
- "outputs": [
- {
- "localized_name": "width",
- "name": "width",
- "type": "INT",
- "links": [
- 296
- ]
+ "link": 380
},
{
- "localized_name": "height",
- "name": "height",
- "type": "INT",
- "links": [
- 297
- ]
- },
- {
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "links": []
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.7.0",
- "Node name for S&R": "GetImageSize",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65
- },
- "widgets_values": []
- },
- {
- "id": 115,
- "type": "EmptyLTXVLatentVideo",
- "pos": [
- -1099.721794809093,
- 4611.11072170357
- ],
- "size": [
- 269.97395833333337,
- 130
- ],
- "flags": {},
- "order": 28,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "width",
- "name": "width",
- "type": "INT",
+ "localized_name": "upscale_method",
+ "name": "upscale_method",
+ "type": "COMBO",
"widget": {
- "name": "width"
+ "name": "upscale_method"
},
- "link": 296
+ "link": null
},
{
- "localized_name": "height",
- "name": "height",
- "type": "INT",
+ "localized_name": "scale_by",
+ "name": "scale_by",
+ "type": "FLOAT",
"widget": {
- "name": "height"
- },
- "link": 297
- },
- {
- "localized_name": "length",
- "name": "length",
- "type": "INT",
- "widget": {
- "name": "length"
- },
- "link": 410
- },
- {
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "widget": {
- "name": "batch_size"
+ "name": "scale_by"
},
"link": null
}
],
"outputs": [
{
- "localized_name": "LATENT",
- "name": "LATENT",
- "type": "LATENT",
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
"links": [
- 360
+ 381
]
}
],
"properties": {
"cnr_id": "comfy-core",
- "ver": "0.3.60",
- "Node name for S&R": "EmptyLTXVLatentVideo",
+ "ver": "0.5.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ImageScaleBy",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -2937,87 +3130,8 @@
"secondTabWidth": 65
},
"widgets_values": [
- 768,
- 512,
- 97,
- 1
- ]
- },
- {
- "id": 111,
- "type": "LTXVEmptyLatentAudio",
- "pos": [
- -1099.721794809093,
- 4811.110229576288
- ],
- "size": [
- 269.97395833333337,
- 106
- ],
- "flags": {},
- "order": 24,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "audio_vae",
- "name": "audio_vae",
- "type": "VAE",
- "link": 382
- },
- {
- "localized_name": "frames_number",
- "name": "frames_number",
- "type": "INT",
- "widget": {
- "name": "frames_number"
- },
- "link": null
- },
- {
- "localized_name": "frame_rate",
- "name": "frame_rate",
- "type": "INT",
- "widget": {
- "name": "frame_rate"
- },
- "link": 354
- },
- {
- "localized_name": "batch_size",
- "name": "batch_size",
- "type": "INT",
- "widget": {
- "name": "batch_size"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "Latent",
- "name": "Latent",
- "type": "LATENT",
- "links": [
- 300
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.68",
- "Node name for S&R": "LTXVEmptyLatentAudio",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65
- },
- "widgets_values": [
- 97,
- 25,
- 1
+ "lanczos",
+ 0.5
]
}
],
@@ -3028,8 +3142,8 @@
"bounding": [
-1660,
3440,
- 440,
- 820
+ 450,
+ 940
],
"color": "#3f789e",
"font_size": 24,
@@ -3041,8 +3155,8 @@
"bounding": [
-700,
3440,
- 570,
- 820
+ 580,
+ 940
],
"color": "#3f789e",
"font_size": 24,
@@ -3054,8 +3168,8 @@
"bounding": [
-1180,
3440,
- 440,
- 820
+ 450,
+ 940
],
"color": "#3f789e",
"font_size": 24,
@@ -3066,7 +3180,7 @@
"title": "Latent",
"bounding": [
-1180,
- 4290,
+ 4420,
1050,
680
],
@@ -3080,8 +3194,8 @@
"bounding": [
-100,
3440,
- 1090,
- 820
+ 1110,
+ 940
],
"color": "#3f789e",
"font_size": 24,
@@ -3091,10 +3205,10 @@
"id": 6,
"title": "Sampler",
"bounding": [
- 350,
+ 410,
3480,
- 620,
- 750
+ 590,
+ 880
],
"color": "#3f789e",
"font_size": 24,
@@ -3106,8 +3220,8 @@
"bounding": [
-90,
3480,
- 430,
- 310
+ 450,
+ 480
],
"color": "#3f789e",
"font_size": 24,
@@ -3117,8 +3231,8 @@
"id": 11,
"title": "Frame rate",
"bounding": [
- -1610,
- 4860,
+ -1620,
+ 4730,
290,
271.6
],
@@ -3184,6 +3298,22 @@
"target_slot": 2,
"type": "CONDITIONING"
},
+ {
+ "id": 285,
+ "origin_id": 96,
+ "origin_slot": 0,
+ "target_id": 111,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 329,
+ "origin_id": 110,
+ "origin_slot": 2,
+ "target_id": 111,
+ "target_slot": 1,
+ "type": "INT"
+ },
{
"id": 260,
"origin_id": 126,
@@ -3240,6 +3370,14 @@
"target_slot": 1,
"type": "INT"
},
+ {
+ "id": 330,
+ "origin_id": 110,
+ "origin_slot": 2,
+ "target_id": 115,
+ "target_slot": 2,
+ "type": "INT"
+ },
{
"id": 325,
"origin_id": 103,
@@ -3360,6 +3498,14 @@
"target_slot": 0,
"type": "LATENT"
},
+ {
+ "id": 340,
+ "origin_id": 96,
+ "origin_slot": 0,
+ "target_id": 139,
+ "target_slot": 1,
+ "type": "VAE"
+ },
{
"id": 337,
"origin_id": 138,
@@ -3490,23 +3636,31 @@
},
{
"id": 347,
- "origin_id": 143,
+ "origin_id": 187,
"origin_slot": 0,
"target_id": 107,
"target_slot": 0,
"type": "NOISE"
},
+ {
+ "id": 348,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 132,
+ "target_slot": 4,
+ "type": "IMAGE"
+ },
{
"id": 351,
"origin_id": 138,
"origin_slot": 0,
- "target_id": 144,
+ "target_id": 188,
"target_slot": 0,
"type": "LATENT"
},
{
"id": 352,
- "origin_id": 144,
+ "origin_id": 188,
"origin_slot": 0,
"target_id": 106,
"target_slot": 0,
@@ -3516,7 +3670,7 @@
"id": 353,
"origin_id": 103,
"origin_slot": 2,
- "target_id": 144,
+ "target_id": 188,
"target_slot": 1,
"type": "VAE"
},
@@ -3569,16 +3723,16 @@
"type": "LATENT"
},
{
- "id": 363,
+ "id": 364,
"origin_id": -10,
"origin_slot": 2,
"target_id": 149,
- "target_slot": 4,
- "type": "BOOLEAN"
+ "target_slot": 1,
+ "type": "IMAGE"
},
{
"id": 365,
- "origin_id": 151,
+ "origin_id": 189,
"origin_slot": 0,
"target_id": 101,
"target_slot": 0,
@@ -3588,7 +3742,7 @@
"id": 366,
"origin_id": 112,
"origin_slot": 0,
- "target_id": 151,
+ "target_id": 189,
"target_slot": 2,
"type": "LATENT"
},
@@ -3596,92 +3750,68 @@
"id": 367,
"origin_id": 118,
"origin_slot": 0,
- "target_id": 151,
+ "target_id": 189,
"target_slot": 0,
"type": "VAE"
},
{
"id": 368,
"origin_id": -10,
- "origin_slot": 2,
- "target_id": 151,
+ "origin_slot": 4,
+ "target_id": 189,
"target_slot": 4,
"type": "BOOLEAN"
},
{
- "id": 370,
+ "id": 379,
"origin_id": -10,
- "origin_slot": 1,
- "target_id": 149,
- "target_slot": 3,
- "type": "FLOAT"
- },
- {
- "id": 371,
- "origin_id": -10,
- "origin_slot": 1,
- "target_id": 151,
- "target_slot": 3,
- "type": "FLOAT"
- },
- {
- "id": 382,
- "origin_id": 156,
- "origin_slot": 0,
- "target_id": 111,
- "target_slot": 0,
- "type": "VAE"
- },
- {
- "id": 383,
- "origin_id": 156,
- "origin_slot": 0,
- "target_id": 139,
+ "origin_slot": 2,
+ "target_id": 189,
"target_slot": 1,
- "type": "VAE"
+ "type": "IMAGE"
},
{
- "id": 391,
- "origin_id": 159,
+ "id": 380,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 155,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 381,
+ "origin_id": 155,
"origin_slot": 0,
"target_id": 110,
"target_slot": 0,
"type": "IMAGE"
},
{
- "id": 395,
- "origin_id": 159,
- "origin_slot": 0,
- "target_id": 132,
- "target_slot": 4,
- "type": "IMAGE"
- },
- {
- "id": 398,
- "origin_id": -10,
- "origin_slot": 3,
- "target_id": 151,
- "target_slot": 1,
- "type": "IMAGE"
- },
- {
- "id": 399,
+ "id": 1758,
"origin_id": -10,
"origin_slot": 3,
"target_id": 149,
- "target_slot": 1,
- "type": "IMAGE"
+ "target_slot": 3,
+ "type": "FLOAT"
},
{
- "id": 400,
+ "id": 1759,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 189,
+ "target_slot": 3,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1767,
"origin_id": -10,
"origin_slot": 4,
- "target_id": 159,
+ "target_id": 126,
"target_slot": 0,
- "type": "IMAGE,MASK"
+ "type": "INT"
},
{
- "id": 401,
+ "id": 1768,
"origin_id": -10,
"origin_slot": 5,
"target_id": 103,
@@ -3689,23 +3819,7 @@
"type": "COMBO"
},
{
- "id": 402,
- "origin_id": -10,
- "origin_slot": 5,
- "target_id": 156,
- "target_slot": 0,
- "type": "COMBO"
- },
- {
- "id": 403,
- "origin_id": -10,
- "origin_slot": 5,
- "target_id": 97,
- "target_slot": 1,
- "type": "COMBO"
- },
- {
- "id": 404,
+ "id": 1769,
"origin_id": -10,
"origin_slot": 6,
"target_id": 134,
@@ -3713,76 +3827,55 @@
"type": "COMBO"
},
{
- "id": 405,
+ "id": 1770,
"origin_id": -10,
- "origin_slot": 6,
+ "origin_slot": 5,
+ "target_id": 96,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1771,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 97,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 1772,
+ "origin_id": -10,
+ "origin_slot": 7,
"target_id": 97,
"target_slot": 0,
"type": "COMBO"
},
{
- "id": 406,
+ "id": 1773,
"origin_id": -10,
- "origin_slot": 7,
+ "origin_slot": 8,
"target_id": 105,
"target_slot": 1,
"type": "COMBO"
},
{
- "id": 407,
+ "id": 1774,
"origin_id": -10,
- "origin_slot": 8,
+ "origin_slot": 9,
"target_id": 100,
"target_slot": 0,
"type": "COMBO"
- },
- {
- "id": 408,
- "origin_id": -10,
- "origin_slot": 9,
- "target_id": 159,
- "target_slot": 2,
- "type": "INT"
- },
- {
- "id": 409,
- "origin_id": -10,
- "origin_slot": 10,
- "target_id": 159,
- "target_slot": 3,
- "type": "INT"
- },
- {
- "id": 410,
- "origin_id": -10,
- "origin_slot": 11,
- "target_id": 115,
- "target_slot": 2,
- "type": "INT"
}
],
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Pose to video"
+ "category": "Video generation and editing/Conditioned",
+ "description": "Generates video from pose reference frames using LTX-2, with optional synchronized audio."
}
]
},
- "config": {},
"extra": {
- "ds": {
- "scale": 1.3889423076923078,
- "offset": [
- 217.0560747663551,
- -3703.3333333333335
- ]
- },
- "frontendVersion": "1.37.10",
- "workflowRendererVersion": "LG",
- "VHS_latentpreview": false,
- "VHS_latentpreviewrate": 0,
- "VHS_MetadataImage": true,
- "VHS_KeepIntermediate": true
- },
- "version": 0.4
-}
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Prompt Enhance.json b/blueprints/Prompt Enhance.json
index 5e57548ff..e3a77a73b 100644
--- a/blueprints/Prompt Enhance.json
+++ b/blueprints/Prompt Enhance.json
@@ -270,9 +270,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Text generation/Prompt enhance"
+ "category": "Text Tools",
+ "description": "Expands short text prompts into detailed descriptions using a text generation model for better generation quality."
}
]
},
"extra": {}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Remove Background (BiRefNet).json b/blueprints/Remove Background (BiRefNet).json
new file mode 100644
index 000000000..9ec441e51
--- /dev/null
+++ b/blueprints/Remove Background (BiRefNet).json
@@ -0,0 +1,397 @@
+{
+ "revision": 0,
+ "last_node_id": 19,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 19,
+ "type": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
+ "pos": [
+ -6411.330578108367,
+ 1940.2638932730042
+ ],
+ "size": [
+ 349.609375,
+ 145.9375
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "name": "bg_removal_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "bg_removal_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "mask",
+ "type": "MASK",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "14",
+ "bg_removal_name"
+ ]
+ ]
+ },
+ "widgets_values": [],
+ "title": "Remove Background (BiRefNet)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "5b40ca21-ba1a-41d5-b403-4d2d7acdc195",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 21,
+ "lastLinkId": 16,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Remove Background (BiRefNet)",
+ "description": "Removes or replaces image backgrounds using BiRefNet segmentation and alpha compositing.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -6728.534070722246,
+ 1475.2619799128663,
+ 150.9140625,
+ 88
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -6169.049695722246,
+ 1475.2619799128663,
+ 128,
+ 88
+ ]
+ },
+ "inputs": [
+ {
+ "id": "7bc321cd-df31-4c39-aaf7-7f0d01326189",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 5,
+ 7
+ ],
+ "localized_name": "image",
+ "pos": [
+ -6601.620008222246,
+ 1499.2619799128663
+ ]
+ },
+ {
+ "id": "e89d2cd8-daa3-4e29-8a69-851db85072cb",
+ "name": "bg_removal_name",
+ "type": "COMBO",
+ "linkIds": [
+ 12
+ ],
+ "pos": [
+ -6601.620008222246,
+ 1519.2619799128663
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "16e7863c-4c38-46c2-aa74-e82991fbfe8d",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 8
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ -6145.049695722246,
+ 1499.2619799128663
+ ]
+ },
+ {
+ "id": "f7240c19-5b80-406e-a8e2-9b12440ee2d6",
+ "name": "mask",
+ "type": "MASK",
+ "linkIds": [
+ 11
+ ],
+ "pos": [
+ -6145.049695722246,
+ 1519.2619799128663
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 13,
+ "type": "RemoveBackground",
+ "pos": [
+ -6536.764823982709,
+ 1444.9963409012412
+ ],
+ "size": [
+ 302.25,
+ 72
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 5
+ },
+ {
+ "localized_name": "bg_removal_model",
+ "name": "bg_removal_model",
+ "type": "BACKGROUND_REMOVAL",
+ "link": 3
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "links": [
+ 4,
+ 11
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "RemoveBackground"
+ }
+ },
+ {
+ "id": 14,
+ "type": "LoadBackgroundRemovalModel",
+ "pos": [
+ -6540.534070722246,
+ 1302.223464635445
+ ],
+ "size": [
+ 311.484375,
+ 85.515625
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "bg_removal_name",
+ "name": "bg_removal_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "bg_removal_name"
+ },
+ "link": 12
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "bg_model",
+ "name": "bg_model",
+ "type": "BACKGROUND_REMOVAL",
+ "links": [
+ 3
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadBackgroundRemovalModel",
+ "models": [
+ {
+ "name": "birefnet.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/BiRefNet/resolve/main/background_removal/birefnet.safetensors",
+ "directory": "background_removal"
+ }
+ ]
+ },
+ "widgets_values": [
+ "birefnet.safetensors"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "InvertMask",
+ "pos": [
+ -6532.446160529669,
+ 1571.1111286839914
+ ],
+ "size": [
+ 285.984375,
+ 48
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 4
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 6
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InvertMask"
+ }
+ },
+ {
+ "id": 16,
+ "type": "JoinImageWithAlpha",
+ "pos": [
+ -6527.4370171636665,
+ 1674.3004951902876
+ ],
+ "size": [
+ 284.96875,
+ 72
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 7
+ },
+ {
+ "localized_name": "alpha",
+ "name": "alpha",
+ "type": "MASK",
+ "link": 6
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 8
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "JoinImageWithAlpha"
+ }
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 3,
+ "origin_id": 14,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 1,
+ "type": "BACKGROUND_REMOVAL"
+ },
+ {
+ "id": 4,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 15,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 6,
+ "origin_id": 15,
+ "origin_slot": 0,
+ "target_id": 16,
+ "target_slot": 1,
+ "type": "MASK"
+ },
+ {
+ "id": 5,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 7,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 16,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 8,
+ "origin_id": 16,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 11,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "MASK"
+ },
+ {
+ "id": 12,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 14,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Image Tools/Background Removal"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Select Per-Line Text by Index.json b/blueprints/Select Per-Line Text by Index.json
new file mode 100644
index 000000000..8a4020d50
--- /dev/null
+++ b/blueprints/Select Per-Line Text by Index.json
@@ -0,0 +1,485 @@
+{
+ "revision": 0,
+ "last_node_id": 10,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 10,
+ "type": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
+ "pos": [
+ -250,
+ 8590
+ ],
+ "size": [
+ 280,
+ 360
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "text_per_line",
+ "name": "text_per_line",
+ "type": "STRING",
+ "widget": {
+ "name": "text_per_line"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "index",
+ "name": "index",
+ "type": "INT",
+ "widget": {
+ "name": "index"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "selected_line",
+ "name": "selected_line",
+ "type": "STRING",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "2",
+ "string"
+ ],
+ [
+ "3",
+ "value"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Select Per-Line Text by Index"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "3fb7557a-470d-4983-9d8c-6d5caa9788f0",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 10,
+ "lastLinkId": 14,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Select Per-Line Text by Index",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -990,
+ 8595,
+ 128,
+ 88
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 710,
+ 8585,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "75417d82-a934-4ac9-b667-d8dcd5a3bfb3",
+ "name": "text_per_line",
+ "type": "STRING",
+ "linkIds": [
+ 13
+ ],
+ "localized_name": "text_per_line",
+ "pos": [
+ -886,
+ 8619
+ ]
+ },
+ {
+ "id": "46e69a73-1804-4ca6-9175-31445bf0be96",
+ "name": "index",
+ "type": "INT",
+ "linkIds": [
+ 14
+ ],
+ "localized_name": "index",
+ "pos": [
+ -886,
+ 8639
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "e34e8ad1-84d2-4bd2-a460-eb7de6067c10",
+ "name": "selected_line",
+ "type": "STRING",
+ "linkIds": [
+ 10
+ ],
+ "localized_name": "selected_line",
+ "pos": [
+ 734,
+ 8609
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 1,
+ "type": "PreviewAny",
+ "pos": [
+ -500,
+ 8400
+ ],
+ "size": [
+ 230,
+ 180
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 1
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 6
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 2,
+ "type": "RegexExtract",
+ "pos": [
+ -240,
+ 8740
+ ],
+ "size": [
+ 470,
+ 460
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "showAdvanced": false,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 13
+ },
+ {
+ "localized_name": "regex_pattern",
+ "name": "regex_pattern",
+ "type": "STRING",
+ "widget": {
+ "name": "regex_pattern"
+ },
+ "link": 9
+ },
+ {
+ "localized_name": "mode",
+ "name": "mode",
+ "type": "COMBO",
+ "widget": {
+ "name": "mode"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "case_insensitive",
+ "name": "case_insensitive",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "case_insensitive"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "multiline",
+ "name": "multiline",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "multiline"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "dotall",
+ "name": "dotall",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "dotall"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "group_index",
+ "name": "group_index",
+ "type": "INT",
+ "widget": {
+ "name": "group_index"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 10
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "RegexExtract",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ "",
+ "First Group",
+ false,
+ false,
+ false,
+ 1
+ ]
+ },
+ {
+ "id": 3,
+ "type": "PrimitiveInt",
+ "pos": [
+ -810,
+ 8400
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 14
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 1
+ ]
+ }
+ ],
+ "title": "Int (line index)",
+ "properties": {
+ "Node name for S&R": "Int (line index)",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 0,
+ "fixed"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "StringReplace",
+ "pos": [
+ -240,
+ 8400
+ ],
+ "size": [
+ 400,
+ 280
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 6
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "^(?:[^\\n]*\\n){index}([^\\n]*)(?:\\n|$)",
+ "index",
+ ""
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 1,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 9,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 6,
+ "origin_id": 1,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 10,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 13,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 14,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "INT"
+ }
+ ],
+ "extra": {},
+ "category": "Text Tools",
+ "description": "Selects one line from multiline text by zero-based index for batch or list-driven prompt workflows."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": [],
+ "links_added_by_ue": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Sharpen.json b/blueprints/Sharpen.json
index bb79f61fc..3c4099c6b 100644
--- a/blueprints/Sharpen.json
+++ b/blueprints/Sharpen.json
@@ -267,7 +267,7 @@
"Node name for S&R": "GLSLShader"
},
"widgets_values": [
- "#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
+ "#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // strength [0.0 – 2.0] typical: 0.3–1.0\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nvoid main() {\n vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));\n \n // Sample center and neighbors\n vec4 center = texture(u_image0, v_texCoord);\n vec4 top = texture(u_image0, v_texCoord + vec2( 0.0, -texel.y));\n vec4 bottom = texture(u_image0, v_texCoord + vec2( 0.0, texel.y));\n vec4 left = texture(u_image0, v_texCoord + vec2(-texel.x, 0.0));\n vec4 right = texture(u_image0, v_texCoord + vec2( texel.x, 0.0));\n \n // Edge enhancement (Laplacian)\n vec4 edges = center * 4.0 - top - bottom - left - right;\n \n // Add edges back scaled by strength\n vec4 sharpened = center + edges * u_float0;\n \n fragColor0 = vec4(clamp(sharpened.rgb, 0.0, 1.0), center.a);\n}",
"from_input"
]
}
@@ -302,8 +302,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Sharpen"
+ "category": "Image Tools/Sharpen",
+ "description": "Sharpens image details using a GPU fragment shader for enhanced clarity."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Split Image Grid to Tiles.json b/blueprints/Split Image Grid to Tiles.json
new file mode 100644
index 000000000..d1f6e40ef
--- /dev/null
+++ b/blueprints/Split Image Grid to Tiles.json
@@ -0,0 +1,714 @@
+{
+ "revision": 0,
+ "last_node_id": 251,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 251,
+ "type": "609e1fd1-b731-4b78-89ac-d19b1156b025",
+ "pos": [
+ -1490,
+ 130
+ ],
+ "size": [
+ 230,
+ 164
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source_image",
+ "name": "source_image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "columns",
+ "name": "columns",
+ "type": "INT",
+ "widget": {
+ "name": "columns"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "rows",
+ "name": "rows",
+ "type": "INT",
+ "widget": {
+ "name": "rows"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "tiles",
+ "name": "tiles",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "228",
+ "value"
+ ],
+ [
+ "252",
+ "value"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.20.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Split Image Grid to Tiles"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "609e1fd1-b731-4b78-89ac-d19b1156b025",
+ "version": 1,
+ "state": {
+ "lastGroupId": 9,
+ "lastNodeId": 252,
+ "lastLinkId": 429,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Split Image Grid to Tiles",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1690,
+ 260,
+ 128,
+ 108
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -510,
+ 590,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "866ac798-cfbc-450a-b755-e704f86404d9",
+ "name": "source_image",
+ "type": "IMAGE",
+ "linkIds": [
+ 386,
+ 389
+ ],
+ "localized_name": "source_image",
+ "pos": [
+ -1586,
+ 284
+ ]
+ },
+ {
+ "id": "bc37b1f8-8ab2-4f19-bd00-75d4fbc4feb3",
+ "name": "columns",
+ "type": "INT",
+ "linkIds": [
+ 427
+ ],
+ "localized_name": "columns",
+ "pos": [
+ -1586,
+ 304
+ ]
+ },
+ {
+ "id": "d45915da-e848-43dd-9ccc-e3161e9c99d9",
+ "name": "rows",
+ "type": "INT",
+ "linkIds": [
+ 428
+ ],
+ "localized_name": "rows",
+ "pos": [
+ -1586,
+ 324
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "18bc780f-064b-4038-87c6-67dba71deb08",
+ "name": "tiles",
+ "type": "IMAGE",
+ "linkIds": [
+ 394
+ ],
+ "localized_name": "tiles",
+ "shape": 6,
+ "pos": [
+ -486,
+ 614
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 225,
+ "type": "SplitImageToTileList",
+ "pos": [
+ -1010,
+ 620
+ ],
+ "size": [
+ 290,
+ 170
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 386
+ },
+ {
+ "localized_name": "tile_width",
+ "name": "tile_width",
+ "type": "INT",
+ "widget": {
+ "name": "tile_width"
+ },
+ "link": 403
+ },
+ {
+ "localized_name": "tile_height",
+ "name": "tile_height",
+ "type": "INT",
+ "widget": {
+ "name": "tile_height"
+ },
+ "link": 404
+ },
+ {
+ "localized_name": "overlap",
+ "name": "overlap",
+ "type": "INT",
+ "widget": {
+ "name": "overlap"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "shape": 6,
+ "type": "IMAGE",
+ "links": [
+ 394
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SplitImageToTileList",
+ "cnr_id": "comfy-core",
+ "ver": "0.20.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 0
+ ]
+ },
+ {
+ "id": 231,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -1080,
+ 330
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 390
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 429
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 404
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "title": "Math Expression (Height)",
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "max(1, (int(a) + int(b) - 1) // int(b))"
+ ]
+ },
+ {
+ "id": 229,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -1090,
+ -30
+ ],
+ "size": [
+ 370,
+ 190
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 387
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 388
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 403
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "title": "Math Expression (Width)",
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "max(1, (int(a) + int(b) - 1) // int(b))"
+ ]
+ },
+ {
+ "id": 228,
+ "type": "PrimitiveInt",
+ "pos": [
+ -1380,
+ 90
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 427
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 388
+ ]
+ }
+ ],
+ "title": "Int (grid columns)",
+ "properties": {
+ "Node name for S&R": "Int (grid columns)",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 2,
+ "fixed"
+ ]
+ },
+ {
+ "id": 230,
+ "type": "GetImageSize",
+ "pos": [
+ -1380,
+ 290
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 389
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 387
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 390
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 252,
+ "type": "PrimitiveInt",
+ "pos": [
+ -1380,
+ 470
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 428
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 429
+ ]
+ }
+ ],
+ "title": "Int (grid rows)",
+ "properties": {
+ "Node name for S&R": "Int (grid rows)",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 3,
+ "fixed"
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 403,
+ "origin_id": 229,
+ "origin_slot": 1,
+ "target_id": 225,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 404,
+ "origin_id": 231,
+ "origin_slot": 1,
+ "target_id": 225,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 390,
+ "origin_id": 230,
+ "origin_slot": 1,
+ "target_id": 231,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 387,
+ "origin_id": 230,
+ "origin_slot": 0,
+ "target_id": 229,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 388,
+ "origin_id": 228,
+ "origin_slot": 0,
+ "target_id": 229,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 386,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 225,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 389,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 230,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 394,
+ "origin_id": 225,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 427,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 228,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 428,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 252,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 429,
+ "origin_id": 252,
+ "origin_slot": 0,
+ "target_id": 231,
+ "target_slot": 1,
+ "type": "INT"
+ }
+ ],
+ "extra": {},
+ "category": "Image Tools/Crop",
+ "description": "Splits an image into a configurable columns×rows grid of equal tiles for tiled generation or processing."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Audio (ACE-Step 1.5).json b/blueprints/Text to Audio (ACE-Step 1.5).json
index 206cf16be..5b8b8626f 100644
--- a/blueprints/Text to Audio (ACE-Step 1.5).json
+++ b/blueprints/Text to Audio (ACE-Step 1.5).json
@@ -222,7 +222,7 @@
},
"revision": 0,
"config": {},
- "name": "local-Text to Audio (ACE-Step 1.5)",
+ "name": "Text to Audio (ACE-Step 1.5)",
"inputNode": {
"id": -10,
"bounding": [
@@ -1502,7 +1502,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Audio/Music generation"
+ "category": "Audio/Music generation",
+ "description": "Generates audio/music from text prompts using ACE-Step 1.5, a diffusion-based audio generation model."
}
]
},
@@ -1518,4 +1519,4 @@
}
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image (Anima).json b/blueprints/Text to Image (Anima).json
new file mode 100644
index 000000000..787908ca9
--- /dev/null
+++ b/blueprints/Text to Image (Anima).json
@@ -0,0 +1,1085 @@
+{
+ "revision": 0,
+ "last_node_id": 60,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 60,
+ "type": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f",
+ "pos": [
+ -10,
+ 130
+ ],
+ "size": [
+ 500,
+ 640
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "11",
+ "text"
+ ],
+ [
+ "28",
+ "width"
+ ],
+ [
+ "28",
+ "height"
+ ],
+ [
+ "19",
+ "steps"
+ ],
+ [
+ "19",
+ "cfg"
+ ],
+ [
+ "19",
+ "seed"
+ ],
+ [
+ "44",
+ "unet_name"
+ ],
+ [
+ "45",
+ "clip_name"
+ ],
+ [
+ "15",
+ "vae_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Anima)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "a3c0dab6-b250-4585-a0f9-8fb8b074fb2f",
+ "version": 1,
+ "state": {
+ "lastGroupId": 3,
+ "lastNodeId": 70,
+ "lastLinkId": 104,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Anima)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -330,
+ 530,
+ 120,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1229.9999873482075,
+ 505,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "4693f350-6ba0-446d-80d4-3038c661d26c",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 95
+ ],
+ "label": "prompt",
+ "pos": [
+ -230,
+ 550
+ ]
+ },
+ {
+ "id": "4a7886a9-4ed7-49bb-afc2-977bb78a303d",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 96
+ ],
+ "pos": [
+ -230,
+ 570
+ ]
+ },
+ {
+ "id": "f6c04461-d29e-49e3-8790-07bb662bbbfe",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 97
+ ],
+ "pos": [
+ -230,
+ 590
+ ]
+ },
+ {
+ "id": "7a24f998-3808-4837-8bff-52304ad09fb6",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 98
+ ],
+ "pos": [
+ -230,
+ 610
+ ]
+ },
+ {
+ "id": "aaa99698-b222-40fe-b946-28067528a85c",
+ "name": "cfg",
+ "type": "FLOAT",
+ "linkIds": [
+ 99
+ ],
+ "pos": [
+ -230,
+ 630
+ ]
+ },
+ {
+ "id": "053df9ae-7311-4816-aa23-7fa13c656ced",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 100
+ ],
+ "pos": [
+ -230,
+ 650
+ ]
+ },
+ {
+ "id": "c59194ea-015c-41a7-8edd-ae7ffc220b63",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 101
+ ],
+ "pos": [
+ -230,
+ 670
+ ]
+ },
+ {
+ "id": "e655aa3b-2db7-4e25-9ea2-61550fa7ae2d",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 102
+ ],
+ "pos": [
+ -230,
+ 690
+ ]
+ },
+ {
+ "id": "94965a7a-74dd-4f5a-87e3-9f87995d554f",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 103
+ ],
+ "pos": [
+ -230,
+ 710
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ef85ac0a-2152-4232-bfa1-929cfc913718",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 82
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1249.9999873482075,
+ 525
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 45,
+ "type": "CLIPLoader",
+ "pos": [
+ -60,
+ 380
+ ],
+ "size": [
+ 310,
+ 150
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 102
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 80,
+ 81
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.11.0",
+ "models": [
+ {
+ "name": "qwen_3_06b_base.safetensors",
+ "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/text_encoders/qwen_3_06b_base.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_3_06b_base.safetensors",
+ "stable_diffusion",
+ "default"
+ ]
+ },
+ {
+ "id": 15,
+ "type": "VAELoader",
+ "pos": [
+ -50,
+ 610
+ ],
+ "size": [
+ 310,
+ 100
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 103
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 11
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "models": [
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/vae/qwen_image_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_image_vae.safetensors"
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 880,
+ 840
+ ],
+ "size": [
+ 230,
+ 90
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 10
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 11
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 82
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 28,
+ "type": "EmptyLatentImage",
+ "pos": [
+ -50,
+ 830
+ ],
+ "size": [
+ 310,
+ 150
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 96
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 97
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 78
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyLatentImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 12,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 330,
+ 830
+ ],
+ "size": [
+ 490,
+ 140
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 81
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.65",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "worst quality, low quality, score_1, score_2, score_3, blurry, jpeg artifacts, sepia"
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 19,
+ "type": "KSampler",
+ "pos": [
+ 870,
+ 120
+ ],
+ "size": [
+ 300,
+ 620
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 79
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 39
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 40
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 78
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 100
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 98
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 99
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 10
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "fixed",
+ 30,
+ 4,
+ "er_sde",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 11,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 320,
+ 170
+ ],
+ "size": [
+ 490,
+ 610
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 80
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 95
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 39
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.65",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 44,
+ "type": "UNETLoader",
+ "pos": [
+ -50,
+ 170
+ ],
+ "size": [
+ 310,
+ 130
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 101
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 79
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.11.0",
+ "models": [
+ {
+ "name": "anima-base-v1.0.safetensors",
+ "url": "https://huggingface.co/circlestone-labs/Anima/resolve/main/split_files/diffusion_models/anima-base-v1.0.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "anima-base-v1.0.safetensors",
+ "default"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ -80,
+ 80,
+ 360,
+ 640
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size(1MP)",
+ "bounding": [
+ -80,
+ 750,
+ 360,
+ 240
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ 300,
+ 80,
+ 530,
+ 910
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 10,
+ "origin_id": 19,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 11,
+ "origin_id": 15,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 81,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 12,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 79,
+ "origin_id": 44,
+ "origin_slot": 0,
+ "target_id": 19,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 39,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 19,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 40,
+ "origin_id": 12,
+ "origin_slot": 0,
+ "target_id": 19,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 78,
+ "origin_id": 28,
+ "origin_slot": 0,
+ "target_id": 19,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 80,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 82,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 95,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 96,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 28,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 97,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 28,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 98,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 19,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 99,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 19,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 100,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 19,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 101,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 44,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 102,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 45,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 103,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 15,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Image generation and editing/Text to image"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image (Ernie Image Turbo).json b/blueprints/Text to Image (Ernie Image Turbo).json
new file mode 100644
index 000000000..4ecdd1883
--- /dev/null
+++ b/blueprints/Text to Image (Ernie Image Turbo).json
@@ -0,0 +1,2112 @@
+{
+ "revision": 0,
+ "last_node_id": 88,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 88,
+ "type": "2a4f0815-c4d2-4e8b-9bdf-991a8403889d",
+ "pos": [
+ -120,
+ 240
+ ],
+ "size": [
+ 400,
+ 540
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "prompt_enhancement",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "label": "prompt_enhancer",
+ "name": "clip_name_1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name_1"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "94",
+ "value"
+ ],
+ [
+ "96",
+ "value"
+ ],
+ [
+ "71",
+ "width"
+ ],
+ [
+ "71",
+ "height"
+ ],
+ [
+ "70",
+ "seed"
+ ],
+ [
+ "66",
+ "unet_name"
+ ],
+ [
+ "62",
+ "clip_name"
+ ],
+ [
+ "98",
+ "clip_name"
+ ],
+ [
+ "63",
+ "vae_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value": true,
+ "value_1": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Ernie Image Turbo)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "2a4f0815-c4d2-4e8b-9bdf-991a8403889d",
+ "version": 1,
+ "state": {
+ "lastGroupId": 7,
+ "lastNodeId": 103,
+ "lastLinkId": 134,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Ernie Image Turbo)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1350,
+ 370,
+ 163.50390625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1110,
+ 260,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "74a4609c-67df-4ae9-ab96-9ff4e3a1c3b1",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 128
+ ],
+ "label": "prompt",
+ "pos": [
+ -1206.49609375,
+ 390
+ ]
+ },
+ {
+ "id": "996f1854-7ae3-450e-821c-a9b5b7c310f9",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 127
+ ],
+ "label": "prompt_enhancement",
+ "pos": [
+ -1206.49609375,
+ 410
+ ]
+ },
+ {
+ "id": "71e9c6e8-4285-4543-b1d3-81520088f6a4",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 104,
+ 129
+ ],
+ "pos": [
+ -1206.49609375,
+ 430
+ ]
+ },
+ {
+ "id": "bdb6cd97-67d9-440c-8c4c-9b7a7540edd0",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 105,
+ 130
+ ],
+ "pos": [
+ -1206.49609375,
+ 450
+ ]
+ },
+ {
+ "id": "18abb56c-30bf-4de5-83c1-c12376e8d14e",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 108
+ ],
+ "pos": [
+ -1206.49609375,
+ 470
+ ]
+ },
+ {
+ "id": "e5cd06f9-64ed-4778-97ba-b165f7a79c4e",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 109
+ ],
+ "pos": [
+ -1206.49609375,
+ 490
+ ]
+ },
+ {
+ "id": "06480e4c-4043-489b-ae68-1cf2b4246260",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 110
+ ],
+ "pos": [
+ -1206.49609375,
+ 510
+ ]
+ },
+ {
+ "id": "8d65d01b-16b2-420d-8b7b-42077c2e4976",
+ "name": "clip_name_1",
+ "type": "COMBO",
+ "linkIds": [
+ 132
+ ],
+ "label": "prompt_enhancer",
+ "pos": [
+ -1206.49609375,
+ 530
+ ]
+ },
+ {
+ "id": "697f2fdb-0fd9-4008-a895-0f9ce9e8fd88",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 133
+ ],
+ "pos": [
+ -1206.49609375,
+ 550
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "21d5fbe0-9f91-4d93-8ea8-5bbf2cd5b698",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 84
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1130,
+ 280
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 71,
+ "type": "EmptyFlux2LatentImage",
+ "pos": [
+ -470,
+ 1050
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 104
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 105
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyFlux2LatentImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 66,
+ "type": "UNETLoader",
+ "pos": [
+ -470,
+ 320
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 109
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 85
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "ernie-image-turbo.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/diffusion_models/ernie-image-turbo.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ernie-image-turbo.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 65,
+ "type": "VAEDecode",
+ "pos": [
+ 710,
+ 280
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 73
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 74
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 84
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 70,
+ "type": "KSampler",
+ "pos": [
+ 350,
+ 280
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 85
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 76
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 113
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 80
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 108
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 73
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 423299999918804,
+ "randomize",
+ 8,
+ 1,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 67,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -140,
+ 320
+ ],
+ "size": [
+ 410,
+ 370
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 79
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 131
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 76,
+ 112
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 62,
+ "type": "CLIPLoader",
+ "pos": [
+ -470,
+ 530
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 110
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 79
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "ministral-3-3b.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ministral-3-3b.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 63,
+ "type": "VAELoader",
+ "pos": [
+ -470,
+ 780
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 133
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 74
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "flux2-vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/vae/flux2-vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "flux2-vae.safetensors"
+ ]
+ },
+ {
+ "id": 91,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ 30,
+ 760
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 112
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 113
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ConditioningZeroOut",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 93,
+ "type": "StringReplace",
+ "pos": [
+ -500,
+ -650
+ ],
+ "size": [
+ 430,
+ 450
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 115
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 121
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "[SYSTEM_PROMPT]你是一个专业的文生图 Prompt 增强助手。你将收到用户的简短图片描述及目标生成分辨率,请据此扩写为一段内容丰富、细节充分的视觉描述,以帮助文生图模型生成高质量的图片。仅输出增强后的描述,不要包含任何解释或前缀。[/SYSTEM_PROMPT][INST]{\"prompt\": \"{prompt}\", \"width\": {width}, \"height\": {height}}[/INST]",
+ "{prompt}",
+ ""
+ ]
+ },
+ {
+ "id": 94,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -950,
+ -660
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 128
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 115,
+ 118
+ ]
+ }
+ ],
+ "title": "String (Multiline - Prompt)",
+ "properties": {
+ "Node name for S&R": "PrimitiveStringMultiline",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 95,
+ "type": "TextGenerate",
+ "pos": [
+ 530,
+ -660
+ ],
+ "size": [
+ 400,
+ 380
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 116
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 117
+ },
+ {
+ "localized_name": "max_length",
+ "name": "max_length",
+ "type": "INT",
+ "widget": {
+ "name": "max_length"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode",
+ "name": "sampling_mode",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "sampling_mode"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temperature",
+ "name": "sampling_mode.temperature",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.temperature"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_k",
+ "name": "sampling_mode.top_k",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.top_k"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_p",
+ "name": "sampling_mode.top_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.top_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "min_p",
+ "name": "sampling_mode.min_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.min_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "repetition_penalty",
+ "name": "sampling_mode.repetition_penalty",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.repetition_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "seed",
+ "name": "sampling_mode.seed",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.seed"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode.presence_penalty",
+ "name": "sampling_mode.presence_penalty",
+ "shape": 7,
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.presence_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "thinking",
+ "name": "thinking",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "thinking"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "use_default_template",
+ "name": "use_default_template",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_default_template"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "generated_text",
+ "name": "generated_text",
+ "type": "STRING",
+ "links": [
+ 119
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextGenerate",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ 2048,
+ "on",
+ 0.6,
+ 64,
+ 0.8,
+ 0.05,
+ 1.05,
+ 0,
+ 0,
+ false,
+ true
+ ]
+ },
+ {
+ "id": 96,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -490,
+ 60
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 127
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 120
+ ]
+ }
+ ],
+ "title": "Enable prompt enhancement?",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 97,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 550,
+ -10
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 118
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 119
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 120
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 131,
+ 134
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 98,
+ "type": "CLIPLoader",
+ "pos": [
+ -490,
+ -150
+ ],
+ "size": [
+ 510,
+ 150
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 132
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 116
+ ]
+ }
+ ],
+ "title": "Load CLIP (PE)",
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "models": [
+ {
+ "name": "ernie-image-prompt-enhancer.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ernie-image-prompt-enhancer.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ernie-image-prompt-enhancer.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 99,
+ "type": "PreviewAny",
+ "pos": [
+ -950,
+ -410
+ ],
+ "size": [
+ 400,
+ 180
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 129
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 122
+ ]
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 100,
+ "type": "PreviewAny",
+ "pos": [
+ -950,
+ -190
+ ],
+ "size": [
+ 400,
+ 180
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 130
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 124
+ ]
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 101,
+ "type": "StringReplace",
+ "pos": [
+ -30,
+ -650
+ ],
+ "size": [
+ 230,
+ 450
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 121
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 122
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 123
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ "{width}",
+ ""
+ ]
+ },
+ {
+ "id": 102,
+ "type": "StringReplace",
+ "pos": [
+ 220,
+ -650
+ ],
+ "size": [
+ 250,
+ 450
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 123
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 124
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 117
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ "{height}",
+ ""
+ ]
+ },
+ {
+ "id": 103,
+ "type": "PreviewAny",
+ "pos": [
+ 970,
+ -660
+ ],
+ "size": [
+ 570,
+ 790
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 134
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": []
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 6,
+ "title": "Text to Image",
+ "bounding": [
+ -510,
+ 200,
+ 1450,
+ 1060
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size",
+ "bounding": [
+ -490,
+ 950,
+ 300,
+ 290
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -160,
+ 250,
+ 470,
+ 670
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Model",
+ "bounding": [
+ -490,
+ 250,
+ 300,
+ 670
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Prompt Enhancement",
+ "bounding": [
+ -510,
+ -720,
+ 1450,
+ 890
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 73,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 74,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 85,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 76,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 80,
+ "origin_id": 71,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 79,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 84,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 104,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 71,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 105,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 71,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 108,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 70,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 109,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 110,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 112,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 91,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 113,
+ "origin_id": 91,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 115,
+ "origin_id": 94,
+ "origin_slot": 0,
+ "target_id": 93,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 116,
+ "origin_id": 98,
+ "origin_slot": 0,
+ "target_id": 95,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 117,
+ "origin_id": 102,
+ "origin_slot": 0,
+ "target_id": 95,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 118,
+ "origin_id": 94,
+ "origin_slot": 0,
+ "target_id": 97,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 119,
+ "origin_id": 95,
+ "origin_slot": 0,
+ "target_id": 97,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 120,
+ "origin_id": 96,
+ "origin_slot": 0,
+ "target_id": 97,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 121,
+ "origin_id": 93,
+ "origin_slot": 0,
+ "target_id": 101,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 122,
+ "origin_id": 99,
+ "origin_slot": 0,
+ "target_id": 101,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 123,
+ "origin_id": 101,
+ "origin_slot": 0,
+ "target_id": 102,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 124,
+ "origin_id": 100,
+ "origin_slot": 0,
+ "target_id": 102,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 127,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 96,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 128,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 94,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 129,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 99,
+ "target_slot": 0,
+ "type": "*"
+ },
+ {
+ "id": 130,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 100,
+ "target_slot": 0,
+ "type": "*"
+ },
+ {
+ "id": 131,
+ "origin_id": 97,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 132,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 98,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 133,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 134,
+ "origin_id": 97,
+ "origin_slot": 0,
+ "target_id": 103,
+ "target_slot": 0,
+ "type": "STRING"
+ }
+ ],
+ "extra": {},
+ "category": "Image generation and editing/Text to image",
+ "description": "Faster ERNIE Image Turbo variant (~8B DiT, distilled for fewer sampling steps): same strengths in Chinese/English on-image text and layout-heavy graphics as the base ERNIE Image lineup, with bundled encoders and VAE."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (Ernie Image).json b/blueprints/Text to Image (Ernie Image).json
new file mode 100644
index 000000000..2bab20d69
--- /dev/null
+++ b/blueprints/Text to Image (Ernie Image).json
@@ -0,0 +1,2190 @@
+{
+ "revision": 0,
+ "last_node_id": 88,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 88,
+ "type": "03921aea-a70e-44b4-bc77-f6bda10f2120",
+ "pos": [
+ -120,
+ 240
+ ],
+ "size": [
+ 400,
+ 540
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "prompt_enhancement",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "label": "prompt_enhancer",
+ "name": "clip_name_1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name_1"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "78",
+ "value"
+ ],
+ [
+ "76",
+ "value"
+ ],
+ [
+ "71",
+ "width"
+ ],
+ [
+ "71",
+ "height"
+ ],
+ [
+ "70",
+ "steps"
+ ],
+ [
+ "70",
+ "cfg"
+ ],
+ [
+ "70",
+ "seed"
+ ],
+ [
+ "66",
+ "unet_name"
+ ],
+ [
+ "62",
+ "clip_name"
+ ],
+ [
+ "91",
+ "clip_name"
+ ],
+ [
+ "63",
+ "vae_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value": true,
+ "value_1": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Ernie Image)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "03921aea-a70e-44b4-bc77-f6bda10f2120",
+ "version": 1,
+ "state": {
+ "lastGroupId": 6,
+ "lastNodeId": 99,
+ "lastLinkId": 124,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Ernie Image)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1350,
+ 370,
+ 163.50390625,
+ 260
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1110,
+ 260,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "504de359-52a4-49aa-b6be-23c1cdb0cbde",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 102
+ ],
+ "label": "prompt",
+ "pos": [
+ -1206.49609375,
+ 390
+ ]
+ },
+ {
+ "id": "29f699c6-9263-41f6-b37d-69b9fc3913dd",
+ "name": "value_1",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 103
+ ],
+ "label": "prompt_enhancement",
+ "pos": [
+ -1206.49609375,
+ 410
+ ]
+ },
+ {
+ "id": "968e6213-d1e9-4268-8f47-1d6b9a39a43e",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 104,
+ 113
+ ],
+ "pos": [
+ -1206.49609375,
+ 430
+ ]
+ },
+ {
+ "id": "181c49ef-740d-4385-aa11-79718951ccb9",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 105,
+ 114
+ ],
+ "pos": [
+ -1206.49609375,
+ 450
+ ]
+ },
+ {
+ "id": "1e85f808-66a1-41df-be52-334142b35419",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 106
+ ],
+ "pos": [
+ -1206.49609375,
+ 470
+ ]
+ },
+ {
+ "id": "2806addf-a252-4aa3-a5b7-397ab36dccec",
+ "name": "cfg",
+ "type": "FLOAT",
+ "linkIds": [
+ 107
+ ],
+ "pos": [
+ -1206.49609375,
+ 490
+ ]
+ },
+ {
+ "id": "5d036a66-5dc0-4d7c-b9a9-349e454738aa",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 108
+ ],
+ "pos": [
+ -1206.49609375,
+ 510
+ ]
+ },
+ {
+ "id": "360f9a40-aac5-4e9c-bc98-9d55a4a58be2",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 109
+ ],
+ "pos": [
+ -1206.49609375,
+ 530
+ ]
+ },
+ {
+ "id": "886301c7-6e88-4cec-96fa-8ae20e8340c5",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 110
+ ],
+ "pos": [
+ -1206.49609375,
+ 550
+ ]
+ },
+ {
+ "id": "1d73a545-6d01-462f-bc61-966d4b918ff2",
+ "name": "clip_name_1",
+ "type": "COMBO",
+ "linkIds": [
+ 120
+ ],
+ "label": "prompt_enhancer",
+ "pos": [
+ -1206.49609375,
+ 570
+ ]
+ },
+ {
+ "id": "8c61dc8c-e260-4b36-b73e-d36f90a0bbe3",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 121
+ ],
+ "pos": [
+ -1206.49609375,
+ 590
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f4cb34c8-4090-4281-b428-7338a339d274",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 84
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1130,
+ 280
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 71,
+ "type": "EmptyFlux2LatentImage",
+ "pos": [
+ -460,
+ 1040
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 104
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 105
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptyFlux2LatentImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 66,
+ "type": "UNETLoader",
+ "pos": [
+ -470,
+ 320
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 109
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 85
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "ernie-image.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/diffusion_models/ernie-image.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ernie-image.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 65,
+ "type": "VAEDecode",
+ "pos": [
+ 710,
+ 280
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 73
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 74
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 84
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 70,
+ "type": "KSampler",
+ "pos": [
+ 350,
+ 280
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 85
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 76
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 83
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 80
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 108
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 106
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 107
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 73
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 182596410725960,
+ "randomize",
+ 20,
+ 4,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 67,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -140,
+ 320
+ ],
+ "size": [
+ 410,
+ 370
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 79
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 100
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 76
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 72,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -130,
+ 770
+ ],
+ "size": [
+ 390,
+ 140
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 82
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 83
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 83,
+ "type": "StringReplace",
+ "pos": [
+ -500,
+ -640
+ ],
+ "size": [
+ 430,
+ 450
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 92
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 115
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "[SYSTEM_PROMPT]你是一个专业的文生图 Prompt 增强助手。你将收到用户的简短图片描述及目标生成分辨率,请据此扩写为一段内容丰富、细节充分的视觉描述,以帮助文生图模型生成高质量的图片。仅输出增强后的描述,不要包含任何解释或前缀。[/SYSTEM_PROMPT][INST]{\"prompt\": \"{prompt}\", \"width\": {width}, \"height\": {height}}[/INST]",
+ "{prompt}",
+ ""
+ ]
+ },
+ {
+ "id": 78,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -950,
+ -650
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 102
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 87,
+ 92
+ ]
+ }
+ ],
+ "title": "String (Multiline - Prompt)",
+ "properties": {
+ "Node name for S&R": "PrimitiveStringMultiline",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 74,
+ "type": "TextGenerate",
+ "pos": [
+ 530,
+ -650
+ ],
+ "size": [
+ 400,
+ 380
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 112
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "prompt",
+ "name": "prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "prompt"
+ },
+ "link": 119
+ },
+ {
+ "localized_name": "max_length",
+ "name": "max_length",
+ "type": "INT",
+ "widget": {
+ "name": "max_length"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode",
+ "name": "sampling_mode",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "sampling_mode"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temperature",
+ "name": "sampling_mode.temperature",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.temperature"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_k",
+ "name": "sampling_mode.top_k",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.top_k"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "top_p",
+ "name": "sampling_mode.top_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.top_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "min_p",
+ "name": "sampling_mode.min_p",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.min_p"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "repetition_penalty",
+ "name": "sampling_mode.repetition_penalty",
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.repetition_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "seed",
+ "name": "sampling_mode.seed",
+ "type": "INT",
+ "widget": {
+ "name": "sampling_mode.seed"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampling_mode.presence_penalty",
+ "name": "sampling_mode.presence_penalty",
+ "shape": 7,
+ "type": "FLOAT",
+ "widget": {
+ "name": "sampling_mode.presence_penalty"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "thinking",
+ "name": "thinking",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "thinking"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "use_default_template",
+ "name": "use_default_template",
+ "shape": 7,
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "use_default_template"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "generated_text",
+ "name": "generated_text",
+ "type": "STRING",
+ "links": [
+ 89
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TextGenerate",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ 2048,
+ "on",
+ 0.6,
+ 64,
+ 0.8,
+ 0.05,
+ 1.05,
+ 0,
+ 0,
+ false,
+ true
+ ]
+ },
+ {
+ "id": 76,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -500,
+ 60
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 103
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 88
+ ]
+ }
+ ],
+ "title": "Enable prompt enhancement?",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 75,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 530,
+ 20
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 87
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 89
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 88
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 100,
+ 124
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 62,
+ "type": "CLIPLoader",
+ "pos": [
+ -460,
+ 520
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 110
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 79,
+ 82
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "ministral-3-3b.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ministral-3-3b.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ministral-3-3b.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 63,
+ "type": "VAELoader",
+ "pos": [
+ -460,
+ 770
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 121
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 74
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "flux2-vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/vae/flux2-vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "flux2-vae.safetensors"
+ ]
+ },
+ {
+ "id": 91,
+ "type": "CLIPLoader",
+ "pos": [
+ -500,
+ -150
+ ],
+ "size": [
+ 510,
+ 150
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 120
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 112
+ ]
+ }
+ ],
+ "title": "Load CLIP (PE)",
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "models": [
+ {
+ "name": "ernie-image-prompt-enhancer.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ERNIE-Image/resolve/main/text_encoders/ernie-image-prompt-enhancer.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "ernie-image-prompt-enhancer.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 92,
+ "type": "PreviewAny",
+ "pos": [
+ -950,
+ -400
+ ],
+ "size": [
+ 400,
+ 180
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 113
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 116
+ ]
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 93,
+ "type": "PreviewAny",
+ "pos": [
+ -950,
+ -180
+ ],
+ "size": [
+ 400,
+ 180
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 114
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 118
+ ]
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ },
+ {
+ "id": 94,
+ "type": "StringReplace",
+ "pos": [
+ -30,
+ -640
+ ],
+ "size": [
+ 230,
+ 450
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 115
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 116
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 117
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ "{width}",
+ ""
+ ]
+ },
+ {
+ "id": 95,
+ "type": "StringReplace",
+ "pos": [
+ 220,
+ -640
+ ],
+ "size": [
+ 250,
+ 450
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string",
+ "name": "string",
+ "type": "STRING",
+ "widget": {
+ "name": "string"
+ },
+ "link": 117
+ },
+ {
+ "localized_name": "find",
+ "name": "find",
+ "type": "STRING",
+ "widget": {
+ "name": "find"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "replace",
+ "name": "replace",
+ "type": "STRING",
+ "widget": {
+ "name": "replace"
+ },
+ "link": 118
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 119
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "StringReplace",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "",
+ "{height}",
+ ""
+ ]
+ },
+ {
+ "id": 97,
+ "type": "PreviewAny",
+ "pos": [
+ 970,
+ -650
+ ],
+ "size": [
+ 570,
+ 790
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "*",
+ "link": 124
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": []
+ }
+ ],
+ "title": "Preview as Text (Int to String)",
+ "properties": {
+ "Node name for S&R": "PreviewAny",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ null,
+ null,
+ null
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 6,
+ "title": "Text to Image",
+ "bounding": [
+ -510,
+ 200,
+ 1450,
+ 1060
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size",
+ "bounding": [
+ -480,
+ 940,
+ 310,
+ 290
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -160,
+ 250,
+ 470,
+ 670
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Model",
+ "bounding": [
+ -490,
+ 250,
+ 320,
+ 670
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Prompt Enhancement",
+ "bounding": [
+ -510,
+ -720,
+ 1450,
+ 890
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 73,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 74,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 85,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 76,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 83,
+ "origin_id": 72,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 80,
+ "origin_id": 71,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 79,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 100,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 82,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 72,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 92,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 83,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 87,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 89,
+ "origin_id": 74,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 88,
+ "origin_id": 76,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 84,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 102,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 103,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 76,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 104,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 71,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 105,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 71,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 106,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 70,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 107,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 70,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 108,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 70,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 109,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 110,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 112,
+ "origin_id": 91,
+ "origin_slot": 0,
+ "target_id": 74,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 113,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 92,
+ "target_slot": 0,
+ "type": "*"
+ },
+ {
+ "id": 114,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 93,
+ "target_slot": 0,
+ "type": "*"
+ },
+ {
+ "id": 115,
+ "origin_id": 83,
+ "origin_slot": 0,
+ "target_id": 94,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 116,
+ "origin_id": 92,
+ "origin_slot": 0,
+ "target_id": 94,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 117,
+ "origin_id": 94,
+ "origin_slot": 0,
+ "target_id": 95,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 118,
+ "origin_id": 93,
+ "origin_slot": 0,
+ "target_id": 95,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 119,
+ "origin_id": 95,
+ "origin_slot": 0,
+ "target_id": 74,
+ "target_slot": 2,
+ "type": "STRING"
+ },
+ {
+ "id": 120,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 91,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 121,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 124,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": 97,
+ "target_slot": 0,
+ "type": "STRING"
+ }
+ ],
+ "extra": {},
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using Baidu’s open ERNIE Image (~8B DiT): bilingual in-image typography and layouts (posters, infographics, multi-panel compositions) alongside general scenes, with bundled encoders and VAE."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (Flux.1 Dev).json b/blueprints/Text to Image (Flux.1 Dev).json
new file mode 100644
index 000000000..6d8446e81
--- /dev/null
+++ b/blueprints/Text to Image (Flux.1 Dev).json
@@ -0,0 +1,1047 @@
+{
+ "revision": 0,
+ "last_node_id": 193,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 193,
+ "type": "1fd98b34-59ef-4d8d-afbf-58bdd7a1cd35",
+ "pos": [
+ -1210,
+ -1770
+ ],
+ "size": [
+ 400,
+ 380
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "45",
+ "text"
+ ],
+ [
+ "27",
+ "width"
+ ],
+ [
+ "27",
+ "height"
+ ],
+ [
+ "31",
+ "seed"
+ ],
+ [
+ "38",
+ "unet_name"
+ ],
+ [
+ "40",
+ "clip_name1"
+ ],
+ [
+ "40",
+ "clip_name2"
+ ],
+ [
+ "39",
+ "vae_name"
+ ],
+ [
+ "31",
+ "control_after_generate"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Flux.1 Dev)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "1fd98b34-59ef-4d8d-afbf-58bdd7a1cd35",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 193,
+ "lastLinkId": 388,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Flux.1 Dev)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1090,
+ 411,
+ 120,
+ 200
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 540,
+ 100,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "669e384e-5e26-4291-9bac-e1d1f04b4a16",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 68
+ ],
+ "label": "prompt",
+ "pos": [
+ -990,
+ 431
+ ]
+ },
+ {
+ "id": "5a5c0b01-5836-4ca6-a24f-68c0a4fb9802",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 69
+ ],
+ "pos": [
+ -990,
+ 451
+ ]
+ },
+ {
+ "id": "5e01104a-ed7f-457b-aaee-934e8ecc088d",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 70
+ ],
+ "pos": [
+ -990,
+ 471
+ ]
+ },
+ {
+ "id": "ea5ea317-a484-4605-8138-8628a4b8e502",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 382
+ ],
+ "pos": [
+ -990,
+ 491
+ ]
+ },
+ {
+ "id": "ea2332f5-bd49-4e2e-8c7a-95817dc56ed6",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 385
+ ],
+ "pos": [
+ -990,
+ 511
+ ]
+ },
+ {
+ "id": "4fca3f43-c05f-4337-bf84-2afe67e43739",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "linkIds": [
+ 386
+ ],
+ "pos": [
+ -990,
+ 531
+ ]
+ },
+ {
+ "id": "357a679f-1370-4cd5-9269-0d5ae1986b49",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "linkIds": [
+ 387
+ ],
+ "pos": [
+ -990,
+ 551
+ ]
+ },
+ {
+ "id": "924ffec5-81f8-4585-8761-5a80d5d775bc",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 388
+ ],
+ "pos": [
+ -990,
+ 571
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "2185cb4d-8689-4cf8-b345-75319fb46a8e",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 9
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 560,
+ 120
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 39,
+ "type": "VAELoader",
+ "pos": [
+ -800,
+ 670
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 388
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "UNETLoader",
+ "pos": [
+ -800,
+ 160
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 385
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 61
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "flux1-dev.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux1-dev/resolve/main/flux1-dev.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "flux1-dev.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 40,
+ "type": "DualCLIPLoader",
+ "pos": [
+ -800,
+ 380
+ ],
+ "size": [
+ 270,
+ 180
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name1",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": 386
+ },
+ {
+ "localized_name": "clip_name2",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": 387
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 64
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "DualCLIPLoader",
+ "models": [
+ {
+ "name": "clip_l.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "t5xxl_fp16.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "clip_l.safetensors",
+ "t5xxl_fp16.safetensors",
+ "flux",
+ "default"
+ ]
+ },
+ {
+ "id": 27,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ -420,
+ 640
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 69
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 70
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 51
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "EmptySD3LatentImage"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 45,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -460,
+ 150
+ ],
+ "size": [
+ 330,
+ 220
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 64
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 68
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 65,
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 31,
+ "type": "KSampler",
+ "pos": [
+ -50,
+ 260
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 61
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 65
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 63
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 51
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 382
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 52
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 20,
+ 1,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 20,
+ 120
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 52
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 58
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 42,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ -350,
+ 420
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 66
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 63
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "ConditioningZeroOut"
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ -820,
+ 70,
+ 320,
+ 750
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size",
+ "bounding": [
+ -470,
+ 570,
+ 380,
+ 250
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -470,
+ 70,
+ 380,
+ 470
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 52,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 58,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 61,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 63,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 51,
+ "origin_id": 27,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 9,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 64,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 65,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 66,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 68,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 69,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 27,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 70,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 27,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 382,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 31,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 385,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 386,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 387,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 40,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 388,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from prompts using FLUX.1 [dev]: a 12B rectified-flow MMDiT with dual CLIP plus T5-XXL text encoders and guidance-distilled sampling for sharp prompt following versus classic DDPM diffusion."
+ }
+ ]
+ },
+ "extra": {
+ "ds": {
+ "scale": 0.7513148009015777,
+ "offset": [
+ 1726.1426909346173,
+ 146.66925047394233
+ ]
+ },
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (Flux.1 Krea Dev).json b/blueprints/Text to Image (Flux.1 Krea Dev).json
new file mode 100644
index 000000000..0d7fa03c4
--- /dev/null
+++ b/blueprints/Text to Image (Flux.1 Krea Dev).json
@@ -0,0 +1,1041 @@
+{
+ "revision": 0,
+ "last_node_id": 196,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 196,
+ "type": "aa0a207e-bf0e-477c-a87f-f58fcf5f7749",
+ "pos": [
+ 1010,
+ 130
+ ],
+ "size": [
+ 410,
+ 460
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "195",
+ "text"
+ ],
+ [
+ "27",
+ "width"
+ ],
+ [
+ "27",
+ "height"
+ ],
+ [
+ "31",
+ "seed"
+ ],
+ [
+ "38",
+ "unet_name"
+ ],
+ [
+ "40",
+ "clip_name1"
+ ],
+ [
+ "40",
+ "clip_name2"
+ ],
+ [
+ "39",
+ "vae_name"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1"
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Flux.1 Krea Dev)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "aa0a207e-bf0e-477c-a87f-f58fcf5f7749",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 196,
+ "lastLinkId": 395,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Flux.1 Krea Dev)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1050,
+ 426,
+ 120,
+ 200
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 620,
+ 140,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "c2515318-6e10-4ad9-9466-e6aa855bc849",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 71
+ ],
+ "pos": [
+ -950,
+ 446
+ ]
+ },
+ {
+ "id": "09f20672-c8a3-4180-823a-5a6af0113e4f",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 72
+ ],
+ "pos": [
+ -950,
+ 466
+ ]
+ },
+ {
+ "id": "7f54c952-896e-4356-bfb2-970e1c8f2eb7",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 73
+ ],
+ "pos": [
+ -950,
+ 486
+ ]
+ },
+ {
+ "id": "e2dc1c86-2fb4-4b80-b560-f30560af1897",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 391
+ ],
+ "pos": [
+ -950,
+ 506
+ ]
+ },
+ {
+ "id": "34b172e7-85b2-444a-9a4d-1221f272c46e",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 392
+ ],
+ "pos": [
+ -950,
+ 526
+ ]
+ },
+ {
+ "id": "073b7440-d943-4a2f-a3a1-fbdb8fcda9f9",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "linkIds": [
+ 393
+ ],
+ "pos": [
+ -950,
+ 546
+ ]
+ },
+ {
+ "id": "55c1286a-4aca-41fc-b967-ae3d3fa7bc85",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "linkIds": [
+ 394
+ ],
+ "pos": [
+ -950,
+ 566
+ ]
+ },
+ {
+ "id": "2241e4fc-9219-4be7-bf6d-3493b579ab5a",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 395
+ ],
+ "pos": [
+ -950,
+ 586
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "5310184a-f0a2-405f-9917-dd2a352a4fac",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 9
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 640,
+ 160
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 40,
+ "type": "DualCLIPLoader",
+ "pos": [
+ -780,
+ 360
+ ],
+ "size": [
+ 270,
+ 180
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name1",
+ "name": "clip_name1",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name1"
+ },
+ "link": 393
+ },
+ {
+ "localized_name": "clip_name2",
+ "name": "clip_name2",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name2"
+ },
+ "link": 394
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 64
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "DualCLIPLoader",
+ "models": [
+ {
+ "name": "clip_l.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "t5xxl_fp16.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "clip_l.safetensors",
+ "t5xxl_fp16.safetensors",
+ "flux",
+ "default"
+ ]
+ },
+ {
+ "id": 39,
+ "type": "VAELoader",
+ "pos": [
+ -770,
+ 630
+ ],
+ "size": [
+ 240,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 395
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Lumina_Image_2.0_Repackaged/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "UNETLoader",
+ "pos": [
+ -780,
+ 170
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 392
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 61
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "flux1-krea-dev_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/FLUX.1-Krea-dev_ComfyUI/resolve/main/split_files/diffusion_models/flux1-krea-dev_fp8_scaled.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "flux1-krea-dev_fp8_scaled.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 195,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -440,
+ 180
+ ],
+ "size": [
+ 330,
+ 210
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 64
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 71
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 65,
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.47",
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 27,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ -390,
+ 650
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 72
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 73
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 51
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "EmptySD3LatentImage"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 31,
+ "type": "KSampler",
+ "pos": [
+ 0,
+ 130
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 61
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 65
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 63
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 51
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 391
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 52
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 20,
+ 1,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 340,
+ 140
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 52
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 58
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 42,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ -340,
+ 430
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 66
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 63
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "Node name for S&R": "ConditioningZeroOut"
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ -800,
+ 90,
+ 310,
+ 750
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size",
+ "bounding": [
+ -460,
+ 560,
+ 400,
+ 280
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -460,
+ 90,
+ 400,
+ 440
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 66,
+ "origin_id": 195,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 52,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 58,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 61,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 65,
+ "origin_id": 195,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 63,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 51,
+ "origin_id": 27,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 64,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 195,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 9,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 71,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 195,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 72,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 27,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 73,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 27,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 391,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 31,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 392,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 393,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 394,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 40,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 395,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "FLUX.1 Krea [dev] (Black Forest Labs × Krea): open-weight 12B rectified-flow text-to-image drop-in alongside FLUX.1 [dev], tuned away from overcooked saturation toward more natural diversity in people, realism, and style while keeping ecosystem compatibility."
+ }
+ ]
+ },
+ "extra": {
+ "ds": {
+ "scale": 0.735584459955559,
+ "offset": [
+ 1936.5815687336737,
+ 303.78330847702625
+ ]
+ },
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (Flux.2 Dev).json b/blueprints/Text to Image (Flux.2 Dev).json
new file mode 100644
index 000000000..d5ca3077d
--- /dev/null
+++ b/blueprints/Text to Image (Flux.2 Dev).json
@@ -0,0 +1,1870 @@
+{
+ "revision": 0,
+ "last_node_id": 123,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 123,
+ "type": "85066daf-feda-4c7b-bbc3-d4797e8ccf0f",
+ "pos": [
+ -800,
+ 640
+ ],
+ "size": [
+ 400,
+ 0
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "label": "turbo_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "115",
+ "text"
+ ],
+ [
+ "113",
+ "width"
+ ],
+ [
+ "113",
+ "height"
+ ],
+ [
+ "122",
+ "unet_name"
+ ],
+ [
+ "111",
+ "clip_name"
+ ],
+ [
+ "108",
+ "vae_name"
+ ],
+ [
+ "116",
+ "lora_name"
+ ],
+ [
+ "121",
+ "value"
+ ],
+ [
+ "114",
+ "noise_seed"
+ ],
+ [
+ "114",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value": true,
+ "lora_name": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Flux.2 Dev)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "85066daf-feda-4c7b-bbc3-d4797e8ccf0f",
+ "version": 1,
+ "state": {
+ "lastGroupId": 6,
+ "lastNodeId": 123,
+ "lastLinkId": 232,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Flux.2 Dev)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1500,
+ 250,
+ 151.744140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1560,
+ -20,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "1f4f1091-3f97-41d8-8ed8-e8b02260cf3c",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 206
+ ],
+ "label": "prompt",
+ "pos": [
+ -1368.255859375,
+ 270
+ ]
+ },
+ {
+ "id": "b9b59411-4f5f-4482-8f78-369e6d50e71c",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 222,
+ 231
+ ],
+ "pos": [
+ -1368.255859375,
+ 290
+ ]
+ },
+ {
+ "id": "c6de9a28-3bf6-40d0-be16-f75ec517a766",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 223,
+ 232
+ ],
+ "pos": [
+ -1368.255859375,
+ 310
+ ]
+ },
+ {
+ "id": "8f1b1c75-e47c-45f5-af57-74abcfe8967c",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 225
+ ],
+ "pos": [
+ -1368.255859375,
+ 330
+ ]
+ },
+ {
+ "id": "6ac27631-1bf0-4161-9670-a662f6180b94",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 226
+ ],
+ "pos": [
+ -1368.255859375,
+ 350
+ ]
+ },
+ {
+ "id": "932e6cbe-f716-4905-ae54-d2b3543497bd",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 227
+ ],
+ "pos": [
+ -1368.255859375,
+ 370
+ ]
+ },
+ {
+ "id": "37400048-5e7b-427b-8b79-ea35841d5306",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 228
+ ],
+ "label": "turbo_lora",
+ "pos": [
+ -1368.255859375,
+ 390
+ ]
+ },
+ {
+ "id": "333212d0-f027-476f-8b97-a921e20e340a",
+ "name": "value",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 229
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -1368.255859375,
+ 410
+ ]
+ },
+ {
+ "id": "e7e73fad-ce6e-48d5-b719-e2abed685185",
+ "name": "noise_seed",
+ "type": "INT",
+ "linkIds": [
+ 230
+ ],
+ "pos": [
+ -1368.255859375,
+ 430
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ed3c0a0f-a39f-453e-907f-8249c8e3335d",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 9
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1580,
+ 0
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 105,
+ "type": "BasicGuider",
+ "pos": [
+ 570,
+ 170
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 210
+ },
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 165
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "slot_index": 0,
+ "links": [
+ 30
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "BasicGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 106,
+ "type": "FluxGuidance",
+ "pos": [
+ -510,
+ 470
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 41
+ },
+ {
+ "localized_name": "guidance",
+ "name": "guidance",
+ "type": "FLOAT",
+ "widget": {
+ "name": "guidance"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 165
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "FluxGuidance",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 4
+ ],
+ "color": "#233",
+ "bgcolor": "#355"
+ },
+ {
+ "id": 107,
+ "type": "KSamplerSelect",
+ "pos": [
+ 570,
+ 350
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 19
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler"
+ ]
+ },
+ {
+ "id": 108,
+ "type": "VAELoader",
+ "pos": [
+ -1000,
+ 460
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 227
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 159
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "full_encoder_small_decoder.safetensors",
+ "url": "https://huggingface.co/black-forest-labs/FLUX.2-small-decoder/resolve/main/full_encoder_small_decoder.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "full_encoder_small_decoder.safetensors"
+ ]
+ },
+ {
+ "id": 109,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 860,
+ -20
+ ],
+ "size": [
+ 280,
+ 330
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 37
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 30
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 19
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 132
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 161
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 24
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 110,
+ "type": "VAEDecode",
+ "pos": [
+ 1220,
+ -20
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 24
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 159
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 111,
+ "type": "CLIPLoader",
+ "pos": [
+ -1000,
+ 200
+ ],
+ "size": [
+ 300,
+ 150
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 226
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 117
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "mistral_3_small_flux2_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux2-dev/resolve/main/split_files/text_encoders/mistral_3_small_flux2_bf16.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "mistral_3_small_flux2_bf16.safetensors",
+ "flux2",
+ "default"
+ ]
+ },
+ {
+ "id": 112,
+ "type": "Flux2Scheduler",
+ "pos": [
+ 570,
+ 550
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 213
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 231
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 232
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 132
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "Flux2Scheduler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ 1024,
+ 1024
+ ]
+ },
+ {
+ "id": 113,
+ "type": "EmptyFlux2LatentImage",
+ "pos": [
+ -980,
+ 660
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 222
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 223
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 161
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptyFlux2LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 114,
+ "type": "RandomNoise",
+ "pos": [
+ 570,
+ -20
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": 230
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 37
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1027111520328378,
+ "randomize"
+ ]
+ },
+ {
+ "id": 115,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -630,
+ -40
+ ],
+ "size": [
+ 440,
+ 450
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 117
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 206
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 41
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 116,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ -150,
+ 220
+ ],
+ "size": [
+ 300,
+ 140
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 221
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 228
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 209
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "Flux_2-Turbo-LoRA_comfyui.safetensors",
+ "url": "https://huggingface.co/ByteZSzn/Flux.2-Turbo-ComfyUI/resolve/main/Flux_2-Turbo-LoRA_comfyui.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "Flux_2-Turbo-LoRA_comfyui.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 117,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 220,
+ -30
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 208
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 209
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 215
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 210
+ ]
+ }
+ ],
+ "title": "Switch(model)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 118,
+ "type": "PrimitiveInt",
+ "pos": [
+ -140,
+ -30
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 211
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ "fixed"
+ ]
+ },
+ {
+ "id": 119,
+ "type": "PrimitiveInt",
+ "pos": [
+ -150,
+ 460
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 212
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 8,
+ "fixed"
+ ]
+ },
+ {
+ "id": 120,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 220,
+ 260
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 211
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 212
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 214
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 213
+ ]
+ }
+ ],
+ "title": "Switch(steps)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 121,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -110,
+ 690
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 229
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 214,
+ 215
+ ]
+ }
+ ],
+ "title": "Enable Turbo LoRA",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 122,
+ "type": "UNETLoader",
+ "pos": [
+ -1000,
+ -30
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 225
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 208,
+ 221
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "flux2_dev_fp8mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/flux2-dev/resolve/main/split_files/diffusion_models/flux2_dev_fp8mixed.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "flux2_dev_fp8mixed.safetensors",
+ "default"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Step 1 - Upload models",
+ "bounding": [
+ -1040,
+ -110,
+ 380,
+ 710
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Custom sampler",
+ "bounding": [
+ 540,
+ -110,
+ 640,
+ 870
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Step2 - Prompt",
+ "bounding": [
+ -640,
+ -110,
+ 460,
+ 710
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Original",
+ "bounding": [
+ -160,
+ -110,
+ 320,
+ 230
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "8 Steps LoRA",
+ "bounding": [
+ -160,
+ 140,
+ 320,
+ 460
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 165,
+ "origin_id": 106,
+ "origin_slot": 0,
+ "target_id": 105,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 41,
+ "origin_id": 115,
+ "origin_slot": 0,
+ "target_id": 106,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 37,
+ "origin_id": 114,
+ "origin_slot": 0,
+ "target_id": 109,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 30,
+ "origin_id": 105,
+ "origin_slot": 0,
+ "target_id": 109,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 19,
+ "origin_id": 107,
+ "origin_slot": 0,
+ "target_id": 109,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 132,
+ "origin_id": 112,
+ "origin_slot": 0,
+ "target_id": 109,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 161,
+ "origin_id": 113,
+ "origin_slot": 0,
+ "target_id": 109,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 117,
+ "origin_id": 111,
+ "origin_slot": 0,
+ "target_id": 115,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 24,
+ "origin_id": 109,
+ "origin_slot": 0,
+ "target_id": 110,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 159,
+ "origin_id": 108,
+ "origin_slot": 0,
+ "target_id": 110,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 9,
+ "origin_id": 110,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 206,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 115,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 208,
+ "origin_id": 122,
+ "origin_slot": 0,
+ "target_id": 117,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 209,
+ "origin_id": 116,
+ "origin_slot": 0,
+ "target_id": 117,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 210,
+ "origin_id": 117,
+ "origin_slot": 0,
+ "target_id": 105,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 211,
+ "origin_id": 118,
+ "origin_slot": 0,
+ "target_id": 120,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 212,
+ "origin_id": 119,
+ "origin_slot": 0,
+ "target_id": 120,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 213,
+ "origin_id": 120,
+ "origin_slot": 0,
+ "target_id": 112,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 214,
+ "origin_id": 121,
+ "origin_slot": 0,
+ "target_id": 120,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 215,
+ "origin_id": 121,
+ "origin_slot": 0,
+ "target_id": 117,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 221,
+ "origin_id": 122,
+ "origin_slot": 0,
+ "target_id": 116,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 222,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 113,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 223,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 113,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 225,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 122,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 226,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 111,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 227,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 108,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 228,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 116,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 229,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 121,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 230,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 114,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 231,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 112,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 232,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 112,
+ "target_slot": 2,
+ "type": "INT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from prompts using FLUX.2 [dev]: a newer 32B rectified-flow stack with distilled guidance plus a stronger long-context multimodal encoder for complex scenes, sharper typography/UI text, anatomy, lighting, and high-resolution latent decoding."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (NetaYume Lumina).json b/blueprints/Text to Image (NetaYume Lumina).json
new file mode 100644
index 000000000..9e11b7a86
--- /dev/null
+++ b/blueprints/Text to Image (NetaYume Lumina).json
@@ -0,0 +1,1470 @@
+{
+ "revision": 0,
+ "last_node_id": 219,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 219,
+ "type": "fc9485c9-2acd-482e-94f1-b5fa702f2536",
+ "pos": [
+ -1900,
+ 2330
+ ],
+ "size": [
+ 400,
+ 540
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "62",
+ "value"
+ ],
+ [
+ "53",
+ "width"
+ ],
+ [
+ "53",
+ "height"
+ ],
+ [
+ "55",
+ "seed"
+ ],
+ [
+ "56",
+ "ckpt_name"
+ ],
+ [
+ "55",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Text to Image (NetaYume Lumina)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "fc9485c9-2acd-482e-94f1-b5fa702f2536",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 219,
+ "lastLinkId": 395,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (NetaYume Lumina)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -600,
+ 90,
+ 120,
+ 140
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1740.333330193419,
+ 286.3333328495138,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "b80a1e0c-e8a6-4c4f-8eb1-825cb7e4fdcf",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 36
+ ],
+ "pos": [
+ -500,
+ 110
+ ]
+ },
+ {
+ "id": "6583bb32-7cff-4921-a771-1f0dcdf779e6",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 39
+ ],
+ "pos": [
+ -500,
+ 130
+ ]
+ },
+ {
+ "id": "c486937a-46c0-431b-8775-057897843cbd",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 40
+ ],
+ "pos": [
+ -500,
+ 150
+ ]
+ },
+ {
+ "id": "9c85c0cc-c906-405a-a4d9-43b93c47cb53",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 42
+ ],
+ "pos": [
+ -500,
+ 170
+ ]
+ },
+ {
+ "id": "f7e288ec-fa1f-4a1d-b721-6b605de9cb51",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 43
+ ],
+ "pos": [
+ -500,
+ 190
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ea4b872b-a294-4cbf-99a9-70e55c0f8b3e",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 16
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1760.333330193419,
+ 306.3333328495138
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 53,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ -220,
+ 370
+ ],
+ "size": [
+ 320,
+ 170
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 39
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 40
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 17
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptySD3LatentImage"
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 54,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 650,
+ 40
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 12
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ModelSamplingAuraFlow"
+ },
+ "widgets_values": [
+ 4
+ ]
+ },
+ {
+ "id": 55,
+ "type": "KSampler",
+ "pos": [
+ 650,
+ 200
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 13
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 32
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 23
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 17
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 42
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 14
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSampler"
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 30,
+ 4,
+ "res_multistep",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 56,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -220,
+ 70
+ ],
+ "size": [
+ 320,
+ 160
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 43
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 12
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 1,
+ "links": [
+ 22,
+ 35
+ ]
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 2,
+ "links": [
+ 8
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "models": [
+ {
+ "name": "NetaYumev35_pretrained_all_in_one.safetensors",
+ "url": "https://huggingface.co/duongve/NetaYume-Lumina-Image-2.0/resolve/main/NetaYumev35_pretrained_all_in_one.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "NetaYumev35_pretrained_all_in_one.safetensors"
+ ]
+ },
+ {
+ "id": 57,
+ "type": "a07fdf06-1bda-4dac-bdbd-63ee8ebca1c9",
+ "pos": [
+ 180,
+ 360
+ ],
+ "size": [
+ 400,
+ 140
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 22
+ },
+ {
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 23
+ ]
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "218",
+ "value"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 217,
+ "type": "VAEDecode",
+ "pos": [
+ 1040,
+ 210
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 14
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 8
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 16
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode"
+ }
+ },
+ {
+ "id": 59,
+ "type": "MarkdownNote",
+ "pos": [
+ 640,
+ -390
+ ],
+ "size": [
+ 370,
+ 280
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "title": "Note: Prompt",
+ "properties": {},
+ "widgets_values": [
+ "Check the prompt book [here](https://nieta-art.feishu.cn/wiki/RY3GwpT59icIQlkWXEfcCqIMnQd)\n\nYou should keep the prefix part fixed until the **Prompt Start** tag\n\n@whatever in the prompt is for artist tags, such as @comfyanonymous\n\nYou can find more artist tags [here](https://gumgum10.github.io/gumgum.github.io/)\n"
+ ],
+ "color": "#222",
+ "bgcolor": "#000"
+ },
+ {
+ "id": 60,
+ "type": "StringConcatenate",
+ "pos": [
+ 170,
+ -370
+ ],
+ "size": [
+ 400,
+ 250
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string_a",
+ "name": "string_a",
+ "type": "STRING",
+ "widget": {
+ "name": "string_a"
+ },
+ "link": 30
+ },
+ {
+ "localized_name": "string_b",
+ "name": "string_b",
+ "type": "STRING",
+ "widget": {
+ "name": "string_b"
+ },
+ "link": 31
+ },
+ {
+ "localized_name": "delimiter",
+ "name": "delimiter",
+ "type": "STRING",
+ "widget": {
+ "name": "delimiter"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 34
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "StringConcatenate"
+ },
+ "widgets_values": [
+ "",
+ "",
+ ""
+ ]
+ },
+ {
+ "id": 61,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 170,
+ 60
+ ],
+ "size": [
+ 430,
+ 190
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 35
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 34
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 32
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 62,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -240,
+ -210
+ ],
+ "size": [
+ 370,
+ 140
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 36
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 31
+ ]
+ }
+ ],
+ "title": "Prompt",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 63,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ -240,
+ -390
+ ],
+ "size": [
+ 370,
+ 140
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 30
+ ]
+ }
+ ],
+ "title": "System prompt",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ "You are an assistant designed to generate high quality anime images based on textual prompts. "
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ -250,
+ -30,
+ 370,
+ 280
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image Size",
+ "bounding": [
+ -250,
+ 280,
+ 370,
+ 290
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ 150,
+ -30,
+ 460,
+ 600
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Prompt Builder",
+ "bounding": [
+ -250,
+ -460,
+ 840,
+ 400
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 12,
+ "origin_id": 56,
+ "origin_slot": 0,
+ "target_id": 54,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 13,
+ "origin_id": 54,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 23,
+ "origin_id": 57,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 17,
+ "origin_id": 53,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 14,
+ "origin_id": 55,
+ "origin_slot": 0,
+ "target_id": 217,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 8,
+ "origin_id": 56,
+ "origin_slot": 2,
+ "target_id": 217,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 22,
+ "origin_id": 56,
+ "origin_slot": 1,
+ "target_id": 57,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 16,
+ "origin_id": 217,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 30,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 60,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 31,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 60,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 32,
+ "origin_id": 61,
+ "origin_slot": 0,
+ "target_id": 55,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 34,
+ "origin_id": 60,
+ "origin_slot": 0,
+ "target_id": 61,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 35,
+ "origin_id": 56,
+ "origin_slot": 1,
+ "target_id": 61,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 36,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 39,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 53,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 40,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 53,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 42,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 55,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 43,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 56,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using NetaYume Lumina, fine-tuned from Neta Lumina for anime-style and illustration generation."
+ },
+ {
+ "id": "a07fdf06-1bda-4dac-bdbd-63ee8ebca1c9",
+ "version": 1,
+ "state": {
+ "lastGroupId": 8,
+ "lastNodeId": 219,
+ "lastLinkId": 395,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "CLIP Text Encode (Negative Prompt)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -150,
+ 675,
+ 120,
+ 80
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 905.2780151367188,
+ 675,
+ 128.6640625,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "47264a97-6fc9-454d-920f-b8a43fee0489",
+ "name": "clip",
+ "type": "CLIP",
+ "linkIds": [
+ 5
+ ],
+ "localized_name": "clip",
+ "pos": [
+ -50,
+ 695
+ ]
+ },
+ {
+ "id": "7cdb7919-1dad-4bd2-928d-c543c3fd712e",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 22
+ ],
+ "pos": [
+ -50,
+ 715
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "c3f17ad9-6954-4333-bf8e-e1cf886c351b",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "linkIds": [
+ 6
+ ],
+ "localized_name": "CONDITIONING",
+ "pos": [
+ 925.2780151367188,
+ 695
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 64,
+ "type": "StringConcatenate",
+ "pos": [
+ 420,
+ 720
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "string_a",
+ "name": "string_a",
+ "type": "STRING",
+ "widget": {
+ "name": "string_a"
+ },
+ "link": 19
+ },
+ {
+ "localized_name": "string_b",
+ "name": "string_b",
+ "type": "STRING",
+ "widget": {
+ "name": "string_b"
+ },
+ "link": 20
+ },
+ {
+ "localized_name": "delimiter",
+ "name": "delimiter",
+ "type": "STRING",
+ "widget": {
+ "name": "delimiter"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 21
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "StringConcatenate"
+ },
+ "widgets_values": [
+ "",
+ "",
+ ""
+ ]
+ },
+ {
+ "id": 65,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ 30,
+ 720
+ ],
+ "size": [
+ 370,
+ 130
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 19
+ ]
+ }
+ ],
+ "title": "System prompt",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ "You are an assistant designed to generate low-quality images based on textual prompts "
+ ]
+ },
+ {
+ "id": 218,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ 30,
+ 900
+ ],
+ "size": [
+ 370,
+ 130
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 22
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 20
+ ]
+ }
+ ],
+ "title": "System prompt",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.70",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveStringMultiline"
+ },
+ "widgets_values": [
+ "blurry, worst quality, low quality, jpeg artifacts, signature, watermark, username, error, deformed hands, bad anatomy, extra limbs, poorly drawn hands, poorly drawn face, mutation, deformed, extra eyes, extra arms, extra legs, malformed limbs, fused fingers, too many fingers, long neck, cross-eyed, bad proportions, missing arms, missing legs, extra digit, fewer digits, cropped"
+ ]
+ },
+ {
+ "id": 67,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 420,
+ 410
+ ],
+ "size": [
+ 430,
+ 190
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 5
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 21
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 6
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode"
+ },
+ "widgets_values": [
+ "You are an assistant designed to generate low-quality images based on textual prompts blurry, worst quality, low quality, jpeg artifacts, signature, watermark, username, error, deformed hands, bad anatomy, extra limbs, poorly drawn hands, poorly drawn face, mutation, deformed, extra eyes, extra arms, extra legs, malformed limbs, fused fingers, too many fingers, long neck, cross-eyed, bad proportions, missing arms, missing legs, extra digit, fewer digits, cropped"
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 19,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": 64,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 20,
+ "origin_id": 218,
+ "origin_slot": 0,
+ "target_id": 64,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 21,
+ "origin_id": 64,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 5,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 6,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 22,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 218,
+ "target_slot": 0,
+ "type": "STRING"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "description": "Encodes a negative text prompt via CLIP for classifier-free guidance in anime-style generation (NetaYume Lumina)."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
diff --git a/blueprints/Text to Image (Qwen-Image 2512).json b/blueprints/Text to Image (Qwen-Image 2512).json
new file mode 100644
index 000000000..09612be8b
--- /dev/null
+++ b/blueprints/Text to Image (Qwen-Image 2512).json
@@ -0,0 +1,1952 @@
+{
+ "revision": 0,
+ "last_node_id": 263,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 263,
+ "type": "fd6ee5f8-a0a9-487a-8b44-8cb65957532a",
+ "pos": [
+ 750,
+ 760
+ ],
+ "size": [
+ 400,
+ 0
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "249",
+ "text"
+ ],
+ [
+ "252",
+ "width"
+ ],
+ [
+ "252",
+ "height"
+ ],
+ [
+ "256",
+ "value"
+ ],
+ [
+ "253",
+ "seed"
+ ],
+ [
+ "248",
+ "unet_name"
+ ],
+ [
+ "245",
+ "clip_name"
+ ],
+ [
+ "246",
+ "vae_name"
+ ],
+ [
+ "259",
+ "lora_name"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.4",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Qwen-Image 2512)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "fd6ee5f8-a0a9-487a-8b44-8cb65957532a",
+ "version": 1,
+ "state": {
+ "lastGroupId": 7,
+ "lastNodeId": 263,
+ "lastLinkId": 375,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Qwen-Image 2512)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1080,
+ 1480,
+ 151.744140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1550,
+ 1460,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "74d26021-a723-4a90-8e33-5d805a7e5deb",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 360
+ ],
+ "pos": [
+ -948.255859375,
+ 1500
+ ]
+ },
+ {
+ "id": "b55f69e6-c7cb-4641-9e1f-2cb1c1942ed0",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 361
+ ],
+ "pos": [
+ -948.255859375,
+ 1520
+ ]
+ },
+ {
+ "id": "3e80284d-aba3-43cd-ab7b-ac2a619ef18c",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 362
+ ],
+ "pos": [
+ -948.255859375,
+ 1540
+ ]
+ },
+ {
+ "id": "de06e137-6cec-4cb3-a6bb-737022310a7b",
+ "name": "value",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 370
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -948.255859375,
+ 1560
+ ]
+ },
+ {
+ "id": "9e500dee-a5b9-481b-ac46-64bab4bd3530",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 371
+ ],
+ "pos": [
+ -948.255859375,
+ 1580
+ ]
+ },
+ {
+ "id": "33422b12-24e5-41c6-96fc-f9a8dadd5d94",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 372
+ ],
+ "pos": [
+ -948.255859375,
+ 1600
+ ]
+ },
+ {
+ "id": "5cf753e4-236e-468e-9a06-6b8e238badc8",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 373
+ ],
+ "pos": [
+ -948.255859375,
+ 1620
+ ]
+ },
+ {
+ "id": "790e775c-a639-4e5f-9007-e2ee6764dc5e",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 374
+ ],
+ "pos": [
+ -948.255859375,
+ 1640
+ ]
+ },
+ {
+ "id": "3ebed521-3fe9-4922-ae26-2483e03d9305",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 375
+ ],
+ "pos": [
+ -948.255859375,
+ 1660
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "7db1f9e2-40ee-4f9f-bb24-a0db7b96d45e",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 333
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1570,
+ 1480
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 245,
+ "type": "CLIPLoader",
+ "pos": [
+ -590,
+ 1370
+ ],
+ "size": [
+ 280,
+ 150
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 373
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 314,
+ 315
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "CLIPLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ },
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "qwen_image",
+ "default"
+ ]
+ },
+ {
+ "id": 246,
+ "type": "VAELoader",
+ "pos": [
+ -580,
+ 1620
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 374
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 323
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "VAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors",
+ "directory": "vae"
+ },
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors",
+ "directory": "vae"
+ }
+ ]
+ },
+ "widgets_values": [
+ "qwen_image_vae.safetensors"
+ ]
+ },
+ {
+ "id": 247,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 1040,
+ 1110
+ ],
+ "size": [
+ 250,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 367
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 316
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3.1000000000000005
+ ]
+ },
+ {
+ "id": 248,
+ "type": "UNETLoader",
+ "pos": [
+ -590,
+ 1140
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 372
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 312,
+ 324
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "UNETLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "qwen_image_2512_fp8_e4m3fn.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/diffusion_models/qwen_image_2512_fp8_e4m3fn.safetensors",
+ "directory": "diffusion_models"
+ },
+ {
+ "name": "qwen_image_2512_fp8_e4m3fn.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/diffusion_models/qwen_image_2512_fp8_e4m3fn.safetensors",
+ "directory": "diffusion_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "qwen_image_2512_fp8_e4m3fn.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 249,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -200,
+ 1140
+ ],
+ "size": [
+ 360,
+ 420
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 314
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 360
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 317
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 250,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -200,
+ 1610
+ ],
+ "size": [
+ 370,
+ 170
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 315
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 318
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "低分辨率,低画质,肢体畸形,手指畸形,画面过饱和,蜡像感,人脸无细节,过度光滑,画面具有AI感。构图混乱。文字模糊,扭曲"
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 251,
+ "type": "VAEDecode",
+ "pos": [
+ 1320,
+ 1120
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 322
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 323
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 333
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 252,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ -550,
+ 1930
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 361
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 362
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 319
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "EmptySD3LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1328,
+ 1328,
+ 1
+ ]
+ },
+ {
+ "id": 253,
+ "type": "KSampler",
+ "pos": [
+ 1040,
+ 1250
+ ],
+ "size": [
+ 250,
+ 350
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 316
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 317
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 318
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 319
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 371
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 368
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 369
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 322
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "Node name for S&R": "KSampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 464857551335368,
+ "randomize",
+ 50,
+ 4,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 254,
+ "type": "PrimitiveInt",
+ "pos": [
+ 300,
+ 1150
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 355
+ ]
+ }
+ ],
+ "title": "Int (Steps)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 50,
+ "fixed"
+ ]
+ },
+ {
+ "id": 255,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 300,
+ 1290
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 357
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "PrimitiveFloat",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 4
+ ]
+ },
+ {
+ "id": 256,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 300,
+ 2060
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 370
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 326,
+ 358,
+ 359
+ ]
+ }
+ ],
+ "title": "Enable 4 Steps LoRA?",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 257,
+ "type": "PrimitiveInt",
+ "pos": [
+ 290,
+ 1540
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 347,
+ 354
+ ]
+ }
+ ],
+ "title": "Int (Steps)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 4,
+ "fixed"
+ ]
+ },
+ {
+ "id": 258,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 290,
+ 1670
+ ],
+ "size": [
+ 230,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 356
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "PrimitiveFloat",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 259,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 240,
+ 1820
+ ],
+ "size": [
+ 330,
+ 140
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 312
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 375
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 325
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.49",
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "Qwen-Image-2512-Lightning-4steps-V1.0-fp32.safetensors",
+ "url": "https://huggingface.co/lightx2v/Qwen-Image-2512-Lightning/resolve/main/Qwen-Image-2512-Lightning-4steps-V1.0-fp32.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "Qwen-Image-2512-Lightning-4steps-V1.0-fp32.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 260,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 710,
+ 1170
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 324
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 325
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 326
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 367
+ ]
+ }
+ ],
+ "title": "Switch (model)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 261,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 710,
+ 1420
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 355
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 354
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 359
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 368
+ ]
+ }
+ ],
+ "title": "Switch (steps)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 262,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 710,
+ 1660
+ ],
+ "size": [
+ 230,
+ 130
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 357
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 356
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 358
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 369
+ ]
+ }
+ ],
+ "title": "Switch (cfg)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.12.3",
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ -640,
+ 1060,
+ 390,
+ 740
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Image size",
+ "bounding": [
+ -630,
+ 1830,
+ 380,
+ 290
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ -220,
+ 1060,
+ 400,
+ 740
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "4-steps LoRA",
+ "bounding": [
+ 210,
+ 1460,
+ 410,
+ 550
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Original Settings",
+ "bounding": [
+ 210,
+ 1060,
+ 420,
+ 370
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Swtich",
+ "bounding": [
+ 660,
+ 1060,
+ 320,
+ 750
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 312,
+ "origin_id": 248,
+ "origin_slot": 0,
+ "target_id": 259,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 314,
+ "origin_id": 245,
+ "origin_slot": 0,
+ "target_id": 249,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 315,
+ "origin_id": 245,
+ "origin_slot": 0,
+ "target_id": 250,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 322,
+ "origin_id": 253,
+ "origin_slot": 0,
+ "target_id": 251,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 323,
+ "origin_id": 246,
+ "origin_slot": 0,
+ "target_id": 251,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 316,
+ "origin_id": 247,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 317,
+ "origin_id": 249,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 318,
+ "origin_id": 250,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 319,
+ "origin_id": 252,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 324,
+ "origin_id": 248,
+ "origin_slot": 0,
+ "target_id": 260,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 325,
+ "origin_id": 259,
+ "origin_slot": 0,
+ "target_id": 260,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 326,
+ "origin_id": 256,
+ "origin_slot": 0,
+ "target_id": 260,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 333,
+ "origin_id": 251,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 347,
+ "origin_id": 257,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 354,
+ "origin_id": 257,
+ "origin_slot": 0,
+ "target_id": 261,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 355,
+ "origin_id": 254,
+ "origin_slot": 0,
+ "target_id": 261,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 356,
+ "origin_id": 258,
+ "origin_slot": 0,
+ "target_id": 262,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 357,
+ "origin_id": 255,
+ "origin_slot": 0,
+ "target_id": 262,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 358,
+ "origin_id": 256,
+ "origin_slot": 0,
+ "target_id": 262,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 359,
+ "origin_id": 256,
+ "origin_slot": 0,
+ "target_id": 261,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 360,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 249,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 361,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 252,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 362,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 252,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 367,
+ "origin_id": 260,
+ "origin_slot": 0,
+ "target_id": 247,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 368,
+ "origin_id": 261,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 369,
+ "origin_id": 262,
+ "origin_slot": 0,
+ "target_id": 253,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 370,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 256,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 371,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 253,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 372,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 248,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 373,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 245,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 374,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 246,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 375,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 259,
+ "target_slot": 1,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "Vue-corrected"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using Qwen-Image-2512, with enhanced human realism and finer natural detail over the base version."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image (Qwen-Image).json b/blueprints/Text to Image (Qwen-Image).json
new file mode 100644
index 000000000..e78d5a962
--- /dev/null
+++ b/blueprints/Text to Image (Qwen-Image).json
@@ -0,0 +1,1882 @@
+{
+ "revision": 0,
+ "last_node_id": 76,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 76,
+ "type": "e5cfe5ba-2ae0-4bc4-869f-ab2228cb44d3",
+ "pos": [
+ 30,
+ 10
+ ],
+ "size": [
+ 470,
+ 660
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "label": "lightning_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "6",
+ "text"
+ ],
+ [
+ "58",
+ "width"
+ ],
+ [
+ "58",
+ "height"
+ ],
+ [
+ "3",
+ "seed"
+ ],
+ [
+ "37",
+ "unet_name"
+ ],
+ [
+ "38",
+ "clip_name"
+ ],
+ [
+ "39",
+ "vae_name"
+ ],
+ [
+ "73",
+ "lora_name"
+ ],
+ [
+ "86",
+ "value"
+ ],
+ [
+ "3",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true,
+ "lora_name": true,
+ "value": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Qwen-Image)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "e5cfe5ba-2ae0-4bc4-869f-ab2228cb44d3",
+ "version": 1,
+ "state": {
+ "lastGroupId": 5,
+ "lastNodeId": 87,
+ "lastLinkId": 153,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Qwen-Image)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -810,
+ 290,
+ 151.744140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 2580,
+ 340,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "846fd1a5-9f4a-4e83-af40-27cafe99e5c6",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 132
+ ],
+ "label": "prompt",
+ "pos": [
+ -678.255859375,
+ 310
+ ]
+ },
+ {
+ "id": "e941d29f-bb7f-4001-a956-90a9b29ae9f9",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 134
+ ],
+ "pos": [
+ -678.255859375,
+ 330
+ ]
+ },
+ {
+ "id": "df798f50-87ba-481b-b847-ca8b7c7efff3",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 135
+ ],
+ "pos": [
+ -678.255859375,
+ 350
+ ]
+ },
+ {
+ "id": "3fcf7667-f697-43ee-bdee-0d3fed39e777",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 136
+ ],
+ "pos": [
+ -678.255859375,
+ 370
+ ]
+ },
+ {
+ "id": "e8d70f26-d9f5-4633-a39e-0bf6cf93d566",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 137
+ ],
+ "pos": [
+ -678.255859375,
+ 390
+ ]
+ },
+ {
+ "id": "8c9b537a-c6c9-4365-96ad-dbbb82d917e0",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 138
+ ],
+ "pos": [
+ -678.255859375,
+ 410
+ ]
+ },
+ {
+ "id": "7cc2f92b-6e2f-4e4e-a316-b61f58ed1442",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 139
+ ],
+ "pos": [
+ -678.255859375,
+ 430
+ ]
+ },
+ {
+ "id": "3cb1ba7c-583c-4f92-afc1-71463161e2a4",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 140
+ ],
+ "label": "lightning_lora",
+ "pos": [
+ -678.255859375,
+ 450
+ ]
+ },
+ {
+ "id": "4278102d-766c-4c6b-af2e-0fb9f26bbb27",
+ "name": "value",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 153
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -678.255859375,
+ 470
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "2af20250-dc7a-4643-bc84-0a180d9ca62b",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 110
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 2600,
+ 360
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 39,
+ "type": "VAELoader",
+ "pos": [
+ -260,
+ 510
+ ],
+ "size": [
+ 330,
+ 110
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 139
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 76
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "qwen_image_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/vae/qwen_image_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_image_vae.safetensors"
+ ]
+ },
+ {
+ "id": 38,
+ "type": "CLIPLoader",
+ "pos": [
+ -260,
+ 280
+ ],
+ "size": [
+ 330,
+ 150
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 138
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 74,
+ 75
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "models": [
+ {
+ "name": "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/text_encoders/qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_2.5_vl_7b_fp8_scaled.safetensors",
+ "qwen_image",
+ "default"
+ ]
+ },
+ {
+ "id": 58,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ -240,
+ 810
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 134
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 135
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 107
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptySD3LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1328,
+ 1328,
+ 1
+ ]
+ },
+ {
+ "id": 66,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 1780,
+ 180
+ ],
+ "size": [
+ 300,
+ 110
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 147
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 125
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3.1000000000000005
+ ]
+ },
+ {
+ "id": 37,
+ "type": "UNETLoader",
+ "pos": [
+ -260,
+ 80
+ ],
+ "size": [
+ 330,
+ 110
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 137
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 129,
+ 142
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "qwen_image_fp8_e4m3fn.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Qwen-Image_ComfyUI/resolve/main/split_files/diffusion_models/qwen_image_fp8_e4m3fn.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_image_fp8_e4m3fn.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 120,
+ 60
+ ],
+ "size": [
+ 440,
+ 340
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 74
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 132
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 46
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 130,
+ 480
+ ],
+ "size": [
+ 430,
+ 180
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 75
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 52
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#322",
+ "bgcolor": "#533"
+ },
+ {
+ "id": 8,
+ "type": "VAEDecode",
+ "pos": [
+ 2190,
+ 350
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 128
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 76
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 110
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 73,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 670,
+ 500
+ ],
+ "size": [
+ 400,
+ 140
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 129
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 140
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 141
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.49",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "models": [
+ {
+ "name": "Qwen-Image-Lightning-8steps-V1.0.safetensors",
+ "url": "https://huggingface.co/lightx2v/Qwen-Image-Lightning/resolve/main/Qwen-Image-Lightning-8steps-V1.0.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "Qwen-Image-Lightning-8steps-V1.0.safetensors",
+ 1
+ ]
+ },
+ {
+ "id": 3,
+ "type": "KSampler",
+ "pos": [
+ 1780,
+ 330
+ ],
+ "size": [
+ 300,
+ 480
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 125
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 46
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 52
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 107
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 136
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 148
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 149
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 128
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.48",
+ "ue_properties": {
+ "version": "7.7",
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 50347169638278,
+ "randomize",
+ 8,
+ 1,
+ "euler",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 78,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 1320,
+ 180
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 142
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 141
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 150
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 147
+ ]
+ }
+ ],
+ "title": "Switch (Model)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 79,
+ "type": "PrimitiveInt",
+ "pos": [
+ 680,
+ 710
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 143
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 8,
+ "fixed"
+ ]
+ },
+ {
+ "id": 81,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 680,
+ 870
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 144
+ ]
+ }
+ ],
+ "title": "CFG",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 82,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 1320,
+ 400
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 146
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 143
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 151
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 148
+ ]
+ }
+ ],
+ "title": "Switch (Steps)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 83,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 1320,
+ 600
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 145
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 144
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 152
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 149
+ ]
+ }
+ ],
+ "title": "Switch (CFG)",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ComfySwitchNode"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 84,
+ "type": "PrimitiveInt",
+ "pos": [
+ 680,
+ 60
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 146
+ ]
+ }
+ ],
+ "title": "Steps",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveInt"
+ },
+ "widgets_values": [
+ 20,
+ "fixed"
+ ]
+ },
+ {
+ "id": 85,
+ "type": "PrimitiveFloat",
+ "pos": [
+ 680,
+ 230
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 145
+ ]
+ }
+ ],
+ "title": "CFG",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveFloat"
+ },
+ "widgets_values": [
+ 4
+ ]
+ },
+ {
+ "id": 86,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 710,
+ 1070
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 153
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 150,
+ 151,
+ 152
+ ]
+ }
+ ],
+ "title": "Enable Lightning LoRA",
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "PrimitiveBoolean"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 87,
+ "type": "MarkdownNote",
+ "pos": [
+ 620,
+ -160
+ ],
+ "size": [
+ 500,
+ 120
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "Try 50 steps, if you want original the [qwen image](https://huggingface.co/Qwen/Qwen-Image)'s setting, but it will takes longer"
+ ],
+ "color": "#222",
+ "bgcolor": "#000"
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Step1 - Load models",
+ "bounding": [
+ -280,
+ -20,
+ 360,
+ 700
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Step2 - Image size",
+ "bounding": [
+ -280,
+ 710,
+ 360,
+ 300
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Step3 - Prompt",
+ "bounding": [
+ 110,
+ -20,
+ 470,
+ 700
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Lightx2v 8steps LoRA",
+ "bounding": [
+ 610,
+ 390,
+ 520,
+ 620
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 5,
+ "title": "Original Settings",
+ "bounding": [
+ 610,
+ -20,
+ 520,
+ 380
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 74,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 75,
+ "origin_id": 38,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 129,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 73,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 128,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 76,
+ "origin_id": 39,
+ "origin_slot": 0,
+ "target_id": 8,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 125,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 46,
+ "origin_id": 6,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 52,
+ "origin_id": 7,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 107,
+ "origin_id": 58,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 110,
+ "origin_id": 8,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 132,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 134,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 58,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 135,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 58,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 136,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 137,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 138,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 38,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 139,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 39,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 140,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 73,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 141,
+ "origin_id": 73,
+ "origin_slot": 0,
+ "target_id": 78,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 142,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 143,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 144,
+ "origin_id": 81,
+ "origin_slot": 0,
+ "target_id": 83,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 145,
+ "origin_id": 85,
+ "origin_slot": 0,
+ "target_id": 83,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 146,
+ "origin_id": 84,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 147,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 148,
+ "origin_id": 82,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 149,
+ "origin_id": 83,
+ "origin_slot": 0,
+ "target_id": 3,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 150,
+ "origin_id": 86,
+ "origin_slot": 0,
+ "target_id": 78,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 151,
+ "origin_id": 86,
+ "origin_slot": 0,
+ "target_id": 82,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 152,
+ "origin_id": 86,
+ "origin_slot": 0,
+ "target_id": 83,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 153,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 86,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using Qwen-Image, Alibaba's 20B MMDiT model with excellent multilingual text rendering."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image (Z-Image-Base).json b/blueprints/Text to Image (Z-Image-Base).json
new file mode 100644
index 000000000..169263712
--- /dev/null
+++ b/blueprints/Text to Image (Z-Image-Base).json
@@ -0,0 +1,1184 @@
+{
+ "revision": 0,
+ "last_node_id": 126,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 126,
+ "type": "8a2bb267-5858-4aaf-bdcd-61002711af19",
+ "pos": [
+ -2280,
+ 2850
+ ],
+ "size": [
+ 410,
+ 560
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "67",
+ "text"
+ ],
+ [
+ "68",
+ "width"
+ ],
+ [
+ "68",
+ "height"
+ ],
+ [
+ "69",
+ "steps"
+ ],
+ [
+ "69",
+ "cfg"
+ ],
+ [
+ "69",
+ "seed"
+ ],
+ [
+ "66",
+ "unet_name"
+ ],
+ [
+ "62",
+ "clip_name"
+ ],
+ [
+ "63",
+ "vae_name"
+ ],
+ [
+ "69",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.13.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Text to Image (Z-Image-Base)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "8a2bb267-5858-4aaf-bdcd-61002711af19",
+ "version": 1,
+ "state": {
+ "lastGroupId": 16,
+ "lastNodeId": 126,
+ "lastLinkId": 229,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image (Z-Image-Base)",
+ "description": "Generates images from text prompts using Z-Image base weights with Qwen3 text encoder and bundled VAE.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -220,
+ 40,
+ 120,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1840,
+ -150,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "af36fee5-4f8b-4a8e-bfa8-cb8fe7006cc3",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 108
+ ],
+ "label": "prompt",
+ "pos": [
+ -120,
+ 60
+ ]
+ },
+ {
+ "id": "357f0059-e8e6-41f6-a290-c53b0a60c0ed",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 114
+ ],
+ "pos": [
+ -120,
+ 80
+ ]
+ },
+ {
+ "id": "4a442743-a9c2-4aa5-9efd-05d43f3322d3",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 115
+ ],
+ "pos": [
+ -120,
+ 100
+ ]
+ },
+ {
+ "id": "a0fc336b-d349-418e-8415-318653f7b6b3",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 116
+ ],
+ "pos": [
+ -120,
+ 120
+ ]
+ },
+ {
+ "id": "2f253ace-1e1a-415f-9b95-a10430bd5749",
+ "name": "cfg",
+ "type": "FLOAT",
+ "linkIds": [
+ 117
+ ],
+ "pos": [
+ -120,
+ 140
+ ]
+ },
+ {
+ "id": "18a6ad37-23aa-4bf7-a0cd-1d6ca6e2a128",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 118
+ ],
+ "pos": [
+ -120,
+ 160
+ ]
+ },
+ {
+ "id": "d1fc4937-8505-4ec6-9fc4-a33ef7b45eee",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 119
+ ],
+ "pos": [
+ -120,
+ 180
+ ]
+ },
+ {
+ "id": "db45dd49-d990-4ceb-a849-f96341874cdd",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 120
+ ],
+ "pos": [
+ -120,
+ 200
+ ]
+ },
+ {
+ "id": "37b8eac6-9b1b-452b-81f3-0ba9e34a576a",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 121
+ ],
+ "pos": [
+ -120,
+ 220
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f2bea309-bfe7-4ccb-9ffe-9475bf1da2ae",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 79
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1860,
+ -130
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 67,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 600,
+ -90
+ ],
+ "size": [
+ 410,
+ 320
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 78
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 108
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 75
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 68,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ 240,
+ 620
+ ],
+ "size": [
+ 260,
+ 170
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 114
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 115
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 77
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "EmptySD3LatentImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 63,
+ "type": "VAELoader",
+ "pos": [
+ 230,
+ 340
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 121
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 73
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 62,
+ "type": "CLIPLoader",
+ "pos": [
+ 230,
+ 110
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 120
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 78,
+ 82
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "qwen_3_4b.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/text_encoders/qwen_3_4b.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_3_4b.safetensors",
+ "lumina2",
+ "default"
+ ]
+ },
+ {
+ "id": 65,
+ "type": "VAEDecode",
+ "pos": [
+ 1450,
+ -150
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 72
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 73
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 79
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 70,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 1100,
+ -150
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 109
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 74
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3
+ ]
+ },
+ {
+ "id": 66,
+ "type": "UNETLoader",
+ "pos": [
+ 230,
+ -90
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 119
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 109
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "models": [
+ {
+ "name": "z_image_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image/resolve/main/split_files/diffusion_models/z_image_bf16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "z_image_bf16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 71,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 600,
+ 310
+ ],
+ "size": [
+ 390,
+ 140
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 82
+ },
+ {
+ "label": "prompt",
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 83
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 69,
+ "type": "KSampler",
+ "pos": [
+ 1100,
+ 10
+ ],
+ "size": [
+ 310,
+ 440
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 74
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 75
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 83
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 77
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 118
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 116
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 117
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 72
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 25,
+ 4,
+ "res_multistep",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 87,
+ "type": "MarkdownNote",
+ "pos": [
+ 1110,
+ -360
+ ],
+ "size": [
+ 300,
+ 120
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "properties": {},
+ "widgets_values": [
+ "- Steps: 30~50\n- cfg: 3~5"
+ ],
+ "color": "#222",
+ "bgcolor": "#000",
+ "title": "Original Settings"
+ }
+ ],
+ "groups": [
+ {
+ "id": 2,
+ "title": "Step2 - Image size",
+ "bounding": [
+ 200,
+ 530,
+ 330,
+ 287.9999544955691
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Step3 - Prompt",
+ "bounding": [
+ 570,
+ -200,
+ 470,
+ 700
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Step1 - Load models",
+ "bounding": [
+ 200,
+ -200,
+ 330,
+ 700
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 78,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 74,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 69,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 75,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 69,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 83,
+ "origin_id": 71,
+ "origin_slot": 0,
+ "target_id": 69,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 77,
+ "origin_id": 68,
+ "origin_slot": 0,
+ "target_id": 69,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 82,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 71,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 72,
+ "origin_id": 69,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 73,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 79,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 108,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 109,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 114,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 68,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 115,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 68,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 116,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 69,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 117,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 69,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 118,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 69,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 119,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 120,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 121,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image (Z-Image-Turbo).json b/blueprints/Text to Image (Z-Image-Turbo).json
index 6aa80e327..2501486fa 100644
--- a/blueprints/Text to Image (Z-Image-Turbo).json
+++ b/blueprints/Text to Image (Z-Image-Turbo).json
@@ -1,22 +1,21 @@
{
- "id": "1c3eaa76-5cfa-4dc7-8571-97a570324e01",
"revision": 0,
- "last_node_id": 34,
- "last_link_id": 40,
+ "last_node_id": 57,
+ "last_link_id": 0,
"nodes": [
{
- "id": 5,
- "type": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
+ "id": 57,
+ "type": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
"pos": [
- -2.5766491043910378e-05,
- 1229.999928629805
+ 130,
+ 200
],
"size": [
400,
470
],
"flags": {},
- "order": 0,
+ "order": 1,
"mode": 0,
"inputs": [
{
@@ -44,6 +43,22 @@
},
"link": null
},
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
{
"name": "unet_name",
"type": "COMBO",
@@ -80,15 +95,15 @@
"properties": {
"proxyWidgets": [
[
- "-1",
+ "27",
"text"
],
[
- "-1",
+ "13",
"width"
],
[
- "-1",
+ "13",
"height"
],
[
@@ -97,19 +112,23 @@
],
[
"3",
- "control_after_generate"
+ "steps"
],
[
- "-1",
+ "28",
"unet_name"
],
[
- "-1",
+ "30",
"clip_name"
],
[
- "-1",
+ "29",
"vae_name"
+ ],
+ [
+ "3",
+ "control_after_generate"
]
],
"cnr_id": "comfy-core",
@@ -122,48 +141,40 @@
"secondTabOffset": 80,
"secondTabWidth": 65
},
- "widgets_values": [
- "",
- 1024,
- 1024,
- null,
- null,
- "z_image_turbo_bf16.safetensors",
- "qwen_3_4b.safetensors",
- "ae.safetensors"
- ]
+ "widgets_values": [],
+ "title": "Text to Image (Z-Image-Turbo)"
}
],
"links": [],
- "groups": [],
+ "version": 0.4,
"definitions": {
"subgraphs": [
{
- "id": "dfe9eb32-97c0-43a5-90d5-4fd37768d91b",
+ "id": "f2fdebf6-dfaf-43b6-9eb2-7f70613cfdc1",
"version": 1,
"state": {
"lastGroupId": 4,
- "lastNodeId": 34,
- "lastLinkId": 40,
+ "lastNodeId": 61,
+ "lastLinkId": 75,
"lastRerouteId": 0
},
"revision": 0,
"config": {},
- "name": "local-Text to Image (Z-Image-Turbo)",
+ "name": "Text to Image (Z-Image-Turbo)",
"inputNode": {
"id": -10,
"bounding": [
- -80,
- 425,
+ -560,
+ 480,
120,
- 160
+ 200
]
},
"outputNode": {
"id": -20,
"bounding": [
- 1490,
- 415,
+ 1670,
+ 320,
120,
60
]
@@ -178,8 +189,8 @@
],
"label": "prompt",
"pos": [
- 20,
- 445
+ -460,
+ 500
]
},
{
@@ -190,8 +201,8 @@
35
],
"pos": [
- 20,
- 465
+ -460,
+ 520
]
},
{
@@ -202,44 +213,68 @@
36
],
"pos": [
- 20,
- 485
+ -460,
+ 540
]
},
{
- "id": "23087d15-8412-4fbd-b71e-9b6d7ef76de1",
+ "id": "f77677f7-6bf6-4c19-a71f-c4a553d5981e",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 71
+ ],
+ "pos": [
+ -460,
+ 560
+ ]
+ },
+ {
+ "id": "ef9a9fb1-5983-4bc9-a60b-cf5aec48bff1",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 72
+ ],
+ "pos": [
+ -460,
+ 580
+ ]
+ },
+ {
+ "id": "a20a1b30-785f-4a04-bb6d-3d61adab9764",
"name": "unet_name",
"type": "COMBO",
"linkIds": [
- 38
+ 73
],
"pos": [
- 20,
- 505
+ -460,
+ 600
]
},
{
- "id": "0677f5c3-2a3f-43d4-98ac-a4c56d5efdc0",
+ "id": "4af8fc2b-4655-4086-8240-45f8cb38c6f6",
"name": "clip_name",
"type": "COMBO",
"linkIds": [
- 39
+ 74
],
"pos": [
- 20,
- 525
+ -460,
+ 620
]
},
{
- "id": "c85c0445-2641-48b1-bbca-95057edf2fcf",
+ "id": "4d518693-2807-439c-9cb6-cffd23ccba2c",
"name": "vae_name",
"type": "COMBO",
"linkIds": [
- 40
+ 75
],
"pos": [
- 20,
- 545
+ -460,
+ 640
]
}
],
@@ -253,8 +288,8 @@
],
"localized_name": "IMAGE",
"pos": [
- 1510,
- 435
+ 1690,
+ 340
]
}
],
@@ -264,15 +299,15 @@
"id": 30,
"type": "CLIPLoader",
"pos": [
- 109.99997264844609,
- 329.99999029608756
+ 30,
+ 420
],
"size": [
- 269.9869791666667,
- 106
+ 270,
+ 150
],
"flags": {},
- "order": 0,
+ "order": 7,
"mode": 0,
"inputs": [
{
@@ -282,7 +317,7 @@
"widget": {
"name": "clip_name"
},
- "link": 39
+ "link": 74
},
{
"localized_name": "type",
@@ -315,9 +350,9 @@
}
],
"properties": {
+ "Node name for S&R": "CLIPLoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
- "Node name for S&R": "CLIPLoader",
"models": [
{
"name": "qwen_3_4b.safetensors",
@@ -343,15 +378,15 @@
"id": 29,
"type": "VAELoader",
"pos": [
- 109.99997264844609,
- 479.9999847172637
+ 30,
+ 650
],
"size": [
- 269.9869791666667,
- 58
+ 270,
+ 110
],
"flags": {},
- "order": 1,
+ "order": 6,
"mode": 0,
"inputs": [
{
@@ -361,7 +396,7 @@
"widget": {
"name": "vae_name"
},
- "link": 40
+ "link": 75
}
],
"outputs": [
@@ -375,9 +410,9 @@
}
],
"properties": {
+ "Node name for S&R": "VAELoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
- "Node name for S&R": "VAELoader",
"models": [
{
"name": "ae.safetensors",
@@ -401,12 +436,12 @@
"id": 33,
"type": "ConditioningZeroOut",
"pos": [
- 639.9999103333332,
- 620.0000271257795
+ 630,
+ 960
],
"size": [
- 204.134765625,
- 26
+ 230,
+ 80
],
"flags": {},
"order": 8,
@@ -430,9 +465,9 @@
}
],
"properties": {
+ "Node name for S&R": "ConditioningZeroOut",
"cnr_id": "comfy-core",
"ver": "0.3.73",
- "Node name for S&R": "ConditioningZeroOut",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -440,22 +475,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 8,
"type": "VAEDecode",
"pos": [
- 1219.9999088104782,
- 160.00009184959066
+ 1320,
+ 230
],
"size": [
- 209.98697916666669,
- 46
+ 230,
+ 100
],
"flags": {},
- "order": 5,
+ "order": 1,
"mode": 0,
"inputs": [
{
@@ -483,9 +517,9 @@
}
],
"properties": {
+ "Node name for S&R": "VAEDecode",
"cnr_id": "comfy-core",
"ver": "0.3.64",
- "Node name for S&R": "VAEDecode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -493,22 +527,21 @@
"secondTabText": "Send Back",
"secondTabOffset": 80,
"secondTabWidth": 65
- },
- "widgets_values": []
+ }
},
{
"id": 28,
"type": "UNETLoader",
"pos": [
- 109.99997264844609,
- 200.0000502647102
+ 30,
+ 230
],
"size": [
- 269.9869791666667,
- 82
+ 270,
+ 110
],
"flags": {},
- "order": 2,
+ "order": 5,
"mode": 0,
"inputs": [
{
@@ -518,7 +551,7 @@
"widget": {
"name": "unet_name"
},
- "link": 38
+ "link": 73
},
{
"localized_name": "weight_dtype",
@@ -541,9 +574,9 @@
}
],
"properties": {
+ "Node name for S&R": "UNETLoader",
"cnr_id": "comfy-core",
"ver": "0.3.73",
- "Node name for S&R": "UNETLoader",
"models": [
{
"name": "z_image_turbo_bf16.safetensors",
@@ -568,15 +601,15 @@
"id": 27,
"type": "CLIPTextEncode",
"pos": [
- 429.99997828947767,
- 200.0000502647102
+ 400,
+ 230
],
"size": [
- 409.9869791666667,
- 319.9869791666667
+ 450,
+ 650
],
"flags": {},
- "order": 7,
+ "order": 4,
"mode": 0,
"inputs": [
{
@@ -607,9 +640,9 @@
}
],
"properties": {
+ "Node name for S&R": "CLIPTextEncode",
"cnr_id": "comfy-core",
"ver": "0.3.73",
- "Node name for S&R": "CLIPTextEncode",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -626,15 +659,15 @@
"id": 13,
"type": "EmptySD3LatentImage",
"pos": [
- 109.99997264844609,
- 629.9999791384399
+ 40,
+ 890
],
"size": [
- 259.9869791666667,
- 106
+ 260,
+ 170
],
"flags": {},
- "order": 6,
+ "order": 3,
"mode": 0,
"inputs": [
{
@@ -677,9 +710,9 @@
}
],
"properties": {
+ "Node name for S&R": "EmptySD3LatentImage",
"cnr_id": "comfy-core",
"ver": "0.3.64",
- "Node name for S&R": "EmptySD3LatentImage",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -694,19 +727,77 @@
1
]
},
+ {
+ "id": 11,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 950,
+ 230
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 26
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3
+ ]
+ },
{
"id": 3,
"type": "KSampler",
"pos": [
- 879.9999615530063,
- 269.9999774911694
+ 950,
+ 400
],
"size": [
- 314.9869791666667,
- 262
+ 320,
+ 350
],
"flags": {},
- "order": 4,
+ "order": 0,
"mode": 0,
"inputs": [
{
@@ -740,7 +831,7 @@
"widget": {
"name": "seed"
},
- "link": null
+ "link": 71
},
{
"localized_name": "steps",
@@ -749,7 +840,7 @@
"widget": {
"name": "steps"
},
- "link": null
+ "link": 72
},
{
"localized_name": "cfg",
@@ -800,9 +891,9 @@
}
],
"properties": {
+ "Node name for S&R": "KSampler",
"cnr_id": "comfy-core",
"ver": "0.3.64",
- "Node name for S&R": "KSampler",
"enableTabs": false,
"tabWidth": 65,
"tabXOffset": 10,
@@ -814,81 +905,23 @@
"widgets_values": [
0,
"randomize",
- 4,
+ 8,
1,
"res_multistep",
"simple",
1
]
- },
- {
- "id": 11,
- "type": "ModelSamplingAuraFlow",
- "pos": [
- 879.9999615530063,
- 160.00009184959066
- ],
- "size": [
- 309.9869791666667,
- 58
- ],
- "flags": {},
- "order": 3,
- "mode": 0,
- "inputs": [
- {
- "localized_name": "model",
- "name": "model",
- "type": "MODEL",
- "link": 26
- },
- {
- "localized_name": "shift",
- "name": "shift",
- "type": "FLOAT",
- "widget": {
- "name": "shift"
- },
- "link": null
- }
- ],
- "outputs": [
- {
- "localized_name": "MODEL",
- "name": "MODEL",
- "type": "MODEL",
- "slot_index": 0,
- "links": [
- 13
- ]
- }
- ],
- "properties": {
- "cnr_id": "comfy-core",
- "ver": "0.3.64",
- "Node name for S&R": "ModelSamplingAuraFlow",
- "enableTabs": false,
- "tabWidth": 65,
- "tabXOffset": 10,
- "hasSecondTab": false,
- "secondTabText": "Send Back",
- "secondTabOffset": 80,
- "secondTabWidth": 65
- },
- "widgets_values": [
- 3
- ]
}
],
"groups": [
{
"id": 2,
- "title": "Image size",
+ "title": "Step2 - Image size",
"bounding": [
- 100,
- 560,
- 290,
- 200
+ 10,
+ 820,
+ 320,
+ 280
],
"color": "#3f789e",
"font_size": 24,
@@ -896,12 +929,12 @@
},
{
"id": 3,
- "title": "Prompt",
+ "title": "Step3 - Prompt",
"bounding": [
- 410,
+ 360,
130,
- 450,
- 540
+ 530,
+ 970
],
"color": "#3f789e",
"font_size": 24,
@@ -909,12 +942,12 @@
},
{
"id": 4,
- "title": "Models",
+ "title": "Step1 - Load models",
"bounding": [
- 100,
+ 0,
130,
- 290,
- 413.6
+ 330,
+ 660
],
"color": "#3f789e",
"font_size": 24,
@@ -1027,25 +1060,41 @@
"type": "INT"
},
{
- "id": 38,
+ "id": 71,
"origin_id": -10,
"origin_slot": 3,
+ "target_id": 3,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 72,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 3,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 73,
+ "origin_id": -10,
+ "origin_slot": 5,
"target_id": 28,
"target_slot": 0,
"type": "COMBO"
},
{
- "id": 39,
+ "id": 74,
"origin_id": -10,
- "origin_slot": 4,
+ "origin_slot": 6,
"target_id": 30,
"target_slot": 0,
"type": "COMBO"
},
{
- "id": 40,
+ "id": 75,
"origin_id": -10,
- "origin_slot": 5,
+ "origin_slot": 7,
"target_id": 29,
"target_slot": 0,
"type": "COMBO"
@@ -1054,25 +1103,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image generation and editing/Text to image"
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using Z-Image-Turbo, Alibaba's distilled 6B DiT model."
}
]
},
- "config": {},
- "extra": {
- "frontendVersion": "1.37.10",
- "workflowRendererVersion": "LG",
- "VHS_latentpreview": false,
- "VHS_latentpreviewrate": 0,
- "VHS_MetadataImage": true,
- "VHS_KeepIntermediate": true,
- "ds": {
- "scale": 0.8401370345180755,
- "offset": [
- 940.0587067393087,
- -830.7121087564725
- ]
- }
- },
- "version": 0.4
-}
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Text to Image.json b/blueprints/Text to Image.json
new file mode 100644
index 000000000..ffe3682ff
--- /dev/null
+++ b/blueprints/Text to Image.json
@@ -0,0 +1,1132 @@
+{
+ "revision": 0,
+ "last_node_id": 71,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 71,
+ "type": "2d5985c9-deef-41ae-9c34-6353d3d7d1ef",
+ "pos": [
+ 90,
+ 800
+ ],
+ "size": [
+ 400,
+ 80
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": null
+ },
+ {
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ }
+ ],
+ "title": "Text to Image",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "67",
+ "text"
+ ],
+ [
+ "68",
+ "width"
+ ],
+ [
+ "68",
+ "height"
+ ],
+ [
+ "66",
+ "unet_name"
+ ],
+ [
+ "62",
+ "clip_name"
+ ],
+ [
+ "63",
+ "vae_name"
+ ],
+ [
+ "70",
+ "steps"
+ ],
+ [
+ "70",
+ "control_after_generate"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "2d5985c9-deef-41ae-9c34-6353d3d7d1ef",
+ "version": 1,
+ "state": {
+ "lastGroupId": 4,
+ "lastNodeId": 71,
+ "lastLinkId": 70,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Image",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -80,
+ 425,
+ 120,
+ 180
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 1490,
+ 415,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "fb178669-e742-4a53-8a69-7df59834dfd8",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 34
+ ],
+ "label": "prompt",
+ "pos": [
+ 20,
+ 445
+ ]
+ },
+ {
+ "id": "dd780b3c-23e9-46ff-8469-156008f42e5a",
+ "name": "width",
+ "type": "INT",
+ "linkIds": [
+ 35
+ ],
+ "pos": [
+ 20,
+ 465
+ ]
+ },
+ {
+ "id": "7b08d546-6bb0-4ef9-82e9-ffae5e1ee6bc",
+ "name": "height",
+ "type": "INT",
+ "linkIds": [
+ 36
+ ],
+ "pos": [
+ 20,
+ 485
+ ]
+ },
+ {
+ "id": "8ed4eb73-a2bf-4766-8bf4-c5890b560596",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 38
+ ],
+ "pos": [
+ 20,
+ 505
+ ]
+ },
+ {
+ "id": "f362d639-d412-4b5d-8490-1e9995dc5f82",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 39
+ ],
+ "pos": [
+ 20,
+ 525
+ ]
+ },
+ {
+ "id": "ee25ac16-de63-4b74-bbbb-5b29fdc1efcf",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 40
+ ],
+ "pos": [
+ 20,
+ 545
+ ]
+ },
+ {
+ "id": "51cbcd61-9218-4bcb-89ac-ecdfb1ef8892",
+ "name": "steps",
+ "type": "INT",
+ "linkIds": [
+ 70
+ ],
+ "pos": [
+ 20,
+ 565
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "1fa72a21-ce00-4952-814e-1f2ffbe87d1d",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 16
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ 1510,
+ 435
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 62,
+ "type": "CLIPLoader",
+ "pos": [
+ 110,
+ 330
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 39
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 28
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPLoader",
+ "models": [
+ {
+ "name": "qwen_3_4b.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/text_encoders/qwen_3_4b.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "qwen_3_4b.safetensors",
+ "lumina2",
+ "default"
+ ]
+ },
+ {
+ "id": 63,
+ "type": "VAELoader",
+ "pos": [
+ 110,
+ 480
+ ],
+ "size": [
+ 270,
+ 60
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 40
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 27
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAELoader",
+ "models": [
+ {
+ "name": "ae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/vae/ae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "ae.safetensors"
+ ]
+ },
+ {
+ "id": 64,
+ "type": "ConditioningZeroOut",
+ "pos": [
+ 640,
+ 620
+ ],
+ "size": [
+ 210,
+ 30
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "type": "CONDITIONING",
+ "link": 32
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 33
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ConditioningZeroOut",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 65,
+ "type": "VAEDecode",
+ "pos": [
+ 1220,
+ 160
+ ],
+ "size": [
+ 210,
+ 50
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 14
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 27
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 16
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "VAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 66,
+ "type": "UNETLoader",
+ "pos": [
+ 110,
+ 200
+ ],
+ "size": [
+ 270,
+ 90
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 38
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 26
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "UNETLoader",
+ "models": [
+ {
+ "name": "z_image_turbo_bf16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/z_image_turbo/resolve/main/split_files/diffusion_models/z_image_turbo_bf16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "z_image_turbo_bf16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 67,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 430,
+ 200
+ ],
+ "size": [
+ 410,
+ 370
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 28
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 34
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 30,
+ 32
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.73",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 68,
+ "type": "EmptySD3LatentImage",
+ "pos": [
+ 110,
+ 630
+ ],
+ "size": [
+ 260,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 35
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 36
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 17
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "EmptySD3LatentImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1024,
+ 1024,
+ 1
+ ]
+ },
+ {
+ "id": 69,
+ "type": "ModelSamplingAuraFlow",
+ "pos": [
+ 880,
+ 160
+ ],
+ "size": [
+ 310,
+ 60
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 26
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 13
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "ModelSamplingAuraFlow",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 3
+ ]
+ },
+ {
+ "id": 70,
+ "type": "KSampler",
+ "pos": [
+ 880,
+ 270
+ ],
+ "size": [
+ 320,
+ 270
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 13
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 30
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 33
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 17
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 70
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 14
+ ]
+ }
+ ],
+ "properties": {
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "Node name for S&R": "KSampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "randomize",
+ 8,
+ 1,
+ "res_multistep",
+ "simple",
+ 1
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 2,
+ "title": "Step2 - Image size",
+ "bounding": [
+ 100,
+ 560,
+ 290,
+ 200
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Step3 - Prompt",
+ "bounding": [
+ 410,
+ 130,
+ 450,
+ 540
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Step1 - Load models",
+ "bounding": [
+ 100,
+ 130,
+ 290,
+ 413.6
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 32,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 64,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 26,
+ "origin_id": 66,
+ "origin_slot": 0,
+ "target_id": 69,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 14,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 27,
+ "origin_id": 63,
+ "origin_slot": 0,
+ "target_id": 65,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 13,
+ "origin_id": 69,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 30,
+ "origin_id": 67,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 33,
+ "origin_id": 64,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 17,
+ "origin_id": 68,
+ "origin_slot": 0,
+ "target_id": 70,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 28,
+ "origin_id": 62,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 16,
+ "origin_id": 65,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 34,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 67,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 35,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 68,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 36,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 68,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 38,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 66,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 39,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 62,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 40,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 63,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 70,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 70,
+ "target_slot": 5,
+ "type": "INT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Image generation and editing/Text to image",
+ "description": "Generates images from text prompts using Z-Image-Turbo defaults with Qwen3 text encoder and VAE."
+ }
+ ]
+ },
+ "extra": {}
+}
diff --git a/blueprints/Text to Video (LTX-2.3).json b/blueprints/Text to Video (LTX-2.3).json
new file mode 100644
index 000000000..f44a216dd
--- /dev/null
+++ b/blueprints/Text to Video (LTX-2.3).json
@@ -0,0 +1,4297 @@
+{
+ "revision": 0,
+ "last_node_id": 324,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 324,
+ "type": "871cf29d-2726-43a5-b61e-01fa939d699d",
+ "pos": [
+ -300,
+ 4290
+ ],
+ "size": [
+ 400,
+ 170
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "width",
+ "name": "value_2",
+ "type": "INT",
+ "widget": {
+ "name": "value_2"
+ },
+ "link": null
+ },
+ {
+ "label": "height",
+ "name": "value_3",
+ "type": "INT",
+ "widget": {
+ "name": "value_3"
+ },
+ "link": null
+ },
+ {
+ "label": "duration",
+ "name": "value_4",
+ "type": "INT",
+ "widget": {
+ "name": "value_4"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "label": "distilled_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": null
+ },
+ {
+ "label": "latent_upscale_model",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": null
+ },
+ {
+ "label": "fps",
+ "name": "value_1",
+ "type": "INT",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "320",
+ "value"
+ ],
+ [
+ "314",
+ "value"
+ ],
+ [
+ "301",
+ "value"
+ ],
+ [
+ "303",
+ "value"
+ ],
+ [
+ "318",
+ "ckpt_name"
+ ],
+ [
+ "287",
+ "lora_name"
+ ],
+ [
+ "319",
+ "text_encoder"
+ ],
+ [
+ "313",
+ "model_name"
+ ],
+ [
+ "302",
+ "value"
+ ],
+ [
+ "279",
+ "noise_seed"
+ ],
+ [
+ "279",
+ "control_after_generate"
+ ]
+ ],
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "value_1": true,
+ "value_2": true,
+ "value_3": true,
+ "value_4": true,
+ "lora_name": true,
+ "model_name": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Text to Video (LTX-2.3)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "871cf29d-2726-43a5-b61e-01fa939d699d",
+ "version": 1,
+ "state": {
+ "lastGroupId": 26,
+ "lastNodeId": 324,
+ "lastLinkId": 631,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Text to Video (LTX-2.3)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ 720,
+ 4240,
+ 162.162109375,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 6100,
+ 4160,
+ 120,
+ 60
+ ]
+ },
+ "inputs": [
+ {
+ "id": "9494c550-4172-49c6-930e-5b508f775e77",
+ "name": "value",
+ "type": "STRING",
+ "linkIds": [
+ 595
+ ],
+ "pos": [
+ 862.162109375,
+ 4260
+ ]
+ },
+ {
+ "id": "58dbb3f6-f924-4548-96ef-e0e34610bd4e",
+ "name": "value_2",
+ "type": "INT",
+ "linkIds": [
+ 597
+ ],
+ "label": "width",
+ "pos": [
+ 862.162109375,
+ 4280
+ ]
+ },
+ {
+ "id": "6086d5b8-2586-448c-a641-dd14d76dd102",
+ "name": "value_3",
+ "type": "INT",
+ "linkIds": [
+ 598
+ ],
+ "label": "height",
+ "pos": [
+ 862.162109375,
+ 4300
+ ]
+ },
+ {
+ "id": "feb8c2eb-ae48-4fa8-bc24-929552d656c3",
+ "name": "value_4",
+ "type": "INT",
+ "linkIds": [
+ 599
+ ],
+ "label": "duration",
+ "pos": [
+ 862.162109375,
+ 4320
+ ]
+ },
+ {
+ "id": "d7255058-319a-4880-8f9a-7e542c8f3c3c",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 601,
+ 604,
+ 605
+ ],
+ "pos": [
+ 862.162109375,
+ 4340
+ ]
+ },
+ {
+ "id": "4afce68d-8f65-4342-9d6d-ae0a7688c3e3",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 602
+ ],
+ "label": "distilled_lora",
+ "pos": [
+ 862.162109375,
+ 4360
+ ]
+ },
+ {
+ "id": "ab842b4b-c977-4679-b421-424722785b57",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "linkIds": [
+ 606
+ ],
+ "pos": [
+ 862.162109375,
+ 4380
+ ]
+ },
+ {
+ "id": "9e47372d-28d9-4311-91e9-e90d03f4eb43",
+ "name": "model_name",
+ "type": "COMBO",
+ "linkIds": [
+ 607
+ ],
+ "label": "latent_upscale_model",
+ "pos": [
+ 862.162109375,
+ 4400
+ ]
+ },
+ {
+ "id": "7951b137-465e-4844-b05f-88b89f0e1ba8",
+ "name": "value_1",
+ "type": "INT",
+ "linkIds": [
+ 627
+ ],
+ "label": "fps",
+ "pos": [
+ 862.162109375,
+ 4420
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "954ef307-c897-4eea-8b5c-5c6ce15a5357",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "linkIds": [
+ 536
+ ],
+ "localized_name": "VIDEO",
+ "pos": [
+ 6120,
+ 4180
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 278,
+ "type": "RandomNoise",
+ "pos": [
+ 4720,
+ 3750
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 490
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 42,
+ "fixed"
+ ]
+ },
+ {
+ "id": 279,
+ "type": "RandomNoise",
+ "pos": [
+ 3200,
+ 3900
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 483
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "RandomNoise",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 343011291748534,
+ "randomize"
+ ]
+ },
+ {
+ "id": 280,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 4730,
+ 4520
+ ],
+ "size": [
+ 280,
+ 100
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 512
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 513
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 494
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 281,
+ "type": "LTXVAudioVAELoader",
+ "pos": [
+ 1660,
+ 4140
+ ],
+ "size": [
+ 430,
+ 110
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 604
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio VAE",
+ "name": "Audio VAE",
+ "type": "VAE",
+ "links": [
+ 481,
+ 496
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVAudioVAELoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-dev-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 282,
+ "type": "KSamplerSelect",
+ "pos": [
+ 4720,
+ 4160
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 492
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler_cfg_pp"
+ ]
+ },
+ {
+ "id": 283,
+ "type": "ManualSigmas",
+ "pos": [
+ 4720,
+ 4340
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 493
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "0.85, 0.7250, 0.4219, 0.0"
+ ]
+ },
+ {
+ "id": 284,
+ "type": "CFGGuider",
+ "pos": [
+ 4720,
+ 3930
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 478
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 479
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 480
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 491
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.71",
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 285,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 3620,
+ 3990
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 483
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 484
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 485
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 544
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 487
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": [
+ 488
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 286,
+ "type": "LTXVCropGuides",
+ "pos": [
+ 3900,
+ 3700
+ ],
+ "size": [
+ 250,
+ 120
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 475
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 476
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 477
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 479
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 480
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "slot_index": 2,
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVCropGuides",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 287,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ 1660,
+ 3910
+ ],
+ "size": [
+ 430,
+ 140
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 520
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 602
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 478,
+ 541
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-distilled-lora-384.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-22b-distilled-lora-384.safetensors",
+ "directory": "loras"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-distilled-lora-384.safetensors",
+ 0.5
+ ]
+ },
+ {
+ "id": 288,
+ "type": "ResizeImagesByLongerEdge",
+ "pos": [
+ 2120,
+ 5040
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 523
+ },
+ {
+ "localized_name": "longer_edge",
+ "name": "longer_edge",
+ "type": "INT",
+ "widget": {
+ "name": "longer_edge"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 505
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "ResizeImagesByLongerEdge",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1536
+ ]
+ },
+ {
+ "id": 289,
+ "type": "LTXVLatentUpsampler",
+ "pos": [
+ 4270,
+ 3910
+ ],
+ "size": [
+ 330,
+ 120
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 547
+ },
+ {
+ "localized_name": "upscale_model",
+ "name": "upscale_model",
+ "type": "LATENT_UPSCALE_MODEL",
+ "link": 545
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 554
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 548
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "LTXVLatentUpsampler",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 290,
+ "type": "LTXVImgToVideoInplace",
+ "pos": [
+ 4280,
+ 4150
+ ],
+ "size": [
+ 300,
+ 180
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 552
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 515
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 548
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "bypass",
+ "name": "bypass",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "bypass"
+ },
+ "link": 543
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 512
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVImgToVideoInplace",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1,
+ false
+ ]
+ },
+ {
+ "id": 291,
+ "type": "LTXVPreprocess",
+ "pos": [
+ 2130,
+ 5190
+ ],
+ "size": [
+ 290,
+ 110
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 505
+ },
+ {
+ "localized_name": "img_compression",
+ "name": "img_compression",
+ "type": "INT",
+ "widget": {
+ "name": "img_compression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output_image",
+ "name": "output_image",
+ "type": "IMAGE",
+ "links": [
+ 510,
+ 515
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVPreprocess",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 18
+ ]
+ },
+ {
+ "id": 292,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ 1670,
+ 5040
+ ],
+ "size": [
+ 300,
+ 160
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 626
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 558
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 559
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 523
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "ResizeImageMaskNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 1920,
+ 1088,
+ "center",
+ "lanczos"
+ ]
+ },
+ {
+ "id": 293,
+ "type": "KSamplerSelect",
+ "pos": [
+ 3200,
+ 4350
+ ],
+ "size": [
+ 280,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 485
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "KSamplerSelect",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "euler_ancestral_cfg_pp"
+ ]
+ },
+ {
+ "id": 294,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2530,
+ 5070
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 560
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 561
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a/2"
+ ]
+ },
+ {
+ "id": 295,
+ "type": "Reroute",
+ "pos": [
+ 3930,
+ 4090
+ ],
+ "size": [
+ 80,
+ 30
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "",
+ "type": "*",
+ "link": 557
+ }
+ ],
+ "outputs": [
+ {
+ "name": "",
+ "type": "VAE",
+ "links": [
+ 552,
+ 553,
+ 554
+ ]
+ }
+ ],
+ "properties": {
+ "showOutputText": false,
+ "horizontal": false,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ }
+ },
+ {
+ "id": 296,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2530,
+ 5130
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 562
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 563
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a/2"
+ ]
+ },
+ {
+ "id": 297,
+ "type": "EmptyLTXVLatentVideo",
+ "pos": [
+ 2980,
+ 5200
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 561
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 563
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 631
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 511
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.60",
+ "Node name for S&R": "EmptyLTXVLatentVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 512,
+ 97,
+ 1
+ ]
+ },
+ {
+ "id": 298,
+ "type": "LTXVImgToVideoInplace",
+ "pos": [
+ 3420,
+ 4990
+ ],
+ "size": [
+ 280,
+ 180
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 556
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 510
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "link": 511
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "bypass",
+ "name": "bypass",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "bypass"
+ },
+ "link": 542
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 497
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVImgToVideoInplace",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.7,
+ false
+ ]
+ },
+ {
+ "id": 299,
+ "type": "LTXVAudioVAEDecode",
+ "pos": [
+ 5770,
+ 3940
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 495
+ },
+ {
+ "label": "Audio VAE",
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 496
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Audio",
+ "name": "Audio",
+ "type": "AUDIO",
+ "links": [
+ 534
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVAudioVAEDecode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 300,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2530,
+ 5270
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 564
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 566,
+ 591
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 565
+ ]
+ }
+ ],
+ "title": "Math Expression (fps)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "a"
+ ]
+ },
+ {
+ "id": 301,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1160,
+ 4530
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 598
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 559,
+ 562
+ ]
+ }
+ ],
+ "title": "Height",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 302,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1160,
+ 4680
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 627
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 564,
+ 629
+ ]
+ }
+ ],
+ "title": "Frame Rate",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 25,
+ "fixed"
+ ]
+ },
+ {
+ "id": 303,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1160,
+ 4230
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 599
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 628
+ ]
+ }
+ ],
+ "title": "Duration",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 5,
+ "fixed"
+ ]
+ },
+ {
+ "id": 304,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ 1170,
+ 4080
+ ],
+ "size": [
+ 370,
+ 100
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 542,
+ 543
+ ]
+ }
+ ],
+ "title": "Switch to Text to Video?",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.0",
+ "Node name for S&R": "PrimitiveBoolean",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 305,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 2170,
+ 3640
+ ],
+ "size": [
+ 550,
+ 740
+ ],
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 615
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 623
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 526
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 306,
+ "type": "LTXVConditioning",
+ "pos": [
+ 2790,
+ 3670
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 526
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 527
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "FLOAT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 566
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 475,
+ 518
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 476,
+ 519
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "LTXVConditioning",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 307,
+ "type": "LTXVEmptyLatentAudio",
+ "pos": [
+ 2970,
+ 4970
+ ],
+ "size": [
+ 280,
+ 170
+ ],
+ "flags": {},
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio_vae",
+ "name": "audio_vae",
+ "type": "VAE",
+ "link": 481
+ },
+ {
+ "localized_name": "frames_number",
+ "name": "frames_number",
+ "type": "INT",
+ "widget": {
+ "name": "frames_number"
+ },
+ "link": 630
+ },
+ {
+ "localized_name": "frame_rate",
+ "name": "frame_rate",
+ "type": "INT",
+ "widget": {
+ "name": "frame_rate"
+ },
+ "link": 565
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "Latent",
+ "name": "Latent",
+ "type": "LATENT",
+ "links": [
+ 498
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.68",
+ "Node name for S&R": "LTXVEmptyLatentAudio",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 97,
+ 25,
+ 1
+ ]
+ },
+ {
+ "id": 308,
+ "type": "ManualSigmas",
+ "pos": [
+ 3200,
+ 4550
+ ],
+ "size": [
+ 500,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "STRING",
+ "widget": {
+ "name": "sigmas"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 544
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "ManualSigmas",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "1.0, 0.99375, 0.9875, 0.98125, 0.975, 0.909375, 0.725, 0.421875, 0.0"
+ ]
+ },
+ {
+ "id": 309,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 3890,
+ 3910
+ ],
+ "size": [
+ 250,
+ 100
+ ],
+ "flags": {},
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 488
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 477,
+ 547
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 513
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 310,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 5070,
+ 3750
+ ],
+ "size": [
+ 230,
+ 170
+ ],
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 490
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 491
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 492
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 493
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 494
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": [
+ 578
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.75",
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 311,
+ "type": "LTXVSeparateAVLatent",
+ "pos": [
+ 5410,
+ 3750
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 34,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "av_latent",
+ "name": "av_latent",
+ "type": "LATENT",
+ "link": 578
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "links": [
+ 539
+ ]
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "links": [
+ 495
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "LTXVSeparateAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 312,
+ "type": "CreateVideo",
+ "pos": [
+ 5740,
+ 4610
+ ],
+ "size": [
+ 280,
+ 130
+ ],
+ "flags": {},
+ "order": 35,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 538
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 534
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 591
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 536
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.5.1",
+ "Node name for S&R": "CreateVideo",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 313,
+ "type": "LatentUpscaleModelLoader",
+ "pos": [
+ 1670,
+ 4600
+ ],
+ "size": [
+ 400,
+ 110
+ ],
+ "flags": {},
+ "order": 36,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 607
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT_UPSCALE_MODEL",
+ "name": "LATENT_UPSCALE_MODEL",
+ "type": "LATENT_UPSCALE_MODEL",
+ "links": [
+ 545
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LatentUpscaleModelLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-spatial-upscaler-x2-1.1.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3/resolve/main/ltx-2.3-spatial-upscaler-x2-1.1.safetensors",
+ "directory": "latent_upscale_models"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
+ ]
+ },
+ {
+ "id": 314,
+ "type": "PrimitiveInt",
+ "pos": [
+ 1160,
+ 4380
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 37,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 597
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 558,
+ 560
+ ]
+ }
+ ],
+ "title": "Width",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveInt",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1280,
+ "fixed"
+ ]
+ },
+ {
+ "id": 315,
+ "type": "CLIPTextEncode",
+ "pos": [
+ 2180,
+ 4480
+ ],
+ "size": [
+ 530,
+ 240
+ ],
+ "flags": {},
+ "order": 38,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 625
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 527
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CLIPTextEncode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "pc game, console game, video game, cartoon, childish, ugly"
+ ],
+ "color": "#323",
+ "bgcolor": "#535"
+ },
+ {
+ "id": 316,
+ "type": "CFGGuider",
+ "pos": [
+ 3200,
+ 4100
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 39,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 541
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 518
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 519
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 484
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.64",
+ "Node name for S&R": "CFGGuider",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 317,
+ "type": "VAEDecodeTiled",
+ "pos": [
+ 5760,
+ 3650
+ ],
+ "size": [
+ 280,
+ 200
+ ],
+ "flags": {},
+ "order": 40,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 539
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 553
+ },
+ {
+ "localized_name": "tile_size",
+ "name": "tile_size",
+ "type": "INT",
+ "widget": {
+ "name": "tile_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "overlap",
+ "name": "overlap",
+ "type": "INT",
+ "widget": {
+ "name": "overlap"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_size",
+ "name": "temporal_size",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "temporal_overlap",
+ "name": "temporal_overlap",
+ "type": "INT",
+ "widget": {
+ "name": "temporal_overlap"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 538
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "Node name for S&R": "VAEDecodeTiled",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 768,
+ 64,
+ 4096,
+ 4
+ ]
+ },
+ {
+ "id": 318,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ 1660,
+ 3660
+ ],
+ "size": [
+ 430,
+ 160
+ ],
+ "flags": {},
+ "order": 41,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 601
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 520
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 556,
+ 557
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.3.56",
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "ltx-2.3-22b-dev-fp8.safetensors"
+ ]
+ },
+ {
+ "id": 319,
+ "type": "LTXAVTextEncoderLoader",
+ "pos": [
+ 1660,
+ 4340
+ ],
+ "size": [
+ 430,
+ 170
+ ],
+ "flags": {},
+ "order": 42,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "text_encoder",
+ "name": "text_encoder",
+ "type": "COMBO",
+ "widget": {
+ "name": "text_encoder"
+ },
+ "link": 606
+ },
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 605
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 615,
+ 625
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXAVTextEncoderLoader",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "ltx-2.3-22b-dev-fp8.safetensors",
+ "url": "https://huggingface.co/Lightricks/LTX-2.3-fp8/resolve/main/ltx-2.3-22b-dev-fp8.safetensors",
+ "directory": "checkpoints"
+ },
+ {
+ "name": "gemma_3_12B_it_fp4_mixed.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/ltx-2/resolve/main/split_files/text_encoders/gemma_3_12B_it_fp4_mixed.safetensors",
+ "directory": "text_encoders"
+ }
+ ]
+ },
+ "widgets_values": [
+ "gemma_3_12B_it_fp4_mixed.safetensors",
+ "ltx-2.3-22b-dev-fp8.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 320,
+ "type": "PrimitiveStringMultiline",
+ "pos": [
+ 1160,
+ 3680
+ ],
+ "size": [
+ 370,
+ 350
+ ],
+ "flags": {},
+ "order": 43,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "STRING",
+ "widget": {
+ "name": "value"
+ },
+ "link": 595
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "STRING",
+ "name": "STRING",
+ "type": "STRING",
+ "links": [
+ 623
+ ]
+ }
+ ],
+ "title": "Prompt",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "PrimitiveStringMultiline",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 321,
+ "type": "LTXVConcatAVLatent",
+ "pos": [
+ 3820,
+ 4990
+ ],
+ "size": [
+ 240,
+ 100
+ ],
+ "flags": {},
+ "order": 44,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video_latent",
+ "name": "video_latent",
+ "type": "LATENT",
+ "link": 497
+ },
+ {
+ "localized_name": "audio_latent",
+ "name": "audio_latent",
+ "type": "LATENT",
+ "link": 498
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 487
+ ]
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.7.0",
+ "Node name for S&R": "LTXVConcatAVLatent",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 322,
+ "type": "LoadImage",
+ "pos": [
+ 1150,
+ 4940
+ ],
+ "size": [
+ 400,
+ 480
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "COMBO",
+ "widget": {
+ "name": "image"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "choose file to upload",
+ "name": "upload",
+ "type": "IMAGEUPLOAD",
+ "widget": {
+ "name": "upload"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 626
+ ]
+ },
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": null
+ }
+ ],
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.16.3",
+ "Node name for S&R": "LoadImage",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "example.png",
+ "image"
+ ]
+ },
+ {
+ "id": 323,
+ "type": "ComfyMathExpression",
+ "pos": [
+ 2540,
+ 5370
+ ],
+ "size": [
+ 260,
+ 190
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 45,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 628
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": 629
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 630,
+ 631
+ ]
+ }
+ ],
+ "title": "Math Expression (length)",
+ "properties": {
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "input_ue_unconnectable": {}
+ },
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b + 1"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Model",
+ "bounding": [
+ 1630,
+ 3550,
+ 480,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Generate Low Resolution",
+ "bounding": [
+ 3150,
+ 3550,
+ 1020,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Prompt",
+ "bounding": [
+ 2140,
+ 3550,
+ 980,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Generate High Resolution",
+ "bounding": [
+ 4690,
+ 3550,
+ 960,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 7,
+ "title": "Lantent Upscale",
+ "bounding": [
+ 4200,
+ 3550,
+ 460,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 19,
+ "title": "Video Settings",
+ "bounding": [
+ 1110,
+ 3550,
+ 490,
+ 1270
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 20,
+ "title": "Image Preprocess",
+ "bounding": [
+ 1630,
+ 4850,
+ 830,
+ 610
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 21,
+ "title": "Empty Latent",
+ "bounding": [
+ 2830,
+ 4850,
+ 1340,
+ 610
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 22,
+ "title": "Number conversion",
+ "bounding": [
+ 2490,
+ 4850,
+ 320,
+ 610
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ },
+ {
+ "id": 26,
+ "title": "Image will not affect the video",
+ "bounding": [
+ 1110,
+ 4850,
+ 490,
+ 610
+ ],
+ "color": "#3f789e",
+ "font_size": 24,
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 512,
+ "origin_id": 290,
+ "origin_slot": 0,
+ "target_id": 280,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 513,
+ "origin_id": 309,
+ "origin_slot": 1,
+ "target_id": 280,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 478,
+ "origin_id": 287,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 479,
+ "origin_id": 286,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 480,
+ "origin_id": 286,
+ "origin_slot": 1,
+ "target_id": 284,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 541,
+ "origin_id": 287,
+ "origin_slot": 0,
+ "target_id": 316,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 518,
+ "origin_id": 306,
+ "origin_slot": 0,
+ "target_id": 316,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 519,
+ "origin_id": 306,
+ "origin_slot": 1,
+ "target_id": 316,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 483,
+ "origin_id": 279,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 484,
+ "origin_id": 316,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 485,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 544,
+ "origin_id": 308,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 487,
+ "origin_id": 321,
+ "origin_slot": 0,
+ "target_id": 285,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 475,
+ "origin_id": 306,
+ "origin_slot": 0,
+ "target_id": 286,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 476,
+ "origin_id": 306,
+ "origin_slot": 1,
+ "target_id": 286,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 477,
+ "origin_id": 309,
+ "origin_slot": 0,
+ "target_id": 286,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 520,
+ "origin_id": 318,
+ "origin_slot": 0,
+ "target_id": 287,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 523,
+ "origin_id": 292,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 547,
+ "origin_id": 309,
+ "origin_slot": 0,
+ "target_id": 289,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 545,
+ "origin_id": 313,
+ "origin_slot": 0,
+ "target_id": 289,
+ "target_slot": 1,
+ "type": "LATENT_UPSCALE_MODEL"
+ },
+ {
+ "id": 554,
+ "origin_id": 295,
+ "origin_slot": 0,
+ "target_id": 289,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 552,
+ "origin_id": 295,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 515,
+ "origin_id": 291,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 548,
+ "origin_id": 289,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 543,
+ "origin_id": 304,
+ "origin_slot": 0,
+ "target_id": 290,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 505,
+ "origin_id": 288,
+ "origin_slot": 0,
+ "target_id": 291,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 558,
+ "origin_id": 314,
+ "origin_slot": 0,
+ "target_id": 292,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 559,
+ "origin_id": 301,
+ "origin_slot": 0,
+ "target_id": 292,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 560,
+ "origin_id": 314,
+ "origin_slot": 0,
+ "target_id": 294,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 557,
+ "origin_id": 318,
+ "origin_slot": 2,
+ "target_id": 295,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 562,
+ "origin_id": 301,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 561,
+ "origin_id": 294,
+ "origin_slot": 1,
+ "target_id": 297,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 563,
+ "origin_id": 296,
+ "origin_slot": 1,
+ "target_id": 297,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 556,
+ "origin_id": 318,
+ "origin_slot": 2,
+ "target_id": 298,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 510,
+ "origin_id": 291,
+ "origin_slot": 0,
+ "target_id": 298,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 511,
+ "origin_id": 297,
+ "origin_slot": 0,
+ "target_id": 298,
+ "target_slot": 2,
+ "type": "LATENT"
+ },
+ {
+ "id": 542,
+ "origin_id": 304,
+ "origin_slot": 0,
+ "target_id": 298,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 495,
+ "origin_id": 311,
+ "origin_slot": 1,
+ "target_id": 299,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 496,
+ "origin_id": 281,
+ "origin_slot": 0,
+ "target_id": 299,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 564,
+ "origin_id": 302,
+ "origin_slot": 0,
+ "target_id": 300,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 526,
+ "origin_id": 305,
+ "origin_slot": 0,
+ "target_id": 306,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 527,
+ "origin_id": 315,
+ "origin_slot": 0,
+ "target_id": 306,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 566,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 306,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 497,
+ "origin_id": 298,
+ "origin_slot": 0,
+ "target_id": 321,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 498,
+ "origin_id": 307,
+ "origin_slot": 0,
+ "target_id": 321,
+ "target_slot": 1,
+ "type": "LATENT"
+ },
+ {
+ "id": 481,
+ "origin_id": 281,
+ "origin_slot": 0,
+ "target_id": 307,
+ "target_slot": 0,
+ "type": "VAE"
+ },
+ {
+ "id": 565,
+ "origin_id": 300,
+ "origin_slot": 1,
+ "target_id": 307,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 488,
+ "origin_id": 285,
+ "origin_slot": 0,
+ "target_id": 309,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 490,
+ "origin_id": 278,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 491,
+ "origin_id": 284,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 492,
+ "origin_id": 282,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 493,
+ "origin_id": 283,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 494,
+ "origin_id": 280,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 578,
+ "origin_id": 310,
+ "origin_slot": 0,
+ "target_id": 311,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 539,
+ "origin_id": 311,
+ "origin_slot": 0,
+ "target_id": 317,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 553,
+ "origin_id": 295,
+ "origin_slot": 0,
+ "target_id": 317,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 538,
+ "origin_id": 317,
+ "origin_slot": 0,
+ "target_id": 312,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 534,
+ "origin_id": 299,
+ "origin_slot": 0,
+ "target_id": 312,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 591,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 312,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 536,
+ "origin_id": 312,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 595,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 320,
+ "target_slot": 0,
+ "type": "STRING"
+ },
+ {
+ "id": 597,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 314,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 598,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 301,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 599,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 303,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 601,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 318,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 602,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 287,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 604,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 281,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 605,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 319,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 606,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 319,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 607,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 313,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 615,
+ "origin_id": 319,
+ "origin_slot": 0,
+ "target_id": 305,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 623,
+ "origin_id": 320,
+ "origin_slot": 0,
+ "target_id": 305,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 625,
+ "origin_id": 319,
+ "origin_slot": 0,
+ "target_id": 315,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 626,
+ "origin_id": 322,
+ "origin_slot": 0,
+ "target_id": 292,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 627,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 302,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 628,
+ "origin_id": 303,
+ "origin_slot": 0,
+ "target_id": 323,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 629,
+ "origin_id": 302,
+ "origin_slot": 0,
+ "target_id": 323,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 630,
+ "origin_id": 323,
+ "origin_slot": 1,
+ "target_id": 307,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 631,
+ "origin_id": 323,
+ "origin_slot": 1,
+ "target_id": 297,
+ "target_slot": 2,
+ "type": "INT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "Vue-corrected"
+ },
+ "category": "Video generation and editing/Text to video",
+ "description": "Generates video from text prompts using LTX-2.3, Lightricks' video diffusion model."
+ }
+ ]
+ },
+ "extra": {
+ "ue_links": []
+ }
+}
\ No newline at end of file
diff --git a/blueprints/Text to Video (Wan 2.2).json b/blueprints/Text to Video (Wan 2.2).json
index 0ce485b67..a264a490d 100644
--- a/blueprints/Text to Video (Wan 2.2).json
+++ b/blueprints/Text to Video (Wan 2.2).json
@@ -1572,7 +1572,8 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Text to video"
+ "category": "Video generation and editing/Text to video",
+ "description": "Generates video from text prompts using Wan2.2, Alibaba's diffusion video model."
}
]
},
@@ -1586,4 +1587,4 @@
"VHS_KeepIntermediate": true
},
"version": 0.4
-}
+}
\ No newline at end of file
diff --git a/blueprints/Unsharp Mask.json b/blueprints/Unsharp Mask.json
index b673eb703..79a4c954f 100644
--- a/blueprints/Unsharp Mask.json
+++ b/blueprints/Unsharp Mask.json
@@ -383,7 +383,7 @@
"Node name for S&R": "GLSLShader"
},
"widgets_values": [
- "#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform vec2 u_resolution;\nuniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5\nuniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels\nuniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nfloat getLuminance(vec3 color) {\n return dot(color, vec3(0.2126, 0.7152, 0.0722));\n}\n\nvoid main() {\n vec2 texel = 1.0 / u_resolution;\n float radius = max(u_float1, 0.5);\n float amount = u_float0;\n float threshold = u_float2;\n\n vec4 original = texture(u_image0, v_texCoord);\n\n // Gaussian blur for the \"unsharp\" mask\n int samples = int(ceil(radius));\n float sigma = radius / 2.0;\n\n vec4 blurred = vec4(0.0);\n float totalWeight = 0.0;\n\n for (int x = -samples; x <= samples; x++) {\n for (int y = -samples; y <= samples; y++) {\n vec2 offset = vec2(float(x), float(y)) * texel;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float dist = length(vec2(float(x), float(y)));\n float weight = gaussian(dist, sigma);\n blurred += sample_color * weight;\n totalWeight += weight;\n }\n }\n blurred /= totalWeight;\n\n // Unsharp mask = original - blurred\n vec3 mask = original.rgb - blurred.rgb;\n\n // Luminance-based threshold with smooth falloff\n float lumaDelta = abs(getLuminance(original.rgb) - getLuminance(blurred.rgb));\n float thresholdScale = smoothstep(0.0, threshold, lumaDelta);\n mask *= thresholdScale;\n\n // Sharpen: original + mask * amount\n vec3 sharpened = original.rgb + mask * amount;\n\n fragColor0 = vec4(clamp(sharpened, 0.0, 1.0), original.a);\n}\n",
+ "#version 300 es\nprecision highp float;\n\nuniform sampler2D u_image0;\nuniform float u_float0; // amount [0.0 - 3.0] typical: 0.5-1.5\nuniform float u_float1; // radius [0.5 - 10.0] blur radius in pixels\nuniform float u_float2; // threshold [0.0 - 0.1] min difference to sharpen\n\nin vec2 v_texCoord;\nlayout(location = 0) out vec4 fragColor0;\n\nfloat gaussian(float x, float sigma) {\n return exp(-(x * x) / (2.0 * sigma * sigma));\n}\n\nfloat getLuminance(vec3 color) {\n return dot(color, vec3(0.2126, 0.7152, 0.0722));\n}\n\nvoid main() {\n vec2 texel = 1.0 / vec2(textureSize(u_image0, 0));\n float radius = max(u_float1, 0.5);\n float amount = u_float0;\n float threshold = u_float2;\n\n vec4 original = texture(u_image0, v_texCoord);\n\n // Gaussian blur for the \"unsharp\" mask\n int samples = int(ceil(radius));\n float sigma = radius / 2.0;\n\n vec4 blurred = vec4(0.0);\n float totalWeight = 0.0;\n\n for (int x = -samples; x <= samples; x++) {\n for (int y = -samples; y <= samples; y++) {\n vec2 offset = vec2(float(x), float(y)) * texel;\n vec4 sample_color = texture(u_image0, v_texCoord + offset);\n\n float dist = length(vec2(float(x), float(y)));\n float weight = gaussian(dist, sigma);\n blurred += sample_color * weight;\n totalWeight += weight;\n }\n }\n blurred /= totalWeight;\n\n // Unsharp mask = original - blurred\n vec3 mask = original.rgb - blurred.rgb;\n\n // Luminance-based threshold with smooth falloff\n float lumaDelta = abs(getLuminance(original.rgb) - getLuminance(blurred.rgb));\n float thresholdScale = smoothstep(0.0, threshold, lumaDelta);\n mask *= thresholdScale;\n\n // Sharpen: original + mask * amount\n vec3 sharpened = original.rgb + mask * amount;\n\n fragColor0 = vec4(clamp(sharpened, 0.0, 1.0), original.a);\n}\n",
"from_input"
]
}
@@ -434,8 +434,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Image Tools/Sharpen"
+ "category": "Image Tools/Sharpen",
+ "description": "Enhances edge contrast via unsharp masking for a sharper image appearance."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Video Captioning (Gemini).json b/blueprints/Video Captioning (Gemini).json
index ea6dc8bee..54a7d6e78 100644
--- a/blueprints/Video Captioning (Gemini).json
+++ b/blueprints/Video Captioning (Gemini).json
@@ -307,8 +307,9 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Text generation/Video Captioning"
+ "category": "Video Tools",
+ "description": "Generates descriptive captions for video input using Google's Gemini multimodal LLM."
}
]
}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Video Depth Estimation (MoGe).json b/blueprints/Video Depth Estimation (MoGe).json
new file mode 100644
index 000000000..025e20cda
--- /dev/null
+++ b/blueprints/Video Depth Estimation (MoGe).json
@@ -0,0 +1,1226 @@
+{
+ "revision": 0,
+ "last_node_id": 72,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 72,
+ "type": "7ff83f68-6848-47a8-aa43-9036ca6c46e8",
+ "pos": [
+ -4440,
+ 4550
+ ],
+ "size": [
+ 430,
+ 330
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "inference_resolution",
+ "name": "inference_resolution",
+ "type": "INT",
+ "widget": {
+ "name": "inference_resolution"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "inference_batch_size",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "inference_batch_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "COMBO",
+ "widget": {
+ "name": "moge_model"
+ },
+ "link": null
+ },
+ {
+ "label": "auto_resize_input",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": null
+ },
+ {
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "depth_colored",
+ "name": "depth_colored",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "localized_name": "depth",
+ "name": "depth",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "MASK",
+ "type": "MASK",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "13",
+ "resolution_level"
+ ],
+ [
+ "13",
+ "batch_size"
+ ],
+ [
+ "32",
+ "model_name"
+ ],
+ [
+ "53",
+ "switch"
+ ]
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [],
+ "title": "Video Depth Estimation (MoGe)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "7ff83f68-6848-47a8-aa43-9036ca6c46e8",
+ "version": 1,
+ "state": {
+ "lastGroupId": 1,
+ "lastNodeId": 72,
+ "lastLinkId": 96,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Video Depth Estimation (MoGe)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -5320,
+ 5320,
+ 167.337890625,
+ 148
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -3090,
+ 4966,
+ 129,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "06eefa21-8e60-49f3-9a34-35b081f4ae52",
+ "name": "inference_resolution",
+ "type": "INT",
+ "linkIds": [
+ 73
+ ],
+ "localized_name": "inference_resolution",
+ "pos": [
+ -5176.662109375,
+ 5344
+ ]
+ },
+ {
+ "id": "616638fe-f603-4d10-bae9-fc87c134380f",
+ "name": "inference_batch_size",
+ "type": "INT",
+ "linkIds": [
+ 74
+ ],
+ "localized_name": "inference_batch_size",
+ "pos": [
+ -5176.662109375,
+ 5364
+ ]
+ },
+ {
+ "id": "65694805-186e-4181-a721-df8b5af49d31",
+ "name": "moge_model",
+ "type": "COMBO",
+ "linkIds": [
+ 79
+ ],
+ "localized_name": "moge_model",
+ "pos": [
+ -5176.662109375,
+ 5384
+ ]
+ },
+ {
+ "id": "badf1be1-53c6-4fc1-b5cd-79ad3daf1674",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 83
+ ],
+ "label": "auto_resize_input",
+ "pos": [
+ -5176.662109375,
+ 5404
+ ]
+ },
+ {
+ "id": "749bad18-d00a-4ec4-a5ff-e45b1d0cf089",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 91
+ ],
+ "pos": [
+ -5176.662109375,
+ 5424
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "59c37b52-074f-49fc-9731-483f899c12c4",
+ "name": "depth_colored",
+ "type": "IMAGE",
+ "linkIds": [
+ 36
+ ],
+ "localized_name": "depth_colored",
+ "pos": [
+ -3066,
+ 4990
+ ]
+ },
+ {
+ "id": "f583e936-da5c-4630-9901-391fa605c1f8",
+ "name": "depth",
+ "type": "IMAGE",
+ "linkIds": [
+ 40
+ ],
+ "localized_name": "depth",
+ "pos": [
+ -3066,
+ 5010
+ ]
+ },
+ {
+ "id": "6845b6a1-1980-454a-9451-314f24495c1d",
+ "name": "MASK",
+ "type": "MASK",
+ "linkIds": [
+ 86
+ ],
+ "pos": [
+ -3066,
+ 5030
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 13,
+ "type": "MoGeInference",
+ "pos": [
+ -3790,
+ 5180
+ ],
+ "size": [
+ 270,
+ 230
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_model",
+ "name": "moge_model",
+ "type": "MOGE_MODEL",
+ "link": 58
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 81
+ },
+ {
+ "localized_name": "resolution_level",
+ "name": "resolution_level",
+ "type": "INT",
+ "widget": {
+ "name": "resolution_level"
+ },
+ "link": 73
+ },
+ {
+ "localized_name": "fov_x_degrees",
+ "name": "fov_x_degrees",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fov_x_degrees"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": 74
+ },
+ {
+ "localized_name": "force_projection",
+ "name": "force_projection",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "force_projection"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "apply_mask",
+ "name": "apply_mask",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "apply_mask"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "links": [
+ 35,
+ 39,
+ 61
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeInference",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ 9,
+ 0,
+ 4,
+ true,
+ true
+ ]
+ },
+ {
+ "id": 23,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 4870
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 35
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 36
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ "depth_colored"
+ ]
+ },
+ {
+ "id": 25,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 5030
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 39
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 40
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ "depth"
+ ]
+ },
+ {
+ "id": 32,
+ "type": "LoadMoGeModel",
+ "pos": [
+ -4180,
+ 4880
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 79
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MOGE_MODEL",
+ "name": "MOGE_MODEL",
+ "type": "MOGE_MODEL",
+ "links": [
+ 58
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadMoGeModel",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "models": [
+ {
+ "name": "moge_2_vitl_normal_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/MoGe/resolve/main/geometry_estimation/moge_2_vitl_normal_fp16.safetensors",
+ "directory": "geometry_estimation"
+ }
+ ]
+ },
+ "widgets_values": [
+ "moge_2_vitl_normal_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 36,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -4720,
+ 4910
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 49
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": [
+ 53
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ "a > 2048"
+ ]
+ },
+ {
+ "id": 37,
+ "type": "GetImageSize",
+ "pos": [
+ -4980,
+ 4910
+ ],
+ "size": [
+ 230,
+ 160
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 92
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 49
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ }
+ },
+ {
+ "id": 40,
+ "type": "ResizeImagesByLongerEdge",
+ "pos": [
+ -4650,
+ 5210
+ ],
+ "size": [
+ 310,
+ 110
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 93
+ },
+ {
+ "localized_name": "longer_edge",
+ "name": "longer_edge",
+ "type": "INT",
+ "widget": {
+ "name": "longer_edge"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 54
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImagesByLongerEdge",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ 2048
+ ]
+ },
+ {
+ "id": 42,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4180,
+ 5060
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 94
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 54
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 80
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 45,
+ "type": "MoGeRender",
+ "pos": [
+ -3430,
+ 5200
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "moge_geometry",
+ "name": "moge_geometry",
+ "type": "MOGE_GEOMETRY",
+ "link": 61
+ },
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "COMBO",
+ "widget": {
+ "name": "output"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 85
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MoGeRender",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ "mask"
+ ]
+ },
+ {
+ "id": 53,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -4160,
+ 5340
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 95
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 80
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 83
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 81
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1"
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 68,
+ "type": "ImageToMask",
+ "pos": [
+ -3420,
+ 5360
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 85
+ },
+ {
+ "localized_name": "channel",
+ "name": "channel",
+ "type": "COMBO",
+ "widget": {
+ "name": "channel"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 86
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageToMask",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": [
+ "red"
+ ]
+ },
+ {
+ "id": 70,
+ "type": "GetVideoComponents",
+ "pos": [
+ -4920,
+ 5490
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 91
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 92,
+ 93,
+ 94,
+ 95
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": null
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "auto_resize_if_width_gt_2048",
+ "bounding": [
+ -5000,
+ 4840,
+ 690,
+ 280
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 58,
+ "origin_id": 32,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 0,
+ "type": "MOGE_MODEL"
+ },
+ {
+ "id": 35,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 23,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 39,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 25,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 49,
+ "origin_id": 37,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 54,
+ "origin_id": 40,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 53,
+ "origin_id": 36,
+ "origin_slot": 2,
+ "target_id": 42,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 61,
+ "origin_id": 13,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 0,
+ "type": "MOGE_GEOMETRY"
+ },
+ {
+ "id": 36,
+ "origin_id": 23,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 40,
+ "origin_id": 25,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 73,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 74,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 13,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 79,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 32,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 80,
+ "origin_id": 42,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 81,
+ "origin_id": 53,
+ "origin_slot": 0,
+ "target_id": 13,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 83,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 53,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 85,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 68,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 86,
+ "origin_id": 68,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "MASK"
+ },
+ {
+ "id": 91,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 70,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 92,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 37,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 93,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 40,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 94,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 42,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 95,
+ "origin_id": 70,
+ "origin_slot": 0,
+ "target_id": 53,
+ "target_slot": 0,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Depth",
+ "description": "Estimates monocular depth from an input video using MoGe, outputting both raw and colorized depth maps plus a mask."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Face Detection (Mediapipe).json b/blueprints/Video Face Detection (Mediapipe).json
new file mode 100644
index 000000000..c70352481
--- /dev/null
+++ b/blueprints/Video Face Detection (Mediapipe).json
@@ -0,0 +1,1109 @@
+{
+ "revision": 0,
+ "last_node_id": 167,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 167,
+ "type": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c",
+ "pos": [
+ -3410,
+ 6100
+ ],
+ "size": [
+ 420,
+ 481.3125
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "trim_audio",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": null
+ },
+ {
+ "name": "start_time",
+ "type": "FLOAT",
+ "widget": {
+ "name": "start_time"
+ },
+ "link": null
+ },
+ {
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": null
+ },
+ {
+ "label": "face_landmarker",
+ "name": "face_landmarker_1",
+ "type": "FACE_LANDMARKER",
+ "link": null
+ },
+ {
+ "label": "detector_variant",
+ "name": "detector_variant_1",
+ "type": "COMBO",
+ "widget": {
+ "name": "detector_variant_1"
+ },
+ "link": null
+ },
+ {
+ "label": "num_faces",
+ "name": "num_faces_1",
+ "type": "INT",
+ "widget": {
+ "name": "num_faces_1"
+ },
+ "link": null
+ },
+ {
+ "label": "face_oval",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.face_oval"
+ },
+ "link": null
+ },
+ {
+ "label": "face_lips",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.lips"
+ },
+ "link": null
+ },
+ {
+ "label": "left_eye",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.left_eye"
+ },
+ "link": null
+ },
+ {
+ "label": "right_eye",
+ "name": "regions.right_eye_1",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.right_eye_1"
+ },
+ "link": null
+ },
+ {
+ "label": "irises",
+ "name": "regions.irises_1",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.irises_1"
+ },
+ "link": null
+ },
+ {
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "label": "mask",
+ "name": "MASK_1",
+ "type": "MASK",
+ "links": []
+ },
+ {
+ "label": "bboxes",
+ "name": "bboxes_1",
+ "type": "BOUNDING_BOX",
+ "links": null
+ },
+ {
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "links": null
+ }
+ ],
+ "title": "Video Face Detection (Mediapipe)",
+ "properties": {
+ "proxyWidgets": [
+ [
+ "165",
+ "switch"
+ ],
+ [
+ "164",
+ "start_time"
+ ],
+ [
+ "164",
+ "duration"
+ ],
+ [
+ "11",
+ "detector_variant"
+ ],
+ [
+ "11",
+ "num_faces"
+ ],
+ [
+ "20",
+ "regions.face_oval"
+ ],
+ [
+ "20",
+ "regions.lips"
+ ],
+ [
+ "20",
+ "regions.left_eye"
+ ],
+ [
+ "20",
+ "regions.right_eye"
+ ],
+ [
+ "20",
+ "regions.irises"
+ ],
+ [
+ "2",
+ "model_name"
+ ]
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": []
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "ca14b151-8f5e-4386-aab7-d2ec84eaf43c",
+ "version": 1,
+ "state": {
+ "lastGroupId": 2,
+ "lastNodeId": 167,
+ "lastLinkId": 168,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Video Face Detection (Mediapipe)",
+ "description": "Detects facial landmarks from a video using MediaPipe, outputting landmark data, face bounding boxes, and an optional face-region mask.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1060,
+ 4350,
+ 142.587890625,
+ 308
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 470,
+ 4460,
+ 137.677734375,
+ 108
+ ]
+ },
+ "inputs": [
+ {
+ "id": "16e5a20f-22bc-4960-a67b-e32c64409c49",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 150,
+ 153
+ ],
+ "pos": [
+ -941.412109375,
+ 4374
+ ]
+ },
+ {
+ "id": "cc7fc7d4-24ec-4c00-878e-1af1b6809b4b",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 154
+ ],
+ "label": "trim_audio",
+ "pos": [
+ -941.412109375,
+ 4394
+ ]
+ },
+ {
+ "id": "efa9ab9f-ca70-449c-be43-5ca60c7f0d59",
+ "name": "start_time",
+ "type": "FLOAT",
+ "linkIds": [
+ 155
+ ],
+ "pos": [
+ -941.412109375,
+ 4414
+ ]
+ },
+ {
+ "id": "45050127-4089-4b85-bf81-73b725196c2e",
+ "name": "duration",
+ "type": "FLOAT",
+ "linkIds": [
+ 156
+ ],
+ "pos": [
+ -941.412109375,
+ 4434
+ ]
+ },
+ {
+ "id": "239fcd3b-6324-4824-8255-98199ae58914",
+ "name": "face_landmarker_1",
+ "type": "FACE_LANDMARKER",
+ "linkIds": [
+ 157
+ ],
+ "label": "face_landmarker",
+ "pos": [
+ -941.412109375,
+ 4454
+ ]
+ },
+ {
+ "id": "f79f67b9-5bcb-4cab-9101-8b9dee461bca",
+ "name": "detector_variant_1",
+ "type": "COMBO",
+ "linkIds": [
+ 158
+ ],
+ "label": "detector_variant",
+ "pos": [
+ -941.412109375,
+ 4474
+ ]
+ },
+ {
+ "id": "3369790b-e730-41bf-b5b2-dc1f5fafbe11",
+ "name": "num_faces_1",
+ "type": "INT",
+ "linkIds": [
+ 159
+ ],
+ "label": "num_faces",
+ "pos": [
+ -941.412109375,
+ 4494
+ ]
+ },
+ {
+ "id": "964f6b5f-44ac-456e-ba3a-a3039dfe0729",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 160
+ ],
+ "label": "face_oval",
+ "pos": [
+ -941.412109375,
+ 4514
+ ]
+ },
+ {
+ "id": "d6e89b51-65a2-4f37-a561-8cec3a5040fd",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 161
+ ],
+ "label": "face_lips",
+ "pos": [
+ -941.412109375,
+ 4534
+ ]
+ },
+ {
+ "id": "49f02319-ea4a-4a69-88f8-589d2ef7c97a",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 162
+ ],
+ "label": "left_eye",
+ "pos": [
+ -941.412109375,
+ 4554
+ ]
+ },
+ {
+ "id": "89179a19-aca6-4469-a0b9-2a4bd21bceea",
+ "name": "regions.right_eye_1",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 163
+ ],
+ "label": "right_eye",
+ "pos": [
+ -941.412109375,
+ 4574
+ ]
+ },
+ {
+ "id": "f5667690-24b5-4df9-9210-b8610c68ff5f",
+ "name": "regions.irises_1",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 164
+ ],
+ "label": "irises",
+ "pos": [
+ -941.412109375,
+ 4594
+ ]
+ },
+ {
+ "id": "66c805f6-6ccd-41f9-8a77-fc934b7f4713",
+ "name": "model_name",
+ "type": "COMBO",
+ "linkIds": [
+ 165
+ ],
+ "pos": [
+ -941.412109375,
+ 4614
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f6309e1d-6397-4363-b38f-778a122abc51",
+ "name": "MASK_1",
+ "type": "MASK",
+ "linkIds": [
+ 83
+ ],
+ "label": "mask",
+ "pos": [
+ 494,
+ 4484
+ ]
+ },
+ {
+ "id": "59669f0a-b4b2-49d1-85f8-fc2a88059b1a",
+ "name": "bboxes_1",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 166
+ ],
+ "label": "bboxes",
+ "pos": [
+ 494,
+ 4504
+ ]
+ },
+ {
+ "id": "57f66731-e106-4f8b-a0a0-aed3c620b37b",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "linkIds": [
+ 167
+ ],
+ "pos": [
+ 494,
+ 4524
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 11,
+ "type": "MediaPipeFaceLandmarker",
+ "pos": [
+ -60,
+ 4380
+ ],
+ "size": [
+ 350,
+ 220
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "face_detection_model",
+ "name": "face_detection_model",
+ "type": "FACE_DETECTION_MODEL",
+ "link": 66
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 149
+ },
+ {
+ "localized_name": "detector_variant",
+ "name": "detector_variant",
+ "type": "COMBO",
+ "widget": {
+ "name": "detector_variant"
+ },
+ "link": 158
+ },
+ {
+ "localized_name": "num_faces",
+ "name": "num_faces",
+ "type": "INT",
+ "widget": {
+ "name": "num_faces"
+ },
+ "link": 159
+ },
+ {
+ "localized_name": "min_confidence",
+ "name": "min_confidence",
+ "type": "FLOAT",
+ "widget": {
+ "name": "min_confidence"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "missing_frame_fallback",
+ "name": "missing_frame_fallback",
+ "type": "COMBO",
+ "widget": {
+ "name": "missing_frame_fallback"
+ },
+ "link": null
+ },
+ {
+ "name": "face_landmarker",
+ "type": "FACE_LANDMARKER",
+ "link": 157
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "face_landmarks",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "links": [
+ 46,
+ 167
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 166
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MediaPipeFaceLandmarker",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": [
+ "full",
+ 0,
+ 0.5,
+ "empty"
+ ]
+ },
+ {
+ "id": 2,
+ "type": "LoadMediaPipeFaceLandmarker",
+ "pos": [
+ -70,
+ 4160
+ ],
+ "size": [
+ 350,
+ 140
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 165
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FACE_DETECTION_MODEL",
+ "name": "FACE_DETECTION_MODEL",
+ "type": "FACE_DETECTION_MODEL",
+ "links": [
+ 66
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoadMediaPipeFaceLandmarker",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0",
+ "models": [
+ {
+ "name": "mediapipe_face_fp32.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/mediapipe/resolve/main/detection/mediapipe_face_fp32.safetensors",
+ "directory": "detection"
+ }
+ ]
+ },
+ "widgets_values": [
+ "mediapipe_face_fp32.safetensors"
+ ]
+ },
+ {
+ "id": 20,
+ "type": "MediaPipeFaceMask",
+ "pos": [
+ -70,
+ 4660
+ ],
+ "size": [
+ 360,
+ 180
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "face_landmarks",
+ "name": "face_landmarks",
+ "type": "FACE_LANDMARKS",
+ "link": 46
+ },
+ {
+ "localized_name": "regions",
+ "name": "regions",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "regions"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "regions.face_oval",
+ "name": "regions.face_oval",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.face_oval"
+ },
+ "link": 160
+ },
+ {
+ "localized_name": "regions.lips",
+ "name": "regions.lips",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.lips"
+ },
+ "link": 161
+ },
+ {
+ "localized_name": "regions.left_eye",
+ "name": "regions.left_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.left_eye"
+ },
+ "link": 162
+ },
+ {
+ "localized_name": "regions.right_eye",
+ "name": "regions.right_eye",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.right_eye"
+ },
+ "link": 163
+ },
+ {
+ "localized_name": "regions.irises",
+ "name": "regions.irises",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "regions.irises"
+ },
+ "link": 164
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 83
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MediaPipeFaceMask",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": [
+ "custom",
+ true,
+ false,
+ false,
+ false,
+ false
+ ]
+ },
+ {
+ "id": 160,
+ "type": "GetVideoComponents",
+ "pos": [
+ -420,
+ 4360
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 152
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 149
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": null
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ }
+ },
+ {
+ "id": 164,
+ "type": "Video Slice",
+ "pos": [
+ -780,
+ 4330
+ ],
+ "size": [
+ 270,
+ 170
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 150
+ },
+ {
+ "localized_name": "start_time",
+ "name": "start_time",
+ "type": "FLOAT",
+ "widget": {
+ "name": "start_time"
+ },
+ "link": 155
+ },
+ {
+ "localized_name": "duration",
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": 156
+ },
+ {
+ "localized_name": "strict_duration",
+ "name": "strict_duration",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "strict_duration"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 151
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "Video Slice",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ false
+ ]
+ },
+ {
+ "id": 165,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -420,
+ 4590
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 153
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 151
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 154
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 152
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "cnr_id": "comfy-core",
+ "ver": "0.22.0"
+ },
+ "widgets_values": [
+ false
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 66,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 0,
+ "type": "FACE_DETECTION_MODEL"
+ },
+ {
+ "id": 46,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": 20,
+ "target_slot": 0,
+ "type": "FACE_LANDMARKS"
+ },
+ {
+ "id": 83,
+ "origin_id": 20,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 149,
+ "origin_id": 160,
+ "origin_slot": 0,
+ "target_id": 11,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 150,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 164,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 151,
+ "origin_id": 164,
+ "origin_slot": 0,
+ "target_id": 165,
+ "target_slot": 1,
+ "type": "VIDEO"
+ },
+ {
+ "id": 152,
+ "origin_id": 165,
+ "origin_slot": 0,
+ "target_id": 160,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 153,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 165,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 154,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 165,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 155,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 164,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 156,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 164,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 157,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 11,
+ "target_slot": 6,
+ "type": "FACE_LANDMARKER"
+ },
+ {
+ "id": 158,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 11,
+ "target_slot": 2,
+ "type": "COMBO"
+ },
+ {
+ "id": 159,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 11,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 160,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 20,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 161,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 20,
+ "target_slot": 3,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 162,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 20,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 163,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 20,
+ "target_slot": 5,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 164,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 20,
+ "target_slot": 6,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 165,
+ "origin_id": -10,
+ "origin_slot": 12,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 166,
+ "origin_id": 11,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 167,
+ "origin_id": 11,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "FACE_LANDMARKS"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Face Detection"
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Inpaint (VOID).json b/blueprints/Video Inpaint (VOID).json
new file mode 100644
index 000000000..a7cc806b5
--- /dev/null
+++ b/blueprints/Video Inpaint (VOID).json
@@ -0,0 +1,4340 @@
+{
+ "revision": 0,
+ "last_node_id": 167,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 167,
+ "type": "c3157b75-484a-459e-b8de-57823bef5130",
+ "pos": [
+ -430,
+ 690
+ ],
+ "size": [
+ 590,
+ 723.9375
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "Source video",
+ "localized_name": "source_video",
+ "name": "source_video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "Positive prompt (inpaint fill)",
+ "localized_name": "positive_prompt",
+ "name": "positive_prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "positive_prompt"
+ },
+ "link": null
+ },
+ {
+ "label": "Negative prompt",
+ "localized_name": "negative_prompt",
+ "name": "negative_prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "negative_prompt"
+ },
+ "link": null
+ },
+ {
+ "label": "SAM3 object mask prompt",
+ "localized_name": "sam3_text_prompt",
+ "name": "sam3_text_prompt",
+ "type": "STRING",
+ "widget": {
+ "name": "sam3_text_prompt"
+ },
+ "link": null
+ },
+ {
+ "label": "Start frame index",
+ "localized_name": "start_frame_index",
+ "name": "start_frame_index",
+ "type": "INT",
+ "widget": {
+ "name": "start_frame_index"
+ },
+ "link": null
+ },
+ {
+ "label": "Clip duration (seconds)",
+ "localized_name": "duration_seconds",
+ "name": "duration_seconds",
+ "type": "INT",
+ "widget": {
+ "name": "duration_seconds"
+ },
+ "link": null
+ },
+ {
+ "label": "Width (pass 2)",
+ "localized_name": "latent_width",
+ "name": "latent_width",
+ "type": "INT",
+ "widget": {
+ "name": "latent_width"
+ },
+ "link": null
+ },
+ {
+ "label": "Height (pass 2)",
+ "localized_name": "latent_height",
+ "name": "latent_height",
+ "type": "INT",
+ "widget": {
+ "name": "latent_height"
+ },
+ "link": null
+ },
+ {
+ "label": "Skip pass 2 (reuse pass 1)",
+ "localized_name": "skip_pass_2",
+ "name": "skip_pass_2",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "skip_pass_2"
+ },
+ "link": null
+ },
+ {
+ "label": "Noise seed",
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": null
+ },
+ {
+ "label": "SAM3 checkpoint",
+ "localized_name": "sam3_checkpoint",
+ "name": "sam3_checkpoint",
+ "type": "COMBO",
+ "widget": {
+ "name": "sam3_checkpoint"
+ },
+ "link": null
+ },
+ {
+ "label": "VOID UNet — pass 1",
+ "localized_name": "void_unet_pass1",
+ "name": "void_unet_pass1",
+ "type": "COMBO",
+ "widget": {
+ "name": "void_unet_pass1"
+ },
+ "link": null
+ },
+ {
+ "label": "VOID UNet — pass 2",
+ "localized_name": "void_unet_pass2",
+ "name": "void_unet_pass2",
+ "type": "COMBO",
+ "widget": {
+ "name": "void_unet_pass2"
+ },
+ "link": null
+ },
+ {
+ "label": "Optical flow model",
+ "localized_name": "optical_flow_model",
+ "name": "optical_flow_model",
+ "type": "COMBO",
+ "widget": {
+ "name": "optical_flow_model"
+ },
+ "link": null
+ },
+ {
+ "label": "CLIP / T5 weights",
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "label": "VAE weights",
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "label": "Pass 1 (intermediate)",
+ "localized_name": "pass_1_video",
+ "name": "pass_1_video",
+ "type": "VIDEO",
+ "links": []
+ },
+ {
+ "label": "Pass 2 (final)",
+ "localized_name": "final_pass_2_video",
+ "name": "final_pass_2_video",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "6",
+ "text"
+ ],
+ [
+ "7",
+ "text"
+ ],
+ [
+ "149",
+ "text"
+ ],
+ [
+ "168",
+ "value"
+ ],
+ [
+ "163",
+ "value"
+ ],
+ [
+ "147",
+ "value"
+ ],
+ [
+ "148",
+ "value"
+ ],
+ [
+ "153",
+ "value"
+ ],
+ [
+ "141",
+ "noise_seed"
+ ],
+ [
+ "149",
+ "ckpt_name"
+ ],
+ [
+ "144",
+ "unet_name"
+ ],
+ [
+ "143",
+ "unet_name"
+ ],
+ [
+ "142",
+ "model_name"
+ ],
+ [
+ "2",
+ "clip_name"
+ ],
+ [
+ "3",
+ "vae_name"
+ ]
+ ]
+ },
+ "widgets_values": [],
+ "title": "Video Inpaint (VOID)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "c3157b75-484a-459e-b8de-57823bef5130",
+ "version": 1,
+ "state": {
+ "lastGroupId": 13,
+ "lastNodeId": 171,
+ "lastLinkId": 406,
+ "lastRerouteId": 0
+ },
+ "revision": 5,
+ "config": {},
+ "name": "Video Inpaint (VOID)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -1530,
+ 800,
+ 203.1796875,
+ 368
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 2030,
+ 710,
+ 166.130859375,
+ 88
+ ]
+ },
+ "inputs": [
+ {
+ "id": "1865ea29-14b1-4471-b5e0-d35bba595b9c",
+ "name": "source_video",
+ "type": "VIDEO",
+ "linkIds": [
+ 373
+ ],
+ "localized_name": "source_video",
+ "label": "Source video",
+ "pos": [
+ -1350.8203125,
+ 824
+ ]
+ },
+ {
+ "id": "f1b2b2c4-bc2e-4e72-b16c-7e560e58d2d6",
+ "name": "positive_prompt",
+ "type": "STRING",
+ "linkIds": [
+ 377
+ ],
+ "localized_name": "positive_prompt",
+ "label": "Positive prompt (inpaint fill)",
+ "pos": [
+ -1350.8203125,
+ 844
+ ]
+ },
+ {
+ "id": "931ac4dd-3cb6-4555-a1f0-619be81d64f6",
+ "name": "negative_prompt",
+ "type": "STRING",
+ "linkIds": [
+ 387
+ ],
+ "localized_name": "negative_prompt",
+ "label": "Negative prompt",
+ "pos": [
+ -1350.8203125,
+ 864
+ ]
+ },
+ {
+ "id": "7a0963c3-bf2f-464d-80c2-6a6c90569883",
+ "name": "sam3_text_prompt",
+ "type": "STRING",
+ "linkIds": [
+ 388
+ ],
+ "localized_name": "sam3_text_prompt",
+ "label": "SAM3 object mask prompt",
+ "pos": [
+ -1350.8203125,
+ 884
+ ]
+ },
+ {
+ "id": "f53f340f-2031-401d-b613-157622ef336f",
+ "name": "start_frame_index",
+ "type": "INT",
+ "linkIds": [
+ 389
+ ],
+ "localized_name": "start_frame_index",
+ "label": "Start frame index",
+ "pos": [
+ -1350.8203125,
+ 904
+ ]
+ },
+ {
+ "id": "d5b8704b-7c8c-4cf0-87cd-26b293f65f83",
+ "name": "duration_seconds",
+ "type": "INT",
+ "linkIds": [
+ 390
+ ],
+ "localized_name": "duration_seconds",
+ "label": "Clip duration (seconds)",
+ "pos": [
+ -1350.8203125,
+ 924
+ ]
+ },
+ {
+ "id": "7140209f-5058-4933-ae06-438256f77f23",
+ "name": "latent_width",
+ "type": "INT",
+ "linkIds": [
+ 391
+ ],
+ "localized_name": "latent_width",
+ "label": "Width (pass 2)",
+ "pos": [
+ -1350.8203125,
+ 944
+ ]
+ },
+ {
+ "id": "084a140a-6fa9-4676-9483-ad30e0b14947",
+ "name": "latent_height",
+ "type": "INT",
+ "linkIds": [
+ 392
+ ],
+ "localized_name": "latent_height",
+ "label": "Height (pass 2)",
+ "pos": [
+ -1350.8203125,
+ 964
+ ]
+ },
+ {
+ "id": "a8109321-e101-4ed8-b6f3-8ad1c815f35c",
+ "name": "skip_pass_2",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 393
+ ],
+ "localized_name": "skip_pass_2",
+ "label": "Skip pass 2 (reuse pass 1)",
+ "pos": [
+ -1350.8203125,
+ 984
+ ]
+ },
+ {
+ "id": "6964ab42-0662-47f2-9c2a-96782fdcb883",
+ "name": "noise_seed",
+ "type": "INT",
+ "linkIds": [
+ 400
+ ],
+ "localized_name": "noise_seed",
+ "label": "Noise seed",
+ "pos": [
+ -1350.8203125,
+ 1004
+ ]
+ },
+ {
+ "id": "dccde360-461d-417e-b3f5-e1a4d6cece39",
+ "name": "sam3_checkpoint",
+ "type": "COMBO",
+ "linkIds": [
+ 401
+ ],
+ "localized_name": "sam3_checkpoint",
+ "label": "SAM3 checkpoint",
+ "pos": [
+ -1350.8203125,
+ 1024
+ ]
+ },
+ {
+ "id": "5ce0d036-be08-4539-9ec6-e923fcdb8825",
+ "name": "void_unet_pass1",
+ "type": "COMBO",
+ "linkIds": [
+ 402
+ ],
+ "localized_name": "void_unet_pass1",
+ "label": "VOID UNet — pass 1",
+ "pos": [
+ -1350.8203125,
+ 1044
+ ]
+ },
+ {
+ "id": "c1de695a-a08a-40bc-b9e4-d156fef73cd0",
+ "name": "void_unet_pass2",
+ "type": "COMBO",
+ "linkIds": [
+ 403
+ ],
+ "localized_name": "void_unet_pass2",
+ "label": "VOID UNet — pass 2",
+ "pos": [
+ -1350.8203125,
+ 1064
+ ]
+ },
+ {
+ "id": "99da50bc-db57-4a21-9831-0f77b3c4fe99",
+ "name": "optical_flow_model",
+ "type": "COMBO",
+ "linkIds": [
+ 404
+ ],
+ "localized_name": "optical_flow_model",
+ "label": "Optical flow model",
+ "pos": [
+ -1350.8203125,
+ 1084
+ ]
+ },
+ {
+ "id": "c756ce20-cfa6-4fe0-9eb0-543d56781cb7",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 405
+ ],
+ "localized_name": "clip_name",
+ "label": "CLIP / T5 weights",
+ "pos": [
+ -1350.8203125,
+ 1104
+ ]
+ },
+ {
+ "id": "d8eb12ad-a805-42d9-86b4-6f2c2cc5a231",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 406
+ ],
+ "localized_name": "vae_name",
+ "label": "VAE weights",
+ "pos": [
+ -1350.8203125,
+ 1124
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "a21e83df-8c95-43a3-bd73-feeea67e90cd",
+ "name": "pass_1_video",
+ "type": "VIDEO",
+ "linkIds": [
+ 77
+ ],
+ "localized_name": "pass_1_video",
+ "label": "Pass 1 (intermediate)",
+ "pos": [
+ 2054,
+ 734
+ ]
+ },
+ {
+ "id": "02c265f3-012f-499f-a4e8-a6d6aaf72885",
+ "name": "final_pass_2_video",
+ "type": "VIDEO",
+ "linkIds": [
+ 362
+ ],
+ "localized_name": "final_pass_2_video",
+ "label": "Pass 2 (final)",
+ "pos": [
+ 2054,
+ 754
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 2,
+ "type": "CLIPLoader",
+ "pos": [
+ -710,
+ 30
+ ],
+ "size": [
+ 320,
+ 150
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 405
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 2,
+ 3
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "models": [
+ {
+ "name": "t5xxl_fp16.safetensors",
+ "url": "https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "t5xxl_fp16.safetensors",
+ "cogvideox",
+ "default"
+ ]
+ },
+ {
+ "id": 3,
+ "type": "VAELoader",
+ "pos": [
+ -710,
+ 220
+ ],
+ "size": [
+ 320,
+ 90
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 406
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 4,
+ 45,
+ 70
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "models": [
+ {
+ "name": "cogvideox_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/vae/cogvideox_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "cogvideox_vae.safetensors"
+ ]
+ },
+ {
+ "id": 7,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -260,
+ 200
+ ],
+ "size": [
+ 590,
+ 180
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 3
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 387
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 9
+ ]
+ }
+ ],
+ "title": "Negative Prompt",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 136,
+ "type": "CFGGuider",
+ "pos": [
+ 410,
+ 1640
+ ],
+ "size": [
+ 300,
+ 130
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 322
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 309
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 310
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 311
+ ]
+ }
+ ],
+ "title": "CFGGuider (Pass 2 cfg=6)",
+ "properties": {
+ "Node name for S&R": "CFGGuider",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 6
+ ]
+ },
+ {
+ "id": 138,
+ "type": "BasicScheduler",
+ "pos": [
+ 410,
+ 160
+ ],
+ "size": [
+ 270,
+ 150
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 324
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 315
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "BasicScheduler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "simple",
+ 30,
+ 1
+ ]
+ },
+ {
+ "id": 140,
+ "type": "CFGGuider",
+ "pos": [
+ 410,
+ -30
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 325
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 317
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 318
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "GUIDER",
+ "name": "GUIDER",
+ "type": "GUIDER",
+ "links": [
+ 319
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CFGGuider",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 6
+ ]
+ },
+ {
+ "id": 141,
+ "type": "RandomNoise",
+ "pos": [
+ 410,
+ -180
+ ],
+ "size": [
+ 270,
+ 90
+ ],
+ "flags": {},
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise_seed",
+ "name": "noise_seed",
+ "type": "INT",
+ "widget": {
+ "name": "noise_seed"
+ },
+ "link": 400
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "links": [
+ 320
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "RandomNoise",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 43,
+ "fixed"
+ ]
+ },
+ {
+ "id": 31,
+ "type": "VOIDWarpedNoise",
+ "pos": [
+ 410,
+ 1090
+ ],
+ "size": [
+ 300,
+ 200
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "optical_flow",
+ "name": "optical_flow",
+ "type": "OPTICAL_FLOW",
+ "link": 321
+ },
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "IMAGE",
+ "link": 72
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 333
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 335
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 67
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "warped_noise",
+ "name": "warped_noise",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 53
+ ]
+ }
+ ],
+ "title": "Warped Noise (from Pass 1 output)",
+ "properties": {
+ "Node name for S&R": "VOIDWarpedNoise",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 672,
+ 384,
+ 45,
+ 1
+ ]
+ },
+ {
+ "id": 35,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 870,
+ 1110
+ ],
+ "size": [
+ 250,
+ 170
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 54
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 311
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 305
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 313
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 48
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 49
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "slot_index": 1,
+ "links": []
+ }
+ ],
+ "title": "Pass 2 Sample",
+ "properties": {
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 132,
+ "type": "MaskPreview",
+ "pos": [
+ 390,
+ 560
+ ],
+ "size": [
+ 790,
+ 430
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 4,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 340
+ }
+ ],
+ "outputs": [],
+ "properties": {
+ "Node name for S&R": "MaskPreview",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 142,
+ "type": "OpticalFlowLoader",
+ "pos": [
+ -710,
+ 410
+ ],
+ "size": [
+ 320,
+ 90
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model_name",
+ "name": "model_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "model_name"
+ },
+ "link": 404
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "OPTICAL_FLOW",
+ "name": "OPTICAL_FLOW",
+ "type": "OPTICAL_FLOW",
+ "links": [
+ 321
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "OpticalFlowLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "models": [
+ {
+ "name": "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/optical_flow/raft_large_C_T_SKHT_V2-ff5fadd5.safetensors",
+ "directory": "optical_flow"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "raft_large_C_T_SKHT_V2-ff5fadd5.safetensors"
+ ]
+ },
+ {
+ "id": 10,
+ "type": "VOIDInpaintConditioning",
+ "pos": [
+ -110,
+ 430
+ ],
+ "size": [
+ 300,
+ 280
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 8
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 9
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 4
+ },
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "IMAGE",
+ "link": 326
+ },
+ {
+ "localized_name": "quadmask",
+ "name": "quadmask",
+ "type": "MASK",
+ "link": 339
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 332
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 334
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 63
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 309,
+ 317
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "slot_index": 1,
+ "links": [
+ 310,
+ 318
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "slot_index": 2,
+ "links": [
+ 48,
+ 82
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VOIDInpaintConditioning",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 672,
+ 384,
+ 45,
+ 1
+ ]
+ },
+ {
+ "id": 32,
+ "type": "VOIDWarpedNoiseSource",
+ "pos": [
+ 410,
+ 1350
+ ],
+ "size": [
+ 300,
+ 50
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "warped_noise",
+ "name": "warped_noise",
+ "type": "LATENT",
+ "link": 53
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "NOISE",
+ "name": "NOISE",
+ "type": "NOISE",
+ "slot_index": 0,
+ "links": [
+ 54
+ ]
+ }
+ ],
+ "title": "Warped Noise → NOISE",
+ "properties": {
+ "Node name for S&R": "VOIDWarpedNoiseSource",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 137,
+ "type": "BasicScheduler",
+ "pos": [
+ 410,
+ 1470
+ ],
+ "size": [
+ 300,
+ 150
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 323
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "SIGMAS",
+ "name": "SIGMAS",
+ "type": "SIGMAS",
+ "links": [
+ 313
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "BasicScheduler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "simple",
+ 30,
+ 1
+ ]
+ },
+ {
+ "id": 134,
+ "type": "VOIDSampler",
+ "pos": [
+ 410,
+ 1800
+ ],
+ "size": [
+ 300,
+ 50
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 305
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VOIDSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 143,
+ "type": "UNETLoader",
+ "pos": [
+ -710,
+ 550
+ ],
+ "size": [
+ 320,
+ 120
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 403
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 322,
+ 323
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "models": [
+ {
+ "name": "void_pass2.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass2.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "void_pass2.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 144,
+ "type": "UNETLoader",
+ "pos": [
+ -720,
+ -150
+ ],
+ "size": [
+ 320,
+ 120
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 402
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 324,
+ 325
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "models": [
+ {
+ "name": "void_pass1.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/void-model/resolve/main/diffusion_models/void_pass1.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "void_pass1.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 46,
+ "type": "CreateVideo",
+ "pos": [
+ 1230,
+ -20
+ ],
+ "size": [
+ 240,
+ 110
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 73
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 355
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 368
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 77
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 133,
+ "type": "VOIDSampler",
+ "pos": [
+ 410,
+ 370
+ ],
+ "size": [
+ 280,
+ 50
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [
+ {
+ "localized_name": "SAMPLER",
+ "name": "SAMPLER",
+ "type": "SAMPLER",
+ "links": [
+ 304
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VOIDSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 49,
+ "type": "SamplerCustomAdvanced",
+ "pos": [
+ 880,
+ -180
+ ],
+ "size": [
+ 250,
+ 270
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "noise",
+ "name": "noise",
+ "type": "NOISE",
+ "link": 320
+ },
+ {
+ "localized_name": "guider",
+ "name": "guider",
+ "type": "GUIDER",
+ "link": 319
+ },
+ {
+ "localized_name": "sampler",
+ "name": "sampler",
+ "type": "SAMPLER",
+ "link": 304
+ },
+ {
+ "localized_name": "sigmas",
+ "name": "sigmas",
+ "type": "SIGMAS",
+ "link": 315
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 82
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "LATENT",
+ "links": [
+ 83
+ ]
+ },
+ {
+ "localized_name": "denoised_output",
+ "name": "denoised_output",
+ "type": "LATENT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SamplerCustomAdvanced",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 45,
+ "type": "VAEDecode",
+ "pos": [
+ 1230,
+ -180
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 83
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 70
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 72,
+ 73,
+ 342
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 6,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -260,
+ -180
+ ],
+ "size": [
+ 580,
+ 310
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 2
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 377
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 8
+ ]
+ }
+ ],
+ "title": "Positive Prompt",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 145,
+ "type": "ImageFromBatch",
+ "pos": [
+ -410,
+ 850
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 366
+ },
+ {
+ "localized_name": "batch_index",
+ "name": "batch_index",
+ "type": "INT",
+ "widget": {
+ "name": "batch_index"
+ },
+ "link": 384
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 361
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 326,
+ 327,
+ 336
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFromBatch",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 197
+ ]
+ },
+ {
+ "id": 36,
+ "type": "VAEDecode",
+ "pos": [
+ 1220,
+ 1110
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 49
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 45
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 341
+ ]
+ }
+ ],
+ "title": "Pass 2 VAE Decode",
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 149,
+ "type": "c3e0d783-9aa3-4e75-a94d-19937968ef86",
+ "pos": [
+ -20,
+ 840
+ ],
+ "size": [
+ 290,
+ 370
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 336
+ },
+ {
+ "label": "object",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 388
+ },
+ {
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": null
+ },
+ {
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": null
+ },
+ {
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 401
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 339,
+ 340
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "78",
+ "text"
+ ],
+ [
+ "75",
+ "threshold"
+ ],
+ [
+ "75",
+ "refine_iterations"
+ ],
+ [
+ "75",
+ "individual_masks"
+ ],
+ [
+ "77",
+ "ckpt_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": []
+ },
+ {
+ "id": 43,
+ "type": "GetImageSize",
+ "pos": [
+ -410,
+ 1140
+ ],
+ "size": [
+ 230,
+ 160
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 327
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": null
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": [
+ 63,
+ 67
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.20.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 147,
+ "type": "PrimitiveInt",
+ "pos": [
+ -570,
+ 1660
+ ],
+ "size": [
+ 270,
+ 90
+ ],
+ "flags": {},
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 391
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 332,
+ 333
+ ]
+ }
+ ],
+ "title": "Int (Width)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 672,
+ "fixed"
+ ]
+ },
+ {
+ "id": 148,
+ "type": "PrimitiveInt",
+ "pos": [
+ -570,
+ 1790
+ ],
+ "size": [
+ 270,
+ 90
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 392
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 334,
+ 335
+ ]
+ }
+ ],
+ "title": "Int (Height)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 384,
+ "fixed"
+ ]
+ },
+ {
+ "id": 150,
+ "type": "ComfySwitchNode",
+ "pos": [
+ 1510,
+ 1080
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 342
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 341
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 346
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 363
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 153,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -580,
+ 1440
+ ],
+ "size": [
+ 270,
+ 80
+ ],
+ "flags": {},
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 393
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 346
+ ]
+ }
+ ],
+ "title": "Boolean (Skip Pass 2?)",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 158,
+ "type": "TrimAudioDuration",
+ "pos": [
+ -10,
+ 1580
+ ],
+ "size": [
+ 270,
+ 120
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "link": 367
+ },
+ {
+ "localized_name": "start_index",
+ "name": "start_index",
+ "type": "FLOAT",
+ "widget": {
+ "name": "start_index"
+ },
+ "link": 386
+ },
+ {
+ "localized_name": "duration",
+ "name": "duration",
+ "type": "FLOAT",
+ "widget": {
+ "name": "duration"
+ },
+ "link": 385
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "AUDIO",
+ "name": "AUDIO",
+ "type": "AUDIO",
+ "links": [
+ 355,
+ 364
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TrimAudioDuration",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 60
+ ]
+ },
+ {
+ "id": 163,
+ "type": "PrimitiveInt",
+ "pos": [
+ -740,
+ 1170
+ ],
+ "size": [
+ 230,
+ 90
+ ],
+ "flags": {},
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 390
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 360
+ ]
+ }
+ ],
+ "title": "Int (Video duration)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 5,
+ "fixed"
+ ]
+ },
+ {
+ "id": 164,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -740,
+ 1300
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 360
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 371
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 385
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 361
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b"
+ ]
+ },
+ {
+ "id": 165,
+ "type": "CreateVideo",
+ "pos": [
+ 1510,
+ 1270
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 363
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 364
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 372
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 362
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo"
+ },
+ "widgets_values": [
+ 24
+ ]
+ },
+ {
+ "id": 166,
+ "type": "GetVideoComponents",
+ "pos": [
+ -740,
+ 840
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {},
+ "order": 34,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 373
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 366
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 367
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 368,
+ 371,
+ 372,
+ 383
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents"
+ }
+ },
+ {
+ "id": 168,
+ "type": "PrimitiveInt",
+ "pos": [
+ -740,
+ 980
+ ],
+ "size": [
+ 230,
+ 90
+ ],
+ "flags": {},
+ "order": 35,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 389
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 382
+ ]
+ }
+ ],
+ "title": "Int (Index)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ "fixed"
+ ]
+ },
+ {
+ "id": 169,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -740,
+ 1110
+ ],
+ "size": [
+ 230,
+ 100
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 36,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 382
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 383
+ },
+ {
+ "label": "c",
+ "localized_name": "values.c",
+ "name": "values.c",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 386
+ ]
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 384
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a * b"
+ ]
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Models",
+ "bounding": [
+ -790,
+ -260,
+ 470,
+ 990
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Input videos (place files in ComfyUI/input/)",
+ "bounding": [
+ -790,
+ 760,
+ 660,
+ 560
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Shared: Text & Mask Conditioning",
+ "bounding": [
+ -290,
+ -260,
+ 640,
+ 990
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 4,
+ "title": "Pass 1: Sample (Random Noise → DDIM)",
+ "bounding": [
+ 380,
+ -260,
+ 810,
+ 750
+ ],
+ "color": "#8A8",
+ "flags": {}
+ },
+ {
+ "id": 6,
+ "title": "Pass 2: Sample (Warped Noise → DDIM)",
+ "bounding": [
+ 380,
+ 1020,
+ 810,
+ 880
+ ],
+ "color": "#8A8",
+ "flags": {}
+ },
+ {
+ "id": 8,
+ "title": "Create Mask",
+ "bounding": [
+ -100,
+ 760,
+ 450,
+ 560
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 9,
+ "title": "Pass 1",
+ "bounding": [
+ -730,
+ -220,
+ 360,
+ 210
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 10,
+ "title": "Pass 2",
+ "bounding": [
+ -720,
+ 340,
+ 340,
+ 340
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 11,
+ "title": "Output Video Size",
+ "bounding": [
+ -790,
+ 1580,
+ 660,
+ 320
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 12,
+ "title": "Skip Pass 2",
+ "bounding": [
+ -790,
+ 1350,
+ 660,
+ 200
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 13,
+ "title": "Trim Audio",
+ "bounding": [
+ -100,
+ 1350,
+ 450,
+ 550
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 3,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 7,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 322,
+ "origin_id": 143,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 309,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": 136,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 310,
+ "origin_id": 10,
+ "origin_slot": 1,
+ "target_id": 136,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 324,
+ "origin_id": 144,
+ "origin_slot": 0,
+ "target_id": 138,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 325,
+ "origin_id": 144,
+ "origin_slot": 0,
+ "target_id": 140,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 317,
+ "origin_id": 10,
+ "origin_slot": 0,
+ "target_id": 140,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 318,
+ "origin_id": 10,
+ "origin_slot": 1,
+ "target_id": 140,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 321,
+ "origin_id": 142,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 0,
+ "type": "OPTICAL_FLOW"
+ },
+ {
+ "id": 72,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 333,
+ "origin_id": 147,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 335,
+ "origin_id": 148,
+ "origin_slot": 0,
+ "target_id": 31,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 67,
+ "origin_id": 43,
+ "origin_slot": 2,
+ "target_id": 31,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 54,
+ "origin_id": 32,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 311,
+ "origin_id": 136,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 305,
+ "origin_id": 134,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 313,
+ "origin_id": 137,
+ "origin_slot": 0,
+ "target_id": 35,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 48,
+ "origin_id": 10,
+ "origin_slot": 2,
+ "target_id": 35,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 340,
+ "origin_id": 149,
+ "origin_slot": 0,
+ "target_id": 132,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 8,
+ "origin_id": 6,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 9,
+ "origin_id": 7,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 4,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 326,
+ "origin_id": 145,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 339,
+ "origin_id": 149,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 4,
+ "type": "MASK"
+ },
+ {
+ "id": 332,
+ "origin_id": 147,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 334,
+ "origin_id": 148,
+ "origin_slot": 0,
+ "target_id": 10,
+ "target_slot": 6,
+ "type": "INT"
+ },
+ {
+ "id": 63,
+ "origin_id": 43,
+ "origin_slot": 2,
+ "target_id": 10,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 53,
+ "origin_id": 31,
+ "origin_slot": 0,
+ "target_id": 32,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 323,
+ "origin_id": 143,
+ "origin_slot": 0,
+ "target_id": 137,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 73,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 46,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 355,
+ "origin_id": 158,
+ "origin_slot": 0,
+ "target_id": 46,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 368,
+ "origin_id": 166,
+ "origin_slot": 2,
+ "target_id": 46,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 320,
+ "origin_id": 141,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 0,
+ "type": "NOISE"
+ },
+ {
+ "id": 319,
+ "origin_id": 140,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 1,
+ "type": "GUIDER"
+ },
+ {
+ "id": 304,
+ "origin_id": 133,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 2,
+ "type": "SAMPLER"
+ },
+ {
+ "id": 315,
+ "origin_id": 138,
+ "origin_slot": 0,
+ "target_id": 49,
+ "target_slot": 3,
+ "type": "SIGMAS"
+ },
+ {
+ "id": 82,
+ "origin_id": 10,
+ "origin_slot": 2,
+ "target_id": 49,
+ "target_slot": 4,
+ "type": "LATENT"
+ },
+ {
+ "id": 83,
+ "origin_id": 49,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 70,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 45,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 2,
+ "origin_id": 2,
+ "origin_slot": 0,
+ "target_id": 6,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 366,
+ "origin_id": 166,
+ "origin_slot": 0,
+ "target_id": 145,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 361,
+ "origin_id": 164,
+ "origin_slot": 1,
+ "target_id": 145,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 49,
+ "origin_id": 35,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 45,
+ "origin_id": 3,
+ "origin_slot": 0,
+ "target_id": 36,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 336,
+ "origin_id": 145,
+ "origin_slot": 0,
+ "target_id": 149,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 327,
+ "origin_id": 145,
+ "origin_slot": 0,
+ "target_id": 43,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 342,
+ "origin_id": 45,
+ "origin_slot": 0,
+ "target_id": 150,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 341,
+ "origin_id": 36,
+ "origin_slot": 0,
+ "target_id": 150,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 346,
+ "origin_id": 153,
+ "origin_slot": 0,
+ "target_id": 150,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 367,
+ "origin_id": 166,
+ "origin_slot": 1,
+ "target_id": 158,
+ "target_slot": 0,
+ "type": "AUDIO"
+ },
+ {
+ "id": 360,
+ "origin_id": 163,
+ "origin_slot": 0,
+ "target_id": 164,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 371,
+ "origin_id": 166,
+ "origin_slot": 2,
+ "target_id": 164,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 363,
+ "origin_id": 150,
+ "origin_slot": 0,
+ "target_id": 165,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 364,
+ "origin_id": 158,
+ "origin_slot": 0,
+ "target_id": 165,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 372,
+ "origin_id": 166,
+ "origin_slot": 2,
+ "target_id": 165,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 373,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 166,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 77,
+ "origin_id": 46,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 362,
+ "origin_id": 165,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "VIDEO"
+ },
+ {
+ "id": 377,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 6,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 382,
+ "origin_id": 168,
+ "origin_slot": 0,
+ "target_id": 169,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 383,
+ "origin_id": 166,
+ "origin_slot": 2,
+ "target_id": 169,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 384,
+ "origin_id": 169,
+ "origin_slot": 1,
+ "target_id": 145,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 385,
+ "origin_id": 164,
+ "origin_slot": 0,
+ "target_id": 158,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 386,
+ "origin_id": 169,
+ "origin_slot": 0,
+ "target_id": 158,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 387,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 7,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 388,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 149,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 389,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 168,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 390,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 163,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 391,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 147,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 392,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 148,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 393,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 153,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 400,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 141,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 401,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 149,
+ "target_slot": 8,
+ "type": "COMBO"
+ },
+ {
+ "id": 402,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 144,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 403,
+ "origin_id": -10,
+ "origin_slot": 12,
+ "target_id": 143,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 404,
+ "origin_id": -10,
+ "origin_slot": 13,
+ "target_id": 142,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 405,
+ "origin_id": -10,
+ "origin_slot": 14,
+ "target_id": 2,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 406,
+ "origin_id": -10,
+ "origin_slot": 15,
+ "target_id": 3,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Video generation and editing/Inpaint video",
+ "description": "Removes objects from video by inpainting masked regions using VOID (CogVideoX), with SAM3 text-guided segmentation and optional two-pass optical-flow refinement."
+ },
+ {
+ "id": "c3e0d783-9aa3-4e75-a94d-19937968ef86",
+ "version": 1,
+ "state": {
+ "lastGroupId": 13,
+ "lastNodeId": 171,
+ "lastLinkId": 406,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Segmentation (SAM3)",
+ "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -2260,
+ -3450,
+ 144.369140625,
+ 228
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1130,
+ -3305,
+ 128,
+ 88
+ ]
+ },
+ "inputs": [
+ {
+ "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 264
+ ],
+ "localized_name": "image",
+ "label": "image",
+ "pos": [
+ -2139.630859375,
+ -3426
+ ]
+ },
+ {
+ "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 265
+ ],
+ "label": "object",
+ "pos": [
+ -2139.630859375,
+ -3406
+ ]
+ },
+ {
+ "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 266
+ ],
+ "pos": [
+ -2139.630859375,
+ -3386
+ ]
+ },
+ {
+ "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
+ "name": "positive_coords",
+ "type": "STRING",
+ "linkIds": [
+ 267
+ ],
+ "pos": [
+ -2139.630859375,
+ -3366
+ ]
+ },
+ {
+ "id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
+ "name": "negative_coords",
+ "type": "STRING",
+ "linkIds": [
+ 268
+ ],
+ "pos": [
+ -2139.630859375,
+ -3346
+ ]
+ },
+ {
+ "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 269
+ ],
+ "pos": [
+ -2139.630859375,
+ -3326
+ ]
+ },
+ {
+ "id": "b1439668-b050-490b-a5dc-fc4052c55666",
+ "name": "refine_iterations",
+ "type": "INT",
+ "linkIds": [
+ 270
+ ],
+ "pos": [
+ -2139.630859375,
+ -3306
+ ]
+ },
+ {
+ "id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 271
+ ],
+ "pos": [
+ -2139.630859375,
+ -3286
+ ]
+ },
+ {
+ "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 272
+ ],
+ "pos": [
+ -2139.630859375,
+ -3266
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
+ "name": "masks",
+ "type": "MASK",
+ "linkIds": [
+ 231
+ ],
+ "localized_name": "masks",
+ "pos": [
+ -1106,
+ -3281
+ ]
+ },
+ {
+ "id": "8f622e40-8528-4078-b7d3-147e9f872194",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 232
+ ],
+ "localized_name": "bboxes",
+ "pos": [
+ -1106,
+ -3261
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 75,
+ "type": "SAM3_Detect",
+ "pos": [
+ -1470,
+ -3460
+ ],
+ "size": [
+ 270,
+ 260
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 237
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 264
+ },
+ {
+ "label": "conditioning",
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "shape": 7,
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "label": "bboxes",
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 266
+ },
+ {
+ "label": "positive_coords",
+ "localized_name": "positive_coords",
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 267
+ },
+ {
+ "label": "negative_coords",
+ "localized_name": "negative_coords",
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 268
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 269
+ },
+ {
+ "localized_name": "refine_iterations",
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": 270
+ },
+ {
+ "localized_name": "individual_masks",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": 271
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 231
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SAM3_Detect",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 0.5,
+ 2,
+ false
+ ]
+ },
+ {
+ "id": 77,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -1970,
+ -3200
+ ],
+ "size": [
+ 330,
+ 140
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 272
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 237
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 240
+ ]
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "models": [
+ {
+ "name": "sam3.1_multiplex_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "sam3.1_multiplex_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 78,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2000,
+ -3000
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 240
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 265
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 237,
+ "origin_id": 77,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 200,
+ "origin_id": 78,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 240,
+ "origin_id": 77,
+ "origin_slot": 1,
+ "target_id": 78,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 231,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 232,
+ "origin_id": 75,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 264,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 265,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 78,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 266,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 75,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 267,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 75,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 268,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 75,
+ "target_slot": 5,
+ "type": "STRING"
+ },
+ {
+ "id": 269,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 75,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 270,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 75,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 271,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 75,
+ "target_slot": 8,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 272,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 77,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "ue_links": []
+ }
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Inpainting (Wan2.1 VACE).json b/blueprints/Video Inpainting (Wan2.1 VACE).json
new file mode 100644
index 000000000..7460f3d44
--- /dev/null
+++ b/blueprints/Video Inpainting (Wan2.1 VACE).json
@@ -0,0 +1,4196 @@
+{
+ "revision": 0,
+ "last_node_id": 306,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 306,
+ "type": "bd7f73a0-ec67-4f46-8671-17088d8e31b7",
+ "pos": [
+ -2950,
+ -410
+ ],
+ "size": [
+ 440,
+ 650
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "source_video",
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "label": "reference_image",
+ "name": "reference_image_1",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "label": "prompt",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "label": "width",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ },
+ {
+ "label": "height",
+ "name": "value_1",
+ "type": "INT",
+ "widget": {
+ "name": "value_1"
+ },
+ "link": null
+ },
+ {
+ "label": "frame_counts",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": null
+ },
+ {
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": null
+ },
+ {
+ "label": "wan_vace_model",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "label": "clip_model",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": null
+ },
+ {
+ "label": "vae_model",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": null
+ },
+ {
+ "label": "enable_turbo_mode",
+ "name": "value_2",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value_2"
+ },
+ "link": null
+ },
+ {
+ "label": "lightning_lora",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": null
+ },
+ {
+ "label": "sam3_mask_object",
+ "name": "text_1",
+ "type": "STRING",
+ "widget": {
+ "name": "text_1"
+ },
+ "link": null
+ },
+ {
+ "label": "mask_expand",
+ "name": "expand",
+ "type": "INT",
+ "widget": {
+ "name": "expand"
+ },
+ "link": null
+ },
+ {
+ "label": "sam3_model",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "280",
+ "text"
+ ],
+ [
+ "297",
+ "value"
+ ],
+ [
+ "290",
+ "value"
+ ],
+ [
+ "289",
+ "length"
+ ],
+ [
+ "288",
+ "seed"
+ ],
+ [
+ "299",
+ "unet_name"
+ ],
+ [
+ "277",
+ "clip_name"
+ ],
+ [
+ "278",
+ "vae_name"
+ ],
+ [
+ "300",
+ "value"
+ ],
+ [
+ "272",
+ "lora_name"
+ ],
+ [
+ "268",
+ "text"
+ ],
+ [
+ "269",
+ "expand"
+ ],
+ [
+ "268",
+ "ckpt_name"
+ ],
+ [
+ "312",
+ "$$canvas-image-preview"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Video Inpainting (Wan2.1 VACE)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "bd7f73a0-ec67-4f46-8671-17088d8e31b7",
+ "version": 1,
+ "state": {
+ "lastGroupId": 31,
+ "lastNodeId": 315,
+ "lastLinkId": 499,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Video Inpainting (Wan2.1 VACE)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -3450,
+ 3170,
+ 159.744140625,
+ 348
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ 900,
+ 2840,
+ 128,
+ 68
+ ]
+ },
+ "inputs": [
+ {
+ "id": "a636746e-5b9f-4b91-96f0-7f2657415b93",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 473
+ ],
+ "localized_name": "video",
+ "label": "source_video",
+ "pos": [
+ -3314.255859375,
+ 3194
+ ]
+ },
+ {
+ "id": "46275350-98b8-4d7c-8ca4-c452dc40a6bd",
+ "name": "reference_image_1",
+ "type": "IMAGE",
+ "linkIds": [
+ 478
+ ],
+ "label": "reference_image",
+ "pos": [
+ -3314.255859375,
+ 3214
+ ]
+ },
+ {
+ "id": "0f5bee71-3485-4e10-81a7-2b9f85851353",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 479
+ ],
+ "label": "prompt",
+ "pos": [
+ -3314.255859375,
+ 3234
+ ]
+ },
+ {
+ "id": "16675512-c229-43ed-944e-190a7f61b571",
+ "name": "value",
+ "type": "INT",
+ "linkIds": [
+ 480
+ ],
+ "label": "width",
+ "pos": [
+ -3314.255859375,
+ 3254
+ ]
+ },
+ {
+ "id": "84330129-a0c7-44cd-91fe-c033946749db",
+ "name": "value_1",
+ "type": "INT",
+ "linkIds": [
+ 481
+ ],
+ "label": "height",
+ "pos": [
+ -3314.255859375,
+ 3274
+ ]
+ },
+ {
+ "id": "3bd895e6-cba9-477b-bf6e-8c77dd56bb4a",
+ "name": "length",
+ "type": "INT",
+ "linkIds": [
+ 494
+ ],
+ "label": "frame_counts",
+ "pos": [
+ -3314.255859375,
+ 3294
+ ]
+ },
+ {
+ "id": "dbc2e9c5-f86a-48ba-874a-2991c75d1ae7",
+ "name": "seed",
+ "type": "INT",
+ "linkIds": [
+ 483
+ ],
+ "pos": [
+ -3314.255859375,
+ 3314
+ ]
+ },
+ {
+ "id": "572db94d-e64d-464f-bf3c-23a23aeb79f1",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 485
+ ],
+ "label": "wan_vace_model",
+ "pos": [
+ -3314.255859375,
+ 3334
+ ]
+ },
+ {
+ "id": "32185180-f627-47c2-971b-6ef3007e9455",
+ "name": "clip_name",
+ "type": "COMBO",
+ "linkIds": [
+ 486
+ ],
+ "label": "clip_model",
+ "pos": [
+ -3314.255859375,
+ 3354
+ ]
+ },
+ {
+ "id": "2af354d3-108a-42a9-acfc-7bad158715aa",
+ "name": "vae_name",
+ "type": "COMBO",
+ "linkIds": [
+ 487
+ ],
+ "label": "vae_model",
+ "pos": [
+ -3314.255859375,
+ 3374
+ ]
+ },
+ {
+ "id": "c9777a8c-267f-4c5e-b4d5-e9727d822e50",
+ "name": "value_2",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 489
+ ],
+ "label": "enable_turbo_mode",
+ "pos": [
+ -3314.255859375,
+ 3394
+ ]
+ },
+ {
+ "id": "84a258a3-4f25-4edb-9f50-6fcd8411394e",
+ "name": "lora_name",
+ "type": "COMBO",
+ "linkIds": [
+ 490
+ ],
+ "label": "lightning_lora",
+ "pos": [
+ -3314.255859375,
+ 3414
+ ]
+ },
+ {
+ "id": "9c5fb6f8-407b-4a13-94d8-cbbba546a082",
+ "name": "text_1",
+ "type": "STRING",
+ "linkIds": [
+ 491
+ ],
+ "label": "sam3_mask_object",
+ "pos": [
+ -3314.255859375,
+ 3434
+ ]
+ },
+ {
+ "id": "598323c9-2256-44bd-9745-492a74628300",
+ "name": "expand",
+ "type": "INT",
+ "linkIds": [
+ 496
+ ],
+ "label": "mask_expand",
+ "pos": [
+ -3314.255859375,
+ 3454
+ ]
+ },
+ {
+ "id": "856c1937-8caa-4d85-9d8a-6a900234d6d6",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 497
+ ],
+ "label": "sam3_model",
+ "pos": [
+ -3314.255859375,
+ 3474
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "be46c9d5-ced7-445b-996f-fff59d9b684d",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "linkIds": [
+ 474
+ ],
+ "localized_name": "VIDEO",
+ "pos": [
+ 924,
+ 2864
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 266,
+ "type": "ModelSamplingSD3",
+ "pos": [
+ -560,
+ 1940
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 422
+ },
+ {
+ "localized_name": "shift",
+ "name": "shift",
+ "type": "FLOAT",
+ "widget": {
+ "name": "shift"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 454
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ModelSamplingSD3",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 5
+ ]
+ },
+ {
+ "id": 267,
+ "type": "CreateVideo",
+ "pos": [
+ 530,
+ 2590
+ ],
+ "size": [
+ 310,
+ 130
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 423
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 424
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 425
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 474
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 16
+ ]
+ },
+ {
+ "id": 268,
+ "type": "17df2eeb-d89e-46ee-9480-a4ca2494b207",
+ "pos": [
+ -1960,
+ 3220
+ ],
+ "size": [
+ 290,
+ 370
+ ],
+ "flags": {},
+ "order": 7,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 426
+ },
+ {
+ "label": "object",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 491
+ },
+ {
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": null
+ },
+ {
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": null
+ },
+ {
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 497
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 427
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "237",
+ "text"
+ ],
+ [
+ "75",
+ "threshold"
+ ],
+ [
+ "75",
+ "refine_iterations"
+ ],
+ [
+ "75",
+ "individual_masks"
+ ],
+ [
+ "236",
+ "ckpt_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {
+ "text": true
+ },
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": []
+ },
+ {
+ "id": 269,
+ "type": "GrowMask",
+ "pos": [
+ -1530,
+ 3220
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 8,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 427
+ },
+ {
+ "localized_name": "expand",
+ "name": "expand",
+ "type": "INT",
+ "widget": {
+ "name": "expand"
+ },
+ "link": 496
+ },
+ {
+ "localized_name": "tapered_corners",
+ "name": "tapered_corners",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "tapered_corners"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 441,
+ 445,
+ 449,
+ 498
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GrowMask",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ true
+ ]
+ },
+ {
+ "id": 270,
+ "type": "PrimitiveInt",
+ "pos": [
+ -1350,
+ 1980
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 466
+ ]
+ }
+ ],
+ "title": "Int (Steps)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 20,
+ "fixed"
+ ]
+ },
+ {
+ "id": 271,
+ "type": "PrimitiveFloat",
+ "pos": [
+ -1340,
+ 2140
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 432
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 6
+ ]
+ },
+ {
+ "id": 272,
+ "type": "LoraLoaderModelOnly",
+ "pos": [
+ -1380,
+ 2390
+ ],
+ "size": [
+ 350,
+ 140
+ ],
+ "flags": {},
+ "order": 9,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 428
+ },
+ {
+ "localized_name": "lora_name",
+ "name": "lora_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "lora_name"
+ },
+ "link": 490
+ },
+ {
+ "localized_name": "strength_model",
+ "name": "strength_model",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength_model"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 430
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "LoraLoaderModelOnly",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "models": [
+ {
+ "name": "Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
+ "url": "https://huggingface.co/Kijai/WanVideo_comfy/resolve/main/Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
+ "directory": "loras"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "Wan21_CausVid_14B_T2V_lora_rank32.safetensors",
+ 0.30000000000000004
+ ]
+ },
+ {
+ "id": 273,
+ "type": "PrimitiveInt",
+ "pos": [
+ -1340,
+ 2600
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 467
+ ]
+ }
+ ],
+ "title": "Int (Steps)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 6,
+ "fixed"
+ ]
+ },
+ {
+ "id": 274,
+ "type": "PrimitiveFloat",
+ "pos": [
+ -1340,
+ 2760
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "FLOAT",
+ "widget": {
+ "name": "value"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": [
+ 433
+ ]
+ }
+ ],
+ "title": "Float (CFG)",
+ "properties": {
+ "Node name for S&R": "PrimitiveFloat",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 1
+ ]
+ },
+ {
+ "id": 275,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -960,
+ 2530
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 10,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 429
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 430
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 431
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 422
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 276,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -960,
+ 2340
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 11,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 432
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 433
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 434
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 459
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 277,
+ "type": "CLIPLoader",
+ "pos": [
+ -2710,
+ 2210
+ ],
+ "size": [
+ 360,
+ 170
+ ],
+ "flags": {},
+ "order": 12,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip_name",
+ "name": "clip_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "clip_name"
+ },
+ "link": 486
+ },
+ {
+ "localized_name": "type",
+ "name": "type",
+ "type": "COMBO",
+ "widget": {
+ "name": "type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "device",
+ "name": "device",
+ "shape": 7,
+ "type": "COMBO",
+ "widget": {
+ "name": "device"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "slot_index": 0,
+ "links": [
+ 435,
+ 436
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "models": [
+ {
+ "name": "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/text_encoders/umt5_xxl_fp8_e4m3fn_scaled.safetensors?download=true",
+ "directory": "text_encoders"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "umt5_xxl_fp8_e4m3fn_scaled.safetensors",
+ "wan",
+ "default"
+ ]
+ },
+ {
+ "id": 278,
+ "type": "VAELoader",
+ "pos": [
+ -2700,
+ 2500
+ ],
+ "size": [
+ 360,
+ 110
+ ],
+ "flags": {},
+ "order": 13,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "vae_name",
+ "name": "vae_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "vae_name"
+ },
+ "link": 487
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "slot_index": 0,
+ "links": [
+ 439,
+ 471
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAELoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "models": [
+ {
+ "name": "wan_2.1_vae.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/vae/wan_2.1_vae.safetensors",
+ "directory": "vae"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "wan_2.1_vae.safetensors"
+ ]
+ },
+ {
+ "id": 279,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2280,
+ 2410
+ ],
+ "size": [
+ 430,
+ 190
+ ],
+ "flags": {},
+ "order": 14,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 435
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 438
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Negative Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走,过曝,"
+ ],
+ "color": "#223",
+ "bgcolor": "#335"
+ },
+ {
+ "id": 280,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2270,
+ 1940
+ ],
+ "size": [
+ 420,
+ 420
+ ],
+ "flags": {},
+ "order": 15,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 436
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 479
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "slot_index": 0,
+ "links": [
+ 437
+ ]
+ }
+ ],
+ "title": "CLIP Text Encode (Positive Prompt)",
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ ""
+ ],
+ "color": "#232",
+ "bgcolor": "#353"
+ },
+ {
+ "id": 281,
+ "type": "WanVaceToVideo",
+ "pos": [
+ -1780,
+ 1940
+ ],
+ "size": [
+ 320,
+ 360
+ ],
+ "flags": {},
+ "order": 16,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 437
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 438
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 439
+ },
+ {
+ "localized_name": "control_video",
+ "name": "control_video",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 440
+ },
+ {
+ "localized_name": "control_masks",
+ "name": "control_masks",
+ "shape": 7,
+ "type": "MASK",
+ "link": 441
+ },
+ {
+ "localized_name": "reference_image",
+ "name": "reference_image",
+ "shape": 7,
+ "type": "IMAGE",
+ "link": 478
+ },
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "widget": {
+ "name": "width"
+ },
+ "link": 442
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "widget": {
+ "name": "height"
+ },
+ "link": 443
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 444
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "strength",
+ "name": "strength",
+ "type": "FLOAT",
+ "widget": {
+ "name": "strength"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "links": [
+ 455
+ ]
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "links": [
+ 456
+ ]
+ },
+ {
+ "localized_name": "latent",
+ "name": "latent",
+ "type": "LATENT",
+ "links": [
+ 457
+ ]
+ },
+ {
+ "localized_name": "trim_latent",
+ "name": "trim_latent",
+ "type": "INT",
+ "links": [
+ 453
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "WanVaceToVideo",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {
+ "width": true,
+ "height": true,
+ "length": true
+ }
+ },
+ "widgets_values": [
+ 720,
+ 720,
+ 81,
+ 1,
+ 1
+ ]
+ },
+ {
+ "id": 282,
+ "type": "InvertMask",
+ "pos": [
+ -1510,
+ 3410
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 17,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 445
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MASK",
+ "name": "MASK",
+ "type": "MASK",
+ "links": [
+ 446
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "InvertMask",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 283,
+ "type": "MaskToImage",
+ "pos": [
+ -1510,
+ 3550
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 18,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 446
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 448
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "MaskToImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 284,
+ "type": "ImageCompositeMasked",
+ "pos": [
+ -1210,
+ 3210
+ ],
+ "size": [
+ 230,
+ 220
+ ],
+ "flags": {},
+ "order": 19,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "destination",
+ "name": "destination",
+ "type": "IMAGE",
+ "link": 447
+ },
+ {
+ "localized_name": "source",
+ "name": "source",
+ "type": "IMAGE",
+ "link": 448
+ },
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "shape": 7,
+ "type": "MASK",
+ "link": 449
+ },
+ {
+ "localized_name": "x",
+ "name": "x",
+ "type": "INT",
+ "widget": {
+ "name": "x"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "y",
+ "name": "y",
+ "type": "INT",
+ "widget": {
+ "name": "y"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resize_source",
+ "name": "resize_source",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "resize_source"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 440,
+ 499
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageCompositeMasked",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 0,
+ true
+ ]
+ },
+ {
+ "id": 287,
+ "type": "TrimVideoLatent",
+ "pos": [
+ -220,
+ 1950
+ ],
+ "size": [
+ 320,
+ 110
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 20,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 452
+ },
+ {
+ "localized_name": "trim_amount",
+ "name": "trim_amount",
+ "type": "INT",
+ "widget": {
+ "name": "trim_amount"
+ },
+ "link": 453
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "links": [
+ 470
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "TrimVideoLatent",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {
+ "trim_amount": true
+ }
+ },
+ "widgets_values": [
+ 0
+ ]
+ },
+ {
+ "id": 288,
+ "type": "KSampler",
+ "pos": [
+ -560,
+ 2120
+ ],
+ "size": [
+ 320,
+ 350
+ ],
+ "flags": {},
+ "order": 21,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 454
+ },
+ {
+ "localized_name": "positive",
+ "name": "positive",
+ "type": "CONDITIONING",
+ "link": 455
+ },
+ {
+ "localized_name": "negative",
+ "name": "negative",
+ "type": "CONDITIONING",
+ "link": 456
+ },
+ {
+ "localized_name": "latent_image",
+ "name": "latent_image",
+ "type": "LATENT",
+ "link": 457
+ },
+ {
+ "localized_name": "seed",
+ "name": "seed",
+ "type": "INT",
+ "widget": {
+ "name": "seed"
+ },
+ "link": 483
+ },
+ {
+ "localized_name": "steps",
+ "name": "steps",
+ "type": "INT",
+ "widget": {
+ "name": "steps"
+ },
+ "link": 458
+ },
+ {
+ "localized_name": "cfg",
+ "name": "cfg",
+ "type": "FLOAT",
+ "widget": {
+ "name": "cfg"
+ },
+ "link": 459
+ },
+ {
+ "localized_name": "sampler_name",
+ "name": "sampler_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "sampler_name"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scheduler",
+ "name": "scheduler",
+ "type": "COMBO",
+ "widget": {
+ "name": "scheduler"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "denoise",
+ "name": "denoise",
+ "type": "FLOAT",
+ "widget": {
+ "name": "denoise"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "LATENT",
+ "name": "LATENT",
+ "type": "LATENT",
+ "slot_index": 0,
+ "links": [
+ 452
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "KSampler",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ 832378512055965,
+ "fixed",
+ 4,
+ 1,
+ "uni_pc",
+ "simple",
+ 1
+ ]
+ },
+ {
+ "id": 289,
+ "type": "ImageFromBatch",
+ "pos": [
+ -2360,
+ 3410
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 22,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 460
+ },
+ {
+ "localized_name": "batch_index",
+ "name": "batch_index",
+ "type": "INT",
+ "widget": {
+ "name": "batch_index"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": 494
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 463
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFromBatch",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 81
+ ]
+ },
+ {
+ "id": 290,
+ "type": "PrimitiveInt",
+ "pos": [
+ -2690,
+ 3540
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 23,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 481
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 461
+ ]
+ }
+ ],
+ "title": "Int (Height)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 291,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -2650,
+ 3700
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 24,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 461
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": []
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 465
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": []
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "floor(a/16)*16"
+ ]
+ },
+ {
+ "id": 292,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -2650,
+ 3500
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {
+ "collapsed": true
+ },
+ "order": 25,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": 462
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT,BOOLEAN",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": []
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 464
+ ]
+ },
+ {
+ "localized_name": "BOOL",
+ "name": "BOOL",
+ "type": "BOOLEAN",
+ "links": []
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "floor(a/16)*16"
+ ]
+ },
+ {
+ "id": 293,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ -2360,
+ 3590
+ ],
+ "size": [
+ 280,
+ 160
+ ],
+ "flags": {},
+ "order": 26,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 463
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 464
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 465
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 426,
+ 447,
+ 469
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImageMaskNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale dimensions",
+ 512,
+ 512,
+ "center",
+ "area"
+ ]
+ },
+ {
+ "id": 294,
+ "type": "ComfySwitchNode",
+ "pos": [
+ -960,
+ 2150
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 27,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "on_false",
+ "name": "on_false",
+ "type": "*",
+ "link": 466
+ },
+ {
+ "localized_name": "on_true",
+ "name": "on_true",
+ "type": "*",
+ "link": 467
+ },
+ {
+ "localized_name": "switch",
+ "name": "switch",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "switch"
+ },
+ "link": 468
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "output",
+ "name": "output",
+ "type": "*",
+ "links": [
+ 458
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfySwitchNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ false
+ ]
+ },
+ {
+ "id": 295,
+ "type": "GetImageSize",
+ "pos": [
+ -2010,
+ 2920
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 28,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 469
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 442
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 443
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": [
+ 444
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 296,
+ "type": "VAEDecode",
+ "pos": [
+ 520,
+ 2450
+ ],
+ "size": [
+ 320,
+ 100
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 29,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "samples",
+ "name": "samples",
+ "type": "LATENT",
+ "link": 470
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 471
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "slot_index": 0,
+ "links": [
+ 423
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "VAEDecode",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ }
+ },
+ {
+ "id": 297,
+ "type": "PrimitiveInt",
+ "pos": [
+ -2690,
+ 3350
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 30,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "INT",
+ "widget": {
+ "name": "value"
+ },
+ "link": 480
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 462
+ ]
+ }
+ ],
+ "title": "Int (Width)",
+ "properties": {
+ "Node name for S&R": "PrimitiveInt",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 720,
+ "fixed"
+ ]
+ },
+ {
+ "id": 298,
+ "type": "GetVideoComponents",
+ "pos": [
+ -2330,
+ 3210
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {
+ "collapsed": false
+ },
+ "order": 31,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 473
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 460
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 424
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 425
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.40",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 299,
+ "type": "UNETLoader",
+ "pos": [
+ -2720,
+ 1980
+ ],
+ "size": [
+ 370,
+ 140
+ ],
+ "flags": {},
+ "order": 32,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 485
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "slot_index": 0,
+ "links": [
+ 428,
+ 429
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.3.34",
+ "models": [
+ {
+ "name": "wan2.1_vace_14B_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/Wan_2.1_ComfyUI_repackaged/resolve/main/split_files/diffusion_models/wan2.1_vace_14B_fp16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "widget_ue_connectable": {}
+ },
+ "widgets_values": [
+ "wan2.1_vace_14B_fp16.safetensors",
+ "fp8_e4m3fn_fast"
+ ]
+ },
+ {
+ "id": 300,
+ "type": "PrimitiveBoolean",
+ "pos": [
+ -1390,
+ 2980
+ ],
+ "size": [
+ 270,
+ 100
+ ],
+ "flags": {},
+ "order": 33,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "value",
+ "name": "value",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "value"
+ },
+ "link": 489
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "BOOLEAN",
+ "name": "BOOLEAN",
+ "type": "BOOLEAN",
+ "links": [
+ 431,
+ 434,
+ 468
+ ]
+ }
+ ],
+ "title": "Boolean (Enable Lightning LoRA)",
+ "properties": {
+ "Node name for S&R": "PrimitiveBoolean",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true
+ ]
+ },
+ {
+ "id": 308,
+ "type": "ImageFromBatch",
+ "pos": [
+ -2360,
+ 3410
+ ],
+ "size": [
+ 270,
+ 140
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": null
+ },
+ {
+ "localized_name": "batch_index",
+ "name": "batch_index",
+ "type": "INT",
+ "widget": {
+ "name": "batch_index"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "length",
+ "name": "length",
+ "type": "INT",
+ "widget": {
+ "name": "length"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ImageFromBatch",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0,
+ 1
+ ]
+ },
+ {
+ "id": 310,
+ "type": "MaskPreview",
+ "pos": [
+ -900,
+ 3230
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 34,
+ "mode": 4,
+ "inputs": [
+ {
+ "localized_name": "mask",
+ "name": "mask",
+ "type": "MASK",
+ "link": 498
+ }
+ ],
+ "outputs": [],
+ "properties": {
+ "Node name for S&R": "MaskPreview",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 312,
+ "type": "PreviewImage",
+ "pos": [
+ -520,
+ 3230
+ ],
+ "size": [
+ 230,
+ 80
+ ],
+ "flags": {},
+ "order": 35,
+ "mode": 4,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 499
+ }
+ ],
+ "outputs": [],
+ "properties": {
+ "Node name for S&R": "PreviewImage",
+ "cnr_id": "comfy-core",
+ "ver": "0.21.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ }
+ ],
+ "groups": [
+ {
+ "id": 1,
+ "title": "Models",
+ "bounding": [
+ -2750,
+ 1860,
+ 430,
+ 770
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 2,
+ "title": "Prompt",
+ "bounding": [
+ -2290,
+ 1860,
+ 460,
+ 770
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 3,
+ "title": "Sampling",
+ "bounding": [
+ -590,
+ 1860,
+ 700,
+ 620
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 20,
+ "title": "Create Video Mask",
+ "bounding": [
+ -2030,
+ 3110,
+ 440,
+ 550
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 23,
+ "title": "Conditioning",
+ "bounding": [
+ -1800,
+ 1860,
+ 370,
+ 450
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 26,
+ "title": "Apply Mask to Video",
+ "bounding": [
+ -1560,
+ 3110,
+ 1320,
+ 550
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 29,
+ "title": "Swtich Logic",
+ "bounding": [
+ -1400,
+ 1860,
+ 780,
+ 1060
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 27,
+ "title": "Lightning LoRA",
+ "bounding": [
+ -1390,
+ 2290,
+ 370,
+ 620
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 28,
+ "title": "Original",
+ "bounding": [
+ -1390,
+ 1900,
+ 370,
+ 370
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 31,
+ "title": "Video Size Preprocessing",
+ "bounding": [
+ -2740,
+ 3110,
+ 680,
+ 770
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ },
+ {
+ "id": 30,
+ "title": "Size",
+ "bounding": [
+ -2710,
+ 3270,
+ 330,
+ 470
+ ],
+ "color": "#3f789e",
+ "flags": {}
+ }
+ ],
+ "links": [
+ {
+ "id": 422,
+ "origin_id": 275,
+ "origin_slot": 0,
+ "target_id": 266,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 423,
+ "origin_id": 296,
+ "origin_slot": 0,
+ "target_id": 267,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 424,
+ "origin_id": 298,
+ "origin_slot": 1,
+ "target_id": 267,
+ "target_slot": 1,
+ "type": "AUDIO"
+ },
+ {
+ "id": 425,
+ "origin_id": 298,
+ "origin_slot": 2,
+ "target_id": 267,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 426,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 268,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 427,
+ "origin_id": 268,
+ "origin_slot": 0,
+ "target_id": 269,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 428,
+ "origin_id": 299,
+ "origin_slot": 0,
+ "target_id": 272,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 429,
+ "origin_id": 299,
+ "origin_slot": 0,
+ "target_id": 275,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 430,
+ "origin_id": 272,
+ "origin_slot": 0,
+ "target_id": 275,
+ "target_slot": 1,
+ "type": "MODEL"
+ },
+ {
+ "id": 431,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 275,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 432,
+ "origin_id": 271,
+ "origin_slot": 0,
+ "target_id": 276,
+ "target_slot": 0,
+ "type": "FLOAT"
+ },
+ {
+ "id": 433,
+ "origin_id": 274,
+ "origin_slot": 0,
+ "target_id": 276,
+ "target_slot": 1,
+ "type": "FLOAT"
+ },
+ {
+ "id": 434,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 276,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 435,
+ "origin_id": 277,
+ "origin_slot": 0,
+ "target_id": 279,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 436,
+ "origin_id": 277,
+ "origin_slot": 0,
+ "target_id": 280,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 437,
+ "origin_id": 280,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 0,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 438,
+ "origin_id": 279,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 439,
+ "origin_id": 278,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 2,
+ "type": "VAE"
+ },
+ {
+ "id": 440,
+ "origin_id": 284,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 3,
+ "type": "IMAGE"
+ },
+ {
+ "id": 441,
+ "origin_id": 269,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 4,
+ "type": "MASK"
+ },
+ {
+ "id": 442,
+ "origin_id": 295,
+ "origin_slot": 0,
+ "target_id": 281,
+ "target_slot": 6,
+ "type": "INT"
+ },
+ {
+ "id": 443,
+ "origin_id": 295,
+ "origin_slot": 1,
+ "target_id": 281,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 444,
+ "origin_id": 295,
+ "origin_slot": 2,
+ "target_id": 281,
+ "target_slot": 8,
+ "type": "INT"
+ },
+ {
+ "id": 445,
+ "origin_id": 269,
+ "origin_slot": 0,
+ "target_id": 282,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 446,
+ "origin_id": 282,
+ "origin_slot": 0,
+ "target_id": 283,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 447,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 448,
+ "origin_id": 283,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 449,
+ "origin_id": 269,
+ "origin_slot": 0,
+ "target_id": 284,
+ "target_slot": 2,
+ "type": "MASK"
+ },
+ {
+ "id": 452,
+ "origin_id": 288,
+ "origin_slot": 0,
+ "target_id": 287,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 453,
+ "origin_id": 281,
+ "origin_slot": 3,
+ "target_id": 287,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 454,
+ "origin_id": 266,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 455,
+ "origin_id": 281,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 1,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 456,
+ "origin_id": 281,
+ "origin_slot": 1,
+ "target_id": 288,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 457,
+ "origin_id": 281,
+ "origin_slot": 2,
+ "target_id": 288,
+ "target_slot": 3,
+ "type": "LATENT"
+ },
+ {
+ "id": 458,
+ "origin_id": 294,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 459,
+ "origin_id": 276,
+ "origin_slot": 0,
+ "target_id": 288,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 460,
+ "origin_id": 298,
+ "origin_slot": 0,
+ "target_id": 289,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 461,
+ "origin_id": 290,
+ "origin_slot": 0,
+ "target_id": 291,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 462,
+ "origin_id": 297,
+ "origin_slot": 0,
+ "target_id": 292,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 463,
+ "origin_id": 289,
+ "origin_slot": 0,
+ "target_id": 293,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 464,
+ "origin_id": 292,
+ "origin_slot": 1,
+ "target_id": 293,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 465,
+ "origin_id": 291,
+ "origin_slot": 1,
+ "target_id": 293,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 466,
+ "origin_id": 270,
+ "origin_slot": 0,
+ "target_id": 294,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 467,
+ "origin_id": 273,
+ "origin_slot": 0,
+ "target_id": 294,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 468,
+ "origin_id": 300,
+ "origin_slot": 0,
+ "target_id": 294,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 469,
+ "origin_id": 293,
+ "origin_slot": 0,
+ "target_id": 295,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 470,
+ "origin_id": 287,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 0,
+ "type": "LATENT"
+ },
+ {
+ "id": 471,
+ "origin_id": 278,
+ "origin_slot": 0,
+ "target_id": 296,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 473,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 298,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 474,
+ "origin_id": 267,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 478,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 281,
+ "target_slot": 5,
+ "type": "IMAGE"
+ },
+ {
+ "id": 479,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 280,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 480,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 297,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 481,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 290,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 494,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 289,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 483,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 288,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 485,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 299,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 486,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 277,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 487,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 278,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 489,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 300,
+ "target_slot": 0,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 490,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 272,
+ "target_slot": 1,
+ "type": "COMBO"
+ },
+ {
+ "id": 491,
+ "origin_id": -10,
+ "origin_slot": 12,
+ "target_id": 268,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 496,
+ "origin_id": -10,
+ "origin_slot": 13,
+ "target_id": 269,
+ "target_slot": 1,
+ "type": "INT"
+ },
+ {
+ "id": 497,
+ "origin_id": -10,
+ "origin_slot": 14,
+ "target_id": 268,
+ "target_slot": 8,
+ "type": "COMBO"
+ },
+ {
+ "id": 498,
+ "origin_id": 269,
+ "origin_slot": 0,
+ "target_id": 310,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 499,
+ "origin_id": 284,
+ "origin_slot": 0,
+ "target_id": 312,
+ "target_slot": 0,
+ "type": "IMAGE"
+ }
+ ],
+ "extra": {},
+ "category": "Video generation and editing/Inpaint video",
+ "description": "Removes objects from video by inpainting masked regions using Wan 2.1 VACE, with SAM3 text-guided segmentation and optional Lightning LoRA turbo mode."
+ },
+ {
+ "id": "17df2eeb-d89e-46ee-9480-a4ca2494b207",
+ "version": 1,
+ "state": {
+ "lastGroupId": 31,
+ "lastNodeId": 315,
+ "lastLinkId": 499,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Image Segmentation (SAM3)",
+ "description": "Segments images into masks using Meta SAM3 from text prompts, points, or boxes.",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -2260,
+ -3450,
+ 136.369140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1130,
+ -3305,
+ 120,
+ 80
+ ]
+ },
+ "inputs": [
+ {
+ "id": "a6e75fa2-162a-4af0-a2fd-1e9c899a5ab6",
+ "name": "image",
+ "type": "IMAGE",
+ "linkIds": [
+ 264
+ ],
+ "localized_name": "image",
+ "label": "image",
+ "pos": [
+ -2143.630859375,
+ -3430
+ ]
+ },
+ {
+ "id": "3cefd304-7631-4ff6-a5a0-5a0ffb120745",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 265
+ ],
+ "label": "object",
+ "pos": [
+ -2143.630859375,
+ -3410
+ ]
+ },
+ {
+ "id": "1aec91c5-d8d2-441c-928c-49c14e7e80ed",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 266
+ ],
+ "pos": [
+ -2143.630859375,
+ -3390
+ ]
+ },
+ {
+ "id": "1ec7ce1a-8257-4719-8a81-60ebc8a98899",
+ "name": "positive_coords",
+ "type": "STRING",
+ "linkIds": [
+ 267
+ ],
+ "pos": [
+ -2143.630859375,
+ -3370
+ ]
+ },
+ {
+ "id": "c65f8b87-9bd7-48be-9fc2-823431e95019",
+ "name": "negative_coords",
+ "type": "STRING",
+ "linkIds": [
+ 268
+ ],
+ "pos": [
+ -2143.630859375,
+ -3350
+ ]
+ },
+ {
+ "id": "bb4ba35a-ccfe-4c37-98e5-d9b0d69585fb",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 269
+ ],
+ "pos": [
+ -2143.630859375,
+ -3330
+ ]
+ },
+ {
+ "id": "b1439668-b050-490b-a5dc-fc4052c55666",
+ "name": "refine_iterations",
+ "type": "INT",
+ "linkIds": [
+ 270
+ ],
+ "pos": [
+ -2143.630859375,
+ -3310
+ ]
+ },
+ {
+ "id": "86e239e5-c098-4302-b54d-d42a38bc0f89",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 271
+ ],
+ "pos": [
+ -2143.630859375,
+ -3290
+ ]
+ },
+ {
+ "id": "f9e0b9d4-b2f1-4907-a4a5-305656576706",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 272
+ ],
+ "pos": [
+ -2143.630859375,
+ -3270
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
+ "name": "masks",
+ "type": "MASK",
+ "linkIds": [
+ 231
+ ],
+ "localized_name": "masks",
+ "pos": [
+ -1110,
+ -3285
+ ]
+ },
+ {
+ "id": "8f622e40-8528-4078-b7d3-147e9f872194",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 232
+ ],
+ "localized_name": "bboxes",
+ "pos": [
+ -1110,
+ -3265
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 75,
+ "type": "SAM3_Detect",
+ "pos": [
+ -1470,
+ -3460
+ ],
+ "size": [
+ 270,
+ 260
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 237
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 264
+ },
+ {
+ "label": "conditioning",
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "shape": 7,
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "label": "bboxes",
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 266
+ },
+ {
+ "label": "positive_coords",
+ "localized_name": "positive_coords",
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 267
+ },
+ {
+ "label": "negative_coords",
+ "localized_name": "negative_coords",
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 268
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 269
+ },
+ {
+ "localized_name": "refine_iterations",
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": 270
+ },
+ {
+ "localized_name": "individual_masks",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": 271
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 231
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SAM3_Detect",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ 0.5,
+ 2,
+ false
+ ]
+ },
+ {
+ "id": 236,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -1970,
+ -3200
+ ],
+ "size": [
+ 330,
+ 140
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 272
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 237
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 240
+ ]
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "models": [
+ {
+ "name": "sam3.1_multiplex_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ "sam3.1_multiplex_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 237,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2000,
+ -3000
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 240
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 265
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "ue_properties": {
+ "widget_ue_connectable": {},
+ "version": "7.7",
+ "input_ue_unconnectable": {}
+ }
+ },
+ "widgets_values": [
+ ""
+ ]
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 237,
+ "origin_id": 236,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 200,
+ "origin_id": 237,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 240,
+ "origin_id": 236,
+ "origin_slot": 1,
+ "target_id": 237,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 231,
+ "origin_id": 75,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 232,
+ "origin_id": 75,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 264,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 75,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 265,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 237,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 266,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 75,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 267,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 75,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 268,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 75,
+ "target_slot": 5,
+ "type": "STRING"
+ },
+ {
+ "id": 269,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 75,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 270,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 75,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 271,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 75,
+ "target_slot": 8,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 272,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 236,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {
+ "ue_links": []
+ }
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Segmentation (SAM3).json b/blueprints/Video Segmentation (SAM3).json
new file mode 100644
index 000000000..4c7253869
--- /dev/null
+++ b/blueprints/Video Segmentation (SAM3).json
@@ -0,0 +1,827 @@
+{
+ "revision": 0,
+ "last_node_id": 130,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 130,
+ "type": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
+ "pos": [
+ -1210,
+ -2780
+ ],
+ "size": [
+ 300,
+ 370
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ },
+ {
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": null
+ },
+ {
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "link": null
+ },
+ {
+ "name": "positive_coords",
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "negative_coords",
+ "type": "STRING",
+ "link": null
+ },
+ {
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": null
+ },
+ {
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": []
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ },
+ {
+ "name": "audio",
+ "type": "AUDIO",
+ "links": null
+ },
+ {
+ "name": "fps",
+ "type": "FLOAT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "125",
+ "text"
+ ],
+ [
+ "126",
+ "threshold"
+ ],
+ [
+ "126",
+ "refine_iterations"
+ ],
+ [
+ "126",
+ "individual_masks"
+ ],
+ [
+ "127",
+ "ckpt_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Video Segmentation (SAM3)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "7937cf78-b52b-40a3-93b2-b4e2e5f98df1",
+ "version": 1,
+ "state": {
+ "lastGroupId": 0,
+ "lastNodeId": 130,
+ "lastLinkId": 299,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Video Segmentation (SAM3)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -2260,
+ -3450,
+ 136.369140625,
+ 220
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1050,
+ -3510,
+ 120,
+ 120
+ ]
+ },
+ "inputs": [
+ {
+ "id": "680ffd88-32fe-48be-88d6-91ea44d5eaee",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 252
+ ],
+ "pos": [
+ -2143.630859375,
+ -3430
+ ]
+ },
+ {
+ "id": "ceaf249c-32d7-4624-8bf6-e590e347ed90",
+ "name": "text",
+ "type": "STRING",
+ "linkIds": [
+ 254
+ ],
+ "pos": [
+ -2143.630859375,
+ -3410
+ ]
+ },
+ {
+ "id": "1ffbff36-da0c-4854-8cb4-88ad31e64f99",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 255
+ ],
+ "pos": [
+ -2143.630859375,
+ -3390
+ ]
+ },
+ {
+ "id": "67b7f4c7-cec0-4e00-b154-23cc1abf880e",
+ "name": "positive_coords",
+ "type": "STRING",
+ "linkIds": [
+ 256
+ ],
+ "pos": [
+ -2143.630859375,
+ -3370
+ ]
+ },
+ {
+ "id": "b090a498-2bde-46b9-9554-18501401d687",
+ "name": "negative_coords",
+ "type": "STRING",
+ "linkIds": [
+ 257
+ ],
+ "pos": [
+ -2143.630859375,
+ -3350
+ ]
+ },
+ {
+ "id": "1a76dfcf-ce95-46af-bba5-c42160c683dd",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 261
+ ],
+ "pos": [
+ -2143.630859375,
+ -3330
+ ]
+ },
+ {
+ "id": "999523fa-c476-4c53-80c3-0a2f554d18ab",
+ "name": "refine_iterations",
+ "type": "INT",
+ "linkIds": [
+ 262
+ ],
+ "pos": [
+ -2143.630859375,
+ -3310
+ ]
+ },
+ {
+ "id": "d2371011-7fe5-4a39-b0c1-df2e0bbd6ece",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 263
+ ],
+ "pos": [
+ -2143.630859375,
+ -3290
+ ]
+ },
+ {
+ "id": "675a8b37-17db-48d1-853c-2fe5d6a74582",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 273
+ ],
+ "pos": [
+ -2143.630859375,
+ -3270
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "ff50da09-1e59-4a58-9b7f-be1a00aa5913",
+ "name": "masks",
+ "type": "MASK",
+ "linkIds": [
+ 231
+ ],
+ "localized_name": "masks",
+ "pos": [
+ -1030,
+ -3490
+ ]
+ },
+ {
+ "id": "8f622e40-8528-4078-b7d3-147e9f872194",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 232
+ ],
+ "localized_name": "bboxes",
+ "pos": [
+ -1030,
+ -3470
+ ]
+ },
+ {
+ "id": "6c9924ec-f0fa-4509-83ea-8f97f5889bcc",
+ "name": "audio",
+ "type": "AUDIO",
+ "linkIds": [
+ 259
+ ],
+ "pos": [
+ -1030,
+ -3450
+ ]
+ },
+ {
+ "id": "82c1cddc-ab11-44eb-9e2f-1a5c7ea5645b",
+ "name": "fps",
+ "type": "FLOAT",
+ "linkIds": [
+ 260
+ ],
+ "pos": [
+ -1030,
+ -3430
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 125,
+ "type": "CLIPTextEncode",
+ "pos": [
+ -2010,
+ -3040
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "clip",
+ "name": "clip",
+ "type": "CLIP",
+ "link": 240
+ },
+ {
+ "localized_name": "text",
+ "name": "text",
+ "type": "STRING",
+ "widget": {
+ "name": "text"
+ },
+ "link": 254
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "CONDITIONING",
+ "name": "CONDITIONING",
+ "type": "CONDITIONING",
+ "links": [
+ 200
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CLIPTextEncode",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ ""
+ ]
+ },
+ {
+ "id": 126,
+ "type": "SAM3_Detect",
+ "pos": [
+ -1520,
+ -3520
+ ],
+ "size": [
+ 270,
+ 290
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 237
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 253
+ },
+ {
+ "label": "conditioning",
+ "localized_name": "conditioning",
+ "name": "conditioning",
+ "shape": 7,
+ "type": "CONDITIONING",
+ "link": 200
+ },
+ {
+ "label": "bboxes",
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 255
+ },
+ {
+ "label": "positive_coords",
+ "localized_name": "positive_coords",
+ "name": "positive_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 256
+ },
+ {
+ "label": "negative_coords",
+ "localized_name": "negative_coords",
+ "name": "negative_coords",
+ "shape": 7,
+ "type": "STRING",
+ "link": 257
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 261
+ },
+ {
+ "localized_name": "refine_iterations",
+ "name": "refine_iterations",
+ "type": "INT",
+ "widget": {
+ "name": "refine_iterations"
+ },
+ "link": 262
+ },
+ {
+ "localized_name": "individual_masks",
+ "name": "individual_masks",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "individual_masks"
+ },
+ "link": 263
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "masks",
+ "name": "masks",
+ "type": "MASK",
+ "links": [
+ 231
+ ]
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 232
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SAM3_Detect",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.5,
+ 2,
+ false
+ ]
+ },
+ {
+ "id": 127,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -1970,
+ -3310
+ ],
+ "size": [
+ 330,
+ 160
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 273
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 237
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": [
+ 240
+ ]
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65,
+ "models": [
+ {
+ "name": "sam3.1_multiplex_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/sam3.1/resolve/main/checkpoints/sam3.1_multiplex_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ]
+ },
+ "widgets_values": [
+ "sam3.1_multiplex_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 128,
+ "type": "GetVideoComponents",
+ "pos": [
+ -1910,
+ -3540
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 252
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 253
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 259
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 260
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "cnr_id": "comfy-core",
+ "ver": "0.19.3",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ },
+ {
+ "id": 129,
+ "type": "Note",
+ "pos": [
+ -1980,
+ -2790
+ ],
+ "size": [
+ 370,
+ 250
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [],
+ "outputs": [],
+ "title": "Note: Prompt format",
+ "properties": {},
+ "widgets_values": [
+ "Max tokens for this model is only 32, to separately prompt multiple subjects you can separate prompts with comma, and set the max amount of objects detected for each prompt with :N\n\nFor example above test prompt finds 2 cakes, one apron, 4 window panels"
+ ],
+ "color": "#432",
+ "bgcolor": "#653"
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 237,
+ "origin_id": 127,
+ "origin_slot": 0,
+ "target_id": 126,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 200,
+ "origin_id": 125,
+ "origin_slot": 0,
+ "target_id": 126,
+ "target_slot": 2,
+ "type": "CONDITIONING"
+ },
+ {
+ "id": 240,
+ "origin_id": 127,
+ "origin_slot": 1,
+ "target_id": 125,
+ "target_slot": 0,
+ "type": "CLIP"
+ },
+ {
+ "id": 231,
+ "origin_id": 126,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "MASK"
+ },
+ {
+ "id": 232,
+ "origin_id": 126,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 252,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 128,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 253,
+ "origin_id": 128,
+ "origin_slot": 0,
+ "target_id": 126,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 254,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 125,
+ "target_slot": 1,
+ "type": "STRING"
+ },
+ {
+ "id": 255,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 126,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 256,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 126,
+ "target_slot": 4,
+ "type": "STRING"
+ },
+ {
+ "id": 257,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 126,
+ "target_slot": 5,
+ "type": "STRING"
+ },
+ {
+ "id": 259,
+ "origin_id": 128,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "AUDIO"
+ },
+ {
+ "id": 260,
+ "origin_id": 128,
+ "origin_slot": 2,
+ "target_id": -20,
+ "target_slot": 3,
+ "type": "FLOAT"
+ },
+ {
+ "id": 261,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 126,
+ "target_slot": 6,
+ "type": "FLOAT"
+ },
+ {
+ "id": 262,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 126,
+ "target_slot": 7,
+ "type": "INT"
+ },
+ {
+ "id": 263,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 126,
+ "target_slot": 8,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 273,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 127,
+ "target_slot": 0,
+ "type": "COMBO"
+ }
+ ],
+ "extra": {},
+ "category": "Conditioning & Preprocessors/Segmentation & Mask",
+ "description": "Segments video into temporally consistent masks using Meta SAM3 from text or interactive prompts."
+ }
+ ]
+ },
+ "extra": {}
+}
diff --git a/blueprints/Video Stitch.json b/blueprints/Video Stitch.json
index 020896d78..2ac78b328 100644
--- a/blueprints/Video Stitch.json
+++ b/blueprints/Video Stitch.json
@@ -1,21 +1,21 @@
{
"revision": 0,
- "last_node_id": 84,
+ "last_node_id": 85,
"last_link_id": 0,
"nodes": [
{
- "id": 84,
- "type": "8e8aa94a-647e-436d-8440-8ee4691864de",
+ "id": 85,
+ "type": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
"pos": [
- -6100,
- 2620
+ -880,
+ -2260
],
"size": [
290,
160
],
"flags": {},
- "order": 0,
+ "order": 2,
"mode": 0,
"inputs": [
{
@@ -76,31 +76,26 @@
"properties": {
"proxyWidgets": [
[
- "-1",
+ "79",
"direction"
],
[
- "-1",
+ "79",
"match_image_size"
],
[
- "-1",
+ "79",
"spacing_width"
],
[
- "-1",
+ "79",
"spacing_color"
]
],
"cnr_id": "comfy-core",
"ver": "0.13.0"
},
- "widgets_values": [
- "right",
- true,
- 0,
- "white"
- ],
+ "widgets_values": [],
"title": "Video Stitch"
}
],
@@ -109,12 +104,12 @@
"definitions": {
"subgraphs": [
{
- "id": "8e8aa94a-647e-436d-8440-8ee4691864de",
+ "id": "637913e7-0206-46ba-8ded-70ae3a7c2e19",
"version": 1,
"state": {
"lastGroupId": 1,
- "lastNodeId": 84,
- "lastLinkId": 262,
+ "lastNodeId": 97,
+ "lastLinkId": 282,
"lastRerouteId": 0
},
"revision": 0,
@@ -123,8 +118,8 @@
"inputNode": {
"id": -10,
"bounding": [
- -6580,
- 2649,
+ -6810,
+ 2580,
143.55859375,
160
]
@@ -132,8 +127,8 @@
"outputNode": {
"id": -20,
"bounding": [
- -5720,
- 2659,
+ -4770,
+ 2600,
120,
60
]
@@ -149,8 +144,8 @@
"localized_name": "video",
"label": "Before Video",
"pos": [
- -6456.44140625,
- 2669
+ -6686.44140625,
+ 2600
]
},
{
@@ -163,8 +158,8 @@
"localized_name": "video_1",
"label": "After Video",
"pos": [
- -6456.44140625,
- 2689
+ -6686.44140625,
+ 2620
]
},
{
@@ -175,8 +170,8 @@
259
],
"pos": [
- -6456.44140625,
- 2709
+ -6686.44140625,
+ 2640
]
},
{
@@ -187,8 +182,8 @@
260
],
"pos": [
- -6456.44140625,
- 2729
+ -6686.44140625,
+ 2660
]
},
{
@@ -199,8 +194,8 @@
261
],
"pos": [
- -6456.44140625,
- 2749
+ -6686.44140625,
+ 2680
]
},
{
@@ -211,8 +206,8 @@
262
],
"pos": [
- -6456.44140625,
- 2769
+ -6686.44140625,
+ 2700
]
}
],
@@ -226,8 +221,8 @@
],
"localized_name": "VIDEO",
"pos": [
- -5700,
- 2679
+ -4750,
+ 2620
]
}
],
@@ -238,11 +233,11 @@
"type": "GetVideoComponents",
"pos": [
-6390,
- 2560
+ 2600
],
"size": [
- 193.530859375,
- 66
+ 230,
+ 120
],
"flags": {},
"order": 1,
@@ -278,9 +273,9 @@
}
],
"properties": {
+ "Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
- "ver": "0.13.0",
- "Node name for S&R": "GetVideoComponents"
+ "ver": "0.13.0"
}
},
{
@@ -291,8 +286,8 @@
2420
],
"size": [
- 193.530859375,
- 66
+ 230,
+ 120
],
"flags": {},
"order": 0,
@@ -332,21 +327,254 @@
}
],
"properties": {
+ "Node name for S&R": "GetVideoComponents",
"cnr_id": "comfy-core",
- "ver": "0.13.0",
- "Node name for S&R": "GetVideoComponents"
+ "ver": "0.13.0"
}
},
+ {
+ "id": 90,
+ "type": "GetImageSize",
+ "pos": [
+ -6390,
+ 3030
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 266
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "width",
+ "name": "width",
+ "type": "INT",
+ "links": [
+ 274
+ ]
+ },
+ {
+ "localized_name": "height",
+ "name": "height",
+ "type": "INT",
+ "links": [
+ 276
+ ]
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "links": null
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetImageSize"
+ }
+ },
+ {
+ "id": 80,
+ "type": "CreateVideo",
+ "pos": [
+ -5190,
+ 2420
+ ],
+ "size": [
+ 270,
+ 130
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "link": 282
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "shape": 7,
+ "type": "AUDIO",
+ "link": 251
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "widget": {
+ "name": "fps"
+ },
+ "link": 252
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "VIDEO",
+ "name": "VIDEO",
+ "type": "VIDEO",
+ "links": [
+ 255
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CreateVideo",
+ "cnr_id": "comfy-core",
+ "ver": "0.13.0"
+ },
+ "widgets_values": [
+ 30
+ ]
+ },
+ {
+ "id": 95,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -6040,
+ 3020
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 274
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 279
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a & ~1"
+ ]
+ },
+ {
+ "id": 96,
+ "type": "ComfyMathExpression",
+ "pos": [
+ -6040,
+ 3290
+ ],
+ "size": [
+ 400,
+ 200
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "a",
+ "localized_name": "values.a",
+ "name": "values.a",
+ "type": "FLOAT,INT",
+ "link": 276
+ },
+ {
+ "label": "b",
+ "localized_name": "values.b",
+ "name": "values.b",
+ "shape": 7,
+ "type": "FLOAT,INT",
+ "link": null
+ },
+ {
+ "localized_name": "expression",
+ "name": "expression",
+ "type": "STRING",
+ "widget": {
+ "name": "expression"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "FLOAT",
+ "name": "FLOAT",
+ "type": "FLOAT",
+ "links": null
+ },
+ {
+ "localized_name": "INT",
+ "name": "INT",
+ "type": "INT",
+ "links": [
+ 280
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ComfyMathExpression"
+ },
+ "widgets_values": [
+ "a & ~1"
+ ]
+ },
{
"id": 79,
"type": "ImageStitch",
"pos": [
-6390,
- 2700
+ 2780
],
"size": [
270,
- 150
+ 160
],
"flags": {},
"order": 2,
@@ -408,14 +636,15 @@
"name": "IMAGE",
"type": "IMAGE",
"links": [
- 250
+ 266,
+ 281
]
}
],
"properties": {
+ "Node name for S&R": "ImageStitch",
"cnr_id": "comfy-core",
- "ver": "0.13.0",
- "Node name for S&R": "ImageStitch"
+ "ver": "0.13.0"
},
"widgets_values": [
"right",
@@ -425,60 +654,91 @@
]
},
{
- "id": 80,
- "type": "CreateVideo",
+ "id": 97,
+ "type": "ResizeImageMaskNode",
"pos": [
- -6040,
- 2610
+ -5560,
+ 2790
],
"size": [
270,
- 78
+ 160
],
"flags": {},
- "order": 3,
+ "order": 7,
"mode": 0,
"inputs": [
{
- "localized_name": "images",
- "name": "images",
- "type": "IMAGE",
- "link": 250
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 281
},
{
- "localized_name": "audio",
- "name": "audio",
- "shape": 7,
- "type": "AUDIO",
- "link": 251
- },
- {
- "localized_name": "fps",
- "name": "fps",
- "type": "FLOAT",
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
"widget": {
- "name": "fps"
+ "name": "resize_type"
},
- "link": 252
+ "link": null
+ },
+ {
+ "localized_name": "width",
+ "name": "resize_type.width",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.width"
+ },
+ "link": 279
+ },
+ {
+ "localized_name": "height",
+ "name": "resize_type.height",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.height"
+ },
+ "link": 280
+ },
+ {
+ "localized_name": "crop",
+ "name": "resize_type.crop",
+ "type": "COMBO",
+ "widget": {
+ "name": "resize_type.crop"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
}
],
"outputs": [
{
- "localized_name": "VIDEO",
- "name": "VIDEO",
- "type": "VIDEO",
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
"links": [
- 255
+ 282
]
}
],
"properties": {
- "cnr_id": "comfy-core",
- "ver": "0.13.0",
- "Node name for S&R": "CreateVideo"
+ "Node name for S&R": "ResizeImageMaskNode"
},
"widgets_values": [
- 30
+ "scale dimensions",
+ 512,
+ 512,
+ "center",
+ "area"
]
}
],
@@ -500,14 +760,6 @@
"target_slot": 1,
"type": "IMAGE"
},
- {
- "id": 250,
- "origin_id": 79,
- "origin_slot": 0,
- "target_id": 80,
- "target_slot": 0,
- "type": "IMAGE"
- },
{
"id": 251,
"origin_id": 77,
@@ -579,13 +831,71 @@
"target_id": 79,
"target_slot": 5,
"type": "COMBO"
+ },
+ {
+ "id": 266,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": 90,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 274,
+ "origin_id": 90,
+ "origin_slot": 0,
+ "target_id": 95,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 276,
+ "origin_id": 90,
+ "origin_slot": 1,
+ "target_id": 96,
+ "target_slot": 0,
+ "type": "INT"
+ },
+ {
+ "id": 279,
+ "origin_id": 95,
+ "origin_slot": 1,
+ "target_id": 97,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 280,
+ "origin_id": 96,
+ "origin_slot": 1,
+ "target_id": 97,
+ "target_slot": 3,
+ "type": "INT"
+ },
+ {
+ "id": 281,
+ "origin_id": 79,
+ "origin_slot": 0,
+ "target_id": 97,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 282,
+ "origin_id": 97,
+ "origin_slot": 0,
+ "target_id": 80,
+ "target_slot": 0,
+ "type": "IMAGE"
}
],
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video Tools/Stitch videos"
+ "category": "Video Tools/Stitch videos",
+ "description": "Stitches multiple video clips into a single sequential video file."
}
]
- }
-}
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/blueprints/Video Upscale(GAN x4).json b/blueprints/Video Upscale(GAN x4).json
index b61dc88d7..fc291ac41 100644
--- a/blueprints/Video Upscale(GAN x4).json
+++ b/blueprints/Video Upscale(GAN x4).json
@@ -412,9 +412,10 @@
"extra": {
"workflowRendererVersion": "LG"
},
- "category": "Video generation and editing/Enhance video"
+ "category": "Video generation and editing/Upscale",
+ "description": "Upscales video to 4× resolution using a GAN-based upscaling model."
}
]
},
"extra": {}
-}
+}
\ No newline at end of file
diff --git a/blueprints/Video to Pose Map (SDPose Multi-Person).json b/blueprints/Video to Pose Map (SDPose Multi-Person).json
new file mode 100644
index 000000000..64ef6e524
--- /dev/null
+++ b/blueprints/Video to Pose Map (SDPose Multi-Person).json
@@ -0,0 +1,1323 @@
+{
+ "revision": 0,
+ "last_node_id": 675,
+ "last_link_id": 0,
+ "nodes": [
+ {
+ "id": 675,
+ "type": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "pos": [
+ -2480,
+ 3400
+ ],
+ "size": [
+ 370,
+ 638.625
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "resize_target_longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": null
+ },
+ {
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": null
+ },
+ {
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": null
+ },
+ {
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": null
+ },
+ {
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": null
+ },
+ {
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": null
+ },
+ {
+ "label": "detect_threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": null
+ },
+ {
+ "label": "detect_class",
+ "name": "class_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "class_name"
+ },
+ "link": null
+ },
+ {
+ "name": "max_detections",
+ "type": "INT",
+ "widget": {
+ "name": "max_detections"
+ },
+ "link": null
+ },
+ {
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": null
+ },
+ {
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": null
+ },
+ {
+ "name": "video",
+ "type": "VIDEO",
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": []
+ },
+ {
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": null
+ },
+ {
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": []
+ },
+ {
+ "name": "audio",
+ "type": "AUDIO",
+ "links": []
+ },
+ {
+ "name": "fps",
+ "type": "FLOAT",
+ "links": []
+ }
+ ],
+ "properties": {
+ "proxyWidgets": [
+ [
+ "674",
+ "resize_type.longer_size"
+ ],
+ [
+ "674",
+ "scale_method"
+ ],
+ [
+ "672",
+ "draw_body"
+ ],
+ [
+ "672",
+ "draw_hands"
+ ],
+ [
+ "672",
+ "draw_face"
+ ],
+ [
+ "672",
+ "draw_feet"
+ ],
+ [
+ "672",
+ "stick_width"
+ ],
+ [
+ "672",
+ "face_point_size"
+ ],
+ [
+ "672",
+ "score_threshold"
+ ],
+ [
+ "678",
+ "threshold"
+ ],
+ [
+ "678",
+ "class_name"
+ ],
+ [
+ "678",
+ "max_detections"
+ ],
+ [
+ "673",
+ "ckpt_name"
+ ],
+ [
+ "677",
+ "unet_name"
+ ]
+ ],
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [],
+ "title": "Video to Pose Map (SDPose Multi-Person)"
+ }
+ ],
+ "links": [],
+ "version": 0.4,
+ "definitions": {
+ "subgraphs": [
+ {
+ "id": "01b6a731-fb78-4070-9a38-c87146da9604",
+ "version": 1,
+ "state": {
+ "lastGroupId": 2,
+ "lastNodeId": 699,
+ "lastLinkId": 1754,
+ "lastRerouteId": 0
+ },
+ "revision": 0,
+ "config": {},
+ "name": "Video to Pose Map (SDPose Multi-Person)",
+ "inputNode": {
+ "id": -10,
+ "bounding": [
+ -3570,
+ 3300,
+ 182.8984375,
+ 340
+ ]
+ },
+ "outputNode": {
+ "id": -20,
+ "bounding": [
+ -1890,
+ 3730,
+ 120,
+ 140
+ ]
+ },
+ "inputs": [
+ {
+ "id": "088eefc1-cd8a-4573-993f-9e4da008a12d",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "linkIds": [
+ 1704
+ ],
+ "label": "resize_target_longer_size",
+ "pos": [
+ -3407.1015625,
+ 3320
+ ]
+ },
+ {
+ "id": "b6449bd3-73d4-41c8-b81f-cf8d33f76a2e",
+ "name": "scale_method",
+ "type": "COMBO",
+ "linkIds": [
+ 1705
+ ],
+ "pos": [
+ -3407.1015625,
+ 3340
+ ]
+ },
+ {
+ "id": "4cff52ad-ed07-4c97-8803-fcbd89554fd0",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1706
+ ],
+ "pos": [
+ -3407.1015625,
+ 3360
+ ]
+ },
+ {
+ "id": "7af63dce-f7df-4d7e-8215-d7c7f60bf81c",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1707
+ ],
+ "pos": [
+ -3407.1015625,
+ 3380
+ ]
+ },
+ {
+ "id": "af3a9bce-61f9-4aca-b530-9f65e028b35e",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1708
+ ],
+ "pos": [
+ -3407.1015625,
+ 3400
+ ]
+ },
+ {
+ "id": "4620f6a3-2c85-4b79-ad8f-35d0326b568f",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "linkIds": [
+ 1709
+ ],
+ "pos": [
+ -3407.1015625,
+ 3420
+ ]
+ },
+ {
+ "id": "fee5d0c9-8d4b-4934-81d8-ba2206dc56cb",
+ "name": "stick_width",
+ "type": "INT",
+ "linkIds": [
+ 1710
+ ],
+ "pos": [
+ -3407.1015625,
+ 3440
+ ]
+ },
+ {
+ "id": "aafdd060-ba81-4324-a9cc-b656e1ebc133",
+ "name": "face_point_size",
+ "type": "INT",
+ "linkIds": [
+ 1711
+ ],
+ "pos": [
+ -3407.1015625,
+ 3460
+ ]
+ },
+ {
+ "id": "514c5503-f9e6-4d23-b1ae-1d3291acb2a3",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 1712
+ ],
+ "pos": [
+ -3407.1015625,
+ 3480
+ ]
+ },
+ {
+ "id": "4eb3e4ea-7a36-4511-8483-0d12aadd32f7",
+ "name": "threshold",
+ "type": "FLOAT",
+ "linkIds": [
+ 1718
+ ],
+ "label": "detect_threshold",
+ "pos": [
+ -3407.1015625,
+ 3500
+ ]
+ },
+ {
+ "id": "c76a7a05-81e6-4b17-a9e0-85f47a5844f2",
+ "name": "class_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1719
+ ],
+ "label": "detect_class",
+ "pos": [
+ -3407.1015625,
+ 3520
+ ]
+ },
+ {
+ "id": "4417e988-6e80-4236-be31-4c179037f5a2",
+ "name": "max_detections",
+ "type": "INT",
+ "linkIds": [
+ 1720
+ ],
+ "pos": [
+ -3407.1015625,
+ 3540
+ ]
+ },
+ {
+ "id": "7d7c4a0b-0d1b-4c98-942b-f90548d2a492",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1721
+ ],
+ "pos": [
+ -3407.1015625,
+ 3560
+ ]
+ },
+ {
+ "id": "4d75122c-2c14-452a-98fe-d1545d3e012a",
+ "name": "unet_name",
+ "type": "COMBO",
+ "linkIds": [
+ 1722
+ ],
+ "pos": [
+ -3407.1015625,
+ 3580
+ ]
+ },
+ {
+ "id": "6c46c988-4dd1-41a2-957e-03caf60d7657",
+ "name": "video",
+ "type": "VIDEO",
+ "linkIds": [
+ 1741
+ ],
+ "pos": [
+ -3407.1015625,
+ 3600
+ ]
+ }
+ ],
+ "outputs": [
+ {
+ "id": "f05ed8cc-9403-4f14-8085-4364b06f8a48",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "linkIds": [
+ 1701
+ ],
+ "localized_name": "IMAGE",
+ "pos": [
+ -1870,
+ 3750
+ ]
+ },
+ {
+ "id": "4b64118e-3cef-4eeb-9dad-4cd09cfd63a2",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "linkIds": [
+ 1725
+ ],
+ "pos": [
+ -1870,
+ 3770
+ ]
+ },
+ {
+ "id": "a27f7e34-dcbc-4fb0-a4e1-2c5fc423ca5f",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "linkIds": [
+ 1726
+ ],
+ "pos": [
+ -1870,
+ 3790
+ ]
+ },
+ {
+ "id": "b7fe351d-2b38-41ea-9f4d-3be1a0aad275",
+ "name": "audio",
+ "type": "AUDIO",
+ "linkIds": [
+ 1743
+ ],
+ "pos": [
+ -1870,
+ 3810
+ ]
+ },
+ {
+ "id": "ae187b6f-c9ca-4487-b5c1-3ad775fe945e",
+ "name": "fps",
+ "type": "FLOAT",
+ "linkIds": [
+ 1744
+ ],
+ "pos": [
+ -1870,
+ 3830
+ ]
+ }
+ ],
+ "widgets": [],
+ "nodes": [
+ {
+ "id": 671,
+ "type": "SDPoseKeypointExtractor",
+ "pos": [
+ -2550,
+ 3080
+ ],
+ "size": [
+ 270,
+ 180
+ ],
+ "flags": {},
+ "order": 0,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1696
+ },
+ {
+ "localized_name": "vae",
+ "name": "vae",
+ "type": "VAE",
+ "link": 1697
+ },
+ {
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1698
+ },
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "shape": 7,
+ "type": "BOUNDING_BOX",
+ "link": 1717
+ },
+ {
+ "localized_name": "batch_size",
+ "name": "batch_size",
+ "type": "INT",
+ "widget": {
+ "name": "batch_size"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "links": [
+ 1699,
+ 1725
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseKeypointExtractor",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 16
+ ]
+ },
+ {
+ "id": 674,
+ "type": "ResizeImageMaskNode",
+ "pos": [
+ -3010,
+ 3880
+ ],
+ "size": [
+ 270,
+ 110
+ ],
+ "flags": {},
+ "order": 3,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "input",
+ "name": "input",
+ "type": "IMAGE,MASK",
+ "link": 1742
+ },
+ {
+ "localized_name": "resize_type",
+ "name": "resize_type",
+ "type": "COMFY_DYNAMICCOMBO_V3",
+ "widget": {
+ "name": "resize_type"
+ },
+ "link": null
+ },
+ {
+ "localized_name": "resize_type.longer_size",
+ "name": "resize_type.longer_size",
+ "type": "INT",
+ "widget": {
+ "name": "resize_type.longer_size"
+ },
+ "link": 1704
+ },
+ {
+ "localized_name": "scale_method",
+ "name": "scale_method",
+ "type": "COMBO",
+ "widget": {
+ "name": "scale_method"
+ },
+ "link": 1705
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "resized",
+ "name": "resized",
+ "type": "*",
+ "links": [
+ 1698,
+ 1716
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "ResizeImageMaskNode",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "scale longer dimension",
+ 1024,
+ "lanczos"
+ ]
+ },
+ {
+ "id": 672,
+ "type": "SDPoseDrawKeypoints",
+ "pos": [
+ -2540,
+ 3590
+ ],
+ "size": [
+ 270,
+ 280
+ ],
+ "flags": {},
+ "order": 1,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "keypoints",
+ "name": "keypoints",
+ "type": "POSE_KEYPOINT",
+ "link": 1699
+ },
+ {
+ "localized_name": "draw_body",
+ "name": "draw_body",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_body"
+ },
+ "link": 1706
+ },
+ {
+ "localized_name": "draw_hands",
+ "name": "draw_hands",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_hands"
+ },
+ "link": 1707
+ },
+ {
+ "localized_name": "draw_face",
+ "name": "draw_face",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_face"
+ },
+ "link": 1708
+ },
+ {
+ "localized_name": "draw_feet",
+ "name": "draw_feet",
+ "type": "BOOLEAN",
+ "widget": {
+ "name": "draw_feet"
+ },
+ "link": 1709
+ },
+ {
+ "localized_name": "stick_width",
+ "name": "stick_width",
+ "type": "INT",
+ "widget": {
+ "name": "stick_width"
+ },
+ "link": 1710
+ },
+ {
+ "localized_name": "face_point_size",
+ "name": "face_point_size",
+ "type": "INT",
+ "widget": {
+ "name": "face_point_size"
+ },
+ "link": 1711
+ },
+ {
+ "localized_name": "score_threshold",
+ "name": "score_threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "score_threshold"
+ },
+ "link": 1712
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "IMAGE",
+ "name": "IMAGE",
+ "type": "IMAGE",
+ "links": [
+ 1701
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "SDPoseDrawKeypoints",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ true,
+ true,
+ true,
+ true,
+ 4,
+ 2,
+ 0.5
+ ]
+ },
+ {
+ "id": 673,
+ "type": "CheckpointLoaderSimple",
+ "pos": [
+ -3040,
+ 3080
+ ],
+ "size": [
+ 390,
+ 160
+ ],
+ "flags": {},
+ "order": 2,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "ckpt_name",
+ "name": "ckpt_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "ckpt_name"
+ },
+ "link": 1721
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1696
+ ]
+ },
+ {
+ "localized_name": "CLIP",
+ "name": "CLIP",
+ "type": "CLIP",
+ "links": []
+ },
+ {
+ "localized_name": "VAE",
+ "name": "VAE",
+ "type": "VAE",
+ "links": [
+ 1697
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "CheckpointLoaderSimple",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.0",
+ "models": [
+ {
+ "name": "sdpose_wholebody_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/checkpoints/sdpose_wholebody_fp16.safetensors",
+ "directory": "checkpoints"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "sdpose_wholebody_fp16.safetensors"
+ ]
+ },
+ {
+ "id": 677,
+ "type": "UNETLoader",
+ "pos": [
+ -3030,
+ 3300
+ ],
+ "size": [
+ 370,
+ 110
+ ],
+ "flags": {},
+ "order": 4,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "unet_name",
+ "name": "unet_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "unet_name"
+ },
+ "link": 1722
+ },
+ {
+ "localized_name": "weight_dtype",
+ "name": "weight_dtype",
+ "type": "COMBO",
+ "widget": {
+ "name": "weight_dtype"
+ },
+ "link": null
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "MODEL",
+ "name": "MODEL",
+ "type": "MODEL",
+ "links": [
+ 1715
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "UNETLoader",
+ "cnr_id": "comfy-core",
+ "ver": "0.14.1",
+ "models": [
+ {
+ "name": "rt_detr_v4-x-hgnet_fp16.safetensors",
+ "url": "https://huggingface.co/Comfy-Org/SDPose/resolve/main/diffusion_models/rt_detr_v4-x-hgnet_fp16.safetensors",
+ "directory": "diffusion_models"
+ }
+ ],
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ "rt_detr_v4-x-hgnet_fp16.safetensors",
+ "default"
+ ]
+ },
+ {
+ "id": 678,
+ "type": "RTDETR_detect",
+ "pos": [
+ -2540,
+ 3320
+ ],
+ "size": [
+ 270,
+ 200
+ ],
+ "flags": {},
+ "order": 5,
+ "mode": 0,
+ "inputs": [
+ {
+ "label": "model",
+ "localized_name": "model",
+ "name": "model",
+ "type": "MODEL",
+ "link": 1715
+ },
+ {
+ "label": "image",
+ "localized_name": "image",
+ "name": "image",
+ "type": "IMAGE",
+ "link": 1716
+ },
+ {
+ "localized_name": "threshold",
+ "name": "threshold",
+ "type": "FLOAT",
+ "widget": {
+ "name": "threshold"
+ },
+ "link": 1718
+ },
+ {
+ "localized_name": "class_name",
+ "name": "class_name",
+ "type": "COMBO",
+ "widget": {
+ "name": "class_name"
+ },
+ "link": 1719
+ },
+ {
+ "localized_name": "max_detections",
+ "name": "max_detections",
+ "type": "INT",
+ "widget": {
+ "name": "max_detections"
+ },
+ "link": 1720
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "bboxes",
+ "name": "bboxes",
+ "type": "BOUNDING_BOX",
+ "links": [
+ 1717,
+ 1726
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "RTDETR_detect",
+ "cnr_id": "comfy-core",
+ "ver": "0.15.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ },
+ "widgets_values": [
+ 0.5,
+ "person",
+ 2
+ ]
+ },
+ {
+ "id": 692,
+ "type": "GetVideoComponents",
+ "pos": [
+ -3010,
+ 4100
+ ],
+ "size": [
+ 230,
+ 120
+ ],
+ "flags": {},
+ "order": 6,
+ "mode": 0,
+ "inputs": [
+ {
+ "localized_name": "video",
+ "name": "video",
+ "type": "VIDEO",
+ "link": 1741
+ }
+ ],
+ "outputs": [
+ {
+ "localized_name": "images",
+ "name": "images",
+ "type": "IMAGE",
+ "links": [
+ 1742
+ ]
+ },
+ {
+ "localized_name": "audio",
+ "name": "audio",
+ "type": "AUDIO",
+ "links": [
+ 1743
+ ]
+ },
+ {
+ "localized_name": "fps",
+ "name": "fps",
+ "type": "FLOAT",
+ "links": [
+ 1744
+ ]
+ }
+ ],
+ "properties": {
+ "Node name for S&R": "GetVideoComponents",
+ "cnr_id": "comfy-core",
+ "ver": "0.18.1",
+ "enableTabs": false,
+ "tabWidth": 65,
+ "tabXOffset": 10,
+ "hasSecondTab": false,
+ "secondTabText": "Send Back",
+ "secondTabOffset": 80,
+ "secondTabWidth": 65
+ }
+ }
+ ],
+ "groups": [],
+ "links": [
+ {
+ "id": 1696,
+ "origin_id": 673,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 1697,
+ "origin_id": 673,
+ "origin_slot": 2,
+ "target_id": 671,
+ "target_slot": 1,
+ "type": "VAE"
+ },
+ {
+ "id": 1698,
+ "origin_id": 674,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 2,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1699,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": 672,
+ "target_slot": 0,
+ "type": "POSE_KEYPOINT"
+ },
+ {
+ "id": 1701,
+ "origin_id": 672,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1704,
+ "origin_id": -10,
+ "origin_slot": 0,
+ "target_id": 674,
+ "target_slot": 2,
+ "type": "INT"
+ },
+ {
+ "id": 1705,
+ "origin_id": -10,
+ "origin_slot": 1,
+ "target_id": 674,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 1706,
+ "origin_id": -10,
+ "origin_slot": 2,
+ "target_id": 672,
+ "target_slot": 1,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1707,
+ "origin_id": -10,
+ "origin_slot": 3,
+ "target_id": 672,
+ "target_slot": 2,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1708,
+ "origin_id": -10,
+ "origin_slot": 4,
+ "target_id": 672,
+ "target_slot": 3,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1709,
+ "origin_id": -10,
+ "origin_slot": 5,
+ "target_id": 672,
+ "target_slot": 4,
+ "type": "BOOLEAN"
+ },
+ {
+ "id": 1710,
+ "origin_id": -10,
+ "origin_slot": 6,
+ "target_id": 672,
+ "target_slot": 5,
+ "type": "INT"
+ },
+ {
+ "id": 1711,
+ "origin_id": -10,
+ "origin_slot": 7,
+ "target_id": 672,
+ "target_slot": 6,
+ "type": "INT"
+ },
+ {
+ "id": 1712,
+ "origin_id": -10,
+ "origin_slot": 8,
+ "target_id": 672,
+ "target_slot": 7,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1715,
+ "origin_id": 677,
+ "origin_slot": 0,
+ "target_id": 678,
+ "target_slot": 0,
+ "type": "MODEL"
+ },
+ {
+ "id": 1716,
+ "origin_id": 674,
+ "origin_slot": 0,
+ "target_id": 678,
+ "target_slot": 1,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1717,
+ "origin_id": 678,
+ "origin_slot": 0,
+ "target_id": 671,
+ "target_slot": 3,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 1718,
+ "origin_id": -10,
+ "origin_slot": 9,
+ "target_id": 678,
+ "target_slot": 2,
+ "type": "FLOAT"
+ },
+ {
+ "id": 1719,
+ "origin_id": -10,
+ "origin_slot": 10,
+ "target_id": 678,
+ "target_slot": 3,
+ "type": "COMBO"
+ },
+ {
+ "id": 1720,
+ "origin_id": -10,
+ "origin_slot": 11,
+ "target_id": 678,
+ "target_slot": 4,
+ "type": "INT"
+ },
+ {
+ "id": 1721,
+ "origin_id": -10,
+ "origin_slot": 12,
+ "target_id": 673,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1722,
+ "origin_id": -10,
+ "origin_slot": 13,
+ "target_id": 677,
+ "target_slot": 0,
+ "type": "COMBO"
+ },
+ {
+ "id": 1725,
+ "origin_id": 671,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 1,
+ "type": "POSE_KEYPOINT"
+ },
+ {
+ "id": 1726,
+ "origin_id": 678,
+ "origin_slot": 0,
+ "target_id": -20,
+ "target_slot": 2,
+ "type": "BOUNDING_BOX"
+ },
+ {
+ "id": 1741,
+ "origin_id": -10,
+ "origin_slot": 14,
+ "target_id": 692,
+ "target_slot": 0,
+ "type": "VIDEO"
+ },
+ {
+ "id": 1742,
+ "origin_id": 692,
+ "origin_slot": 0,
+ "target_id": 674,
+ "target_slot": 0,
+ "type": "IMAGE"
+ },
+ {
+ "id": 1743,
+ "origin_id": 692,
+ "origin_slot": 1,
+ "target_id": -20,
+ "target_slot": 3,
+ "type": "AUDIO"
+ },
+ {
+ "id": 1744,
+ "origin_id": 692,
+ "origin_slot": 2,
+ "target_id": -20,
+ "target_slot": 4,
+ "type": "FLOAT"
+ }
+ ],
+ "extra": {
+ "workflowRendererVersion": "LG"
+ },
+ "category": "Conditioning & Preprocessors/Pose",
+ "description": "Extracts multi-person pose keypoints and skeleton frame sequences from video using SDPose with built-in person detection."
+ }
+ ]
+ },
+ "extra": {}
+}
\ No newline at end of file
diff --git a/comfy/background_removal/birefnet.json b/comfy/background_removal/birefnet.json
new file mode 100644
index 000000000..f0960af39
--- /dev/null
+++ b/comfy/background_removal/birefnet.json
@@ -0,0 +1,7 @@
+{
+ "model_type": "birefnet",
+ "image_std": [1.0, 1.0, 1.0],
+ "image_mean": [0.0, 0.0, 0.0],
+ "image_size": 1024,
+ "resize_to_original": true
+}
diff --git a/comfy/background_removal/birefnet.py b/comfy/background_removal/birefnet.py
new file mode 100644
index 000000000..df54b2b90
--- /dev/null
+++ b/comfy/background_removal/birefnet.py
@@ -0,0 +1,689 @@
+import torch
+import comfy.ops
+import numpy as np
+import torch.nn as nn
+from functools import partial
+import torch.nn.functional as F
+from torchvision.ops import deform_conv2d
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+CXT = [3072, 1536, 768, 384][1:][::-1][-3:]
+
+class Attention(nn.Module):
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, device=None, dtype=None, operations=None):
+ super().__init__()
+
+ self.dim = dim
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.q = operations.Linear(dim, dim, bias=qkv_bias, device=device, dtype=dtype)
+ self.kv = operations.Linear(dim, dim * 2, bias=qkv_bias, device=device, dtype=dtype)
+ self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+
+ def forward(self, x):
+ B, N, C = x.shape
+ optimized_attention = optimized_attention_for_device(x.device, mask=False, small_input=True)
+ q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+ kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ k, v = kv[0], kv[1]
+
+ x = optimized_attention(
+ q, k, v, heads=self.num_heads, skip_output_reshape=True, skip_reshape=True
+ ).transpose(1, 2).reshape(B, N, C)
+ x = self.proj(x)
+
+ return x
+
+class Mlp(nn.Module):
+ def __init__(self, in_features, hidden_features=None, out_features=None, device=None, dtype=None, operations=None):
+ super().__init__()
+ out_features = out_features or in_features
+ hidden_features = hidden_features or in_features
+ self.fc1 = operations.Linear(in_features, hidden_features, device=device, dtype=dtype)
+ self.act = nn.GELU()
+ self.fc2 = operations.Linear(hidden_features, out_features, device=device, dtype=dtype)
+
+ def forward(self, x):
+ x = self.fc1(x)
+ x = self.act(x)
+ x = self.fc2(x)
+ return x
+
+
+def window_partition(x, window_size):
+ B, H, W, C = x.shape
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows
+
+
+def window_reverse(windows, window_size, H, W):
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+ return x
+
+
+class WindowAttention(nn.Module):
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, device=None, dtype=None, operations=None):
+
+ super().__init__()
+ self.dim = dim
+ self.window_size = window_size # Wh, Ww
+ self.num_heads = num_heads
+ head_dim = dim // num_heads
+ self.scale = qk_scale or head_dim ** -0.5
+
+ self.relative_position_bias_table = nn.Parameter(
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads, device=device, dtype=dtype))
+
+ coords_h = torch.arange(self.window_size[0])
+ coords_w = torch.arange(self.window_size[1])
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
+ relative_coords[:, :, 0] += self.window_size[0] - 1
+ relative_coords[:, :, 1] += self.window_size[1] - 1
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
+ self.register_buffer("relative_position_index", relative_position_index)
+
+ self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
+ self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+ self.softmax = nn.Softmax(dim=-1)
+
+ def forward(self, x, mask=None):
+ B_, N, C = x.shape
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv[0], qkv[1], qkv[2]
+
+ q = q * self.scale
+ attn = (q @ k.transpose(-2, -1))
+
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.long().view(-1)].view(
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
+ attn = attn + relative_position_bias.unsqueeze(0)
+
+ if mask is not None:
+ nW = mask.shape[0]
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+ attn = attn.view(-1, self.num_heads, N, N)
+ attn = self.softmax(attn)
+ else:
+ attn = self.softmax(attn)
+
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+ x = self.proj(x)
+ return x
+
+
+class SwinTransformerBlock(nn.Module):
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
+ mlp_ratio=4., qkv_bias=True, qk_scale=None,
+ norm_layer=nn.LayerNorm, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.window_size = window_size
+ self.shift_size = shift_size
+ self.mlp_ratio = mlp_ratio
+
+ self.norm1 = norm_layer(dim, device=device, dtype=dtype)
+ self.attn = WindowAttention(
+ dim, window_size=(self.window_size, self.window_size), num_heads=num_heads,
+ qkv_bias=qkv_bias, qk_scale=qk_scale, device=device, dtype=dtype, operations=operations)
+
+ self.norm2 = norm_layer(dim, device=device, dtype=dtype)
+ mlp_hidden_dim = int(dim * mlp_ratio)
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, device=device, dtype=dtype, operations=operations)
+
+ self.H = None
+ self.W = None
+
+ def forward(self, x, mask_matrix):
+ B, L, C = x.shape
+ H, W = self.H, self.W
+
+ shortcut = x
+ x = self.norm1(x)
+ x = x.view(B, H, W, C)
+
+ pad_l = pad_t = 0
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+ _, Hp, Wp, _ = x.shape
+
+ if self.shift_size > 0:
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+ attn_mask = mask_matrix
+ else:
+ shifted_x = x
+ attn_mask = None
+
+ x_windows = window_partition(shifted_x, self.window_size)
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C)
+
+ attn_windows = self.attn(x_windows, mask=attn_mask)
+
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
+
+ if self.shift_size > 0:
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+ else:
+ x = shifted_x
+
+ if pad_r > 0 or pad_b > 0:
+ x = x[:, :H, :W, :].contiguous()
+
+ x = x.view(B, H * W, C)
+
+ x = shortcut + x
+ x = x + self.mlp(self.norm2(x))
+
+ return x
+
+
+class PatchMerging(nn.Module):
+ def __init__(self, dim, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.dim = dim
+ self.reduction = operations.Linear(4 * dim, 2 * dim, bias=False, device=device, dtype=dtype)
+ self.norm = operations.LayerNorm(4 * dim, device=device, dtype=dtype)
+
+ def forward(self, x, H, W):
+ B, L, C = x.shape
+ x = x.view(B, H, W, C)
+
+ # padding
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
+ if pad_input:
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
+
+ x = self.norm(x)
+ x = self.reduction(x)
+
+ return x
+
+
+class BasicLayer(nn.Module):
+ def __init__(self,
+ dim,
+ depth,
+ num_heads,
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ norm_layer=nn.LayerNorm,
+ downsample=None,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.window_size = window_size
+ self.shift_size = window_size // 2
+ self.depth = depth
+
+ # build blocks
+ self.blocks = nn.ModuleList([
+ SwinTransformerBlock(
+ dim=dim,
+ num_heads=num_heads,
+ window_size=window_size,
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ norm_layer=norm_layer,
+ device=device, dtype=dtype, operations=operations)
+ for i in range(depth)])
+
+ # patch merging layer
+ if downsample is not None:
+ self.downsample = downsample(dim=dim, device=device, dtype=dtype, operations=operations)
+ else:
+ self.downsample = None
+
+ def forward(self, x, H, W):
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
+ h_slices = (slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None))
+ w_slices = (slice(0, -self.window_size),
+ slice(-self.window_size, -self.shift_size),
+ slice(-self.shift_size, None))
+ cnt = 0
+ for h in h_slices:
+ for w in w_slices:
+ img_mask[:, h, w, :] = cnt
+ cnt += 1
+
+ mask_windows = window_partition(img_mask, self.window_size)
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+
+ for blk in self.blocks:
+ blk.H, blk.W = H, W
+ x = blk(x, attn_mask)
+ if self.downsample is not None:
+ x_down = self.downsample(x, H, W)
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
+ return x, H, W, x_down, Wh, Ww
+ else:
+ return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+ def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None, device=None, dtype=None, operations=None):
+ super().__init__()
+ patch_size = (patch_size, patch_size)
+ self.patch_size = patch_size
+
+ self.in_channels = in_channels
+ self.embed_dim = embed_dim
+
+ self.proj = operations.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, device=device, dtype=dtype)
+ if norm_layer is not None:
+ self.norm = norm_layer(embed_dim, device=device, dtype=dtype)
+ else:
+ self.norm = None
+
+ def forward(self, x):
+ _, _, H, W = x.size()
+ if W % self.patch_size[1] != 0:
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+ if H % self.patch_size[0] != 0:
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+ x = self.proj(x) # B C Wh Ww
+ if self.norm is not None:
+ Wh, Ww = x.size(2), x.size(3)
+ x = x.flatten(2).transpose(1, 2)
+ x = self.norm(x)
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+ return x
+
+
+class SwinTransformer(nn.Module):
+ def __init__(self,
+ pretrain_img_size=224,
+ patch_size=4,
+ in_channels=3,
+ embed_dim=96,
+ depths=[2, 2, 6, 2],
+ num_heads=[3, 6, 12, 24],
+ window_size=7,
+ mlp_ratio=4.,
+ qkv_bias=True,
+ qk_scale=None,
+ patch_norm=True,
+ out_indices=(0, 1, 2, 3),
+ frozen_stages=-1,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+
+ norm_layer = partial(operations.LayerNorm, device=device, dtype=dtype)
+ self.pretrain_img_size = pretrain_img_size
+ self.num_layers = len(depths)
+ self.embed_dim = embed_dim
+ self.patch_norm = patch_norm
+ self.out_indices = out_indices
+ self.frozen_stages = frozen_stages
+
+ self.patch_embed = PatchEmbed(
+ patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
+ device=device, dtype=dtype, operations=operations,
+ norm_layer=norm_layer if self.patch_norm else None)
+
+ self.layers = nn.ModuleList()
+ for i_layer in range(self.num_layers):
+ layer = BasicLayer(
+ dim=int(embed_dim * 2 ** i_layer),
+ depth=depths[i_layer],
+ num_heads=num_heads[i_layer],
+ window_size=window_size,
+ mlp_ratio=mlp_ratio,
+ qkv_bias=qkv_bias,
+ qk_scale=qk_scale,
+ norm_layer=norm_layer,
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+ device=device, dtype=dtype, operations=operations)
+ self.layers.append(layer)
+
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+ self.num_features = num_features
+
+ for i_layer in out_indices:
+ layer = norm_layer(num_features[i_layer])
+ layer_name = f'norm{i_layer}'
+ self.add_module(layer_name, layer)
+
+
+ def forward(self, x):
+ x = self.patch_embed(x)
+
+ Wh, Ww = x.size(2), x.size(3)
+
+ outs = []
+ x = x.flatten(2).transpose(1, 2)
+ for i in range(self.num_layers):
+ layer = self.layers[i]
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+ if i in self.out_indices:
+ norm_layer = getattr(self, f'norm{i}')
+ x_out = norm_layer(x_out)
+
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+ outs.append(out)
+
+ return tuple(outs)
+
+class DeformableConv2d(nn.Module):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias=False, device=None, dtype=None, operations=None):
+
+ super(DeformableConv2d, self).__init__()
+
+ kernel_size = kernel_size if type(kernel_size) is tuple else (kernel_size, kernel_size)
+ self.stride = stride if type(stride) is tuple else (stride, stride)
+ self.padding = padding
+
+ self.offset_conv = operations.Conv2d(in_channels,
+ 2 * kernel_size[0] * kernel_size[1],
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=self.padding,
+ bias=True, device=device, dtype=dtype)
+
+ self.modulator_conv = operations.Conv2d(in_channels,
+ 1 * kernel_size[0] * kernel_size[1],
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=self.padding,
+ bias=True, device=device, dtype=dtype)
+
+ self.regular_conv = operations.Conv2d(in_channels,
+ out_channels=out_channels,
+ kernel_size=kernel_size,
+ stride=stride,
+ padding=self.padding,
+ bias=bias, device=device, dtype=dtype)
+
+ def forward(self, x):
+ offset = self.offset_conv(x)
+ modulator = 2. * torch.sigmoid(self.modulator_conv(x))
+ weight, bias, offload_info = comfy.ops.cast_bias_weight(self.regular_conv, x, offloadable=True)
+
+ x = deform_conv2d(
+ input=x,
+ offset=offset,
+ weight=weight,
+ bias=None,
+ padding=self.padding,
+ mask=modulator,
+ stride=self.stride,
+ )
+ comfy.ops.uncast_bias_weight(self.regular_conv, weight, bias, offload_info)
+ return x
+
+class BasicDecBlk(nn.Module):
+ def __init__(self, in_channels=64, out_channels=64, inter_channels=64, device=None, dtype=None, operations=None):
+ super(BasicDecBlk, self).__init__()
+ inter_channels = 64
+ self.conv_in = operations.Conv2d(in_channels, inter_channels, 3, 1, padding=1, device=device, dtype=dtype)
+ self.relu_in = nn.ReLU(inplace=True)
+ self.dec_att = ASPPDeformable(in_channels=inter_channels, device=device, dtype=dtype, operations=operations)
+ self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, padding=1, device=device, dtype=dtype)
+ self.bn_in = operations.BatchNorm2d(inter_channels, device=device, dtype=dtype)
+ self.bn_out = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
+
+ def forward(self, x):
+ x = self.conv_in(x)
+ x = self.bn_in(x)
+ x = self.relu_in(x)
+ x = self.dec_att(x)
+ x = self.conv_out(x)
+ x = self.bn_out(x)
+ return x
+
+
+class BasicLatBlk(nn.Module):
+ def __init__(self, in_channels=64, out_channels=64, device=None, dtype=None, operations=None):
+ super(BasicLatBlk, self).__init__()
+ self.conv = operations.Conv2d(in_channels, out_channels, 1, 1, 0, device=device, dtype=dtype)
+
+ def forward(self, x):
+ x = self.conv(x)
+ return x
+
+
+class _ASPPModuleDeformable(nn.Module):
+ def __init__(self, in_channels, planes, kernel_size, padding, device, dtype, operations):
+ super(_ASPPModuleDeformable, self).__init__()
+ self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
+ stride=1, padding=padding, bias=False, device=device, dtype=dtype, operations=operations)
+ self.bn = operations.BatchNorm2d(planes, device=device, dtype=dtype)
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ x = self.atrous_conv(x)
+ x = self.bn(x)
+
+ return self.relu(x)
+
+
+class ASPPDeformable(nn.Module):
+ def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7], device=None, dtype=None, operations=None):
+ super(ASPPDeformable, self).__init__()
+ self.down_scale = 1
+ if out_channels is None:
+ out_channels = in_channels
+ self.in_channelster = 256 // self.down_scale
+
+ self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0, device=device, dtype=dtype, operations=operations)
+ self.aspp_deforms = nn.ModuleList([
+ _ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2), device=device, dtype=dtype, operations=operations)
+ for conv_size in parallel_block_sizes
+ ])
+
+ self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
+ operations.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False, device=device, dtype=dtype),
+ operations.BatchNorm2d(self.in_channelster, device=device, dtype=dtype),
+ nn.ReLU(inplace=True))
+ self.conv1 = operations.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False, device=device, dtype=dtype)
+ self.bn1 = operations.BatchNorm2d(out_channels, device=device, dtype=dtype)
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ x1 = self.aspp1(x)
+ x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
+ x5 = self.global_avg_pool(x)
+ x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
+ x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
+
+ x = self.conv1(x)
+ x = self.bn1(x)
+ x = self.relu(x)
+
+ return x
+
+class BiRefNet(nn.Module):
+ def __init__(self, config=None, dtype=None, device=None, operations=None):
+ super(BiRefNet, self).__init__()
+ self.bb = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, device=device, dtype=dtype, operations=operations)
+
+ channels = [1536, 768, 384, 192]
+ channels = [c * 2 for c in channels]
+ self.cxt = channels[1:][::-1][-3:]
+ self.squeeze_module = nn.Sequential(*[
+ BasicDecBlk(channels[0]+sum(self.cxt), channels[0], device=device, dtype=dtype, operations=operations)
+ for _ in range(1)
+ ])
+
+ self.decoder = Decoder(channels, device=device, dtype=dtype, operations=operations)
+
+ def forward_enc(self, x):
+ x1, x2, x3, x4 = self.bb(x)
+ B, C, H, W = x.shape
+ x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
+ x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
+ x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
+ x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
+ x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
+ x4 = torch.cat(
+ (
+ *[
+ F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
+ F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
+ F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
+ ][-len(CXT):],
+ x4
+ ),
+ dim=1
+ )
+ return (x1, x2, x3, x4)
+
+ def forward_ori(self, x):
+ (x1, x2, x3, x4) = self.forward_enc(x)
+ x4 = self.squeeze_module(x4)
+ features = [x, x1, x2, x3, x4]
+ scaled_preds = self.decoder(features)
+ return scaled_preds
+
+ def forward(self, pixel_values, intermediate_output=None):
+ scaled_preds = self.forward_ori(pixel_values)
+ return scaled_preds
+
+
+class Decoder(nn.Module):
+ def __init__(self, channels, device, dtype, operations):
+ super(Decoder, self).__init__()
+ # factory kwargs
+ fk = {"device":device, "dtype":dtype, "operations":operations}
+ DecoderBlock = partial(BasicDecBlk, **fk)
+ LateralBlock = partial(BasicLatBlk, **fk)
+ DBlock = partial(SimpleConvs, **fk)
+
+ self.split = True
+ N_dec_ipt = 64
+ ic = 64
+ ipt_cha_opt = 1
+ self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
+ self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
+ self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
+ self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
+ self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
+
+ self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[1])
+ self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt]), channels[2])
+ self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt]), channels[3])
+ self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt]), channels[3]//2)
+
+ fk = {"device":device, "dtype":dtype}
+
+ self.conv_out1 = nn.Sequential(operations.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt]), 1, 1, 1, 0, **fk))
+
+ self.lateral_block4 = LateralBlock(channels[1], channels[1])
+ self.lateral_block3 = LateralBlock(channels[2], channels[2])
+ self.lateral_block2 = LateralBlock(channels[3], channels[3])
+
+ self.conv_ms_spvn_4 = operations.Conv2d(channels[1], 1, 1, 1, 0, **fk)
+ self.conv_ms_spvn_3 = operations.Conv2d(channels[2], 1, 1, 1, 0, **fk)
+ self.conv_ms_spvn_2 = operations.Conv2d(channels[3], 1, 1, 1, 0, **fk)
+
+ _N = 16
+
+ self.gdt_convs_4 = nn.Sequential(operations.Conv2d(channels[0] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
+ self.gdt_convs_3 = nn.Sequential(operations.Conv2d(channels[1] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
+ self.gdt_convs_2 = nn.Sequential(operations.Conv2d(channels[2] // 2, _N, 3, 1, 1, **fk), operations.BatchNorm2d(_N, **fk), nn.ReLU(inplace=True))
+
+ [setattr(self, f"gdt_convs_pred_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
+ [setattr(self, f"gdt_convs_attn_{i}", nn.Sequential(operations.Conv2d(_N, 1, 1, 1, 0, **fk))) for i in range(2, 5)]
+
+ def get_patches_batch(self, x, p):
+ _size_h, _size_w = p.shape[2:]
+ patches_batch = []
+ for idx in range(x.shape[0]):
+ columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
+ patches_x = []
+ for column_x in columns_x:
+ patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
+ patch_sample = torch.cat(patches_x, dim=1)
+ patches_batch.append(patch_sample)
+ return torch.cat(patches_batch, dim=0)
+
+ def forward(self, features):
+ x, x1, x2, x3, x4 = features
+
+ patches_batch = self.get_patches_batch(x, x4) if self.split else x
+ x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
+ p4 = self.decoder_block4(x4)
+ p4_gdt = self.gdt_convs_4(p4)
+ gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
+ p4 = p4 * gdt_attn_4
+ _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
+ _p3 = _p4 + self.lateral_block4(x3)
+
+ patches_batch = self.get_patches_batch(x, _p3) if self.split else x
+ _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
+ p3 = self.decoder_block3(_p3)
+
+ p3_gdt = self.gdt_convs_3(p3)
+ gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
+ p3 = p3 * gdt_attn_3
+ _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
+ _p2 = _p3 + self.lateral_block3(x2)
+
+ patches_batch = self.get_patches_batch(x, _p2) if self.split else x
+ _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
+ p2 = self.decoder_block2(_p2)
+
+ p2_gdt = self.gdt_convs_2(p2)
+ gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
+ p2 = p2 * gdt_attn_2
+
+ _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
+ _p1 = _p2 + self.lateral_block2(x1)
+
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
+ _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
+ _p1 = self.decoder_block1(_p1)
+ _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
+
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
+ _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
+ p1_out = self.conv_out1(_p1)
+ return p1_out
+
+
+class SimpleConvs(nn.Module):
+ def __init__(
+ self, in_channels: int, out_channels: int, inter_channels=64, device=None, dtype=None, operations=None
+ ) -> None:
+ super().__init__()
+ self.conv1 = operations.Conv2d(in_channels, inter_channels, 3, 1, 1, device=device, dtype=dtype)
+ self.conv_out = operations.Conv2d(inter_channels, out_channels, 3, 1, 1, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.conv_out(self.conv1(x))
diff --git a/comfy/bg_removal_model.py b/comfy/bg_removal_model.py
new file mode 100644
index 000000000..6dec65e63
--- /dev/null
+++ b/comfy/bg_removal_model.py
@@ -0,0 +1,85 @@
+from .utils import load_torch_file
+import os
+import json
+import torch
+import logging
+
+import comfy.ops
+import comfy.model_patcher
+import comfy.model_management
+import comfy.clip_model
+import comfy.background_removal.birefnet
+
+BG_REMOVAL_MODELS = {
+ "birefnet": comfy.background_removal.birefnet.BiRefNet
+}
+
+class BackgroundRemovalModel():
+ def __init__(self, json_config):
+ with open(json_config) as f:
+ config = json.load(f)
+
+ self.image_size = config.get("image_size", 1024)
+ self.image_mean = config.get("image_mean", [0.0, 0.0, 0.0])
+ self.image_std = config.get("image_std", [1.0, 1.0, 1.0])
+ self.model_type = config.get("model_type", "birefnet")
+ self.config = config.copy()
+ model_class = BG_REMOVAL_MODELS.get(self.model_type)
+
+ self.load_device = comfy.model_management.text_encoder_device()
+ offload_device = comfy.model_management.text_encoder_offload_device()
+ self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+ self.model = model_class(config, self.dtype, offload_device, comfy.ops.manual_cast)
+ self.model.eval()
+
+ self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+
+ def load_sd(self, sd):
+ return self.model.load_state_dict(sd, strict=False, assign=self.patcher.is_dynamic())
+
+ def get_sd(self):
+ return self.model.state_dict()
+
+ def encode_image(self, image):
+ comfy.model_management.load_model_gpu(self.patcher)
+ H, W = image.shape[1], image.shape[2]
+ pixel_values = comfy.clip_model.clip_preprocess(image.to(self.load_device), size=self.image_size, mean=self.image_mean, std=self.image_std, crop=False)
+
+ if pixel_values.shape[0] > 1:
+ out = torch.cat([
+ self.model(pixel_values=pixel_values[i:i+1])
+ for i in range(pixel_values.shape[0])
+ ], dim=0)
+ else:
+ out = self.model(pixel_values=pixel_values)
+ out = torch.nn.functional.interpolate(out, size=(H, W), mode="bicubic", antialias=False)
+
+ mask = out.sigmoid().to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
+ if mask.ndim == 3:
+ mask = mask.unsqueeze(0)
+ if mask.shape[1] != 1:
+ mask = mask.movedim(-1, 1)
+
+ return mask
+
+
+def load_background_removal_model(sd):
+ if "bb.layers.1.blocks.0.attn.relative_position_index" in sd:
+ json_config = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "background_removal"), "birefnet.json")
+ else:
+ return None
+
+ bg_model = BackgroundRemovalModel(json_config)
+ m, u = bg_model.load_sd(sd)
+ if len(m) > 0:
+ logging.warning("missing background removal: {}".format(m))
+ u = set(u)
+ keys = list(sd.keys())
+ for k in keys:
+ if k not in u:
+ sd.pop(k)
+ return bg_model
+
+def load(ckpt_path):
+ sd = load_torch_file(ckpt_path)
+ return load_background_removal_model(sd)
diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index dbaadf723..9bda414d1 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -49,7 +49,7 @@ parser.add_argument("--temp-directory", type=str, default=None, help="Set the Co
parser.add_argument("--input-directory", type=str, default=None, help="Set the ComfyUI input directory. Overrides --base-directory.")
parser.add_argument("--auto-launch", action="store_true", help="Automatically launch ComfyUI in the default browser.")
parser.add_argument("--disable-auto-launch", action="store_true", help="Disable auto launching the browser.")
-parser.add_argument("--cuda-device", type=int, default=None, metavar="DEVICE_ID", help="Set the id of the cuda device this instance will use. All other devices will not be visible.")
+parser.add_argument("--cuda-device", type=str, default=None, metavar="DEVICE_ID", help="Set the ids of cuda devices this instance will use, as a comma-separated list (e.g. '0' or '0,1'). All other devices will not be visible.")
parser.add_argument("--default-device", type=int, default=None, metavar="DEFAULT_DEVICE_ID", help="Set the id of the default device, all other devices will stay visible.")
cm_group = parser.add_mutually_exclusive_group()
cm_group.add_argument("--cuda-malloc", action="store_true", help="Enable cudaMallocAsync (enabled by default for torch 2.0 and up).")
@@ -90,8 +90,8 @@ parser.add_argument("--force-channels-last", action="store_true", help="Force ch
parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
parser.add_argument("--oneapi-device-selector", type=str, default=None, metavar="SELECTOR_STRING", help="Sets the oneAPI device(s) this instance will use.")
-parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize default when loading models with Intel's Extension for Pytorch.")
parser.add_argument("--supports-fp8-compute", action="store_true", help="ComfyUI will act like if the device supports fp8 compute.")
+parser.add_argument("--enable-triton-backend", action="store_true", help="ComfyUI will enable the use of Triton backend in comfy-kitchen. Is disabled at launch by default.")
class LatentPreviewMethod(enum.Enum):
NoPreviews = "none"
@@ -110,13 +110,11 @@ parser.add_argument("--preview-method", type=LatentPreviewMethod, default=Latent
parser.add_argument("--preview-size", type=int, default=512, help="Sets the maximum preview size for sampler nodes.")
-CACHE_RAM_AUTO_GB = -1.0
-
cache_group = parser.add_mutually_exclusive_group()
+cache_group.add_argument("--cache-ram", nargs='*', type=float, default=[], metavar="GB", help="Use RAM pressure caching with the specified headroom thresholds. This is the default caching mode. The first value sets the active-cache threshold; the optional second value sets the inactive-cache/pin threshold. Defaults when no values are provided: active 10%% of system RAM (min 2GB, max 10GB), inactive 100%% of system RAM (max 96GB).")
cache_group.add_argument("--cache-classic", action="store_true", help="Use the old style (aggressive) caching.")
cache_group.add_argument("--cache-lru", type=int, default=0, help="Use LRU caching with a maximum of N node results cached. May use more RAM/VRAM.")
cache_group.add_argument("--cache-none", action="store_true", help="Reduced RAM/VRAM usage at the expense of executing every node for each run.")
-cache_group.add_argument("--cache-ram", nargs='?', const=CACHE_RAM_AUTO_GB, type=float, default=0, help="Use RAM pressure caching with the specified headroom threshold. If available RAM drops below the threshold the cache removes large items to free RAM. Default (when no value is provided): 25%% of system RAM (min 4GB, max 32GB).")
attn_group = parser.add_mutually_exclusive_group()
attn_group.add_argument("--use-split-cross-attention", action="store_true", help="Use the split cross attention optimization. Ignored when xformers is used.")
@@ -141,8 +139,7 @@ manager_group.add_argument("--enable-manager-legacy-ui", action="store_true", he
vram_group = parser.add_mutually_exclusive_group()
vram_group.add_argument("--gpu-only", action="store_true", help="Store and run everything (text encoders/CLIP models, etc... on the GPU).")
vram_group.add_argument("--highvram", action="store_true", help="By default models will be unloaded to CPU memory after being used. This option keeps them in GPU memory.")
-vram_group.add_argument("--normalvram", action="store_true", help="Used to force normal vram use if lowvram gets automatically enabled.")
-vram_group.add_argument("--lowvram", action="store_true", help="Split the unet in parts to use less vram.")
+vram_group.add_argument("--lowvram", action="store_true", help="Doesn't do anything if dynamic vram is enabled. If dynamic vram isn't being used this option makes the text encoders run on the CPU.")
vram_group.add_argument("--novram", action="store_true", help="When lowvram isn't enough.")
vram_group.add_argument("--cpu", action="store_true", help="To use the CPU for everything (slow).")
@@ -238,12 +235,17 @@ database_default_path = os.path.abspath(
)
parser.add_argument("--database-url", type=str, default=f"sqlite:///{database_default_path}", help="Specify the database URL, e.g. for an in-memory database you can use 'sqlite:///:memory:'.")
parser.add_argument("--enable-assets", action="store_true", help="Enable the assets system (API routes, database synchronization, and background scanning).")
+parser.add_argument("--feature-flag", type=str, action='append', default=[], metavar="KEY[=VALUE]", help="Set a server feature flag. Use KEY=VALUE to set an explicit value, or bare KEY to set it to true. Can be specified multiple times. Boolean values (true/false) and numbers are auto-converted. Examples: --feature-flag show_signin_button=true or --feature-flag show_signin_button")
+parser.add_argument("--list-feature-flags", action="store_true", help="Print the registry of known CLI-settable feature flags as JSON and exit.")
if comfy.options.args_parsing:
args = parser.parse_args()
else:
args = parser.parse_args([])
+if args.cache_ram is not None and len(args.cache_ram) > 2:
+ parser.error("--cache-ram accepts at most two values: active GB and inactive GB")
+
if args.windows_standalone_build:
args.auto_launch = True
diff --git a/comfy/comfy_types/node_typing.py b/comfy/comfy_types/node_typing.py
index 57126fa4a..bb21eb1d1 100644
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
@@ -1,6 +1,5 @@
"""Comfy-specific type hinting"""
-from __future__ import annotations
from typing import Literal, TypedDict, Optional
from typing_extensions import NotRequired
from abc import ABC, abstractmethod
diff --git a/comfy/context_windows.py b/comfy/context_windows.py
index cb44ee6e8..db57537a2 100644
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
@@ -63,7 +63,11 @@ class IndexListContextWindow(ContextWindowABC):
dim = self.dim
if dim == 0 and full.shape[dim] == 1:
return full
- idx = tuple([slice(None)] * dim + [self.index_list])
+ indices = self.index_list
+ anchor_idx = getattr(self, 'causal_anchor_index', None)
+ if anchor_idx is not None and anchor_idx >= 0:
+ indices = [anchor_idx] + list(indices)
+ idx = tuple([slice(None)] * dim + [indices])
window = full[idx]
if retain_index_list:
idx = tuple([slice(None)] * dim + [retain_index_list])
@@ -113,7 +117,14 @@ def slice_cond(cond_value, window: IndexListContextWindow, x_in: torch.Tensor, d
# skip leading latent positions that have no corresponding conditioning (e.g. reference frames)
if temporal_offset > 0:
- indices = [i - temporal_offset for i in window.index_list[temporal_offset:]]
+ anchor_idx = getattr(window, 'causal_anchor_index', None)
+ if anchor_idx is not None and anchor_idx >= 0:
+ # anchor occupies one of the no-cond positions, so skip one fewer from window.index_list
+ skip_count = temporal_offset - 1
+ else:
+ skip_count = temporal_offset
+
+ indices = [i - temporal_offset for i in window.index_list[skip_count:]]
indices = [i for i in indices if 0 <= i]
else:
indices = list(window.index_list)
@@ -150,7 +161,8 @@ class ContextFuseMethod:
ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
class IndexListContextHandler(ContextHandlerABC):
def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1,
- closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False):
+ closed_loop: bool=False, dim:int=0, freenoise: bool=False, cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False,
+ causal_window_fix: bool=True):
self.context_schedule = context_schedule
self.fuse_method = fuse_method
self.context_length = context_length
@@ -162,6 +174,7 @@ class IndexListContextHandler(ContextHandlerABC):
self.freenoise = freenoise
self.cond_retain_index_list = [int(x.strip()) for x in cond_retain_index_list.split(",")] if cond_retain_index_list else []
self.split_conds_to_windows = split_conds_to_windows
+ self.causal_window_fix = causal_window_fix
self.callbacks = {}
@@ -318,6 +331,14 @@ class IndexListContextHandler(ContextHandlerABC):
# allow processing to end between context window executions for faster Cancel
comfy.model_management.throw_exception_if_processing_interrupted()
+ # causal_window_fix: prepend a pre-window frame that will be stripped post-forward
+ anchor_applied = False
+ if self.causal_window_fix:
+ anchor_idx = window.index_list[0] - 1
+ if 0 <= anchor_idx < x_in.size(self.dim):
+ window.causal_anchor_index = anchor_idx
+ anchor_applied = True
+
for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)
@@ -332,6 +353,12 @@ class IndexListContextHandler(ContextHandlerABC):
if device is not None:
for i in range(len(sub_conds_out)):
sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
+
+ # strip causal_window_fix anchor if applied
+ if anchor_applied:
+ for i in range(len(sub_conds_out)):
+ sub_conds_out[i] = sub_conds_out[i].narrow(self.dim, 1, sub_conds_out[i].shape[self.dim] - 1)
+
results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
return results
diff --git a/comfy/controlnet.py b/comfy/controlnet.py
index ba670b16d..6dbbaa959 100644
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -15,13 +15,14 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
"""
-
+from __future__ import annotations
import torch
from enum import Enum
import math
import os
import logging
+import copy
import comfy.utils
import comfy.model_management
import comfy.model_detection
@@ -38,7 +39,7 @@ import comfy.ldm.hydit.controlnet
import comfy.ldm.flux.controlnet
import comfy.ldm.qwen_image.controlnet
import comfy.cldm.dit_embedder
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Union
if TYPE_CHECKING:
from comfy.hooks import HookGroup
@@ -64,6 +65,18 @@ class StrengthType(Enum):
CONSTANT = 1
LINEAR_UP = 2
+class ControlIsolation:
+ '''Temporarily set a ControlBase object's previous_controlnet to None to prevent cascading calls.'''
+ def __init__(self, control: ControlBase):
+ self.control = control
+ self.orig_previous_controlnet = control.previous_controlnet
+
+ def __enter__(self):
+ self.control.previous_controlnet = None
+
+ def __exit__(self, *args):
+ self.control.previous_controlnet = self.orig_previous_controlnet
+
class ControlBase:
def __init__(self):
self.cond_hint_original = None
@@ -77,7 +90,7 @@ class ControlBase:
self.compression_ratio = 8
self.upscale_algorithm = 'nearest-exact'
self.extra_args = {}
- self.previous_controlnet = None
+ self.previous_controlnet: Union[ControlBase, None] = None
self.extra_conds = []
self.strength_type = StrengthType.CONSTANT
self.concat_mask = False
@@ -85,6 +98,7 @@ class ControlBase:
self.extra_concat = None
self.extra_hooks: HookGroup = None
self.preprocess_image = lambda a: a
+ self.multigpu_clones: dict[torch.device, ControlBase] = {}
def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
self.cond_hint_original = cond_hint
@@ -111,17 +125,38 @@ class ControlBase:
def cleanup(self):
if self.previous_controlnet is not None:
self.previous_controlnet.cleanup()
-
+ for device_cnet in self.multigpu_clones.values():
+ with ControlIsolation(device_cnet):
+ device_cnet.cleanup()
self.cond_hint = None
self.extra_concat = None
self.timestep_range = None
def get_models(self):
out = []
+ for device_cnet in self.multigpu_clones.values():
+ out += device_cnet.get_models_only_self()
if self.previous_controlnet is not None:
out += self.previous_controlnet.get_models()
return out
+ def get_models_only_self(self):
+ 'Calls get_models, but temporarily sets previous_controlnet to None.'
+ with ControlIsolation(self):
+ return self.get_models()
+
+ def get_instance_for_device(self, device):
+ 'Returns instance of this Control object intended for selected device.'
+ return self.multigpu_clones.get(device, self)
+
+ def deepclone_multigpu(self, load_device, autoregister=False):
+ '''
+ Create deep clone of Control object where model(s) is set to other devices.
+
+ When autoregister is set to True, the deep clone is also added to multigpu_clones dict.
+ '''
+ raise NotImplementedError("Classes inheriting from ControlBase should define their own deepclone_multigpu funtion.")
+
def get_extra_hooks(self):
out = []
if self.extra_hooks is not None:
@@ -130,7 +165,7 @@ class ControlBase:
out += self.previous_controlnet.get_extra_hooks()
return out
- def copy_to(self, c):
+ def copy_to(self, c: ControlBase):
c.cond_hint_original = self.cond_hint_original
c.strength = self.strength
c.timestep_percent_range = self.timestep_percent_range
@@ -284,6 +319,14 @@ class ControlNet(ControlBase):
self.copy_to(c)
return c
+ def deepclone_multigpu(self, load_device, autoregister=False):
+ c = self.copy()
+ c.control_model = copy.deepcopy(c.control_model)
+ c.control_model_wrapped = comfy.model_patcher.ModelPatcher(c.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+ if autoregister:
+ self.multigpu_clones[load_device] = c
+ return c
+
def get_models(self):
out = super().get_models()
out.append(self.control_model_wrapped)
@@ -314,6 +357,10 @@ class QwenFunControlNet(ControlNet):
super().pre_run(model, percent_to_timestep_function)
self.set_extra_arg("base_model", model.diffusion_model)
+ def cleanup(self):
+ self.extra_args.pop("base_model", None)
+ super().cleanup()
+
def copy(self):
c = QwenFunControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
c.control_model = self.control_model
@@ -906,6 +953,14 @@ class T2IAdapter(ControlBase):
self.copy_to(c)
return c
+ def deepclone_multigpu(self, load_device, autoregister=False):
+ c = self.copy()
+ c.t2i_model = copy.deepcopy(c.t2i_model)
+ c.device = load_device
+ if autoregister:
+ self.multigpu_clones[load_device] = c
+ return c
+
def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
compression_ratio = 8
upscale_algorithm = 'nearest-exact'
diff --git a/comfy/deploy_environment.py b/comfy/deploy_environment.py
new file mode 100644
index 000000000..8c99a3584
--- /dev/null
+++ b/comfy/deploy_environment.py
@@ -0,0 +1,34 @@
+import functools
+import logging
+import os
+
+logger = logging.getLogger(__name__)
+
+_DEFAULT_DEPLOY_ENV = "local-git"
+_ENV_FILENAME = ".comfy_environment"
+
+# Resolve the ComfyUI install directory (the parent of this `comfy/` package).
+# We deliberately avoid `folder_paths.base_path` here because that is overridden
+# by the `--base-directory` CLI arg to a user-supplied path, whereas the
+# `.comfy_environment` marker is written by launchers/installers next to the
+# ComfyUI install itself.
+_COMFY_INSTALL_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+
+@functools.cache
+def get_deploy_environment() -> str:
+ env_file = os.path.join(_COMFY_INSTALL_DIR, _ENV_FILENAME)
+ try:
+ with open(env_file, encoding="utf-8") as f:
+ # Cap the read so a malformed or maliciously crafted file (e.g.
+ # a single huge line with no newline) can't blow up memory.
+ first_line = f.readline(128).strip()
+ value = "".join(c for c in first_line if 32 <= ord(c) < 127)
+ if value:
+ return value
+ except FileNotFoundError:
+ pass
+ except Exception as e:
+ logger.error("Failed to read %s: %s", env_file, e)
+
+ return _DEFAULT_DEPLOY_ENV
diff --git a/comfy/hooks.py b/comfy/hooks.py
index 1a76c7ba4..5458fc3d8 100644
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
@@ -93,7 +93,7 @@ class Hook:
self.hook_scope = hook_scope
'''Scope of where this hook should apply in terms of the conds used in sampling run.'''
self.custom_should_register = default_should_register
- '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
+ '''Can be overridden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
@property
def strength(self):
diff --git a/comfy/image_encoders/dino2.py b/comfy/image_encoders/dino2.py
index 9b6dace9d..ee86f8309 100644
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
@@ -106,6 +106,7 @@ class Dino2Encoder(torch.nn.Module):
class Dino2PatchEmbeddings(torch.nn.Module):
def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
super().__init__()
+ self.patch_size = patch_size
self.projection = operations.Conv2d(
in_channels=num_channels,
out_channels=dim,
@@ -125,17 +126,37 @@ class Dino2Embeddings(torch.nn.Module):
super().__init__()
patch_size = 14
image_size = 518
+ self.patch_size = patch_size
self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
- self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
+ self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device)) # mask_token is a pre-training param, kept only so strict loading accepts the key.
self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+ def interpolate_pos_encoding(self, x, h_pixels, w_pixels):
+ pos_embed = comfy.model_management.cast_to_device(self.position_embeddings, x.device, torch.float32)
+
+ class_pos = pos_embed[:, 0:1]
+ patch_pos = pos_embed[:, 1:]
+ N = patch_pos.shape[1]
+ M = int(N ** 0.5)
+ h0 = h_pixels // self.patch_size
+ w0 = w_pixels // self.patch_size
+ scale_factor = ((h0 + 0.1) / M, (w0 + 0.1) / M) # +0.1 matches upstream DINOv2's FP-rounding workaround so the interpolate output size lands on (h0, w0).
+
+ patch_pos = patch_pos.reshape(1, M, M, -1).permute(0, 3, 1, 2)
+ patch_pos = torch.nn.functional.interpolate(patch_pos, scale_factor=scale_factor, mode="bicubic", antialias=False)
+ patch_pos = patch_pos.permute(0, 2, 3, 1).flatten(1, 2)
+ return torch.cat((class_pos, patch_pos), dim=1).to(x.dtype)
+
def forward(self, pixel_values):
x = self.patch_embeddings(pixel_values)
- # TODO: mask_token?
x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
- x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
+ if x.shape[1] - 1 == self.position_embeddings.shape[1] - 1:
+ x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
+ else:
+ h, w = pixel_values.shape[-2:]
+ x = x + self.interpolate_pos_encoding(x, h, w)
return x
@@ -158,3 +179,21 @@ class Dinov2Model(torch.nn.Module):
x = self.layernorm(x)
pooled_output = x[:, 0, :]
return x, i, pooled_output, None
+
+ def get_intermediate_layers(self, pixel_values, indices, apply_norm=True):
+ x = self.embeddings(pixel_values)
+ optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
+ n_layers = len(self.encoder.layer)
+ resolved = [(i if i >= 0 else n_layers + i) for i in indices]
+ target = set(resolved)
+ max_idx = max(resolved)
+ n_skip = 1 # skip cls token
+ cache = {}
+ for i, layer in enumerate(self.encoder.layer):
+ x = layer(x, optimized_attention)
+ if i in target:
+ normed = self.layernorm(x) if apply_norm else x
+ cache[i] = (normed[:, n_skip:], normed[:, 0])
+ if i >= max_idx:
+ break
+ return [cache[i] for i in resolved]
diff --git a/comfy/k_diffusion/sampling.py b/comfy/k_diffusion/sampling.py
index 6978eb717..11db46d94 100644
--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -242,6 +242,7 @@ def sample_euler_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@@ -373,6 +374,7 @@ def sample_dpm_2_ancestral_RF(model, x, sigmas, extra_args=None, callback=None,
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@@ -686,6 +688,7 @@ def sample_dpmpp_2s_ancestral_RF(model, x, sigmas, extra_args=None, callback=Non
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
sigma_fn = lambda lbda: (lbda.exp() + 1) ** -1
lambda_fn = lambda sigma: ((1-sigma)/sigma).log()
@@ -747,6 +750,7 @@ def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=N
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
for i in trange(len(sigmas) - 1, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
@@ -832,6 +836,7 @@ def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
old_denoised = None
h, h_last = None, None
@@ -889,6 +894,7 @@ def sample_dpmpp_3m_sde(model, x, sigmas, extra_args=None, callback=None, disabl
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
denoised_1, denoised_2 = None, None
h, h_1, h_2 = None, None, None
@@ -1006,23 +1012,39 @@ def sample_ddpm(model, x, sigmas, extra_args=None, callback=None, disable=None,
return generic_step_sampler(model, x, sigmas, extra_args, callback, disable, noise_sampler, DDPMSampler_step)
@torch.no_grad()
-def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None):
+def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, noise_sampler=None, s_noise=1.0, s_noise_end=None, noise_clip_std=0.0):
+
+ # s_noise / s_noise_end: per-step noise multiplier, linearly interpolated across steps
+ # noise_clip_std: clamp injected noise to +/- N stddevs (0 disables).
+
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
- for i in trange(len(sigmas) - 1, disable=disable):
+ n_steps = max(1, len(sigmas) - 1)
+ model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
+
+ s_start = float(s_noise)
+ s_end = s_start if s_noise_end is None else float(s_noise_end)
+ for i in trange(n_steps, disable=disable):
denoised = model(x, sigmas[i] * s_in, **extra_args)
if callback is not None:
callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised})
x = denoised
if sigmas[i + 1] > 0:
- x = model.inner_model.inner_model.model_sampling.noise_scaling(sigmas[i + 1], noise_sampler(sigmas[i], sigmas[i + 1]), x)
+ noise = noise_sampler(sigmas[i], sigmas[i + 1])
+ if noise_clip_std > 0:
+ clip_val = noise_clip_std * noise.std()
+ noise = noise.clamp(min=-clip_val, max=clip_val)
+ t = (i / (n_steps - 1)) if n_steps > 1 else 0.0
+ s_noise_i = s_start + (s_end - s_start) * t
+ if s_noise_i != 1.0:
+ noise = noise * s_noise_i
+ x = model_sampling.noise_scaling(sigmas[i + 1], noise, x)
return x
-
@torch.no_grad()
def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
# From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
@@ -1249,6 +1271,7 @@ def sample_euler_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback=No
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
uncond_denoised = None
@@ -1296,6 +1319,7 @@ def sample_dpmpp_2s_ancestral_cfg_pp(model, x, sigmas, extra_args=None, callback
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
temp = [0]
def post_cfg_function(args):
@@ -1371,6 +1395,7 @@ def res_multistep(model, x, sigmas, extra_args=None, callback=None, disable=None
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
sigma_fn = lambda t: t.neg().exp()
t_fn = lambda sigma: sigma.log().neg()
@@ -1504,6 +1529,7 @@ def sample_er_sde(model, x, sigmas, extra_args=None, callback=None, disable=None
extra_args = {} if extra_args is None else extra_args
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
+ s_noise = s_noise * getattr(model.inner_model.model_patcher.get_model_object('model_sampling'), "noise_scale", 1.0)
s_in = x.new_ones([x.shape[0]])
def default_er_sde_noise_scaler(x):
@@ -1574,9 +1600,10 @@ def sample_seeds_2(model, x, sigmas, extra_args=None, callback=None, disable=Non
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
- inject_noise = eta > 0 and s_noise > 0
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
+ inject_noise = eta > 0 and s_noise > 0
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@@ -1645,9 +1672,10 @@ def sample_seeds_3(model, x, sigmas, extra_args=None, callback=None, disable=Non
seed = extra_args.get("seed", None)
noise_sampler = default_noise_sampler(x, seed=seed) if noise_sampler is None else noise_sampler
s_in = x.new_ones([x.shape[0]])
- inject_noise = eta > 0 and s_noise > 0
model_sampling = model.inner_model.model_patcher.get_model_object('model_sampling')
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
+ inject_noise = eta > 0 and s_noise > 0
sigma_fn = partial(half_log_snr_to_sigma, model_sampling=model_sampling)
lambda_fn = partial(sigma_to_half_log_snr, model_sampling=model_sampling)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
@@ -1713,6 +1741,7 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
s_in = x.new_ones([x.shape[0]])
model_sampling = model.inner_model.model_patcher.get_model_object("model_sampling")
+ s_noise = s_noise * getattr(model_sampling, "noise_scale", 1.0)
sigmas = offset_first_sigma_for_snr(sigmas, model_sampling)
lambdas = sigma_to_half_log_snr(sigmas, model_sampling=model_sampling)
@@ -1810,3 +1839,119 @@ def sample_sa_solver(model, x, sigmas, extra_args=None, callback=None, disable=F
def sample_sa_solver_pece(model, x, sigmas, extra_args=None, callback=None, disable=False, tau_func=None, s_noise=1.0, noise_sampler=None, predictor_order=3, corrector_order=4, simple_order_2=False):
"""Stochastic Adams Solver with PECE (Predict–Evaluate–Correct–Evaluate) mode (NeurIPS 2023)."""
return sample_sa_solver(model, x, sigmas, extra_args=extra_args, callback=callback, disable=disable, tau_func=tau_func, s_noise=s_noise, noise_sampler=noise_sampler, predictor_order=predictor_order, corrector_order=corrector_order, use_pece=True, simple_order_2=simple_order_2)
+
+
+@torch.no_grad()
+def sample_ar_video(model, x, sigmas, extra_args=None, callback=None, disable=None,
+ num_frame_per_block=1):
+ """
+ Autoregressive video sampler: block-by-block denoising with KV cache
+ and flow-match re-noising for Causal Forcing / Self-Forcing models.
+
+ Requires a Causal-WAN compatible model (diffusion_model must expose
+ init_kv_caches / init_crossattn_caches) and 5-D latents [B,C,T,H,W].
+
+ All AR-loop parameters are passed via the SamplerARVideo node, not read
+ from the checkpoint or transformer_options.
+ """
+ extra_args = {} if extra_args is None else extra_args
+ model_options = extra_args.get("model_options", {})
+ transformer_options = model_options.get("transformer_options", {})
+
+ if x.ndim != 5:
+ raise ValueError(
+ f"ar_video sampler requires 5-D video latents [B,C,T,H,W], got {x.ndim}-D tensor with shape {x.shape}. "
+ "This sampler is only compatible with autoregressive video models (e.g. Causal-WAN)."
+ )
+
+ inner_model = model.inner_model.inner_model
+ causal_model = inner_model.diffusion_model
+
+ if not (hasattr(causal_model, "init_kv_caches") and hasattr(causal_model, "init_crossattn_caches")):
+ raise TypeError(
+ "ar_video sampler requires a Causal-WAN compatible model whose diffusion_model "
+ "exposes init_kv_caches() and init_crossattn_caches(). The loaded checkpoint "
+ "does not support this interface — choose a different sampler."
+ )
+
+ seed = extra_args.get("seed", 0)
+
+ bs, c, lat_t, lat_h, lat_w = x.shape
+ frame_seq_len = -(-lat_h // 2) * -(-lat_w // 2) # ceiling division
+ num_blocks = -(-lat_t // num_frame_per_block) # ceiling division
+ device = x.device
+ model_dtype = inner_model.get_dtype()
+
+ kv_caches = causal_model.init_kv_caches(bs, lat_t * frame_seq_len, device, model_dtype)
+ crossattn_caches = causal_model.init_crossattn_caches(bs, device, model_dtype)
+
+ output = torch.zeros_like(x)
+ s_in = x.new_ones([x.shape[0]])
+ current_start_frame = 0
+
+ # I2V: seed KV cache with the initial image latent before the denoising loop
+ initial_latent = transformer_options.get("ar_config", {}).get("initial_latent", None)
+ if initial_latent is not None:
+ initial_latent = inner_model.process_latent_in(initial_latent).to(device=device, dtype=model_dtype)
+ n_init = initial_latent.shape[2]
+ output[:, :, :n_init] = initial_latent
+
+ ar_state = {"start_frame": 0, "kv_caches": kv_caches, "crossattn_caches": crossattn_caches}
+ transformer_options["ar_state"] = ar_state
+ zero_sigma = sigmas.new_zeros([1])
+ _ = model(initial_latent, zero_sigma * s_in, **extra_args)
+
+ current_start_frame = n_init
+ remaining = lat_t - n_init
+ num_blocks = -(-remaining // num_frame_per_block)
+
+ num_sigma_steps = len(sigmas) - 1
+ total_real_steps = num_blocks * num_sigma_steps
+ step_count = 0
+
+ try:
+ for block_idx in trange(num_blocks, disable=disable):
+ bf = min(num_frame_per_block, lat_t - current_start_frame)
+ fs, fe = current_start_frame, current_start_frame + bf
+ noisy_input = x[:, :, fs:fe]
+
+ ar_state = {
+ "start_frame": current_start_frame,
+ "kv_caches": kv_caches,
+ "crossattn_caches": crossattn_caches,
+ }
+ transformer_options["ar_state"] = ar_state
+
+ for i in range(num_sigma_steps):
+ denoised = model(noisy_input, sigmas[i] * s_in, **extra_args)
+
+ if callback is not None:
+ scaled_i = step_count * num_sigma_steps // total_real_steps
+ callback({"x": noisy_input, "i": scaled_i, "sigma": sigmas[i],
+ "sigma_hat": sigmas[i], "denoised": denoised})
+
+ if sigmas[i + 1] == 0:
+ noisy_input = denoised
+ else:
+ sigma_next = sigmas[i + 1]
+ torch.manual_seed(seed + block_idx * 1000 + i)
+ fresh_noise = torch.randn_like(denoised)
+ noisy_input = (1.0 - sigma_next) * denoised + sigma_next * fresh_noise
+
+ for cache in kv_caches:
+ cache["end"] -= bf * frame_seq_len
+
+ step_count += 1
+
+ output[:, :, fs:fe] = noisy_input
+
+ for cache in kv_caches:
+ cache["end"] -= bf * frame_seq_len
+ zero_sigma = sigmas.new_zeros([1])
+ _ = model(noisy_input, zero_sigma * s_in, **extra_args)
+
+ current_start_frame += bf
+ finally:
+ transformer_options.pop("ar_state", None)
+
+ return output
diff --git a/comfy/latent_formats.py b/comfy/latent_formats.py
index 6a57bca1c..12a934d71 100644
--- a/comfy/latent_formats.py
+++ b/comfy/latent_formats.py
@@ -9,6 +9,7 @@ class LatentFormat:
latent_rgb_factors_reshape = None
taesd_decoder_name = None
spacial_downscale_ratio = 8
+ temporal_downscale_ratio = 1
def process_in(self, latent):
return latent * self.scale_factor
@@ -149,6 +150,12 @@ class SD3(LatentFormat):
class StableAudio1(LatentFormat):
latent_channels = 64
latent_dimensions = 1
+ temporal_downscale_ratio = 2048
+
+class StableAudio3(LatentFormat):
+ latent_channels = 256
+ latent_dimensions = 1
+ temporal_downscale_ratio = 4096
class Flux(SD3):
latent_channels = 16
@@ -224,6 +231,7 @@ class Flux2(LatentFormat):
self.latent_rgb_factors_bias = [-0.0329, -0.0718, -0.0851]
self.latent_rgb_factors_reshape = lambda t: t.reshape(t.shape[0], 32, 2, 2, t.shape[-2], t.shape[-1]).permute(0, 1, 4, 2, 5, 3).reshape(t.shape[0], 32, t.shape[-2] * 2, t.shape[-1] * 2)
+ self.taesd_decoder_name = "taef2_decoder"
def process_in(self, latent):
return latent
@@ -234,6 +242,7 @@ class Flux2(LatentFormat):
class Mochi(LatentFormat):
latent_channels = 12
latent_dimensions = 3
+ temporal_downscale_ratio = 6
def __init__(self):
self.scale_factor = 1.0
@@ -277,6 +286,7 @@ class LTXV(LatentFormat):
latent_channels = 128
latent_dimensions = 3
spacial_downscale_ratio = 32
+ temporal_downscale_ratio = 8
def __init__(self):
self.latent_rgb_factors = [
@@ -420,6 +430,7 @@ class LTXAV(LTXV):
class HunyuanVideo(LatentFormat):
latent_channels = 16
latent_dimensions = 3
+ temporal_downscale_ratio = 4
scale_factor = 0.476986
latent_rgb_factors = [
[-0.0395, -0.0331, 0.0445],
@@ -446,6 +457,7 @@ class HunyuanVideo(LatentFormat):
class Cosmos1CV8x8x8(LatentFormat):
latent_channels = 16
latent_dimensions = 3
+ temporal_downscale_ratio = 8
latent_rgb_factors = [
[ 0.1817, 0.2284, 0.2423],
@@ -471,6 +483,7 @@ class Cosmos1CV8x8x8(LatentFormat):
class Wan21(LatentFormat):
latent_channels = 16
latent_dimensions = 3
+ temporal_downscale_ratio = 4
latent_rgb_factors = [
[-0.1299, -0.1692, 0.2932],
@@ -733,6 +746,7 @@ class HunyuanVideo15(LatentFormat):
latent_channels = 32
latent_dimensions = 3
spacial_downscale_ratio = 16
+ temporal_downscale_ratio = 4
scale_factor = 1.03682
taesd_decoder_name = "lighttaehy1_5"
@@ -758,6 +772,7 @@ class ACEAudio(LatentFormat):
class ACEAudio15(LatentFormat):
latent_channels = 64
latent_dimensions = 1
+ temporal_downscale_ratio = 1764
class ChromaRadiance(LatentFormat):
latent_channels = 3
@@ -783,3 +798,38 @@ class ZImagePixelSpace(ChromaRadiance):
No VAE encoding/decoding — the model operates directly on RGB pixels.
"""
pass
+
+class HiDreamO1Pixel(ChromaRadiance):
+ """Pixel-space latent format for HiDream-O1.
+ No VAE — model patches/unpatches raw RGB internally with patch_size=32.
+ """
+ pass
+
+class PixelDiTPixel(ChromaRadiance):
+ pass
+
+class CogVideoX(LatentFormat):
+ """Latent format for CogVideoX-2b (THUDM/CogVideoX-2b).
+
+ scale_factor matches the vae/config.json scaling_factor for the 2b variant.
+ The 5b-class checkpoints (CogVideoX-5b, CogVideoX-1.5-5B, CogVideoX-Fun-V1.5-*)
+ use a different value; see CogVideoX1_5 below.
+ """
+ latent_channels = 16
+ latent_dimensions = 3
+ temporal_downscale_ratio = 4
+
+ def __init__(self):
+ self.scale_factor = 1.15258426
+
+
+class CogVideoX1_5(CogVideoX):
+ """Latent format for 5b-class CogVideoX checkpoints.
+
+ Covers THUDM/CogVideoX-5b, THUDM/CogVideoX-1.5-5B, and the CogVideoX-Fun
+ V1.5-5b family (including VOID inpainting). All of these have
+ scaling_factor=0.7 in their vae/config.json. Auto-selected in
+ supported_models.CogVideoX_T2V based on transformer hidden dim.
+ """
+ def __init__(self):
+ self.scale_factor = 0.7
diff --git a/comfy/ldm/audio/dit.py b/comfy/ldm/audio/dit.py
index ca865189e..c28be5b49 100644
--- a/comfy/ldm/audio/dit.py
+++ b/comfy/ldm/audio/dit.py
@@ -10,6 +10,17 @@ from torch import nn
from torch.nn import functional as F
import math
import comfy.ops
+from .embedders import ExpoFourierFeatures
+
+
+def _left_pad_to_match(emb, target_len):
+ emb_len = emb.shape[-2]
+ if emb_len < target_len:
+ return F.pad(emb, (0, 0, target_len - emb_len, 0), value=0.)
+ elif emb_len > target_len:
+ return emb[:, -target_len:, :]
+ return emb
+
class FourierFeatures(nn.Module):
def __init__(self, in_features, out_features, std=1., dtype=None, device=None):
@@ -22,6 +33,7 @@ class FourierFeatures(nn.Module):
f = 2 * math.pi * input @ comfy.ops.cast_to_input(self.weight.T, input)
return torch.cat([f.cos(), f.sin()], dim=-1)
+
# norms
class LayerNorm(nn.Module):
def __init__(self, dim, bias=False, fix_scale=False, dtype=None, device=None):
@@ -43,6 +55,16 @@ class LayerNorm(nn.Module):
beta = comfy.ops.cast_to_input(beta, x)
return F.layer_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x), bias=beta)
+
+class RMSNorm(nn.Module):
+ def __init__(self, dim, dtype=None, device=None):
+ super().__init__()
+ self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
+
+ def forward(self, x):
+ return F.rms_norm(x, x.shape[-1:], weight=comfy.ops.cast_to_input(self.gamma, x))
+
+
class GLU(nn.Module):
def __init__(
self,
@@ -236,13 +258,6 @@ class FeedForward(nn.Module):
linear_out = operations.Linear(inner_dim, dim_out, bias = not no_bias, dtype=dtype, device=device) if not use_conv else operations.Conv1d(inner_dim, dim_out, conv_kernel_size, padding = (conv_kernel_size // 2), bias = not no_bias, dtype=dtype, device=device)
- # # init last linear layer to 0
- # if zero_init_output:
- # nn.init.zeros_(linear_out.weight)
- # if not no_bias:
- # nn.init.zeros_(linear_out.bias)
-
-
self.ff = nn.Sequential(
linear_in,
rearrange('b d n -> b n d') if use_conv else nn.Identity(),
@@ -261,8 +276,10 @@ class Attention(nn.Module):
dim_context = None,
causal = False,
zero_init_output=True,
- qk_norm = False,
+ qk_norm = "none",
+ differential = False,
natten_kernel_size = None,
+ feat_scale = False,
dtype=None,
device=None,
operations=None,
@@ -271,6 +288,7 @@ class Attention(nn.Module):
self.dim = dim
self.dim_heads = dim_heads
self.causal = causal
+ self.differential = differential
dim_kv = dim_context if dim_context is not None else dim
@@ -278,18 +296,37 @@ class Attention(nn.Module):
self.kv_heads = dim_kv // dim_heads
if dim_context is not None:
- self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
- self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
+ if differential:
+ self.to_q = operations.Linear(dim, dim * 2, bias=False, dtype=dtype, device=device)
+ self.to_kv = operations.Linear(dim_kv, dim_kv * 3, bias=False, dtype=dtype, device=device)
+ else:
+ self.to_q = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
+ self.to_kv = operations.Linear(dim_kv, dim_kv * 2, bias=False, dtype=dtype, device=device)
else:
- self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
+ if differential:
+ self.to_qkv = operations.Linear(dim, dim * 5, bias=False, dtype=dtype, device=device)
+ else:
+ self.to_qkv = operations.Linear(dim, dim * 3, bias=False, dtype=dtype, device=device)
self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
- # if zero_init_output:
- # nn.init.zeros_(self.to_out.weight)
-
+ # Accept bool for backward compat
+ if isinstance(qk_norm, bool):
+ qk_norm = "l2" if qk_norm else "none"
self.qk_norm = qk_norm
+ if self.qk_norm == "ln":
+ self.q_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ self.k_norm = operations.LayerNorm(dim_heads, elementwise_affine=True, eps=1.0e-6, dtype=dtype, device=device)
+ elif self.qk_norm == "rms":
+ self.q_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
+ self.k_norm = RMSNorm(dim_heads, dtype=dtype, device=device)
+
+ self.feat_scale = feat_scale
+
+ if self.feat_scale:
+ self.lambda_dc = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
+ self.lambda_hf = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
def forward(
self,
@@ -306,22 +343,51 @@ class Attention(nn.Module):
kv_input = context if has_context else x
if hasattr(self, 'to_q'):
- # Use separate linear projections for q and k/v
- q = self.to_q(x)
- q = rearrange(q, 'b n (h d) -> b h n d', h = h)
+ if self.differential:
+ # cross-attention differential: to_q → (q, q_diff), to_kv → (k, k_diff, v)
+ q, q_diff = self.to_q(x).chunk(2, dim=-1)
+ q = rearrange(q, 'b n (h d) -> b h n d', h=h)
+ q_diff = rearrange(q_diff, 'b n (h d) -> b h n d', h=h)
+ q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
+ k, k_diff, v = self.to_kv(kv_input).chunk(3, dim=-1)
+ k = rearrange(k, 'b n (h d) -> b h n d', h=kv_h)
+ k_diff = rearrange(k_diff, 'b n (h d) -> b h n d', h=kv_h)
+ v = rearrange(v, 'b n (h d) -> b h n d', h=kv_h)
+ k = torch.stack([k, k_diff], dim=1) # (B, 2, H, M, D)
+ else:
+ # Use separate linear projections for q and k/v
+ q = self.to_q(x)
+ q = rearrange(q, 'b n (h d) -> b h n d', h = h)
- k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+ k, v = self.to_kv(kv_input).chunk(2, dim=-1)
- k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
+ k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = kv_h), (k, v))
else:
- # Use fused linear projection
- q, k, v = self.to_qkv(x).chunk(3, dim=-1)
- q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
+ if self.differential:
+ # self-attention differential: to_qkv → (q, k, v, q_diff, k_diff)
+ q, k, v, q_diff, k_diff = self.to_qkv(x).chunk(5, dim=-1)
+ q, k, v, q_diff, k_diff = map(
+ lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h),
+ (q, k, v, q_diff, k_diff)
+ )
+ q = torch.stack([q, q_diff], dim=1) # (B, 2, H, N, D)
+ k = torch.stack([k, k_diff], dim=1)
+ else:
+ # Use fused linear projection
+ q, k, v = self.to_qkv(x).chunk(3, dim=-1)
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
# Normalize q and k for cosine sim attention
- if self.qk_norm:
+ if self.qk_norm == "l2":
q = F.normalize(q, dim=-1)
k = F.normalize(k, dim=-1)
+ elif self.qk_norm == "rms":
+ q_type, k_type = q.dtype, k.dtype
+ q = self.q_norm(q).to(q_type)
+ k = self.k_norm(k).to(k_type)
+ elif self.qk_norm != 'none':
+ q = self.q_norm(q)
+ k = self.k_norm(k)
if rotary_pos_emb is not None and not has_context:
freqs, _ = rotary_pos_emb
@@ -364,9 +430,24 @@ class Attention(nn.Module):
heads_per_kv_head = h // kv_h
k, v = map(lambda t: t.repeat_interleave(heads_per_kv_head, dim = 1), (k, v))
- out = optimized_attention(q, k, v, h, skip_reshape=True, transformer_options=transformer_options)
+ if self.differential:
+ q, q_diff = q.unbind(dim=1)
+ k, k_diff = k.unbind(dim=1)
+ out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
+ out_diff = optimized_attention(q_diff, k_diff, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
+ out = out - out_diff
+ else:
+ out = optimized_attention(q, k, v, h, skip_reshape=True, low_precision_attention=False, transformer_options=transformer_options)
+
out = self.to_out(out)
+ if self.feat_scale:
+ out_dc = out.mean(dim=-2, keepdim=True)
+ out_hf = out - out_dc
+
+ # Selectively modulate DC and high frequency components
+ out = out + comfy.ops.cast_to_input(self.lambda_dc, out) * out_dc + comfy.ops.cast_to_input(self.lambda_hf, out) * out_hf
+
if mask is not None:
mask = rearrange(mask, 'b n -> b n 1')
out = out.masked_fill(~mask, 0.)
@@ -417,11 +498,14 @@ class TransformerBlock(nn.Module):
cross_attend = False,
dim_context = None,
global_cond_dim = None,
+ global_cond_shared_embed = False,
+ local_add_cond_dim = None,
causal = False,
zero_init_branch_outputs = True,
conformer = False,
layer_ix = -1,
remove_norms = False,
+ norm_type = "layer_norm",
attn_kwargs = {},
ff_kwargs = {},
norm_kwargs = {},
@@ -436,8 +520,20 @@ class TransformerBlock(nn.Module):
self.cross_attend = cross_attend
self.dim_context = dim_context
self.causal = causal
+ self.global_cond_shared_embed = global_cond_shared_embed
- self.pre_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
+ norm_layer_map = {
+ "layer_norm": LayerNorm,
+ "rms_norm": RMSNorm,
+ }
+ norm_cls = norm_layer_map.get(norm_type, LayerNorm)
+
+ def make_norm():
+ if remove_norms:
+ return nn.Identity()
+ return norm_cls(dim, dtype=dtype, device=device, **norm_kwargs)
+
+ self.pre_norm = make_norm()
self.self_attn = Attention(
dim,
@@ -451,7 +547,7 @@ class TransformerBlock(nn.Module):
)
if cross_attend:
- self.cross_attend_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
+ self.cross_attend_norm = make_norm()
self.cross_attn = Attention(
dim,
dim_heads = dim_heads,
@@ -464,37 +560,56 @@ class TransformerBlock(nn.Module):
**attn_kwargs
)
- self.ff_norm = LayerNorm(dim, dtype=dtype, device=device, **norm_kwargs) if not remove_norms else nn.Identity()
- self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations,**ff_kwargs)
+ self.ff_norm = make_norm()
+ self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs, dtype=dtype, device=device, operations=operations, **ff_kwargs)
self.layer_ix = layer_ix
self.conformer = ConformerModule(dim, norm_kwargs=norm_kwargs) if conformer else None
- self.global_cond_dim = global_cond_dim
+ # Global conditioning
+ self.has_global_cond = (global_cond_dim is not None) or global_cond_shared_embed
- if global_cond_dim is not None:
+ if global_cond_shared_embed:
+ # SA3 style: learnable per-block additive bias; global_cond is pre-projected to (B, dim*6)
+ self.to_scale_shift_gate = nn.Parameter(torch.empty(dim * 6, device=device, dtype=dtype))
+ elif global_cond_dim is not None:
+ # SA1 style: per-block MLP projects global_cond → (B, dim*6)
self.to_scale_shift_gate = nn.Sequential(
nn.SiLU(),
- nn.Linear(global_cond_dim, dim * 6, bias=False)
+ operations.Linear(global_cond_dim, dim * 6, bias=False, device=device, dtype=dtype)
)
- nn.init.zeros_(self.to_scale_shift_gate[1].weight)
- #nn.init.zeros_(self.to_scale_shift_gate_self[1].bias)
+ # Local additive conditioning (e.g. inpaint mask + masked latent)
+ self.local_add_cond_dim = local_add_cond_dim
+ if local_add_cond_dim is not None:
+ self.to_local_embed = nn.Sequential(
+ operations.Linear(local_add_cond_dim, dim, bias=True, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Linear(dim, dim, bias=True, dtype=dtype, device=device),
+ )
+ else:
+ self.to_local_embed = None
def forward(
self,
x,
context = None,
global_cond=None,
+ local_add_cond=None,
mask = None,
context_mask = None,
rotary_pos_emb = None,
transformer_options={}
):
- if self.global_cond_dim is not None and self.global_cond_dim > 0 and global_cond is not None:
+ if self.has_global_cond and global_cond is not None:
+ if self.global_cond_shared_embed:
+ # global_cond already has shape (B, dim*6)
+ ssg = (comfy.ops.cast_to_input(self.to_scale_shift_gate, global_cond) + global_cond).unsqueeze(1)
+ else:
+ ssg = self.to_scale_shift_gate(global_cond).unsqueeze(1)
- scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = self.to_scale_shift_gate(global_cond).unsqueeze(1).chunk(6, dim = -1)
+ scale_self, shift_self, gate_self, scale_ff, shift_ff, gate_ff = ssg.chunk(6, dim = -1)
# self-attention with adaLN
residual = x
@@ -510,6 +625,9 @@ class TransformerBlock(nn.Module):
if self.conformer is not None:
x = x + self.conformer(x)
+ if local_add_cond is not None and self.to_local_embed is not None:
+ x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
+
# feedforward with adaLN
residual = x
x = self.ff_norm(x)
@@ -527,6 +645,9 @@ class TransformerBlock(nn.Module):
if self.conformer is not None:
x = x + self.conformer(x)
+ if local_add_cond is not None and self.to_local_embed is not None:
+ x = x + _left_pad_to_match(self.to_local_embed(local_add_cond), x.shape[-2])
+
x = x + self.ff(self.ff_norm(x))
return x
@@ -543,6 +664,8 @@ class ContinuousTransformer(nn.Module):
cross_attend=False,
cond_token_dim=None,
global_cond_dim=None,
+ global_cond_shared_embed=False,
+ local_add_cond_dim=None,
causal=False,
rotary_pos_emb=True,
zero_init_branch_outputs=True,
@@ -550,6 +673,7 @@ class ContinuousTransformer(nn.Module):
use_sinusoidal_emb=False,
use_abs_pos_emb=False,
abs_pos_emb_max_length=10000,
+ num_memory_tokens=0,
dtype=None,
device=None,
operations=None,
@@ -562,6 +686,8 @@ class ContinuousTransformer(nn.Module):
self.depth = depth
self.causal = causal
self.layers = nn.ModuleList([])
+ self.num_memory_tokens = num_memory_tokens
+ self.global_cond_shared_embed = global_cond_shared_embed
self.project_in = operations.Linear(dim_in, dim, bias=False, dtype=dtype, device=device) if dim_in is not None else nn.Identity()
self.project_out = operations.Linear(dim, dim_out, bias=False, dtype=dtype, device=device) if dim_out is not None else nn.Identity()
@@ -577,7 +703,22 @@ class ContinuousTransformer(nn.Module):
self.use_abs_pos_emb = use_abs_pos_emb
if use_abs_pos_emb:
- self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length)
+ self.pos_emb = AbsolutePositionalEmbedding(dim, abs_pos_emb_max_length + num_memory_tokens)
+
+ if num_memory_tokens > 0:
+ self.memory_tokens = nn.Parameter(torch.empty(num_memory_tokens, dim, device=device, dtype=dtype))
+
+ # Shared global-cond embedder (SA3 style): projects (B, global_cond_dim) → (B, dim*6)
+ self.global_cond_embedder = None
+ if global_cond_shared_embed and global_cond_dim is not None:
+ self.global_cond_embedder = nn.Sequential(
+ operations.Linear(global_cond_dim, dim, bias=True, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Linear(dim, dim * 6, bias=True, dtype=dtype, device=device),
+ )
+
+ # When using shared embed, TransformerBlocks use per-block Parameter (not per-block MLP)
+ block_global_cond_dim = None if global_cond_shared_embed else global_cond_dim
for i in range(depth):
self.layers.append(
@@ -586,7 +727,9 @@ class ContinuousTransformer(nn.Module):
dim_heads = dim_heads,
cross_attend = cross_attend,
dim_context = cond_token_dim,
- global_cond_dim = global_cond_dim,
+ global_cond_dim = block_global_cond_dim,
+ global_cond_shared_embed = global_cond_shared_embed,
+ local_add_cond_dim = local_add_cond_dim,
causal = causal,
zero_init_branch_outputs = zero_init_branch_outputs,
conformer=conformer,
@@ -605,6 +748,7 @@ class ContinuousTransformer(nn.Module):
prepend_embeds = None,
prepend_mask = None,
global_cond = None,
+ local_add_cond = None,
return_info = False,
**kwargs
):
@@ -632,7 +776,9 @@ class ContinuousTransformer(nn.Module):
mask = torch.cat((prepend_mask, mask), dim = -1)
- # Attention layers
+ if self.num_memory_tokens > 0:
+ memory_tokens = comfy.ops.cast_to_input(self.memory_tokens, x).expand(batch, -1, -1)
+ x = torch.cat((memory_tokens, x), dim=1)
if self.rotary_pos_emb is not None:
rotary_pos_emb = self.rotary_pos_emb.forward_from_seq_len(x.shape[1], dtype=torch.float, device=x.device)
@@ -642,6 +788,10 @@ class ContinuousTransformer(nn.Module):
if self.use_sinusoidal_emb or self.use_abs_pos_emb:
x = x + self.pos_emb(x)
+ # Project global_cond once (SA3 shared-embed path)
+ if global_cond is not None and self.global_cond_embedder is not None:
+ global_cond = self.global_cond_embedder(global_cond)
+
blocks_replace = patches_replace.get("dit", {})
# Iterate over the transformer layers
for i, layer in enumerate(self.layers):
@@ -654,12 +804,17 @@ class ContinuousTransformer(nn.Module):
out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": global_cond, "pe": rotary_pos_emb, "transformer_options": transformer_options}, {"original_block": block_wrap})
x = out["img"]
else:
- x = layer(x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, context=context, transformer_options=transformer_options)
- # x = checkpoint(layer, x, rotary_pos_emb = rotary_pos_emb, global_cond=global_cond, **kwargs)
+ x = layer(x, rotary_pos_emb=rotary_pos_emb, global_cond=global_cond,
+ local_add_cond=local_add_cond, context=context,
+ transformer_options=transformer_options)
if return_info:
info["hidden_states"].append(x)
+ # Strip memory tokens before projecting out
+ if self.num_memory_tokens > 0:
+ x = x[:, self.num_memory_tokens:, :]
+
x = self.project_out(x)
if return_info:
@@ -682,6 +837,7 @@ class AudioDiffusionTransformer(nn.Module):
num_heads=24,
transformer_type: tp.Literal["continuous_transformer"] = "continuous_transformer",
global_cond_type: tp.Literal["prepend", "adaLN"] = "prepend",
+ timestep_features_type: str = "learned",
audio_model="",
dtype=None,
device=None,
@@ -696,7 +852,10 @@ class AudioDiffusionTransformer(nn.Module):
# Timestep embeddings
timestep_features_dim = 256
- self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
+ if timestep_features_type == "expo":
+ self.timestep_features = ExpoFourierFeatures(timestep_features_dim, 0.5, 10000.0)
+ else:
+ self.timestep_features = FourierFeatures(1, timestep_features_dim, dtype=dtype, device=device)
self.to_timestep_embed = nn.Sequential(
operations.Linear(timestep_features_dim, embed_dim, bias=True, dtype=dtype, device=device),
@@ -781,6 +940,7 @@ class AudioDiffusionTransformer(nn.Module):
cross_attn_cond=None,
cross_attn_cond_mask=None,
input_concat_cond=None,
+ local_add_cond=None,
global_embed=None,
prepend_cond=None,
prepend_cond_mask=None,
@@ -802,9 +962,13 @@ class AudioDiffusionTransformer(nn.Module):
prepend_cond = self.to_prepend_embed(prepend_cond)
prepend_inputs = prepend_cond
+ prepend_length = prepend_cond.shape[1]
if prepend_cond_mask is not None:
prepend_mask = prepend_cond_mask
+ if local_add_cond is not None and local_add_cond.dim() == 3:
+ local_add_cond = local_add_cond.permute(0, 2, 1)
+
if input_concat_cond is not None:
# Interpolate input_concat_cond to the same length as x
@@ -850,7 +1014,7 @@ class AudioDiffusionTransformer(nn.Module):
if self.transformer_type == "x-transformers":
output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, **extra_args, **kwargs)
elif self.transformer_type == "continuous_transformer":
- output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, **extra_args, **kwargs)
+ output = self.transformer(x, prepend_embeds=prepend_inputs, context=cross_attn_cond, context_mask=cross_attn_cond_mask, mask=mask, prepend_mask=prepend_mask, return_info=return_info, local_add_cond=local_add_cond, **extra_args, **kwargs)
if return_info:
output, info = output
@@ -876,6 +1040,7 @@ class AudioDiffusionTransformer(nn.Module):
context=None,
context_mask=None,
input_concat_cond=None,
+ local_add_cond=None,
global_embed=None,
negative_global_embed=None,
prepend_cond=None,
@@ -890,6 +1055,7 @@ class AudioDiffusionTransformer(nn.Module):
cross_attn_cond=context,
cross_attn_cond_mask=context_mask,
input_concat_cond=input_concat_cond,
+ local_add_cond=local_add_cond,
global_embed=global_embed,
prepend_cond=prepend_cond,
prepend_cond_mask=prepend_cond_mask,
diff --git a/comfy/ldm/audio/embedders.py b/comfy/ldm/audio/embedders.py
index 20edb365a..ba9a62837 100644
--- a/comfy/ldm/audio/embedders.py
+++ b/comfy/ldm/audio/embedders.py
@@ -31,15 +31,39 @@ def TimePositionalEmbedding(dim: int, out_features: int) -> nn.Module:
)
+class ExpoFourierFeatures(nn.Module):
+ """Exponentially-spaced Fourier features (no learnable parameters)."""
+ def __init__(self, dim, min_freq=0.5, max_freq=10000.0):
+ super().__init__()
+ self.dim = dim
+ self.min_freq = min_freq
+ self.max_freq = max_freq
+
+ def forward(self, t):
+ in_dtype = t.dtype
+ t = t.float()
+ if t.dim() == 1:
+ t = t.unsqueeze(-1)
+ half_dim = self.dim // 2
+ ramp = torch.linspace(0, 1, half_dim, device=t.device, dtype=torch.float32)
+ freqs = torch.exp(ramp * (math.log(self.max_freq) - math.log(self.min_freq)) + math.log(self.min_freq))
+ args = t * freqs * 2 * math.pi
+ return torch.cat([args.cos(), args.sin()], dim=-1).to(in_dtype)
+
+
class NumberEmbedder(nn.Module):
def __init__(
self,
features: int,
dim: int = 256,
+ fourier_features_type="learned",
):
super().__init__()
self.features = features
- self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
+ if fourier_features_type == "expo":
+ self.embedding = nn.Sequential(ExpoFourierFeatures(dim=dim), comfy.ops.manual_cast.Linear(in_features=dim, out_features=features))
+ else:
+ self.embedding = TimePositionalEmbedding(dim=dim, out_features=features)
def forward(self, x: Union[List[float], Tensor]) -> Tensor:
if not torch.is_tensor(x):
@@ -77,14 +101,15 @@ class NumberConditioner(Conditioner):
def __init__(self,
output_dim: int,
min_val: float=0,
- max_val: float=1
+ max_val: float=1,
+ fourier_features_type: str = "learned",
):
super().__init__(output_dim, output_dim)
self.min_val = min_val
self.max_val = max_val
- self.embedder = NumberEmbedder(features=output_dim)
+ self.embedder = NumberEmbedder(features=output_dim, fourier_features_type=fourier_features_type)
def forward(self, floats, device=None):
# Cast the inputs to floats
diff --git a/comfy/ldm/audio/vae_sa3.py b/comfy/ldm/audio/vae_sa3.py
new file mode 100644
index 000000000..8be36d6ee
--- /dev/null
+++ b/comfy/ldm/audio/vae_sa3.py
@@ -0,0 +1,533 @@
+import torch
+import torch.nn as nn
+
+import comfy.ops
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.audio.autoencoder import WNConv1d
+
+ops = comfy.ops.disable_weight_init
+
+class Transpose(nn.Module):
+ def forward(self, x, **kwargs):
+ return x.transpose(-2, -1)
+
+
+def _zero_pad_modulo_sequence(x, size, dim=-2):
+ input_len = x.shape[dim]
+ pad_len = (size - input_len % size) % size
+ if pad_len > 0:
+ pad_shape = list(x.shape)
+ pad_shape[dim] = pad_len
+ x = torch.cat([x, torch.zeros(pad_shape, device=x.device, dtype=x.dtype)], dim=dim)
+ return x
+
+
+def _sliding_window_mask(seq_len, window, device, dtype):
+ """Additive attention mask enforcing a ±window local window (matches flash_attn window_size)."""
+ i = torch.arange(seq_len, device=device).unsqueeze(1)
+ j = torch.arange(seq_len, device=device).unsqueeze(0)
+ out_of_window = (j - i).abs() > window
+ return torch.where(
+ out_of_window,
+ torch.full((1,), torch.finfo(dtype).min / 4, device=device, dtype=dtype),
+ torch.zeros(1, device=device, dtype=dtype),
+ )
+
+
+class DynamicTanh(nn.Module):
+ def __init__(self, dim, init_alpha=4.0, dtype=None, device=None, **kwargs):
+ super().__init__()
+ self.alpha = nn.Parameter(torch.empty(1, dtype=dtype, device=device))
+ self.gamma = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
+ self.beta = nn.Parameter(torch.empty(dim, dtype=dtype, device=device))
+
+ def forward(self, x):
+ alpha = comfy.ops.cast_to_input(self.alpha, x)
+ gamma = comfy.ops.cast_to_input(self.gamma, x)
+ beta = comfy.ops.cast_to_input(self.beta, x)
+ return gamma * torch.tanh(alpha * x) + beta
+
+
+class RotaryEmbedding(nn.Module):
+ def __init__(self, dim, base=10000, base_rescale_factor=1., dtype=None, device=None):
+ super().__init__()
+ base = base * base_rescale_factor ** (dim / (dim - 2))
+ self.register_buffer("inv_freq", torch.empty(dim // 2, dtype=dtype, device=device))
+
+ def forward_from_seq_len(self, seq_len, device, dtype=None):
+ t = torch.arange(seq_len, device=device, dtype=torch.float32)
+ return self.forward(t)
+
+ def forward(self, t):
+ freqs = torch.outer(t.float(), comfy.model_management.cast_to(self.inv_freq, dtype=torch.float32, device=t.device))
+ freqs = torch.cat((freqs, freqs), dim=-1)
+ return freqs, 1.
+
+
+def _rotate_half(x):
+ d = x.shape[-1] // 2
+ return torch.cat((-x[..., d:], x[..., :d]), dim=-1)
+
+
+def _apply_rotary_pos_emb(t, freqs):
+ out_dtype = t.dtype
+ rot_dim = freqs.shape[-1]
+ seq_len = t.shape[-2]
+ freqs = freqs[-seq_len:]
+ t_rot, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+ t_rot = t_rot * freqs.cos() + _rotate_half(t_rot) * freqs.sin()
+ return torch.cat((t_rot.to(out_dtype), t_pass.to(out_dtype)), dim=-1)
+
+
+class Attention(nn.Module):
+ def __init__(self, dim, dim_heads=64, qk_norm="none", qk_norm_eps=1e-6,
+ differential=False, zero_init_output=True,
+ dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ self.num_heads = dim // dim_heads
+ self.differential = differential
+ self.qk_norm = qk_norm
+
+ self.to_qkv = operations.Linear(
+ dim, dim * (5 if differential else 3), bias=False, dtype=dtype, device=device)
+ self.to_out = operations.Linear(dim, dim, bias=False, dtype=dtype, device=device)
+
+ if qk_norm == "dyt":
+ self.q_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
+ self.k_norm = DynamicTanh(dim_heads, dtype=dtype, device=device)
+ elif qk_norm == "rms":
+ self.q_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
+ self.k_norm = operations.RMSNorm(dim_heads, eps=qk_norm_eps, dtype=dtype, device=device)
+
+ def forward(self, x, rotary_pos_emb=None, mask=None, **kwargs):
+ B, N, _ = x.shape
+ h = self.num_heads
+
+ qkv = self.to_qkv(x)
+ if self.differential:
+ q, k, v, q_diff, k_diff = qkv.chunk(5, dim=-1)
+ del qkv
+ q = q.view(B, N, h, -1).transpose(1, 2)
+ k = k.view(B, N, h, -1).transpose(1, 2)
+ v = v.view(B, N, h, -1).transpose(1, 2)
+ q_diff = q_diff.view(B, N, h, -1).transpose(1, 2)
+ k_diff = k_diff.view(B, N, h, -1).transpose(1, 2)
+ else:
+ q, k, v = qkv.chunk(3, dim=-1)
+ del qkv
+ q = q.view(B, N, h, -1).transpose(1, 2)
+ k = k.view(B, N, h, -1).transpose(1, 2)
+ v = v.view(B, N, h, -1).transpose(1, 2)
+
+ if self.qk_norm != "none":
+ q_dtype, k_dtype = q.dtype, k.dtype
+ q = self.q_norm(q).to(q_dtype)
+ k = self.k_norm(k).to(k_dtype)
+ if self.differential:
+ q_diff = self.q_norm(q_diff).to(q_dtype)
+ k_diff = self.k_norm(k_diff).to(k_dtype)
+
+ if rotary_pos_emb is not None:
+ freqs, _ = rotary_pos_emb
+ q_dtype, k_dtype = q.dtype, k.dtype
+ q = _apply_rotary_pos_emb(q.float(), freqs).to(q_dtype)
+ k = _apply_rotary_pos_emb(k.float(), freqs).to(k_dtype)
+ if self.differential:
+ q_diff = _apply_rotary_pos_emb(q_diff.float(), freqs).to(q_dtype)
+ k_diff = _apply_rotary_pos_emb(k_diff.float(), freqs).to(k_dtype)
+
+ if self.differential:
+ out = (optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
+ - optimized_attention(q_diff, k_diff, v, h, mask=mask, skip_reshape=True, low_precision_attention=False))
+ del q, k, v, q_diff, k_diff
+ else:
+ out = optimized_attention(q, k, v, h, mask=mask, skip_reshape=True, low_precision_attention=False)
+ del q, k, v
+
+ return self.to_out(out)
+
+
+class _Sin(nn.Module):
+ def forward(self, x):
+ return torch.sin(3.14159265359 * x)
+
+
+class _GLU(nn.Module):
+ def __init__(self, dim_in, dim_out, activation, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.act = activation
+ self.proj = operations.Linear(dim_in, dim_out * 2, dtype=dtype, device=device)
+
+ def forward(self, x):
+ x = self.proj(x)
+ x, gate = x.chunk(2, dim=-1)
+ return x * self.act(gate)
+
+
+class FeedForward(nn.Module):
+ def __init__(self, dim, mult=4, no_bias=False, zero_init_output=True,
+ sinusoidal=False, dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ inner_dim = int(dim * mult)
+ act = _Sin() if sinusoidal else nn.SiLU()
+ self.ff = nn.Sequential(
+ _GLU(dim, inner_dim, act, dtype=dtype, device=device, operations=operations),
+ nn.Identity(),
+ operations.Linear(inner_dim, dim, bias=not no_bias, dtype=dtype, device=device),
+ nn.Identity(),
+ )
+
+ def forward(self, x, **kwargs):
+ return self.ff(x)
+
+
+class TransformerBlock(nn.Module):
+ def __init__(self, dim, dim_heads=64, causal=False, zero_init_branch_outputs=True,
+ norm_type="dyt", add_rope=False, attn_kwargs=None, ff_kwargs=None,
+ norm_kwargs=None, dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ if attn_kwargs is None:
+ attn_kwargs = {}
+ if ff_kwargs is None:
+ ff_kwargs = {}
+ if norm_kwargs is None:
+ norm_kwargs = {}
+ dim_heads = min(dim_heads, dim)
+
+ Norm = DynamicTanh if norm_type == "dyt" else operations.RMSNorm
+ norm_kw = {**norm_kwargs, "dtype": dtype, "device": device}
+
+ self.pre_norm = Norm(dim, **norm_kw)
+ self.self_attn = Attention(dim, dim_heads=dim_heads,
+ zero_init_output=zero_init_branch_outputs,
+ dtype=dtype, device=device, operations=operations,
+ **attn_kwargs)
+ self.ff_norm = Norm(dim, **norm_kw)
+ self.ff = FeedForward(dim, zero_init_output=zero_init_branch_outputs,
+ dtype=dtype, device=device, operations=operations, **ff_kwargs)
+ self.rope = RotaryEmbedding(dim_heads // 2, dtype=dtype, device=device) if add_rope else None
+
+ def forward(self, x, mask=None, **kwargs):
+ rope = self.rope.forward_from_seq_len(x.shape[-2], device=x.device) \
+ if self.rope is not None else None
+ x = x + self.self_attn(self.pre_norm(x), rotary_pos_emb=rope, mask=mask)
+ x = x + self.ff(self.ff_norm(x))
+ return x
+
+
+class TransformerResamplingBlock(nn.Module):
+ def __init__(self, in_channels, out_channels, stride, type="encoder",
+ transformer_depth=3, dim_heads=128, differential=True,
+ sliding_window=None, chunk_size=128, chunk_midpoint_shift=False,
+ dyt=True, ff_mult=3, mapping_bias=True, variable_stride=False,
+ sinusoidal_blocks=0, conv_mapping=False, dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ if type not in ("encoder", "decoder"):
+ raise ValueError(f"type must be 'encoder' or 'decoder', got {type!r}")
+
+ self.type = type
+ self.stride = stride
+ self.chunk_size = chunk_size
+ self.chunk_midpoint_shift = chunk_midpoint_shift
+ self.variable_stride = variable_stride
+ self.transformer_depth = transformer_depth
+
+ transformer_dim = out_channels if type == "encoder" else in_channels
+
+ self.mapping = (WNConv1d(in_channels, out_channels, 3 if conv_mapping else 1, padding="same", bias=mapping_bias)
+ if in_channels != out_channels else nn.Identity())
+
+ self.sliding_window_latents = sliding_window
+ self.sliding_window_seq = self._get_sliding_window_size(sliding_window, stride)
+ self.input_seg_size, self.output_seg_size, self.sub_chunk_size = self._get_seg_sizes(stride)
+
+ token_seq = 1 if variable_stride else self.output_seg_size
+ self.new_tokens = nn.Parameter(torch.empty(1, token_seq, transformer_dim, dtype=dtype, device=device))
+
+ norm_type = "dyt" if dyt else "rms_norm"
+ attn_kwargs = {"qk_norm": "dyt" if dyt else "rms", "qk_norm_eps": 1e-3,
+ "differential": differential}
+ norm_kwargs = {"eps": 1e-3}
+ transformers = []
+ for i in range(transformer_depth):
+ sinusoidal = (transformer_depth - i) < sinusoidal_blocks
+ transformers.append(TransformerBlock(
+ transformer_dim,
+ dim_heads=dim_heads,
+ causal=False,
+ zero_init_branch_outputs=True,
+ norm_type=norm_type,
+ add_rope=True,
+ attn_kwargs=attn_kwargs,
+ ff_kwargs={"mult": ff_mult, "no_bias": False, "sinusoidal": sinusoidal},
+ norm_kwargs=norm_kwargs,
+ dtype=dtype, device=device, operations=operations,
+ ))
+ self.transformers = nn.ModuleList(transformers)
+
+ def _get_sliding_window_size(self, window, stride, prepend_cond_length=0):
+ if window is None:
+ return None
+ return [w * (stride + 1 + prepend_cond_length) for w in window]
+
+ def _get_seg_sizes(self, stride, prepend_cond_length=0):
+ sub_chunk_size = stride + 1 + prepend_cond_length
+ input_seg_size = stride if self.type == "encoder" else 1
+ output_seg_size = 1 if self.type == "encoder" else stride
+ return input_seg_size, output_seg_size, sub_chunk_size
+
+ def forward(self, x, stride=None, **kwargs):
+ B = x.shape[0]
+
+ if stride is None:
+ input_seg = self.input_seg_size
+ output_seg = self.output_seg_size
+ sub_chunk = self.sub_chunk_size
+ sliding_window = self.sliding_window_seq
+ else:
+ input_seg, output_seg, sub_chunk = self._get_seg_sizes(stride)
+ sliding_window = self._get_sliding_window_size(self.sliding_window_latents, stride)
+
+ if self.type == "encoder":
+ if self.transformer_depth > 0:
+ pad_mod = self.chunk_size if sliding_window is None else input_seg
+ x = _zero_pad_modulo_sequence(x, pad_mod, dim=-1)
+ x = self.mapping(x)
+
+ if self.transformer_depth > 0:
+ x = x.permute(0, 2, 1)
+
+ if self.type != "encoder":
+ pad_mod = 1 if sliding_window is not None else (
+ self.chunk_size // (stride if stride is not None else self.stride))
+ x = _zero_pad_modulo_sequence(x, pad_mod)
+
+ C = x.shape[2]
+ x = x.reshape(-1, input_seg, C)
+
+ new_tokens = self.new_tokens.expand(x.shape[0], output_seg, -1)
+ x = torch.cat([x, comfy.ops.cast_to_input(new_tokens, x)], dim=-2)
+ del new_tokens
+
+ x = x.reshape(B, -1, C)
+
+ if sliding_window is None:
+ eff_chunk = self.chunk_size + self.chunk_size // (stride if stride is not None else self.stride)
+
+ if sliding_window is None and self.chunk_midpoint_shift:
+ split = self.transformer_depth // 2
+ shift = eff_chunk // 2
+
+ x = x.reshape(-1, eff_chunk, C)
+ for layer in self.transformers[:split]:
+ x = layer(x)
+ x = x.reshape(B, -1, C)
+
+ shifted = torch.cat([x[:, :shift, :], x, x[:, -shift:, :]], dim=1)
+ del x
+ x = shifted.reshape(-1, eff_chunk, C)
+ del shifted
+ for layer in self.transformers[split:]:
+ x = layer(x)
+ x = x.reshape(B, -1, C)
+ x = x[:, shift:-shift, :]
+ elif sliding_window is None:
+ x = x.reshape(-1, eff_chunk, C)
+ for layer in self.transformers:
+ x = layer(x)
+ x = x.reshape(B, -1, C)
+ else:
+ attn_mask = _sliding_window_mask(x.shape[1], sliding_window[0], x.device, x.dtype)
+ for layer in self.transformers:
+ x = layer(x, mask=attn_mask)
+
+ x = x.reshape(-1, sub_chunk, C)
+ x = x[:, -output_seg:, :]
+ x = x.reshape(B, -1, C).transpose(1, 2)
+
+ if self.type == "decoder":
+ x = self.mapping(x)
+
+ return x
+
+
+class SAMEEncoder(nn.Module):
+ def __init__(self, in_channels=2, channels=128, latent_dim=32,
+ c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
+ transformer_depths=(3, 3, 3, 3),
+ dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ channel_dims = [in_channels] + [channels * c for c in c_mults]
+ layers = []
+ for i in range(len(c_mults)):
+ layers.append(TransformerResamplingBlock(
+ in_channels=channel_dims[i], out_channels=channel_dims[i + 1],
+ stride=strides[i], type="encoder",
+ transformer_depth=transformer_depths[i],
+ dtype=dtype, device=device, operations=operations, **kwargs))
+ layers += [
+ Transpose(),
+ operations.Linear(channel_dims[-1], latent_dim, dtype=dtype, device=device),
+ Transpose(),
+ ]
+ self.layers = nn.ModuleList(layers)
+
+ def forward(self, x, **kwargs):
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+
+class SAMEDecoder(nn.Module):
+ def __init__(self, out_channels=2, channels=128, latent_dim=32,
+ c_mults=(1, 2, 4, 8), strides=(2, 4, 8, 8),
+ transformer_depths=(3, 3, 3, 3), sinusoidal_blocks=None,
+ dtype=None, device=None, operations=None, **kwargs):
+ super().__init__()
+ if sinusoidal_blocks is None:
+ sinusoidal_blocks = [0] * len(c_mults)
+ channel_dims = [out_channels] + [channels * c for c in c_mults]
+ layers = [
+ Transpose(),
+ operations.Linear(latent_dim, channel_dims[-1], dtype=dtype, device=device),
+ Transpose(),
+ ]
+ for i in range(len(c_mults) - 1, -1, -1):
+ layers.append(TransformerResamplingBlock(
+ in_channels=channel_dims[i + 1], out_channels=channel_dims[i],
+ stride=strides[i], type="decoder",
+ transformer_depth=transformer_depths[i],
+ sinusoidal_blocks=sinusoidal_blocks[i],
+ dtype=dtype, device=device, operations=operations, **kwargs))
+ self.layers = nn.ModuleList(layers)
+
+ def forward(self, x, **kwargs):
+ for layer in self.layers:
+ x = layer(x)
+ return x
+
+
+class SoftNormBottleneck(nn.Module):
+ def __init__(self, dim=32, noise_augment_dim=0, noise_regularize=False,
+ auto_scale=False, freeze=False, dtype=None, device=None, **kwargs):
+ super().__init__()
+ self.noise_augment_dim = noise_augment_dim
+ self.noise_regularize = noise_regularize
+ self.scaling_factor = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
+ self.bias = nn.Parameter(torch.empty(1, dim, 1, dtype=dtype, device=device))
+ self.noise_scaling_factor = nn.Parameter(torch.empty(1, noise_augment_dim, 1, dtype=dtype, device=device))
+ if auto_scale:
+ self.register_parameter("running_std", nn.Parameter(
+ torch.empty(1, dtype=dtype, device=device), requires_grad=False))
+ if freeze:
+ for p in self.parameters():
+ p.requires_grad = False
+
+ def encode(self, x, return_info=False, **kwargs):
+ x = x * comfy.ops.cast_to_input(self.scaling_factor, x) \
+ + comfy.ops.cast_to_input(self.bias, x)
+ if hasattr(self, "running_std"):
+ x = x / comfy.ops.cast_to_input(self.running_std, x)
+ if return_info:
+ return x, {}
+ return x
+
+ def decode(self, x, **kwargs):
+ if hasattr(self, "running_std"):
+ x = x * comfy.ops.cast_to_input(self.running_std, x)
+ if self.noise_regularize:
+ scaling = self.running_std if hasattr(self, "running_std") \
+ else x.std(dim=-1, keepdim=True)
+ noise = torch.randn_like(x) * comfy.ops.cast_to_input(scaling, x) * 1e-3
+ x = x + noise
+ if self.noise_augment_dim > 0:
+ noise = comfy.ops.cast_to_input(self.noise_scaling_factor, x) * torch.randn(
+ x.shape[0], self.noise_augment_dim, x.shape[-1], device=x.device, dtype=x.dtype)
+ x = torch.cat([x, noise], dim=1)
+ return x
+
+
+class PatchedPretransform(nn.Module):
+ def __init__(self, channels, patch_size, **kwargs):
+ super().__init__()
+ self.channels = channels
+ self.patch_size = patch_size
+ self.enable_grad = False
+
+ def _pad(self, x):
+ pad_len = (self.patch_size - x.shape[-1] % self.patch_size) % self.patch_size
+ if pad_len > 0:
+ x = torch.cat([x, torch.zeros_like(x[:, :, :pad_len])], dim=-1)
+ return x
+
+ def encode(self, x):
+ x = self._pad(x)
+ B, C, T = x.shape
+ h = self.patch_size
+ L = T // h
+ # b c (l h) -> b (c h) l
+ return x.reshape(B, C, L, h).permute(0, 1, 3, 2).reshape(B, C * h, L)
+
+ def decode(self, x):
+ B, Ch, L = x.shape
+ h = self.patch_size
+ C = Ch // h
+ # b (c h) l -> b c (l h)
+ return x.reshape(B, C, h, L).permute(0, 1, 3, 2).reshape(B, C, L * h)
+
+
+class SA3AudioVAE(nn.Module):
+ """SA3 VAE. State dict keys match checkpoint after stripping 'pretransform.model.'"""
+
+ def __init__(self, channels=256, transformer_depths=12, sinusoidal_blocks=8,
+ sliding_window=None, decoder_conv_mapping=False,
+ chunk_size=128, chunk_midpoint_shift=False,
+ dtype=None, device=None, operations=None):
+ super().__init__()
+ if operations is None:
+ operations = ops
+
+ self.pretransform = PatchedPretransform(channels=2, patch_size=256)
+
+ common_kwargs = dict(
+ differential=True, dyt=True, dim_heads=64,
+ sliding_window=sliding_window, variable_stride=True,
+ chunk_size=chunk_size, chunk_midpoint_shift=chunk_midpoint_shift,
+ dtype=dtype, device=device, operations=operations,
+ )
+ self.encoder = SAMEEncoder(
+ in_channels=512, channels=channels, c_mults=[6], strides=[16],
+ latent_dim=256, transformer_depths=[transformer_depths],
+ conv_mapping=False, **common_kwargs,
+ )
+ self.decoder = SAMEDecoder(
+ out_channels=512, channels=channels, c_mults=[6], strides=[16],
+ latent_dim=256, transformer_depths=[transformer_depths], sinusoidal_blocks=[sinusoidal_blocks],
+ conv_mapping=decoder_conv_mapping, **common_kwargs,
+ )
+ self.bottleneck = SoftNormBottleneck(
+ dim=256, noise_augment_dim=0, noise_regularize=True,
+ auto_scale=True, freeze=True,
+ dtype=dtype, device=device,
+ )
+
+ @torch.no_grad()
+ def _pretransform_encode(self, x):
+ return self.pretransform.encode(x)
+
+ @torch.no_grad()
+ def _pretransform_decode(self, x):
+ return self.pretransform.decode(x)
+
+ def encode(self, x):
+ x = self._pretransform_encode(x)
+ x = self.encoder(x)
+ x = self.bottleneck.encode(x)
+ return x
+
+ def decode(self, x):
+ x = self.bottleneck.decode(x)
+ x = self.decoder(x)
+ x = self._pretransform_decode(x)
+ return x
diff --git a/comfy/ldm/cogvideo/__init__.py b/comfy/ldm/cogvideo/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/comfy/ldm/cogvideo/model.py b/comfy/ldm/cogvideo/model.py
new file mode 100644
index 000000000..fb475ed53
--- /dev/null
+++ b/comfy/ldm/cogvideo/model.py
@@ -0,0 +1,573 @@
+# CogVideoX 3D Transformer - ported to ComfyUI native ops
+# Architecture reference: diffusers CogVideoXTransformer3DModel
+# Style reference: comfy/ldm/wan/model.py
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from comfy.ldm.modules.attention import optimized_attention
+import comfy.patcher_extension
+import comfy.ldm.common_dit
+
+
+def _get_1d_rotary_pos_embed(dim, pos, theta=10000.0):
+ """Returns (cos, sin) each with shape [seq_len, dim].
+
+ Frequencies are computed at dim//2 resolution then repeat_interleaved
+ to full dim, matching CogVideoX's interleaved (real, imag) pair format.
+ """
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim))
+ angles = torch.outer(pos.float(), freqs.float())
+ cos = angles.cos().repeat_interleave(2, dim=-1).float()
+ sin = angles.sin().repeat_interleave(2, dim=-1).float()
+ return (cos, sin)
+
+
+def apply_rotary_emb(x, freqs_cos_sin):
+ """Apply CogVideoX rotary embedding to query or key tensor.
+
+ x: [B, heads, seq_len, head_dim]
+ freqs_cos_sin: (cos, sin) each [seq_len, head_dim//2]
+
+ Uses interleaved pair rotation (same as diffusers CogVideoX/Flux).
+ head_dim is reshaped to (-1, 2) pairs, rotated, then flattened back.
+ """
+ cos, sin = freqs_cos_sin
+ cos = cos[None, None, :, :].to(x.device)
+ sin = sin[None, None, :, :].to(x.device)
+
+ # Interleaved pairs: [B, H, S, D] -> [B, H, S, D//2, 2] -> (real, imag)
+ x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)
+ x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+
+ return (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+
+
+def get_timestep_embedding(timesteps, dim, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1, max_period=10000):
+ half = dim // 2
+ freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32, device=timesteps.device) / half)
+ args = timesteps[:, None].float() * freqs[None] * scale
+ embedding = torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
+ if flip_sin_to_cos:
+ embedding = torch.cat([embedding[:, half:], embedding[:, :half]], dim=-1)
+ if dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ return embedding
+
+
+def get_3d_sincos_pos_embed(embed_dim, spatial_size, temporal_size, spatial_interpolation_scale=1.0, temporal_interpolation_scale=1.0, device=None):
+ if isinstance(spatial_size, int):
+ spatial_size = (spatial_size, spatial_size)
+
+ grid_w = torch.arange(spatial_size[0], dtype=torch.float32, device=device) / spatial_interpolation_scale
+ grid_h = torch.arange(spatial_size[1], dtype=torch.float32, device=device) / spatial_interpolation_scale
+ grid_t = torch.arange(temporal_size, dtype=torch.float32, device=device) / temporal_interpolation_scale
+
+ grid_t, grid_h, grid_w = torch.meshgrid(grid_t, grid_h, grid_w, indexing="ij")
+
+ embed_dim_spatial = 2 * (embed_dim // 3)
+ embed_dim_temporal = embed_dim // 3
+
+ pos_embed_spatial = _get_2d_sincos_pos_embed(embed_dim_spatial, grid_h, grid_w, device=device)
+ pos_embed_temporal = _get_1d_sincos_pos_embed(embed_dim_temporal, grid_t[:, 0, 0], device=device)
+
+ T, H, W = grid_t.shape
+ pos_embed_temporal = pos_embed_temporal.unsqueeze(1).unsqueeze(1).expand(-1, H, W, -1)
+ pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
+
+ return pos_embed
+
+
+def _get_2d_sincos_pos_embed(embed_dim, grid_h, grid_w, device=None):
+ T, H, W = grid_h.shape
+ half_dim = embed_dim // 2
+ pos_h = _get_1d_sincos_pos_embed(half_dim, grid_h.reshape(-1), device=device).reshape(T, H, W, half_dim)
+ pos_w = _get_1d_sincos_pos_embed(half_dim, grid_w.reshape(-1), device=device).reshape(T, H, W, half_dim)
+ return torch.cat([pos_h, pos_w], dim=-1)
+
+
+def _get_1d_sincos_pos_embed(embed_dim, pos, device=None):
+ half = embed_dim // 2
+ freqs = torch.exp(-math.log(10000.0) * torch.arange(start=0, end=half, dtype=torch.float32, device=device) / half)
+ args = pos.float().reshape(-1)[:, None] * freqs[None]
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+ if embed_dim % 2:
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+ return embedding
+
+
+
+class CogVideoXPatchEmbed(nn.Module):
+ def __init__(self, patch_size=2, patch_size_t=None, in_channels=16, dim=1920,
+ text_dim=4096, bias=True, sample_width=90, sample_height=60,
+ sample_frames=49, temporal_compression_ratio=4,
+ max_text_seq_length=226, spatial_interpolation_scale=1.875,
+ temporal_interpolation_scale=1.0, use_positional_embeddings=True,
+ use_learned_positional_embeddings=True,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.patch_size = patch_size
+ self.patch_size_t = patch_size_t
+ self.dim = dim
+ self.sample_height = sample_height
+ self.sample_width = sample_width
+ self.sample_frames = sample_frames
+ self.temporal_compression_ratio = temporal_compression_ratio
+ self.max_text_seq_length = max_text_seq_length
+ self.spatial_interpolation_scale = spatial_interpolation_scale
+ self.temporal_interpolation_scale = temporal_interpolation_scale
+ self.use_positional_embeddings = use_positional_embeddings
+ self.use_learned_positional_embeddings = use_learned_positional_embeddings
+
+ if patch_size_t is None:
+ self.proj = operations.Conv2d(in_channels, dim, kernel_size=patch_size, stride=patch_size, bias=bias, device=device, dtype=dtype)
+ else:
+ self.proj = operations.Linear(in_channels * patch_size * patch_size * patch_size_t, dim, device=device, dtype=dtype)
+
+ self.text_proj = operations.Linear(text_dim, dim, device=device, dtype=dtype)
+
+ if use_positional_embeddings or use_learned_positional_embeddings:
+ persistent = use_learned_positional_embeddings
+ pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
+ self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
+
+ def _get_positional_embeddings(self, sample_height, sample_width, sample_frames, device=None):
+ post_patch_height = sample_height // self.patch_size
+ post_patch_width = sample_width // self.patch_size
+ post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
+ if self.patch_size_t is not None:
+ post_time_compression_frames = post_time_compression_frames // self.patch_size_t
+ num_patches = post_patch_height * post_patch_width * post_time_compression_frames
+
+ pos_embedding = get_3d_sincos_pos_embed(
+ self.dim,
+ (post_patch_width, post_patch_height),
+ post_time_compression_frames,
+ self.spatial_interpolation_scale,
+ self.temporal_interpolation_scale,
+ device=device,
+ )
+ pos_embedding = pos_embedding.reshape(-1, self.dim)
+ joint_pos_embedding = pos_embedding.new_zeros(
+ 1, self.max_text_seq_length + num_patches, self.dim, requires_grad=False
+ )
+ joint_pos_embedding.data[:, self.max_text_seq_length:].copy_(pos_embedding)
+ return joint_pos_embedding
+
+ def forward(self, text_embeds, image_embeds):
+ input_dtype = text_embeds.dtype
+ text_embeds = self.text_proj(text_embeds.to(self.text_proj.weight.dtype)).to(input_dtype)
+ batch_size, num_frames, channels, height, width = image_embeds.shape
+
+ proj_dtype = self.proj.weight.dtype
+ if self.patch_size_t is None:
+ image_embeds = image_embeds.reshape(-1, channels, height, width)
+ image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
+ image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+ image_embeds = image_embeds.flatten(3).transpose(2, 3)
+ image_embeds = image_embeds.flatten(1, 2)
+ else:
+ p = self.patch_size
+ p_t = self.patch_size_t
+ image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+ image_embeds = image_embeds.reshape(
+ batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+ )
+ image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+ image_embeds = self.proj(image_embeds.to(proj_dtype)).to(input_dtype)
+
+ embeds = torch.cat([text_embeds, image_embeds], dim=1).contiguous()
+
+ if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+ text_seq_length = text_embeds.shape[1]
+ num_image_patches = image_embeds.shape[1]
+
+ if self.use_learned_positional_embeddings:
+ image_pos = self.pos_embedding[
+ :, self.max_text_seq_length:self.max_text_seq_length + num_image_patches
+ ].to(device=embeds.device, dtype=embeds.dtype)
+ else:
+ image_pos = get_3d_sincos_pos_embed(
+ self.dim,
+ (width // self.patch_size, height // self.patch_size),
+ num_image_patches // ((height // self.patch_size) * (width // self.patch_size)),
+ self.spatial_interpolation_scale,
+ self.temporal_interpolation_scale,
+ device=embeds.device,
+ ).reshape(1, num_image_patches, self.dim).to(dtype=embeds.dtype)
+
+ # Build joint: zeros for text + sincos for image
+ joint_pos = torch.zeros(1, text_seq_length + num_image_patches, self.dim, device=embeds.device, dtype=embeds.dtype)
+ joint_pos[:, text_seq_length:] = image_pos
+ embeds = embeds + joint_pos
+
+ return embeds
+
+
+class CogVideoXLayerNormZero(nn.Module):
+ def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5, bias=True,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.silu = nn.SiLU()
+ self.linear = operations.Linear(time_dim, 6 * dim, bias=bias, device=device, dtype=dtype)
+ self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+
+ def forward(self, hidden_states, encoder_hidden_states, temb):
+ shift, scale, gate, enc_shift, enc_scale, enc_gate = self.linear(self.silu(temb)).chunk(6, dim=1)
+ hidden_states = self.norm(hidden_states) * (1 + scale)[:, None, :] + shift[:, None, :]
+ encoder_hidden_states = self.norm(encoder_hidden_states) * (1 + enc_scale)[:, None, :] + enc_shift[:, None, :]
+ return hidden_states, encoder_hidden_states, gate[:, None, :], enc_gate[:, None, :]
+
+
+class CogVideoXAdaLayerNorm(nn.Module):
+ def __init__(self, time_dim, dim, elementwise_affine=True, eps=1e-5,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.silu = nn.SiLU()
+ self.linear = operations.Linear(time_dim, 2 * dim, device=device, dtype=dtype)
+ self.norm = operations.LayerNorm(dim, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
+
+ def forward(self, x, temb):
+ temb = self.linear(self.silu(temb))
+ shift, scale = temb.chunk(2, dim=1)
+ x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+ return x
+
+
+class CogVideoXBlock(nn.Module):
+ def __init__(self, dim, num_heads, head_dim, time_dim,
+ eps=1e-5, ff_inner_dim=None, ff_bias=True,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.head_dim = head_dim
+
+ self.norm1 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
+
+ # Self-attention (joint text + latent)
+ self.q = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+ self.k = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+ self.v = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+ self.norm_q = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
+ self.norm_k = operations.LayerNorm(head_dim, eps=1e-6, elementwise_affine=True, device=device, dtype=dtype)
+ self.attn_out = operations.Linear(dim, dim, bias=True, device=device, dtype=dtype)
+
+ self.norm2 = CogVideoXLayerNormZero(time_dim, dim, eps=eps, device=device, dtype=dtype, operations=operations)
+
+ # Feed-forward (GELU approximate)
+ inner_dim = ff_inner_dim or dim * 4
+ self.ff_proj = operations.Linear(dim, inner_dim, bias=ff_bias, device=device, dtype=dtype)
+ self.ff_out = operations.Linear(inner_dim, dim, bias=ff_bias, device=device, dtype=dtype)
+
+ def forward(self, hidden_states, encoder_hidden_states, temb, image_rotary_emb=None, transformer_options=None):
+ if transformer_options is None:
+ transformer_options = {}
+ text_seq_length = encoder_hidden_states.size(1)
+
+ # Norm & modulate
+ norm_hidden, norm_encoder, gate_msa, enc_gate_msa = self.norm1(hidden_states, encoder_hidden_states, temb)
+
+ # Joint self-attention
+ qkv_input = torch.cat([norm_encoder, norm_hidden], dim=1)
+ b, s, _ = qkv_input.shape
+ n, d = self.num_heads, self.head_dim
+
+ q = self.q(qkv_input).view(b, s, n, d)
+ k = self.k(qkv_input).view(b, s, n, d)
+ v = self.v(qkv_input)
+
+ q = self.norm_q(q).view(b, s, n, d)
+ k = self.norm_k(k).view(b, s, n, d)
+
+ # Apply rotary embeddings to image tokens only (diffusers format: [B, heads, seq, head_dim])
+ if image_rotary_emb is not None:
+ q_img = q[:, text_seq_length:].transpose(1, 2) # [B, heads, img_seq, head_dim]
+ k_img = k[:, text_seq_length:].transpose(1, 2)
+ q_img = apply_rotary_emb(q_img, image_rotary_emb)
+ k_img = apply_rotary_emb(k_img, image_rotary_emb)
+ q = torch.cat([q[:, :text_seq_length], q_img.transpose(1, 2)], dim=1)
+ k = torch.cat([k[:, :text_seq_length], k_img.transpose(1, 2)], dim=1)
+
+ attn_out = optimized_attention(
+ q.reshape(b, s, n * d),
+ k.reshape(b, s, n * d),
+ v,
+ heads=self.num_heads,
+ transformer_options=transformer_options,
+ )
+
+ attn_out = self.attn_out(attn_out)
+
+ attn_encoder, attn_hidden = attn_out.split([text_seq_length, s - text_seq_length], dim=1)
+
+ hidden_states = hidden_states + gate_msa * attn_hidden
+ encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder
+
+ # Norm & modulate for FF
+ norm_hidden, norm_encoder, gate_ff, enc_gate_ff = self.norm2(hidden_states, encoder_hidden_states, temb)
+
+ # Feed-forward (GELU on concatenated text + latent)
+ ff_input = torch.cat([norm_encoder, norm_hidden], dim=1)
+ ff_output = self.ff_out(F.gelu(self.ff_proj(ff_input), approximate="tanh"))
+
+ hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+ encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+
+ return hidden_states, encoder_hidden_states
+
+
+class CogVideoXTransformer3DModel(nn.Module):
+ def __init__(self,
+ num_attention_heads=30,
+ attention_head_dim=64,
+ in_channels=16,
+ out_channels=16,
+ flip_sin_to_cos=True,
+ freq_shift=0,
+ time_embed_dim=512,
+ ofs_embed_dim=None,
+ text_embed_dim=4096,
+ num_layers=30,
+ dropout=0.0,
+ attention_bias=True,
+ sample_width=90,
+ sample_height=60,
+ sample_frames=49,
+ patch_size=2,
+ patch_size_t=None,
+ temporal_compression_ratio=4,
+ max_text_seq_length=226,
+ spatial_interpolation_scale=1.875,
+ temporal_interpolation_scale=1.0,
+ use_rotary_positional_embeddings=False,
+ use_learned_positional_embeddings=False,
+ patch_bias=True,
+ image_model=None,
+ device=None,
+ dtype=None,
+ operations=None,
+ ):
+ super().__init__()
+ self.dtype = dtype
+ dim = num_attention_heads * attention_head_dim
+ self.dim = dim
+ self.num_attention_heads = num_attention_heads
+ self.attention_head_dim = attention_head_dim
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.patch_size = patch_size
+ self.patch_size_t = patch_size_t
+ self.max_text_seq_length = max_text_seq_length
+ self.use_rotary_positional_embeddings = use_rotary_positional_embeddings
+
+ # 1. Patch embedding
+ self.patch_embed = CogVideoXPatchEmbed(
+ patch_size=patch_size,
+ patch_size_t=patch_size_t,
+ in_channels=in_channels,
+ dim=dim,
+ text_dim=text_embed_dim,
+ bias=patch_bias,
+ sample_width=sample_width,
+ sample_height=sample_height,
+ sample_frames=sample_frames,
+ temporal_compression_ratio=temporal_compression_ratio,
+ max_text_seq_length=max_text_seq_length,
+ spatial_interpolation_scale=spatial_interpolation_scale,
+ temporal_interpolation_scale=temporal_interpolation_scale,
+ use_positional_embeddings=not use_rotary_positional_embeddings,
+ use_learned_positional_embeddings=use_learned_positional_embeddings,
+ device=device, dtype=torch.float32, operations=operations,
+ )
+
+ # 2. Time embedding
+ self.time_proj_dim = dim
+ self.time_proj_flip = flip_sin_to_cos
+ self.time_proj_shift = freq_shift
+ self.time_embedding_linear_1 = operations.Linear(dim, time_embed_dim, device=device, dtype=dtype)
+ self.time_embedding_act = nn.SiLU()
+ self.time_embedding_linear_2 = operations.Linear(time_embed_dim, time_embed_dim, device=device, dtype=dtype)
+
+ # Optional OFS embedding (CogVideoX 1.5 I2V)
+ self.ofs_proj_dim = ofs_embed_dim
+ if ofs_embed_dim:
+ self.ofs_embedding_linear_1 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
+ self.ofs_embedding_act = nn.SiLU()
+ self.ofs_embedding_linear_2 = operations.Linear(ofs_embed_dim, ofs_embed_dim, device=device, dtype=dtype)
+ else:
+ self.ofs_embedding_linear_1 = None
+
+ # 3. Transformer blocks
+ self.blocks = nn.ModuleList([
+ CogVideoXBlock(
+ dim=dim,
+ num_heads=num_attention_heads,
+ head_dim=attention_head_dim,
+ time_dim=time_embed_dim,
+ eps=1e-5,
+ device=device, dtype=dtype, operations=operations,
+ )
+ for _ in range(num_layers)
+ ])
+
+ self.norm_final = operations.LayerNorm(dim, eps=1e-5, elementwise_affine=True, device=device, dtype=dtype)
+
+ # 4. Output
+ self.norm_out = CogVideoXAdaLayerNorm(
+ time_dim=time_embed_dim, dim=dim, eps=1e-5,
+ device=device, dtype=dtype, operations=operations,
+ )
+
+ if patch_size_t is None:
+ output_dim = patch_size * patch_size * out_channels
+ else:
+ output_dim = patch_size * patch_size * patch_size_t * out_channels
+
+ self.proj_out = operations.Linear(dim, output_dim, device=device, dtype=dtype)
+
+ self.spatial_interpolation_scale = spatial_interpolation_scale
+ self.temporal_interpolation_scale = temporal_interpolation_scale
+ self.temporal_compression_ratio = temporal_compression_ratio
+
+ def forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
+ if transformer_options is None:
+ transformer_options = {}
+ return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+ self._forward,
+ self,
+ comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
+ ).execute(x, timestep, context, ofs, transformer_options, **kwargs)
+
+ def _forward(self, x, timestep, context, ofs=None, transformer_options=None, **kwargs):
+ if transformer_options is None:
+ transformer_options = {}
+ # ComfyUI passes [B, C, T, H, W]
+ batch_size, channels, t, h, w = x.shape
+
+ # Pad to patch size (temporal + spatial), same pattern as WAN
+ p_t = self.patch_size_t if self.patch_size_t is not None else 1
+ x = comfy.ldm.common_dit.pad_to_patch_size(x, (p_t, self.patch_size, self.patch_size))
+
+ # CogVideoX expects [B, T, C, H, W]
+ x = x.permute(0, 2, 1, 3, 4)
+ batch_size, num_frames, channels, height, width = x.shape
+
+ # Time embedding
+ t_emb = get_timestep_embedding(timestep, self.time_proj_dim, self.time_proj_flip, self.time_proj_shift)
+ t_emb = t_emb.to(dtype=x.dtype)
+ emb = self.time_embedding_linear_2(self.time_embedding_act(self.time_embedding_linear_1(t_emb)))
+
+ if self.ofs_embedding_linear_1 is not None and ofs is not None:
+ ofs_emb = get_timestep_embedding(ofs, self.ofs_proj_dim, self.time_proj_flip, self.time_proj_shift)
+ ofs_emb = ofs_emb.to(dtype=x.dtype)
+ ofs_emb = self.ofs_embedding_linear_2(self.ofs_embedding_act(self.ofs_embedding_linear_1(ofs_emb)))
+ emb = emb + ofs_emb
+
+ # Patch embedding
+ hidden_states = self.patch_embed(context, x)
+
+ text_seq_length = context.shape[1]
+ encoder_hidden_states = hidden_states[:, :text_seq_length]
+ hidden_states = hidden_states[:, text_seq_length:]
+
+ # Rotary embeddings (if used)
+ image_rotary_emb = None
+ if self.use_rotary_positional_embeddings:
+ post_patch_height = height // self.patch_size
+ post_patch_width = width // self.patch_size
+ if self.patch_size_t is None:
+ post_time = num_frames
+ else:
+ post_time = num_frames // self.patch_size_t
+ image_rotary_emb = self._get_rotary_emb(post_patch_height, post_patch_width, post_time, device=x.device)
+
+ # Transformer blocks
+ for i, block in enumerate(self.blocks):
+ hidden_states, encoder_hidden_states = block(
+ hidden_states=hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ temb=emb,
+ image_rotary_emb=image_rotary_emb,
+ transformer_options=transformer_options,
+ )
+
+ hidden_states = self.norm_final(hidden_states)
+
+ # Output projection
+ hidden_states = self.norm_out(hidden_states, temb=emb)
+ hidden_states = self.proj_out(hidden_states)
+
+ # Unpatchify
+ p = self.patch_size
+ p_t = self.patch_size_t
+
+ if p_t is None:
+ output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+ output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+ else:
+ output = hidden_states.reshape(
+ batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+ )
+ output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+
+ # Back to ComfyUI format [B, C, T, H, W] and crop padding
+ output = output.permute(0, 2, 1, 3, 4)[:, :, :t, :h, :w]
+ return output
+
+ def _get_rotary_emb(self, h, w, t, device):
+ """Compute CogVideoX 3D rotary positional embeddings.
+
+ For CogVideoX 1.5 (patch_size_t != None): uses "slice" mode — grid positions
+ are integer arange computed at max_size, then sliced to actual size.
+ For CogVideoX 1.0 (patch_size_t == None): uses "linspace" mode with crop coords
+ scaled by spatial_interpolation_scale.
+ """
+ d = self.attention_head_dim
+ dim_t = d // 4
+ dim_h = d // 8 * 3
+ dim_w = d // 8 * 3
+
+ if self.patch_size_t is not None:
+ # CogVideoX 1.5: "slice" mode — positions are simple integer indices
+ # Compute at max(sample_size, actual_size) then slice to actual
+ base_h = self.patch_embed.sample_height // self.patch_size
+ base_w = self.patch_embed.sample_width // self.patch_size
+ max_h = max(base_h, h)
+ max_w = max(base_w, w)
+
+ grid_h = torch.arange(max_h, device=device, dtype=torch.float32)
+ grid_w = torch.arange(max_w, device=device, dtype=torch.float32)
+ grid_t = torch.arange(t, device=device, dtype=torch.float32)
+ else:
+ # CogVideoX 1.0: "linspace" mode with interpolation scale
+ grid_h = torch.linspace(0, h - 1, h, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
+ grid_w = torch.linspace(0, w - 1, w, device=device, dtype=torch.float32) * self.spatial_interpolation_scale
+ grid_t = torch.arange(t, device=device, dtype=torch.float32)
+
+ freqs_t = _get_1d_rotary_pos_embed(dim_t, grid_t)
+ freqs_h = _get_1d_rotary_pos_embed(dim_h, grid_h)
+ freqs_w = _get_1d_rotary_pos_embed(dim_w, grid_w)
+
+ t_cos, t_sin = freqs_t
+ h_cos, h_sin = freqs_h
+ w_cos, w_sin = freqs_w
+
+ # Slice to actual size (for "slice" mode where grids may be larger)
+ t_cos, t_sin = t_cos[:t], t_sin[:t]
+ h_cos, h_sin = h_cos[:h], h_sin[:h]
+ w_cos, w_sin = w_cos[:w], w_sin[:w]
+
+ # Broadcast and concatenate into [T*H*W, head_dim]
+ t_cos = t_cos[:, None, None, :].expand(-1, h, w, -1)
+ t_sin = t_sin[:, None, None, :].expand(-1, h, w, -1)
+ h_cos = h_cos[None, :, None, :].expand(t, -1, w, -1)
+ h_sin = h_sin[None, :, None, :].expand(t, -1, w, -1)
+ w_cos = w_cos[None, None, :, :].expand(t, h, -1, -1)
+ w_sin = w_sin[None, None, :, :].expand(t, h, -1, -1)
+
+ cos = torch.cat([t_cos, h_cos, w_cos], dim=-1).reshape(t * h * w, -1)
+ sin = torch.cat([t_sin, h_sin, w_sin], dim=-1).reshape(t * h * w, -1)
+ return (cos, sin)
diff --git a/comfy/ldm/cogvideo/vae.py b/comfy/ldm/cogvideo/vae.py
new file mode 100644
index 000000000..d4e6f321e
--- /dev/null
+++ b/comfy/ldm/cogvideo/vae.py
@@ -0,0 +1,566 @@
+# CogVideoX VAE - ported to ComfyUI native ops
+# Architecture reference: diffusers AutoencoderKLCogVideoX
+# Style reference: comfy/ldm/wan/vae.py
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+ops = comfy.ops.disable_weight_init
+
+
+class CausalConv3d(nn.Module):
+ """Causal 3D convolution with temporal padding.
+
+ Uses comfy.ops.Conv3d with autopad='causal_zero' fast path: when input has
+ a single temporal frame and no cache, the 3D conv weight is sliced to act
+ as a 2D conv, avoiding computation on zero-padded temporal dimensions.
+ """
+ def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, pad_mode="constant"):
+ super().__init__()
+ if isinstance(kernel_size, int):
+ kernel_size = (kernel_size,) * 3
+
+ time_kernel, height_kernel, width_kernel = kernel_size
+ self.time_kernel_size = time_kernel
+ self.pad_mode = pad_mode
+
+ height_pad = (height_kernel - 1) // 2
+ width_pad = (width_kernel - 1) // 2
+ self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_kernel - 1, 0)
+
+ stride = stride if isinstance(stride, tuple) else (stride, 1, 1)
+ dilation = (dilation, 1, 1)
+ self.conv = ops.Conv3d(
+ in_channels, out_channels, kernel_size,
+ stride=stride, dilation=dilation,
+ padding=(0, height_pad, width_pad),
+ )
+
+ def forward(self, x, conv_cache=None):
+ if self.pad_mode == "replicate":
+ x = F.pad(x, self.time_causal_padding, mode="replicate")
+ conv_cache = None
+ else:
+ kernel_t = self.time_kernel_size
+ if kernel_t > 1:
+ if conv_cache is None and x.shape[2] == 1:
+ # Fast path: single frame, no cache. All temporal padding
+ # frames are copies of the input (replicate-style), so the
+ # 3D conv reduces to a 2D conv with summed temporal kernel.
+ w = comfy.ops.cast_to_input(self.conv.weight, x)
+ b = comfy.ops.cast_to_input(self.conv.bias, x) if self.conv.bias is not None else None
+ w2d = w.sum(dim=2, keepdim=True)
+ out = F.conv3d(x, w2d, b,
+ self.conv.stride, self.conv.padding,
+ self.conv.dilation, self.conv.groups)
+ return out, None
+ cached = [conv_cache] if conv_cache is not None else [x[:, :, :1]] * (kernel_t - 1)
+ x = torch.cat(cached + [x], dim=2)
+ conv_cache = x[:, :, -self.time_kernel_size + 1:].clone() if self.time_kernel_size > 1 else None
+
+ out = self.conv(x)
+ return out, conv_cache
+
+
+def _interpolate_zq(zq, target_size):
+ """Interpolate latent z to target (T, H, W), matching CogVideoX's first-frame-special handling."""
+ t = target_size[0]
+ if t > 1 and t % 2 == 1:
+ z_first = F.interpolate(zq[:, :, :1], size=(1, target_size[1], target_size[2]))
+ z_rest = F.interpolate(zq[:, :, 1:], size=(t - 1, target_size[1], target_size[2]))
+ return torch.cat([z_first, z_rest], dim=2)
+ return F.interpolate(zq, size=target_size)
+
+
+class SpatialNorm3D(nn.Module):
+ """Spatially conditioned normalization."""
+ def __init__(self, f_channels, zq_channels, groups=32):
+ super().__init__()
+ self.norm_layer = ops.GroupNorm(num_channels=f_channels, num_groups=groups, eps=1e-6, affine=True)
+ self.conv_y = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+ self.conv_b = CausalConv3d(zq_channels, f_channels, kernel_size=1, stride=1)
+
+ def forward(self, f, zq, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+
+ if zq.shape[-3:] != f.shape[-3:]:
+ zq = _interpolate_zq(zq, f.shape[-3:])
+
+ conv_y, new_cache["conv_y"] = self.conv_y(zq, conv_cache=conv_cache.get("conv_y"))
+ conv_b, new_cache["conv_b"] = self.conv_b(zq, conv_cache=conv_cache.get("conv_b"))
+
+ return self.norm_layer(f) * conv_y + conv_b, new_cache
+
+
+class ResnetBlock3D(nn.Module):
+ """3D ResNet block with optional spatial norm."""
+ def __init__(self, in_channels, out_channels=None, temb_channels=512, groups=32,
+ eps=1e-6, act_fn="silu", spatial_norm_dim=None, pad_mode="first"):
+ super().__init__()
+ out_channels = out_channels or in_channels
+ self.in_channels = in_channels
+ self.out_channels = out_channels
+ self.spatial_norm_dim = spatial_norm_dim
+
+ if act_fn == "silu":
+ self.nonlinearity = nn.SiLU()
+ elif act_fn == "swish":
+ self.nonlinearity = nn.SiLU()
+ else:
+ self.nonlinearity = nn.SiLU()
+
+ if spatial_norm_dim is None:
+ self.norm1 = ops.GroupNorm(num_channels=in_channels, num_groups=groups, eps=eps)
+ self.norm2 = ops.GroupNorm(num_channels=out_channels, num_groups=groups, eps=eps)
+ else:
+ self.norm1 = SpatialNorm3D(in_channels, spatial_norm_dim, groups=groups)
+ self.norm2 = SpatialNorm3D(out_channels, spatial_norm_dim, groups=groups)
+
+ self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
+
+ if temb_channels > 0:
+ self.temb_proj = ops.Linear(temb_channels, out_channels)
+
+ self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, pad_mode=pad_mode)
+
+ if in_channels != out_channels:
+ self.conv_shortcut = ops.Conv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+ else:
+ self.conv_shortcut = None
+
+ def forward(self, x, temb=None, zq=None, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+ residual = x
+
+ if zq is not None:
+ x, new_cache["norm1"] = self.norm1(x, zq, conv_cache=conv_cache.get("norm1"))
+ else:
+ x = self.norm1(x)
+
+ x = self.nonlinearity(x)
+ x, new_cache["conv1"] = self.conv1(x, conv_cache=conv_cache.get("conv1"))
+
+ if temb is not None and hasattr(self, "temb_proj"):
+ x = x + self.temb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+
+ if zq is not None:
+ x, new_cache["norm2"] = self.norm2(x, zq, conv_cache=conv_cache.get("norm2"))
+ else:
+ x = self.norm2(x)
+
+ x = self.nonlinearity(x)
+ x, new_cache["conv2"] = self.conv2(x, conv_cache=conv_cache.get("conv2"))
+
+ if self.conv_shortcut is not None:
+ residual = self.conv_shortcut(residual)
+
+ return x + residual, new_cache
+
+
+class Downsample3D(nn.Module):
+ """3D downsampling with optional temporal compression."""
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=2, padding=0, compress_time=False):
+ super().__init__()
+ self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+ self.compress_time = compress_time
+
+ def forward(self, x):
+ if self.compress_time:
+ b, c, t, h, w = x.shape
+ x = x.permute(0, 3, 4, 1, 2).reshape(b * h * w, c, t)
+ if t % 2 == 1:
+ x_first, x_rest = x[..., 0], x[..., 1:]
+ if x_rest.shape[-1] > 0:
+ x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+ x = torch.cat([x_first[..., None], x_rest], dim=-1)
+ x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
+ else:
+ x = F.avg_pool1d(x, kernel_size=2, stride=2)
+ x = x.reshape(b, h, w, c, x.shape[-1]).permute(0, 3, 4, 1, 2)
+
+ pad = (0, 1, 0, 1)
+ x = F.pad(x, pad, mode="constant", value=0)
+ b, c, t, h, w = x.shape
+ x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+ x = self.conv(x)
+ x = x.reshape(b, t, x.shape[1], x.shape[2], x.shape[3]).permute(0, 2, 1, 3, 4)
+ return x
+
+
+class Upsample3D(nn.Module):
+ """3D upsampling with optional temporal decompression."""
+ def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1, compress_time=False):
+ super().__init__()
+ self.conv = ops.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding)
+ self.compress_time = compress_time
+
+ def forward(self, x):
+ if self.compress_time:
+ if x.shape[2] > 1 and x.shape[2] % 2 == 1:
+ x_first, x_rest = x[:, :, 0], x[:, :, 1:]
+ x_first = F.interpolate(x_first, scale_factor=2.0)
+ x_rest = F.interpolate(x_rest, scale_factor=2.0)
+ x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
+ elif x.shape[2] > 1:
+ x = F.interpolate(x, scale_factor=2.0)
+ else:
+ x = x.squeeze(2)
+ x = F.interpolate(x, scale_factor=2.0)
+ x = x[:, :, None, :, :]
+ else:
+ b, c, t, h, w = x.shape
+ x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+ x = F.interpolate(x, scale_factor=2.0)
+ x = x.reshape(b, t, c, *x.shape[2:]).permute(0, 2, 1, 3, 4)
+
+ b, c, t, h, w = x.shape
+ x = x.permute(0, 2, 1, 3, 4).reshape(b * t, c, h, w)
+ x = self.conv(x)
+ x = x.reshape(b, t, *x.shape[1:]).permute(0, 2, 1, 3, 4)
+ return x
+
+
+class DownBlock3D(nn.Module):
+ def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
+ eps=1e-6, act_fn="silu", groups=32, add_downsample=True,
+ compress_time=False, pad_mode="first"):
+ super().__init__()
+ self.resnets = nn.ModuleList([
+ ResnetBlock3D(
+ in_channels=in_channels if i == 0 else out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels,
+ groups=groups, eps=eps, act_fn=act_fn, pad_mode=pad_mode,
+ )
+ for i in range(num_layers)
+ ])
+ self.downsamplers = nn.ModuleList([Downsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_downsample else None
+
+ def forward(self, x, temb=None, zq=None, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+ for i, resnet in enumerate(self.resnets):
+ x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+ if self.downsamplers is not None:
+ for ds in self.downsamplers:
+ x = ds(x)
+ return x, new_cache
+
+
+class MidBlock3D(nn.Module):
+ def __init__(self, in_channels, temb_channels=0, num_layers=1,
+ eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=None, pad_mode="first"):
+ super().__init__()
+ self.resnets = nn.ModuleList([
+ ResnetBlock3D(
+ in_channels=in_channels, out_channels=in_channels,
+ temb_channels=temb_channels, groups=groups, eps=eps,
+ act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
+ )
+ for _ in range(num_layers)
+ ])
+
+ def forward(self, x, temb=None, zq=None, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+ for i, resnet in enumerate(self.resnets):
+ x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+ return x, new_cache
+
+
+class UpBlock3D(nn.Module):
+ def __init__(self, in_channels, out_channels, temb_channels=0, num_layers=1,
+ eps=1e-6, act_fn="silu", groups=32, spatial_norm_dim=16,
+ add_upsample=True, compress_time=False, pad_mode="first"):
+ super().__init__()
+ self.resnets = nn.ModuleList([
+ ResnetBlock3D(
+ in_channels=in_channels if i == 0 else out_channels,
+ out_channels=out_channels,
+ temb_channels=temb_channels, groups=groups, eps=eps,
+ act_fn=act_fn, spatial_norm_dim=spatial_norm_dim, pad_mode=pad_mode,
+ )
+ for i in range(num_layers)
+ ])
+ self.upsamplers = nn.ModuleList([Upsample3D(out_channels, out_channels, compress_time=compress_time)]) if add_upsample else None
+
+ def forward(self, x, temb=None, zq=None, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+ for i, resnet in enumerate(self.resnets):
+ x, new_cache[f"resnet_{i}"] = resnet(x, temb, zq, conv_cache=conv_cache.get(f"resnet_{i}"))
+ if self.upsamplers is not None:
+ for us in self.upsamplers:
+ x = us(x)
+ return x, new_cache
+
+
+class Encoder3D(nn.Module):
+ def __init__(self, in_channels=3, out_channels=16,
+ block_out_channels=(128, 256, 256, 512),
+ layers_per_block=3, act_fn="silu",
+ eps=1e-6, groups=32, pad_mode="first",
+ temporal_compression_ratio=4):
+ super().__init__()
+ temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+ self.conv_in = CausalConv3d(in_channels, block_out_channels[0], kernel_size=3, pad_mode=pad_mode)
+
+ self.down_blocks = nn.ModuleList()
+ output_channel = block_out_channels[0]
+ for i in range(len(block_out_channels)):
+ input_channel = output_channel
+ output_channel = block_out_channels[i]
+ is_final = i == len(block_out_channels) - 1
+ compress_time = i < temporal_compress_level
+
+ self.down_blocks.append(DownBlock3D(
+ in_channels=input_channel, out_channels=output_channel,
+ temb_channels=0, num_layers=layers_per_block,
+ eps=eps, act_fn=act_fn, groups=groups,
+ add_downsample=not is_final, compress_time=compress_time,
+ ))
+
+ self.mid_block = MidBlock3D(
+ in_channels=block_out_channels[-1], temb_channels=0,
+ num_layers=2, eps=eps, act_fn=act_fn, groups=groups, pad_mode=pad_mode,
+ )
+
+ self.norm_out = ops.GroupNorm(groups, block_out_channels[-1], eps=1e-6)
+ self.conv_act = nn.SiLU()
+ self.conv_out = CausalConv3d(block_out_channels[-1], 2 * out_channels, kernel_size=3, pad_mode=pad_mode)
+
+ def forward(self, x, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+
+ x, new_cache["conv_in"] = self.conv_in(x, conv_cache=conv_cache.get("conv_in"))
+
+ for i, block in enumerate(self.down_blocks):
+ key = f"down_block_{i}"
+ x, new_cache[key] = block(x, None, None, conv_cache.get(key))
+
+ x, new_cache["mid_block"] = self.mid_block(x, None, None, conv_cache=conv_cache.get("mid_block"))
+
+ x = self.norm_out(x)
+ x = self.conv_act(x)
+ x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
+
+ return x, new_cache
+
+
+class Decoder3D(nn.Module):
+ def __init__(self, in_channels=16, out_channels=3,
+ block_out_channels=(128, 256, 256, 512),
+ layers_per_block=3, act_fn="silu",
+ eps=1e-6, groups=32, pad_mode="first",
+ temporal_compression_ratio=4):
+ super().__init__()
+ reversed_channels = list(reversed(block_out_channels))
+ temporal_compress_level = int(np.log2(temporal_compression_ratio))
+
+ self.conv_in = CausalConv3d(in_channels, reversed_channels[0], kernel_size=3, pad_mode=pad_mode)
+
+ self.mid_block = MidBlock3D(
+ in_channels=reversed_channels[0], temb_channels=0,
+ num_layers=2, eps=eps, act_fn=act_fn, groups=groups,
+ spatial_norm_dim=in_channels, pad_mode=pad_mode,
+ )
+
+ self.up_blocks = nn.ModuleList()
+ output_channel = reversed_channels[0]
+ for i in range(len(block_out_channels)):
+ prev_channel = output_channel
+ output_channel = reversed_channels[i]
+ is_final = i == len(block_out_channels) - 1
+ compress_time = i < temporal_compress_level
+
+ self.up_blocks.append(UpBlock3D(
+ in_channels=prev_channel, out_channels=output_channel,
+ temb_channels=0, num_layers=layers_per_block + 1,
+ eps=eps, act_fn=act_fn, groups=groups,
+ spatial_norm_dim=in_channels,
+ add_upsample=not is_final, compress_time=compress_time,
+ ))
+
+ self.norm_out = SpatialNorm3D(reversed_channels[-1], in_channels, groups=groups)
+ self.conv_act = nn.SiLU()
+ self.conv_out = CausalConv3d(reversed_channels[-1], out_channels, kernel_size=3, pad_mode=pad_mode)
+
+ def forward(self, sample, conv_cache=None):
+ new_cache = {}
+ conv_cache = conv_cache or {}
+
+ x, new_cache["conv_in"] = self.conv_in(sample, conv_cache=conv_cache.get("conv_in"))
+
+ x, new_cache["mid_block"] = self.mid_block(x, None, sample, conv_cache=conv_cache.get("mid_block"))
+
+ for i, block in enumerate(self.up_blocks):
+ key = f"up_block_{i}"
+ x, new_cache[key] = block(x, None, sample, conv_cache=conv_cache.get(key))
+
+ x, new_cache["norm_out"] = self.norm_out(x, sample, conv_cache=conv_cache.get("norm_out"))
+ x = self.conv_act(x)
+ x, new_cache["conv_out"] = self.conv_out(x, conv_cache=conv_cache.get("conv_out"))
+
+ return x, new_cache
+
+
+
+class AutoencoderKLCogVideoX(nn.Module):
+ """CogVideoX VAE. Spatial tiling/slicing handled by ComfyUI's VAE wrapper.
+
+ Uses rolling temporal decode: conv_in + mid_block + temporal up_blocks run
+ on the full (low-res) tensor, then the expensive spatial-only up_blocks +
+ norm_out + conv_out are processed in small temporal chunks with conv_cache
+ carrying causal state between chunks. This keeps peak VRAM proportional to
+ chunk_size rather than total frame count.
+ """
+
+ def __init__(self,
+ in_channels=3, out_channels=3,
+ block_out_channels=(128, 256, 256, 512),
+ latent_channels=16, layers_per_block=3,
+ act_fn="silu", eps=1e-6, groups=32,
+ temporal_compression_ratio=4,
+ ):
+ super().__init__()
+ self.latent_channels = latent_channels
+ self.temporal_compression_ratio = temporal_compression_ratio
+
+ self.encoder = Encoder3D(
+ in_channels=in_channels, out_channels=latent_channels,
+ block_out_channels=block_out_channels, layers_per_block=layers_per_block,
+ act_fn=act_fn, eps=eps, groups=groups,
+ temporal_compression_ratio=temporal_compression_ratio,
+ )
+ self.decoder = Decoder3D(
+ in_channels=latent_channels, out_channels=out_channels,
+ block_out_channels=block_out_channels, layers_per_block=layers_per_block,
+ act_fn=act_fn, eps=eps, groups=groups,
+ temporal_compression_ratio=temporal_compression_ratio,
+ )
+
+ self.num_latent_frames_batch_size = 2
+ self.num_sample_frames_batch_size = 8
+
+ def encode(self, x):
+ t = x.shape[2]
+ frame_batch = self.num_sample_frames_batch_size
+ remainder = t % frame_batch
+ conv_cache = None
+ enc = []
+
+ # Process remainder frames first so only the first chunk can have an
+ # odd temporal dimension — where Downsample3D's first-frame-special
+ # handling in temporal compression is actually correct.
+ if remainder > 0:
+ chunk, conv_cache = self.encoder(x[:, :, :remainder], conv_cache=conv_cache)
+ enc.append(chunk.to(x.device))
+
+ for start in range(remainder, t, frame_batch):
+ chunk, conv_cache = self.encoder(x[:, :, start:start + frame_batch], conv_cache=conv_cache)
+ enc.append(chunk.to(x.device))
+
+ enc = torch.cat(enc, dim=2)
+ mean, _ = enc.chunk(2, dim=1)
+ return mean
+
+ def decode(self, z):
+ return self._decode_rolling(z)
+
+ def _decode_batched(self, z):
+ """Original batched decode - processes 2 latent frames through full decoder."""
+ t = z.shape[2]
+ frame_batch = self.num_latent_frames_batch_size
+ num_batches = max(t // frame_batch, 1)
+ conv_cache = None
+ dec = []
+ for i in range(num_batches):
+ remaining = t % frame_batch
+ start = frame_batch * i + (0 if i == 0 else remaining)
+ end = frame_batch * (i + 1) + remaining
+ chunk, conv_cache = self.decoder(z[:, :, start:end], conv_cache=conv_cache)
+ dec.append(chunk.cpu())
+ return torch.cat(dec, dim=2).to(z.device)
+
+ def _decode_rolling(self, z):
+ """Rolling decode - processes low-res layers on full tensor, then rolls
+ through expensive high-res layers in temporal chunks."""
+ decoder = self.decoder
+ device = z.device
+
+ # Determine which up_blocks have temporal upsample vs spatial-only.
+ # Temporal up_blocks are cheap (low res), spatial-only are expensive.
+ temporal_compress_level = int(np.log2(self.temporal_compression_ratio))
+ split_at = temporal_compress_level # first N up_blocks do temporal upsample
+
+ # Phase 1: conv_in + mid_block + temporal up_blocks on full tensor (low/medium res)
+ x, _ = decoder.conv_in(z)
+ x, _ = decoder.mid_block(x, None, z)
+
+ for i in range(split_at):
+ x, _ = decoder.up_blocks[i](x, None, z)
+
+ # Phase 2: remaining spatial-only up_blocks + norm_out + conv_out in temporal chunks
+ remaining_blocks = list(range(split_at, len(decoder.up_blocks)))
+ chunk_size = 4 # pixel frames per chunk through high-res layers
+ t_expanded = x.shape[2]
+
+ if t_expanded <= chunk_size or len(remaining_blocks) == 0:
+ # Small enough to process in one go
+ for i in remaining_blocks:
+ x, _ = decoder.up_blocks[i](x, None, z)
+ x, _ = decoder.norm_out(x, z)
+ x = decoder.conv_act(x)
+ x, _ = decoder.conv_out(x)
+ return x
+
+ # Expand z temporally once to match Phase 2's time dimension.
+ # z stays at latent spatial resolution so this is small (~16 MB vs ~1.3 GB
+ # for the old approach of pre-interpolating to every pixel resolution).
+ z_time_expanded = _interpolate_zq(z, (t_expanded, z.shape[3], z.shape[4]))
+
+ # Process in temporal chunks, interpolating spatially per-chunk to avoid
+ # allocating full [B, C, t_expanded, H, W] tensors at each resolution.
+ dec_out = []
+ conv_caches = {}
+
+ for chunk_start in range(0, t_expanded, chunk_size):
+ chunk_end = min(chunk_start + chunk_size, t_expanded)
+ x_chunk = x[:, :, chunk_start:chunk_end]
+ z_t_chunk = z_time_expanded[:, :, chunk_start:chunk_end]
+ z_spatial_cache = {}
+
+ for i in remaining_blocks:
+ block = decoder.up_blocks[i]
+ cache_key = f"up_block_{i}"
+ hw_key = (x_chunk.shape[3], x_chunk.shape[4])
+ if hw_key not in z_spatial_cache:
+ if z_t_chunk.shape[3] == hw_key[0] and z_t_chunk.shape[4] == hw_key[1]:
+ z_spatial_cache[hw_key] = z_t_chunk
+ else:
+ z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
+ x_chunk, new_cache = block(x_chunk, None, z_spatial_cache[hw_key], conv_cache=conv_caches.get(cache_key))
+ conv_caches[cache_key] = new_cache
+
+ hw_key = (x_chunk.shape[3], x_chunk.shape[4])
+ if hw_key not in z_spatial_cache:
+ z_spatial_cache[hw_key] = F.interpolate(z_t_chunk, size=(z_t_chunk.shape[2], hw_key[0], hw_key[1]))
+ x_chunk, new_cache = decoder.norm_out(x_chunk, z_spatial_cache[hw_key], conv_cache=conv_caches.get("norm_out"))
+ conv_caches["norm_out"] = new_cache
+ x_chunk = decoder.conv_act(x_chunk)
+ x_chunk, new_cache = decoder.conv_out(x_chunk, conv_cache=conv_caches.get("conv_out"))
+ conv_caches["conv_out"] = new_cache
+
+ dec_out.append(x_chunk.cpu())
+ del z_spatial_cache
+
+ del x, z_time_expanded
+ return torch.cat(dec_out, dim=2).to(device)
diff --git a/comfy/ldm/ernie/model.py b/comfy/ldm/ernie/model.py
index 1f8f08376..eba661aec 100644
--- a/comfy/ldm/ernie/model.py
+++ b/comfy/ldm/ernie/model.py
@@ -15,7 +15,7 @@ def rope(pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=device) / dim
omega = 1.0 / (theta**scale)
- out = torch.einsum("...n,d->...nd", pos, omega)
+ out = torch.einsum("...n,d->...nd", pos.to(device), omega)
out = torch.stack([torch.cos(out), torch.sin(out)], dim=0)
return out.to(dtype=torch.float32, device=pos.device)
@@ -118,8 +118,6 @@ class ErnieImageAttention(nn.Module):
query = apply_rotary_emb(query, image_rotary_emb)
key = apply_rotary_emb(key, image_rotary_emb)
- query, key = query.to(x.dtype), key.to(x.dtype)
-
q_flat = query.reshape(B, S, -1)
k_flat = key.reshape(B, S, -1)
@@ -161,16 +159,16 @@ class ErnieImageSharedAdaLNBlock(nn.Module):
residual = x
x_norm = self.adaLN_sa_ln(x)
- x_norm = (x_norm.float() * (1 + scale_msa.float()) + shift_msa.float()).to(x.dtype)
+ x_norm = x_norm * (1 + scale_msa) + shift_msa
attn_out = self.self_attention(x_norm, attention_mask=attention_mask, image_rotary_emb=rotary_pos_emb)
- x = residual + (gate_msa.float() * attn_out.float()).to(x.dtype)
+ x = residual + gate_msa * attn_out
residual = x
x_norm = self.adaLN_mlp_ln(x)
- x_norm = (x_norm.float() * (1 + scale_mlp.float()) + shift_mlp.float()).to(x.dtype)
+ x_norm = x_norm * (1 + scale_mlp) + shift_mlp
- return residual + (gate_mlp.float() * self.mlp(x_norm).float()).to(x.dtype)
+ return residual + gate_mlp * self.mlp(x_norm)
class ErnieImageAdaLNContinuous(nn.Module):
def __init__(self, hidden_size: int, eps: float = 1e-6, operations=None, device=None, dtype=None):
@@ -183,7 +181,7 @@ class ErnieImageAdaLNContinuous(nn.Module):
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
scale, shift = self.linear(conditioning).chunk(2, dim=-1)
x = self.norm(x)
- x = x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+ x = torch.addcmul(shift.unsqueeze(1), x, 1 + scale.unsqueeze(1))
return x
class ErnieImageModel(nn.Module):
@@ -279,7 +277,7 @@ class ErnieImageModel(nn.Module):
rotary_pos_emb = self.pos_embed(torch.cat([image_ids, text_ids], dim=1)).to(x.dtype)
del image_ids, text_ids
- sample = self.time_proj(timesteps.to(dtype)).to(self.time_embedding.linear_1.weight.dtype)
+ sample = self.time_proj(timesteps).to(dtype)
c = self.time_embedding(sample)
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = [
diff --git a/comfy/ldm/hidream_o1/attention.py b/comfy/ldm/hidream_o1/attention.py
new file mode 100644
index 000000000..1b68f1771
--- /dev/null
+++ b/comfy/ldm/hidream_o1/attention.py
@@ -0,0 +1,41 @@
+"""HiDream-O1 two-pass attention: tokens [0, ar_len) are causal, [ar_len, T)
+attend full K/V. Splitting Q at the boundary avoids the (B, 1, T, T) additive
+mask the general-purpose path would build (~500 MB at T~16K) and lets the
+gen half hit the user's preferred backend via optimized_attention.
+"""
+
+import torch
+
+import comfy.ops
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def make_two_pass_attention(ar_len: int, transformer_options=None):
+ """Build a two-pass attention callable. AR pass uses SDPA-causal directly, gen pass routes through optimized_attention.
+ The AR pass goes through SDPA directand bypasses wrappers, it is only ~1% of T at typical edit sizes.
+ """
+
+ def two_pass_attention(q, k, v, heads, **kwargs):
+ B, H, T, D = q.shape
+
+ if T < k.shape[2]: # KV-cache hot path: Q is shorter than K/V (cached AR prefix is in K/V only), all fresh Q positions are in the gen region, single full-attention call
+ out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
+ elif ar_len >= T:
+ out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
+ elif ar_len <= 0:
+ out = optimized_attention(q, k, v, heads, mask=None, skip_reshape=True, skip_output_reshape=True, transformer_options=transformer_options)
+ else:
+ out_ar = comfy.ops.scaled_dot_product_attention(
+ q[:, :, :ar_len], k[:, :, :ar_len], v[:, :, :ar_len],
+ attn_mask=None, dropout_p=0.0, is_causal=True,
+ )
+ out_gen = optimized_attention(
+ q[:, :, ar_len:], k, v, heads,
+ mask=None, skip_reshape=True, skip_output_reshape=True,
+ transformer_options=transformer_options,
+ )
+ out = torch.cat([out_ar, out_gen], dim=2)
+
+ return out.transpose(1, 2).reshape(B, T, H * D)
+
+ return two_pass_attention
diff --git a/comfy/ldm/hidream_o1/conditioning.py b/comfy/ldm/hidream_o1/conditioning.py
new file mode 100644
index 000000000..7496f0035
--- /dev/null
+++ b/comfy/ldm/hidream_o1/conditioning.py
@@ -0,0 +1,230 @@
+"""HiDream-O1 conditioning prep — ref-image dual path + extra_conds assembly.
+
+Each ref image goes through two paths: a 32x32 patchified stream concatenated
+to the noised target, and a Qwen3-VL ViT path producing tokens that scatter
+into input_ids at <|image_pad|> positions.
+"""
+
+from typing import List
+
+import torch
+
+import comfy.utils
+from comfy.text_encoders.qwen_vl import process_qwen2vl_images
+
+from .utils import (PATCH_SIZE, calculate_dimensions, cond_image_size, ref_max_size, resize_tensor)
+
+# Qwen3-VL ViT preprocessing constants (preprocessor_config.json).
+VIT_PATCH = 16
+VIT_MERGE = 2
+VIT_IMAGE_MEAN = [0.5, 0.5, 0.5]
+VIT_IMAGE_STD = [0.5, 0.5, 0.5]
+
+
+def prepare_ref_images(
+ ref_images: List[torch.Tensor],
+ target_h: int,
+ target_w: int,
+ device: torch.device,
+ dtype: torch.dtype,
+):
+ """Build the dual-path tensors for K reference images at (target_h, target_w).
+
+ Returns None for K=0, else a dict with ref_patches, ref_pixel_values,
+ ref_image_grid_thw, per_ref_vit_tokens, per_ref_patch_grids.
+ """
+ K = len(ref_images)
+ if K == 0:
+ return None
+ max_size = ref_max_size(max(target_h, target_w), K)
+ cis = cond_image_size(K)
+
+ refs_t = [img[0].clamp(0, 1).permute(2, 0, 1).unsqueeze(0).contiguous().float() for img in ref_images]
+ refs_t = [resize_tensor(t, max_size, PATCH_SIZE) for t in refs_t]
+
+ # 32-patch path.
+ ref_patches_per = []
+ per_ref_patch_grids = []
+ for t in refs_t:
+ t_norm = (t.squeeze(0) - 0.5) / 0.5 # (3, H, W) in [-1, 1]
+ h_p, w_p = t_norm.shape[-2] // PATCH_SIZE, t_norm.shape[-1] // PATCH_SIZE
+ per_ref_patch_grids.append((h_p, w_p))
+ patches = (
+ t_norm.reshape(3, h_p, PATCH_SIZE, w_p, PATCH_SIZE)
+ .permute(1, 3, 0, 2, 4)
+ .reshape(h_p * w_p, 3 * PATCH_SIZE * PATCH_SIZE)
+ )
+ ref_patches_per.append(patches)
+ ref_patches = torch.cat(ref_patches_per, dim=0).unsqueeze(0).to(device=device, dtype=dtype)
+
+ # ViT path.
+ refs_vlm_t = []
+ for t in refs_t:
+ _, _, h, w = t.shape
+ cond_w, cond_h = calculate_dimensions(cis, w / h)
+ cond_w = max(cond_w, VIT_PATCH * VIT_MERGE)
+ cond_h = max(cond_h, VIT_PATCH * VIT_MERGE)
+ refs_vlm_t.append(comfy.utils.common_upscale(t, cond_w, cond_h, "lanczos", "disabled"))
+
+ pv_list, grid_list, per_ref_vit_tokens = [], [], []
+ for t_v in refs_vlm_t:
+ pv, grid_thw = process_qwen2vl_images(
+ t_v.permute(0, 2, 3, 1),
+ min_pixels=0, max_pixels=10**12,
+ patch_size=VIT_PATCH, merge_size=VIT_MERGE,
+ image_mean=VIT_IMAGE_MEAN, image_std=VIT_IMAGE_STD,
+ )
+ grid_thw = grid_thw[0]
+ pv_list.append(pv.to(device=device, dtype=dtype))
+ grid_list.append(grid_thw.to(device=device))
+ # Post-merge token count = number of <|image_pad|> tokens this image expands to in input_ids.
+ gh, gw = int(grid_thw[1].item()), int(grid_thw[2].item())
+ per_ref_vit_tokens.append((gh // VIT_MERGE) * (gw // VIT_MERGE))
+
+ return {
+ "ref_patches": ref_patches,
+ "ref_pixel_values": torch.cat(pv_list, dim=0),
+ "ref_image_grid_thw": torch.stack(grid_list, dim=0),
+ "per_ref_vit_tokens": per_ref_vit_tokens,
+ "per_ref_patch_grids": per_ref_patch_grids,
+ }
+
+
+def build_ref_input_ids(
+ text_input_ids: torch.Tensor,
+ per_ref_vit_tokens: List[int],
+ image_token_id: int,
+ vision_start_id: int,
+ vision_end_id: int,
+):
+ """Splice [vision_start, image_pad*N, vision_end] blocks into input_ids
+ after the [im_start, user, \\n] prefix (matches original chat template).
+ """
+ ids = text_input_ids[0].tolist()
+ inserted = []
+ for n_pad in per_ref_vit_tokens:
+ inserted.extend([vision_start_id] + [image_token_id] * n_pad + [vision_end_id])
+ new_ids = ids[:3] + inserted + ids[3:] # 3 = len([im_start, user, \n])
+ return torch.tensor([new_ids], dtype=text_input_ids.dtype, device=text_input_ids.device)
+
+
+def build_extra_conds(
+ text_input_ids: torch.Tensor,
+ noise: torch.Tensor,
+ ref_images: List[torch.Tensor] = None,
+ target_patch_size: int = 32,
+):
+ """Assemble all conditioning tensors for HiDreamO1Transformer.forward:
+ input_ids (with ref-vision tokens spliced in for the edit/IP path),
+ position_ids (MRoPE), token_types, vinput_mask, plus the ref
+ dual-path tensors when refs are provided.
+ """
+ from .utils import get_rope_index_fix_point
+ from comfy.text_encoders.hidream_o1 import (
+ IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
+ )
+
+ if text_input_ids.dim() == 1:
+ text_input_ids = text_input_ids.unsqueeze(0)
+ text_input_ids = text_input_ids.long().to(noise.device)
+ B = noise.shape[0]
+ if text_input_ids.shape[0] == 1 and B > 1:
+ text_input_ids = text_input_ids.expand(B, -1)
+
+ H, W = noise.shape[-2], noise.shape[-1]
+ h_p, w_p = H // target_patch_size, W // target_patch_size
+ image_len = h_p * w_p
+ image_grid_thw_tgt = torch.tensor(
+ [[1, h_p, w_p]], dtype=torch.long, device=text_input_ids.device,
+ )
+
+ out = {}
+ if ref_images:
+ ref = prepare_ref_images(ref_images, H, W, device=noise.device, dtype=noise.dtype)
+ text_input_ids = build_ref_input_ids(
+ text_input_ids, ref["per_ref_vit_tokens"],
+ IMAGE_TOKEN_ID, VISION_START_ID, VISION_END_ID,
+ )
+ new_txt_len = text_input_ids.shape[1]
+
+ # Each ref's patchified stream gets a [vision_start, image_pad*N-1]
+ # block in the position-id stream after the noised target.
+ ref_grid_lengths = [hp * wp for (hp, wp) in ref["per_ref_patch_grids"]]
+ tgt_vision = torch.full((1, image_len), IMAGE_TOKEN_ID,
+ dtype=text_input_ids.dtype, device=text_input_ids.device)
+ tgt_vision[:, 0] = VISION_START_ID
+ ref_vision_blocks = []
+ for rl in ref_grid_lengths:
+ blk = torch.full((1, rl), IMAGE_TOKEN_ID,
+ dtype=text_input_ids.dtype, device=text_input_ids.device)
+ blk[:, 0] = VISION_START_ID
+ ref_vision_blocks.append(blk)
+ ref_vision_cat = torch.cat([tgt_vision] + ref_vision_blocks, dim=1)
+ input_ids_pad = torch.cat([text_input_ids, ref_vision_cat], dim=-1)
+ total_ref_patches_len = sum(ref_grid_lengths)
+ total_len = new_txt_len + image_len + total_ref_patches_len
+
+ # K (ViT, post-merge) + 1 (target) + K (ref-patches) image grids.
+ K = len(ref_images)
+ igthw_cond = ref["ref_image_grid_thw"].clone()
+ igthw_cond[:, 1] //= 2
+ igthw_cond[:, 2] //= 2
+ image_grid_thw_ref = torch.tensor(
+ [[1, hp, wp] for (hp, wp) in ref["per_ref_patch_grids"]],
+ dtype=torch.long, device=text_input_ids.device,
+ )
+ igthw_all = torch.cat([
+ igthw_cond.to(text_input_ids.device),
+ image_grid_thw_tgt,
+ image_grid_thw_ref,
+ ], dim=0)
+ position_ids, _ = get_rope_index_fix_point(
+ spatial_merge_size=1,
+ image_token_id=IMAGE_TOKEN_ID,
+ vision_start_token_id=VISION_START_ID,
+ input_ids=input_ids_pad, image_grid_thw=igthw_all,
+ attention_mask=None,
+ skip_vision_start_token=[0] * K + [1] + [1] * K,
+ fix_point=4096,
+ )
+
+ # tms + target_image + ref_patches are all gen.
+ tms_pos = new_txt_len - 1
+ ar_len = tms_pos
+ token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
+ token_types[:, tms_pos:] = 1
+ vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
+ vinput_mask[:, new_txt_len:] = True
+
+ # Leading batch dim sidesteps CONDRegular.process_cond's repeat_to_batch_size truncation
+ out["ref_pixel_values"] = ref["ref_pixel_values"].unsqueeze(0)
+ out["ref_image_grid_thw"] = ref["ref_image_grid_thw"].unsqueeze(0)
+ out["ref_patches"] = ref["ref_patches"]
+ else:
+ # T2I: text + noised target only, vision_start replaces the first image token
+ txt_len = text_input_ids.shape[1]
+ total_len = txt_len + image_len
+ vision_tokens = torch.full((B, image_len), IMAGE_TOKEN_ID,
+ dtype=text_input_ids.dtype, device=text_input_ids.device)
+ vision_tokens[:, 0] = VISION_START_ID
+ input_ids_pad = torch.cat([text_input_ids, vision_tokens], dim=-1)
+ position_ids, _ = get_rope_index_fix_point(
+ spatial_merge_size=1,
+ image_token_id=IMAGE_TOKEN_ID,
+ vision_start_token_id=VISION_START_ID,
+ input_ids=input_ids_pad, image_grid_thw=image_grid_thw_tgt,
+ attention_mask=None,
+ skip_vision_start_token=[1],
+ )
+ ar_len = txt_len - 1
+ token_types = torch.zeros(B, total_len, dtype=torch.long, device=noise.device)
+ token_types[:, ar_len:] = 1
+ vinput_mask = torch.zeros(B, total_len, dtype=torch.bool, device=noise.device)
+ vinput_mask[:, txt_len:] = True
+
+ out["input_ids"] = text_input_ids
+ out["position_ids"] = position_ids[:, 0].unsqueeze(0) # Collapse position_ids batch and add a leading dim so CONDRegular's batch-resize doesn't truncate the 3-axis MRoPE dim
+ out["token_types"] = token_types
+ out["vinput_mask"] = vinput_mask
+ out["ar_len"] = ar_len
+ return out
diff --git a/comfy/ldm/hidream_o1/model.py b/comfy/ldm/hidream_o1/model.py
new file mode 100644
index 000000000..a223e706f
--- /dev/null
+++ b/comfy/ldm/hidream_o1/model.py
@@ -0,0 +1,306 @@
+"""HiDream-O1-Image transformer.
+
+Pixel-space DiT built on Qwen3-VL: the vision tower (Qwen35VisionModel)
+encodes ref images, the Qwen3-VL-8B decoder (Llama2_ with interleaved MRoPE)
+processes a unified text+image sequence, and 32x32 patch embed/unembed
+shims map raw RGB in and out of LLM hidden space. The Qwen3-VL deepstack
+mergers go unused — their weights are dropped at load.
+"""
+
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import einops
+import torch
+import torch.nn as nn
+
+import comfy.patcher_extension
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
+from comfy.text_encoders.llama import Llama2_
+from comfy.text_encoders.qwen35 import Qwen35VisionModel
+
+from .attention import make_two_pass_attention
+
+
+IMAGE_TOKEN_ID = 151655 # Qwen3-VL <|image_pad|>
+TMS_TOKEN_ID = 151673 # HiDream-O1 <|tms_token|>
+PATCH_SIZE = 32
+
+
+@dataclass
+class HiDreamO1TextConfig:
+ """Qwen3-VL-8B text-decoder dims (matches public Qwen3-VL-8B-Instruct)."""
+ vocab_size: int = 151936
+ hidden_size: int = 4096
+ intermediate_size: int = 12288
+ num_hidden_layers: int = 36
+ num_attention_heads: int = 32
+ num_key_value_heads: int = 8
+ head_dim: int = 128
+ max_position_embeddings: int = 128000
+ rms_norm_eps: float = 1e-6
+ rope_theta: float = 5000000.0
+ rope_scale: Optional[float] = None
+ rope_dims: List[int] = field(default_factory=lambda: [24, 20, 20])
+ interleaved_mrope: bool = True
+ transformer_type: str = "llama"
+ rms_norm_add: bool = False
+ mlp_activation: str = "silu"
+ qkv_bias: bool = False
+ q_norm: str = "gemma3"
+ k_norm: str = "gemma3"
+ final_norm: bool = True
+ lm_head: bool = False
+ stop_tokens: List[int] = field(default_factory=lambda: [151643, 151645])
+
+
+QWEN3VL_VISION_DEFAULTS = dict(
+ hidden_size=1152,
+ num_heads=16,
+ intermediate_size=4304,
+ depth=27,
+ patch_size=16,
+ temporal_patch_size=2,
+ in_channels=3,
+ spatial_merge_size=2,
+ num_position_embeddings=2304,
+ deepstack_visual_indexes=(8, 16, 24),
+ out_hidden_size=4096, # final merger projects directly into LLM hidden
+)
+
+
+class BottleneckPatchEmbed(nn.Module):
+ # 3072 -> 1024 -> 4096 (raw 32x32 RGB patch -> bottleneck -> LLM hidden).
+ def __init__(self, patch_size=32, in_chans=3, pca_dim=1024, embed_dim=4096, bias=True, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.proj1 = ops.Linear(patch_size * patch_size * in_chans, pca_dim, bias=False, device=device, dtype=dtype)
+ self.proj2 = ops.Linear(pca_dim, embed_dim, bias=bias, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.proj2(self.proj1(x))
+
+
+class FinalLayer(nn.Module):
+ # 4096 -> 3072 (LLM hidden -> flat pixel patch).
+ def __init__(self, hidden_size, patch_size=32, out_channels=3, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.linear = ops.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.linear(x)
+
+
+class HiDreamO1Transformer(nn.Module):
+ """HiDream-O1 unified pixel-level transformer."""
+
+ def __init__(self, image_model=None, dtype=None, device=None, operations=None,
+ text_config_overrides=None, vision_config_overrides=None, **kwargs):
+ super().__init__()
+ self.dtype = dtype
+
+ text_cfg = HiDreamO1TextConfig(**(text_config_overrides or {}))
+ vision_cfg = dict(QWEN3VL_VISION_DEFAULTS)
+ if vision_config_overrides:
+ vision_cfg.update(vision_config_overrides)
+ vision_cfg["out_hidden_size"] = text_cfg.hidden_size
+
+ self.text_config = text_cfg
+ self.vision_config = vision_cfg
+ self.hidden_size = text_cfg.hidden_size
+ self.patch_size = PATCH_SIZE
+ self.in_channels = 3
+ self.tms_token_id = TMS_TOKEN_ID
+
+ self.visual = Qwen35VisionModel(vision_cfg, device=device, dtype=dtype, ops=operations)
+ self.language_model = Llama2_(text_cfg, device=device, dtype=dtype, ops=operations)
+ self.t_embedder1 = TimestepEmbedder(
+ text_cfg.hidden_size, device=device, dtype=dtype, operations=operations,
+ )
+ self.x_embedder = BottleneckPatchEmbed(
+ patch_size=self.patch_size, in_chans=self.in_channels,
+ pca_dim=text_cfg.hidden_size // 4, embed_dim=text_cfg.hidden_size,
+ bias=True, device=device, dtype=dtype, ops=operations,
+ )
+ self.final_layer2 = FinalLayer(
+ text_cfg.hidden_size, patch_size=self.patch_size,
+ out_channels=self.in_channels, device=device, dtype=dtype, ops=operations,
+ )
+
+ self._visual_cache = None
+ self._kv_cache_entries = []
+
+ def clear_kv_cache(self):
+ self._kv_cache_entries = []
+ self._visual_cache = None
+
+ def forward(self, x, timesteps, context=None, transformer_options={}, **kwargs):
+ return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+ self._forward,
+ self,
+ comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options)
+ ).execute(x, timesteps, context, transformer_options, **kwargs)
+
+ def _forward(self, x, timesteps, context=None, transformer_options={}, input_ids=None, attention_mask=None, position_ids=None,
+ vinput_mask=None, ar_len=None, ref_pixel_values=None, ref_image_grid_thw=None, ref_patches=None, **kwargs):
+ """Returns flow-match velocity (x - x_pred) / sigma"""
+
+ if input_ids is None or position_ids is None:
+ raise ValueError("HiDreamO1Transformer requires input_ids and position_ids in conditioning")
+
+ B, _, H, W = x.shape
+ h_p, w_p = H // self.patch_size, W // self.patch_size
+ tgt_image_len = h_p * w_p
+
+ z = einops.rearrange(
+ x, 'B C (H p1) (W p2) -> B (H W) (C p1 p2)',
+ p1=self.patch_size, p2=self.patch_size,
+ )
+ vinputs = torch.cat([z, ref_patches.to(z.dtype)], dim=1) if ref_patches is not None else z
+
+ inputs_embeds = self.language_model.embed_tokens(input_ids).to(x.dtype)
+
+ if ref_pixel_values is not None and ref_image_grid_thw is not None:
+ # ViT output is constant across sampling steps within a generation
+ # identity-key by the input tensor so refs don't recompute every step.
+ cached = self._visual_cache
+ if cached is not None and cached[0] is ref_pixel_values:
+ image_embeds = cached[1]
+ else:
+ ref_pv = ref_pixel_values.to(inputs_embeds.device)
+ ref_grid = ref_image_grid_thw.to(inputs_embeds.device).long()
+ # extra_conds wraps with a leading batch dim; refs are model-level so [0] always recovers them.
+ if ref_pv.dim() == 3:
+ ref_pv = ref_pv[0]
+ if ref_grid.dim() == 3:
+ ref_grid = ref_grid[0]
+ image_embeds = self.visual(ref_pv, ref_grid).to(inputs_embeds.dtype)
+ self._visual_cache = (ref_pixel_values, image_embeds)
+ # image_pad positions identical across batch (input_ids shared cond/uncond).
+ image_idx = (input_ids[0] == IMAGE_TOKEN_ID).nonzero(as_tuple=True)[0]
+ if image_idx.shape[0] != image_embeds.shape[0]:
+ raise ValueError(
+ f"Image-token count {image_idx.shape[0]} != ViT output count "
+ f"{image_embeds.shape[0]}; check tokenizer/processor alignment."
+ )
+ inputs_embeds[:, image_idx] = image_embeds.unsqueeze(0).expand(B, -1, -1)
+
+ sigma = timesteps.float() / 1000.0
+ t_pixeldit = 1.0 - sigma
+ t_emb = self.t_embedder1(t_pixeldit * 1000, inputs_embeds.dtype)
+ tms_mask_3d = (input_ids == self.tms_token_id).unsqueeze(-1).expand_as(inputs_embeds)
+ inputs_embeds = torch.where(tms_mask_3d, t_emb.unsqueeze(1).expand_as(inputs_embeds), inputs_embeds)
+
+ vinputs_embedded = self.x_embedder(vinputs.to(inputs_embeds.dtype))
+ inputs_embeds = torch.cat([inputs_embeds, vinputs_embedded], dim=1)
+
+ # extra_conds stores position_ids as (1, 3, T); process_cond repeats dim 0 to B. Take row 0.
+ freqs_cis = self.language_model.compute_freqs_cis(position_ids[0].to(x.device), x.device)
+ freqs_cis = tuple(t.to(x.dtype) for t in freqs_cis)
+
+ two_pass_attn = make_two_pass_attention(ar_len, transformer_options=transformer_options)
+ patches_replace = transformer_options.get("patches_replace", {})
+ blocks_replace = patches_replace.get("dit", {})
+ transformer_options["total_blocks"] = len(self.language_model.layers)
+ transformer_options["block_type"] = "double"
+
+ # Cache prefix K/V across steps. Key includes input_ids (prompt), ref_id
+ # (refs scatter into inputs_embeds), and position_ids (RoPE baked into cached K).
+ can_cache = not blocks_replace and ar_len > 0
+ cache_len = ar_len if can_cache else 0
+ ref_id = id(ref_pixel_values) if ref_pixel_values is not None else None
+ pos_ids_key = position_ids[..., :cache_len] if can_cache else position_ids
+ cache_entries = self._kv_cache_entries
+ # Drop stale entries from a previous device (model was unloaded and reloaded).
+ if cache_entries and cache_entries[0]["input_ids"].device != input_ids.device:
+ cache_entries = []
+ self._kv_cache_entries = []
+ kv_cache = None
+ if can_cache:
+ for entry in cache_entries:
+ ck = entry["input_ids"]
+ ep = entry["position_ids"]
+ if (entry["cache_len"] == cache_len
+ and ck.shape == input_ids.shape and torch.equal(ck, input_ids)
+ and entry["ref_id"] == ref_id
+ and ep.shape == pos_ids_key.shape and torch.equal(ep, pos_ids_key)):
+ kv_cache = entry
+ break
+
+ if kv_cache is not None:
+ # Hot path: project Q/K/V only for fresh positions; past_key_value prepends cached AR K/V.
+ hidden_states = inputs_embeds[:, cache_len:]
+ sliced_freqs = tuple(t[..., cache_len:, :] for t in freqs_cis)
+ for i, layer in enumerate(self.language_model.layers):
+ transformer_options["block_index"] = i
+ K_i, V_i = kv_cache["kv"][i]
+ hidden_states, _ = layer(
+ x=hidden_states, attention_mask=None, freqs_cis=sliced_freqs, optimized_attention=two_pass_attn,
+ past_key_value=(K_i, V_i, cache_len),
+ )
+ else:
+ # Cold path: run full sequence; if cacheable, snapshot K/V at AR positions.
+ snapshots = [] if can_cache else None
+ past_kv_cold = () if can_cache else None
+ hidden_states = inputs_embeds
+ for i, layer in enumerate(self.language_model.layers):
+ transformer_options["block_index"] = i
+ if ("double_block", i) in blocks_replace:
+ def block_wrap(args, _layer=layer):
+ out = {}
+ out["x"], _ = _layer(
+ x=args["x"], attention_mask=args.get("attention_mask"),
+ freqs_cis=args["freqs_cis"], optimized_attention=args["optimized_attention"],
+ past_key_value=None,
+ )
+ return out
+ out = blocks_replace[("double_block", i)](
+ {"x": hidden_states, "attention_mask": None,
+ "freqs_cis": freqs_cis, "optimized_attention": two_pass_attn,
+ "transformer_options": transformer_options},
+ {"original_block": block_wrap},
+ )
+ hidden_states = out["x"]
+ else:
+ hidden_states, present_kv = layer(
+ x=hidden_states, attention_mask=None,
+ freqs_cis=freqs_cis, optimized_attention=two_pass_attn,
+ past_key_value=past_kv_cold,
+ )
+ if snapshots is not None:
+ K, V, _ = present_kv
+ snapshots.append((K[:, :, :cache_len].contiguous(),
+ V[:, :, :cache_len].contiguous()))
+ if snapshots is not None:
+ # Cap at 2 entries (cond + uncond). Multi-cond workflows LRU-evict.
+ new_entry = {
+ "input_ids": input_ids.clone(),
+ "cache_len": cache_len,
+ "kv": snapshots,
+ "ref_id": ref_id,
+ "position_ids": pos_ids_key.clone(),
+ }
+ self._kv_cache_entries = (cache_entries + [new_entry])[-2:]
+
+ if self.language_model.norm is not None:
+ hidden_states = self.language_model.norm(hidden_states)
+
+ # Slice target-image positions before the final projection so the Linear only runs on tgt_image_len tokens.
+ # In the hot path hidden_states starts at original position cache_len, so masks/indices shift by cache_len.
+ sliced_offset = cache_len if kv_cache is not None else 0
+ if vinput_mask is not None:
+ vmask = vinput_mask.to(x.device).bool()
+ if sliced_offset > 0:
+ vmask = vmask[:, sliced_offset:]
+ target_hidden = hidden_states[vmask].view(B, -1, hidden_states.shape[-1])[:, :tgt_image_len]
+ else:
+ txt_seq_len = input_ids.shape[1]
+ start = txt_seq_len - sliced_offset
+ target_hidden = hidden_states[:, start:start + tgt_image_len]
+ x_pred_tgt = self.final_layer2(target_hidden)
+
+ # fp32 final subtraction, bf16 here noticeably degrades samples.
+ x_pred_img = einops.rearrange(
+ x_pred_tgt, 'B (H W) (C p1 p2) -> B C (H p1) (W p2)',
+ H=h_p, W=w_p, p1=self.patch_size, p2=self.patch_size,
+ )
+ return (x.float() - x_pred_img.float()) / sigma.view(B, 1, 1, 1).clamp_min(1e-3)
diff --git a/comfy/ldm/hidream_o1/utils.py b/comfy/ldm/hidream_o1/utils.py
new file mode 100644
index 000000000..5a1249c72
--- /dev/null
+++ b/comfy/ldm/hidream_o1/utils.py
@@ -0,0 +1,173 @@
+"""HiDream-O1 input-prep helpers: image/resolution math and unified-sequence
+RoPE position-id assembly. The fix_point offset in get_rope_index_fix_point
+lets the target image and patchified ref images share spatial RoPE positions
+despite living at different sequence indices — same 2D image plane.
+"""
+
+import math
+from typing import Optional
+
+import torch
+
+
+PATCH_SIZE = 32
+CONDITION_IMAGE_SIZE = 384 # ViT-side base size for ref images
+
+
+def resize_tensor(img_t, image_size, patch_size=16):
+ """img_t: (1, 3, H, W) float [0, 1]. Fit to image_size**2 area, patch-aligned, center-cropped."""
+
+ while min(img_t.shape[-2], img_t.shape[-1]) >= 2 * image_size: # Pre-halves with 2x2 box averaging while the image is still very large
+ img_t = torch.nn.functional.avg_pool2d(img_t, kernel_size=2, stride=2)
+
+ _, _, height, width = img_t.shape
+ m = patch_size
+ s_max = image_size * image_size
+ scale = math.sqrt(s_max / (width * height))
+
+ candidates = [
+ (round(width * scale) // m * m, round(height * scale) // m * m),
+ (round(width * scale) // m * m, math.floor(height * scale) // m * m),
+ (math.floor(width * scale) // m * m, round(height * scale) // m * m),
+ (math.floor(width * scale) // m * m, math.floor(height * scale) // m * m),
+ ]
+ candidates = sorted(candidates, key=lambda x: x[0] * x[1], reverse=True)
+ new_size = candidates[-1]
+ for c in candidates:
+ if c[0] * c[1] <= s_max:
+ new_size = c
+ break
+
+ new_w, new_h = new_size
+ s1 = width / new_w
+ s2 = height / new_h
+ if s1 < s2:
+ resize_w, resize_h = new_w, round(height / s1)
+ else:
+ resize_w, resize_h = round(width / s2), new_h
+ img_t = torch.nn.functional.interpolate(img_t, size=(resize_h, resize_w), mode="bicubic")
+ top = (resize_h - new_h) // 2
+ left = (resize_w - new_w) // 2
+ return img_t[..., top:top + new_h, left:left + new_w]
+
+
+def calculate_dimensions(max_size, ratio):
+ """(W, H) for an aspect ratio fitting in max_size**2 area, 32-aligned."""
+ width = math.sqrt(max_size * max_size * ratio)
+ height = width / ratio
+ width = int(width / 32) * 32
+ height = int(height / 32) * 32
+ return width, height
+
+
+def ref_max_size(target_max_dim, k):
+ """K-dependent ref-image max dim before patchifying."""
+ if k == 1:
+ return target_max_dim
+ if k == 2:
+ return target_max_dim * 48 // 64
+ if k <= 4:
+ return target_max_dim // 2
+ if k <= 8:
+ return target_max_dim * 24 // 64
+ return target_max_dim // 4
+
+
+def cond_image_size(k):
+ """K-dependent ViT-side image size."""
+ if k <= 4:
+ return CONDITION_IMAGE_SIZE
+ if k <= 8:
+ return CONDITION_IMAGE_SIZE * 48 // 64
+ return CONDITION_IMAGE_SIZE // 2
+
+
+def get_rope_index_fix_point(
+ spatial_merge_size: int,
+ image_token_id: int,
+ vision_start_token_id: int,
+ input_ids: Optional[torch.LongTensor] = None,
+ image_grid_thw: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ skip_vision_start_token=None,
+ fix_point: int = 4096,
+):
+ mrope_position_deltas = []
+ if input_ids is not None and image_grid_thw is not None:
+ total_input_ids = input_ids
+ if attention_mask is None:
+ attention_mask = torch.ones_like(total_input_ids)
+ position_ids = torch.ones(
+ 3, input_ids.shape[0], input_ids.shape[1],
+ dtype=input_ids.dtype, device=input_ids.device,
+ )
+ attention_mask = attention_mask.to(total_input_ids.device)
+ for i, input_ids_b in enumerate(total_input_ids):
+ fp = fix_point
+ image_index = 0
+ input_ids_b = input_ids_b[attention_mask[i] == 1]
+ vision_start_indices = torch.argwhere(input_ids_b == vision_start_token_id).squeeze(1)
+ vision_tokens = input_ids_b[vision_start_indices + 1]
+ image_nums = (vision_tokens == image_token_id).sum()
+ input_tokens = input_ids_b.tolist()
+ llm_pos_ids_list = []
+ st = 0
+ remain_images = image_nums
+ for _ in range(image_nums):
+ if image_token_id in input_tokens and remain_images > 0:
+ ed = input_tokens.index(image_token_id, st)
+ else:
+ ed = len(input_tokens) + 1
+ t = image_grid_thw[image_index][0]
+ h = image_grid_thw[image_index][1]
+ w = image_grid_thw[image_index][2]
+ image_index += 1
+ remain_images -= 1
+ llm_grid_t = t.item()
+ llm_grid_h = h.item() // spatial_merge_size
+ llm_grid_w = w.item() // spatial_merge_size
+ text_len = ed - st
+ text_len -= skip_vision_start_token[image_index - 1]
+ text_len = max(0, text_len)
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+ t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+ h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+ w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+
+ if skip_vision_start_token[image_index - 1]:
+ if fp > 0:
+ fp = fp - st_idx
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + fp + st_idx)
+ fp = 0
+ else:
+ llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+ st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+ if st < len(input_tokens):
+ st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+ text_len = len(input_tokens) - st
+ llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+ llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+ position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+ mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+ mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+ return position_ids, mrope_position_deltas
+
+ if attention_mask is not None:
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+ max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+ mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+ else:
+ position_ids = (
+ torch.arange(input_ids.shape[1], device=input_ids.device)
+ .view(1, 1, -1).expand(3, input_ids.shape[0], -1)
+ )
+ mrope_position_deltas = torch.zeros(
+ [input_ids.shape[0], 1], device=input_ids.device, dtype=input_ids.dtype,
+ )
+ return position_ids, mrope_position_deltas
diff --git a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
index f67ba84e9..4e4819fe3 100644
--- a/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
+++ b/comfy/ldm/hunyuan3dv2_1/hunyuandit.py
@@ -328,7 +328,7 @@ class CrossAttention(nn.Module):
kv = torch.cat((k, v), dim=-1)
split_size = kv.shape[-1] // self.num_heads // 2
- kv = kv.view(1, -1, self.num_heads, split_size * 2)
+ kv = kv.view(b, -1, self.num_heads, split_size * 2)
k, v = torch.split(kv, split_size, dim=-1)
q = q.view(b, s1, self.num_heads, self.head_dim)
@@ -398,7 +398,7 @@ class Attention(nn.Module):
qkv_combined = torch.cat((query, key, value), dim=-1)
split_size = qkv_combined.shape[-1] // self.num_heads // 3
- qkv = qkv_combined.view(1, -1, self.num_heads, split_size * 3)
+ qkv = qkv_combined.view(B, -1, self.num_heads, split_size * 3)
query, key, value = torch.split(qkv, split_size, dim=-1)
query = query.reshape(B, N, self.num_heads, self.head_dim)
@@ -607,9 +607,13 @@ class HunYuanDiTPlain(nn.Module):
def forward(self, x, t, context, transformer_options = {}, **kwargs):
x = x.movedim(-1, -2)
- uncond_emb, cond_emb = context.chunk(2, dim = 0)
- context = torch.cat([cond_emb, uncond_emb], dim = 0)
+ swap_cfg_halves = context.shape[0] >= 2
+
+ if swap_cfg_halves:
+ first_half, second_half = context.chunk(2, dim = 0)
+ context = torch.cat([second_half, first_half], dim = 0)
+
main_condition = context
t = 1.0 - t
@@ -657,5 +661,8 @@ class HunYuanDiTPlain(nn.Module):
output = self.final_layer(combined)
output = output.movedim(-2, -1) * (-1.0)
- cond_emb, uncond_emb = output.chunk(2, dim = 0)
- return torch.cat([uncond_emb, cond_emb])
+ if swap_cfg_halves:
+ first_half, second_half = output.chunk(2, dim = 0)
+ output = torch.cat([second_half, first_half], dim = 0)
+
+ return output
diff --git a/comfy/ldm/lens/model.py b/comfy/ldm/lens/model.py
new file mode 100644
index 000000000..cd5015ddc
--- /dev/null
+++ b/comfy/ldm/lens/model.py
@@ -0,0 +1,510 @@
+"""Lens denoising transformer (DiT)"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ldm.flux.layers
+import comfy.patcher_extension
+from comfy.ldm.flux.layers import EmbedND
+from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.modules.attention import optimized_attention
+
+
+def _lens_time_proj(t: torch.Tensor, dim: int = 256) -> torch.Tensor:
+ return comfy.ldm.flux.layers.timestep_embedding(t, dim)
+
+
+def _lens_position_ids(
+ frame: int, height: int, width: int, text_seq_len: int,
+ scale_rope: bool = True, device=None,
+) -> torch.Tensor:
+ """Lens axial (frame, h, w) position ids for joint image + text sequence.
+
+ With ``scale_rope=True`` h/w are centered around 0 (negative + positive
+ halves) and text starts at ``max(h//2, w//2)``. Result shape ``[seq, 3]``;
+ caller adds a batch dim for ``EmbedND``.
+ """
+ if scale_rope:
+ h_pos = torch.cat([torch.arange(-(height - height // 2), 0, device=device),
+ torch.arange(0, height // 2, device=device)])
+ w_pos = torch.cat([torch.arange(-(width - width // 2), 0, device=device),
+ torch.arange(0, width // 2, device=device)])
+ text_start = max(height // 2, width // 2)
+ else:
+ h_pos = torch.arange(height, device=device)
+ w_pos = torch.arange(width, device=device)
+ text_start = max(height, width)
+
+ f_pos = torch.arange(frame, device=device)
+ img_ids = torch.zeros(frame, height, width, 3, device=device)
+ img_ids[..., 0] = f_pos[:, None, None]
+ img_ids[..., 1] = h_pos[None, :, None]
+ img_ids[..., 2] = w_pos[None, None, :]
+ img_ids = img_ids.reshape(-1, 3)
+
+ # Text positions replicate across all 3 axes (matches original packing).
+ txt_pos = torch.arange(text_start, text_start + text_seq_len, device=device).float()
+ txt_ids = txt_pos[:, None].expand(text_seq_len, 3)
+
+ return torch.cat([img_ids, txt_ids], dim=0)
+
+
+class _TimestepEmbedder(nn.Module):
+ def __init__(self, in_channels: int, time_embed_dim: int, dtype=None, device=None, operations=None) -> None:
+ super().__init__()
+ self.linear_1 = operations.Linear(in_channels, time_embed_dim, dtype=dtype, device=device)
+ self.linear_2 = operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ x = self.linear_1(x)
+ x = F.silu(x)
+ return self.linear_2(x)
+
+
+class LensTimestepProjEmbeddings(nn.Module):
+ def __init__(self, embedding_dim: int, dtype=None, device=None, operations=None) -> None:
+ super().__init__()
+ self.timestep_embedder = _TimestepEmbedder(256, embedding_dim, dtype=dtype, device=device, operations=operations)
+
+ def forward(self, timestep: torch.Tensor, hidden_states: torch.Tensor) -> torch.Tensor:
+ proj = _lens_time_proj(timestep, 256)
+ return self.timestep_embedder(proj.to(dtype=hidden_states.dtype))
+
+
+class GateMLP(nn.Module):
+ """SwiGLU MLP."""
+
+ def __init__(self, dim: int, hidden_dim: int, dtype=None, device=None, operations=None) -> None:
+ super().__init__()
+ self.w1 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+ self.w2 = operations.Linear(hidden_dim, dim, bias=False, dtype=dtype, device=device)
+ self.w3 = operations.Linear(dim, hidden_dim, bias=False, dtype=dtype, device=device)
+
+ def forward(self, x):
+ return self.w2(F.silu(self.w1(x), inplace=True).mul_(self.w3(x)))
+
+
+class LensJointAttention(nn.Module):
+ """Joint image+text attention with fused QKV per stream."""
+
+ def __init__(
+ self,
+ query_dim: int,
+ added_kv_proj_dim: int,
+ dim_head: int = 64,
+ heads: int = 8,
+ out_dim: Optional[int] = None,
+ eps: float = 1e-5,
+ dtype=None,
+ device=None,
+ operations=None,
+ ) -> None:
+ super().__init__()
+ self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+ self.heads = self.inner_dim // dim_head
+ self.dim_head = dim_head
+ self.out_dim = out_dim if out_dim is not None else query_dim
+
+ self.norm_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+ self.norm_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+ self.norm_added_q = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+ self.norm_added_k = operations.RMSNorm(dim_head, eps=eps, dtype=dtype, device=device)
+
+ self.img_qkv = operations.Linear(query_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
+ self.txt_qkv = operations.Linear(added_kv_proj_dim, 3 * self.inner_dim, bias=True, dtype=dtype, device=device)
+
+ # ModuleList([Linear, Identity]) for state-dict key compatibility.
+ self.to_out = nn.ModuleList([
+ operations.Linear(self.inner_dim, self.out_dim, bias=True, dtype=dtype, device=device),
+ nn.Identity(),
+ ])
+ self.to_add_out = operations.Linear(self.inner_dim, query_dim, bias=True, dtype=dtype, device=device)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ freqs_cis: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ transformer_options: Optional[Dict[str, Any]] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ bsz, seq_img, _ = hidden_states.shape
+ seq_txt = encoder_hidden_states.shape[1]
+
+ # image stream
+ img_qkv = self.img_qkv(hidden_states).view(bsz, seq_img, 3, self.heads, self.dim_head)
+ img_q, img_k, img_v = img_qkv.unbind(dim=2)
+ img_q = self.norm_q(img_q)
+ img_k = self.norm_k(img_k)
+ del img_qkv
+
+ # text stream
+ txt_qkv = self.txt_qkv(encoder_hidden_states).view(bsz, seq_txt, 3, self.heads, self.dim_head)
+ txt_q, txt_k, txt_v = txt_qkv.unbind(dim=2)
+ txt_q = self.norm_added_q(txt_q)
+ txt_k = self.norm_added_k(txt_k)
+
+ # [B, S, H, D] → [B, H, S, D] for attention, dels to avoid VRAM peaks
+ q = torch.cat([img_q, txt_q], dim=1).transpose(1, 2)
+ del img_q, txt_q
+ k = torch.cat([img_k, txt_k], dim=1).transpose(1, 2)
+ del img_k, txt_k
+ v = torch.cat([img_v, txt_v], dim=1).transpose(1, 2)
+ del img_v, txt_v
+
+ q, k = apply_rope(q, k, freqs_cis)
+
+ if attention_mask is not None:
+ expected = (bsz, 1, 1, seq_img + seq_txt)
+ if attention_mask.shape != expected:
+ raise ValueError(
+ f"attention_mask must be {expected}, got {tuple(attention_mask.shape)}"
+ )
+ attention_mask = attention_mask.to(q.dtype)
+
+ out = optimized_attention(
+ q, k, v, self.heads, mask=attention_mask, skip_reshape=True,
+ transformer_options=transformer_options,
+ )
+
+ img_out = self.to_out[1](self.to_out[0](out[:, :seq_img, :]))
+ txt_out = self.to_add_out(out[:, seq_img:, :])
+ return img_out, txt_out
+
+
+class LensTransformerBlock(nn.Module):
+ def __init__(
+ self,
+ dim: int,
+ num_attention_heads: int,
+ attention_head_dim: int,
+ eps: float = 1e-6,
+ rms_norm: bool = True,
+ dtype=None,
+ device=None,
+ operations=None,
+ ) -> None:
+ super().__init__()
+
+ self.attn = LensJointAttention(
+ query_dim=dim,
+ added_kv_proj_dim=dim,
+ dim_head=attention_head_dim,
+ heads=num_attention_heads,
+ out_dim=dim,
+ eps=1e-5,
+ dtype=dtype,
+ device=device,
+ operations=operations,
+ )
+
+ if rms_norm:
+ NormCls = operations.RMSNorm
+ norm_kwargs = {}
+ else:
+ NormCls = operations.LayerNorm
+ norm_kwargs = {"elementwise_affine": False}
+
+ mlp_hidden = int(dim / 3 * 8)
+
+ # Sequential(SiLU, Linear) so state-dict lands at img_mod.1.{weight,bias}.
+ self.img_mod = nn.Sequential(
+ nn.SiLU(),
+ operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
+ )
+ self.img_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+ self.img_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+ self.img_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
+
+ self.txt_mod = nn.Sequential(
+ nn.SiLU(),
+ operations.Linear(dim, 6 * dim, bias=True, dtype=dtype, device=device),
+ )
+ self.txt_norm1 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+ self.txt_norm2 = NormCls(dim, eps=eps, dtype=dtype, device=device, **norm_kwargs)
+ self.txt_mlp = GateMLP(dim, mlp_hidden, dtype=dtype, device=device, operations=operations)
+
+ @staticmethod
+ def _modulate(x: torch.Tensor, mod_params: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ shift, scale, gate = mod_params.chunk(3, dim=-1)
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: torch.Tensor,
+ temb: torch.Tensor,
+ freqs_cis: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ transformer_options: Optional[Dict[str, Any]] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ img_mod1, img_mod2 = self.img_mod(temb).chunk(2, dim=-1)
+ txt_mod1, txt_mod2 = self.txt_mod(temb).chunk(2, dim=-1)
+
+ img_modulated, img_gate1 = self._modulate(self.img_norm1(hidden_states), img_mod1)
+ txt_modulated, txt_gate1 = self._modulate(self.txt_norm1(encoder_hidden_states), txt_mod1)
+
+ img_attn, txt_attn = self.attn(
+ hidden_states=img_modulated,
+ encoder_hidden_states=txt_modulated,
+ freqs_cis=freqs_cis,
+ attention_mask=attention_mask,
+ transformer_options=transformer_options,
+ )
+
+ hidden_states = hidden_states + img_gate1 * img_attn
+ encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn
+
+ img_modulated2, img_gate2 = self._modulate(self.img_norm2(hidden_states), img_mod2)
+ hidden_states = hidden_states + img_gate2 * self.img_mlp(img_modulated2)
+
+ txt_modulated2, txt_gate2 = self._modulate(self.txt_norm2(encoder_hidden_states), txt_mod2)
+ encoder_hidden_states = encoder_hidden_states + txt_gate2 * self.txt_mlp(txt_modulated2)
+
+ return encoder_hidden_states, hidden_states
+
+
+class _AdaLayerNormContinuousNoAffine(nn.Module):
+ """AdaLayerNormContinuous(elementwise_affine=False).
+
+ The reference uses ``scale, shift = chunk(2)`` (scale first) — opposite
+ to Flux's ``LastLayer``.
+ """
+
+ def __init__(self, embedding_dim: int, conditioning_embedding_dim: int, eps: float = 1e-6,
+ dtype=None, device=None, operations=None) -> None:
+ super().__init__()
+ self.linear = operations.Linear(
+ conditioning_embedding_dim, embedding_dim * 2, bias=True, dtype=dtype, device=device
+ )
+ self.eps = eps
+ self.embedding_dim = embedding_dim
+
+ def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
+ emb = self.linear(F.silu(conditioning))
+ scale, shift = torch.chunk(emb, 2, dim=-1)
+ x = F.layer_norm(x, (self.embedding_dim,), None, None, self.eps)
+ return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+class LensTransformer2DModel(nn.Module):
+ """Lens dual-stream MMDiT (48 blocks, inner_dim=1536, multi-layer text)."""
+
+ def __init__(
+ self,
+ patch_size: int = 2,
+ in_channels: int = 128,
+ out_channels: Optional[int] = 32,
+ num_layers: int = 48,
+ attention_head_dim: int = 64,
+ num_attention_heads: int = 24,
+ enc_hidden_dim: int = 2880,
+ axes_dims_rope: Tuple[int, int, int] = (8, 28, 28),
+ rms_norm: bool = True,
+ multi_layer_encoder_feature: bool = True,
+ selected_layer_index: Tuple[int, ...] = (5, 11, 17, 23),
+ image_model=None, # unused; accepted for detection-side configs.
+ dtype=None,
+ device=None,
+ operations=None,
+ ) -> None:
+ super().__init__()
+ self.patch_size = patch_size
+ self.in_channels = in_channels
+ self.out_channels = out_channels if out_channels is not None else in_channels
+ self.inner_dim = num_attention_heads * attention_head_dim
+ self.multi_layer_encoder_feature = multi_layer_encoder_feature
+ self.selected_layer_index = list(selected_layer_index)
+ self.dtype = dtype
+
+ self.pos_embed = EmbedND(dim=attention_head_dim, theta=10000, axes_dim=list(axes_dims_rope))
+ self.time_text_embed = LensTimestepProjEmbeddings(
+ embedding_dim=self.inner_dim, dtype=dtype, device=device, operations=operations
+ )
+
+ if self.multi_layer_encoder_feature:
+ self.txt_norm = nn.ModuleList(
+ [operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
+ for _ in self.selected_layer_index]
+ )
+ self.txt_in = operations.Linear(
+ enc_hidden_dim * len(self.selected_layer_index),
+ self.inner_dim, bias=True, dtype=dtype, device=device,
+ )
+ else:
+ self.txt_norm = operations.RMSNorm(enc_hidden_dim, eps=1e-5, dtype=dtype, device=device)
+ self.txt_in = operations.Linear(enc_hidden_dim, self.inner_dim, bias=True, dtype=dtype, device=device)
+
+ self.img_in = operations.Linear(in_channels, self.inner_dim, bias=True, dtype=dtype, device=device)
+
+ self.transformer_blocks = nn.ModuleList([
+ LensTransformerBlock(
+ dim=self.inner_dim,
+ num_attention_heads=num_attention_heads,
+ attention_head_dim=attention_head_dim,
+ eps=1e-6,
+ rms_norm=rms_norm,
+ dtype=dtype, device=device, operations=operations,
+ )
+ for _ in range(num_layers)
+ ])
+
+ self.norm_out = _AdaLayerNormContinuousNoAffine(
+ self.inner_dim, self.inner_dim, eps=1e-6,
+ dtype=dtype, device=device, operations=operations,
+ )
+ self.proj_out = operations.Linear(
+ self.inner_dim, patch_size * patch_size * self.out_channels, bias=True,
+ dtype=dtype, device=device,
+ )
+
+ def forward(self, x: torch.Tensor, timestep: torch.Tensor, context: torch.Tensor, attention_mask: Optional[torch.Tensor] = None,
+ transformer_options: Optional[Dict[str, Any]] = None, **kwargs) -> torch.Tensor:
+ if transformer_options is None:
+ transformer_options = {}
+ return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+ self._forward, self,
+ comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+ ).execute(x, timestep, context, attention_mask, transformer_options, **kwargs)
+
+ def _forward(
+ self,
+ x: torch.Tensor,
+ timestep: torch.Tensor,
+ context: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ transformer_options: Optional[Dict[str, Any]] = None,
+ control: Optional[Dict[str, Any]] = None,
+ **kwargs,
+ ) -> torch.Tensor:
+ """ComfyUI bridge: ``(x[B,128,h,w], t[B], context[B,S,L*H], mask[B,S])``."""
+ if transformer_options is None:
+ transformer_options = {}
+ transformer_options = transformer_options.copy()
+ patches = transformer_options.get("patches", {})
+ patches_replace = transformer_options.get("patches_replace", {})
+ blocks_replace = patches_replace.get("dit", {})
+
+ B, C, h, w = x.shape
+ hidden_states = x.permute(0, 2, 3, 1).reshape(B, h * w, C)
+
+ if self.multi_layer_encoder_feature:
+ L = len(self.selected_layer_index)
+ enc_dim = context.shape[-1] // L
+ encoder_hidden_states = list(
+ context.reshape(B, -1, L, enc_dim).unbind(dim=2)
+ )
+ text_seq_len = encoder_hidden_states[0].shape[1]
+ else:
+ encoder_hidden_states = context
+ text_seq_len = context.shape[1]
+
+ if attention_mask is None:
+ attention_mask = torch.ones(
+ (B, text_seq_len), dtype=torch.bool, device=x.device
+ )
+
+ img_len = h * w
+ joint_mask = self._build_joint_attention_mask(attention_mask, img_len)
+
+ hidden_states = self.img_in(hidden_states)
+ timestep = timestep.to(hidden_states.dtype)
+
+ if self.multi_layer_encoder_feature:
+ normed = [self.txt_norm[i](encoder_hidden_states[i]) for i in range(L)]
+ encoder_hidden_states = torch.cat(normed, dim=-1)
+ else:
+ encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+ encoder_hidden_states = self.txt_in(encoder_hidden_states)
+
+ if "post_input" in patches:
+ for p in patches["post_input"]:
+ out = p({
+ "img": hidden_states,
+ "txt": encoder_hidden_states,
+ "transformer_options": transformer_options,
+ })
+ hidden_states = out["img"]
+ encoder_hidden_states = out["txt"]
+
+ temb = self.time_text_embed(timestep, hidden_states)
+ ids = _lens_position_ids(1, h, w, text_seq_len, device=hidden_states.device).unsqueeze(0)
+ freqs_cis = self.pos_embed(ids)
+
+ transformer_options["total_blocks"] = len(self.transformer_blocks)
+ transformer_options["block_type"] = "double"
+ for i, block in enumerate(self.transformer_blocks):
+ transformer_options["block_index"] = i
+ if ("double_block", i) in blocks_replace:
+ def block_wrap(args):
+ out = {}
+ out["txt"], out["img"] = block(
+ hidden_states=args["img"],
+ encoder_hidden_states=args["txt"],
+ temb=args["vec"],
+ freqs_cis=args["pe"],
+ attention_mask=args.get("attn_mask"),
+ transformer_options=args.get("transformer_options"),
+ )
+ return out
+ out = blocks_replace[("double_block", i)](
+ {
+ "img": hidden_states,
+ "txt": encoder_hidden_states,
+ "vec": temb,
+ "pe": freqs_cis,
+ "attn_mask": joint_mask,
+ "transformer_options": transformer_options,
+ },
+ {"original_block": block_wrap},
+ )
+ encoder_hidden_states = out["txt"]
+ hidden_states = out["img"]
+ else:
+ encoder_hidden_states, hidden_states = block(
+ hidden_states=hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ temb=temb,
+ freqs_cis=freqs_cis,
+ attention_mask=joint_mask,
+ transformer_options=transformer_options,
+ )
+
+ if "double_block" in patches:
+ for p in patches["double_block"]:
+ out = p({
+ "img": hidden_states,
+ "txt": encoder_hidden_states,
+ "x": x,
+ "block_index": i,
+ "transformer_options": transformer_options,
+ })
+ hidden_states = out["img"]
+ encoder_hidden_states = out["txt"]
+
+ if control is not None:
+ control_i = control.get("input")
+ if control_i is not None and i < len(control_i):
+ add = control_i[i]
+ if add is not None:
+ hidden_states[:, :add.shape[1]] += add
+
+ hidden_states = self.norm_out(hidden_states, temb)
+ out = self.proj_out(hidden_states)
+ return out.reshape(B, h, w, C).permute(0, 3, 1, 2).contiguous()
+
+ @staticmethod
+ def _build_joint_attention_mask(text_mask: torch.Tensor, img_len: int) -> torch.Tensor:
+ if text_mask.dtype != torch.bool:
+ text_mask = text_mask.bool()
+ bsz = text_mask.shape[0]
+ img_ones = torch.ones((bsz, img_len), dtype=torch.bool, device=text_mask.device)
+ joint = torch.cat([img_ones, text_mask], dim=1)
+ additive = torch.zeros_like(joint, dtype=torch.float32)
+ additive.masked_fill_(~joint, torch.finfo(torch.float32).min)
+ return additive[:, None, None, :]
diff --git a/comfy/ldm/lightricks/av_model.py b/comfy/ldm/lightricks/av_model.py
index 6f2ba41ef..ef9938465 100644
--- a/comfy/ldm/lightricks/av_model.py
+++ b/comfy/ldm/lightricks/av_model.py
@@ -16,31 +16,31 @@ from comfy.ldm.lightricks.model import (
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
from comfy.ldm.lightricks.embeddings_connector import Embeddings1DConnector
import comfy.ldm.common_dit
+import comfy.model_prefetch
class CompressedTimestep:
"""Store video timestep embeddings in compressed form using per-frame indexing."""
__slots__ = ('data', 'batch_size', 'num_frames', 'patches_per_frame', 'feature_dim')
- def __init__(self, tensor: torch.Tensor, patches_per_frame: int):
+ def __init__(self, tensor: torch.Tensor, patches_per_frame: int, per_frame: bool = False):
"""
- tensor: [batch_size, num_tokens, feature_dim] tensor where num_tokens = num_frames * patches_per_frame
- patches_per_frame: Number of spatial patches per frame (height * width in latent space), or None to disable compression
+ tensor: [batch, num_tokens, feature_dim] (per-token, default) or
+ [batch, num_frames, feature_dim] (per_frame=True, already compressed).
+ patches_per_frame: spatial patches per frame; pass None to disable compression.
"""
- self.batch_size, num_tokens, self.feature_dim = tensor.shape
-
- # Check if compression is valid (num_tokens must be divisible by patches_per_frame)
- if patches_per_frame is not None and num_tokens % patches_per_frame == 0 and num_tokens >= patches_per_frame:
+ self.batch_size, n, self.feature_dim = tensor.shape
+ if per_frame:
self.patches_per_frame = patches_per_frame
- self.num_frames = num_tokens // patches_per_frame
-
- # Reshape to [batch, frames, patches_per_frame, feature_dim] and store one value per frame
- # All patches in a frame are identical, so we only keep the first one
- reshaped = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)
- self.data = reshaped[:, :, 0, :].contiguous() # [batch, frames, feature_dim]
+ self.num_frames = n
+ self.data = tensor
+ elif patches_per_frame is not None and n >= patches_per_frame and n % patches_per_frame == 0:
+ self.patches_per_frame = patches_per_frame
+ self.num_frames = n // patches_per_frame
+ # All patches in a frame are identical — keep only the first.
+ self.data = tensor.view(self.batch_size, self.num_frames, patches_per_frame, self.feature_dim)[:, :, 0, :].contiguous()
else:
- # Not divisible or too small - store directly without compression
self.patches_per_frame = 1
- self.num_frames = num_tokens
+ self.num_frames = n
self.data = tensor
def expand(self):
@@ -715,32 +715,35 @@ class LTXAVModel(LTXVModel):
def _prepare_timestep(self, timestep, batch_size, hidden_dtype, **kwargs):
"""Prepare timestep embeddings."""
- # TODO: some code reuse is needed here.
grid_mask = kwargs.get("grid_mask", None)
- if grid_mask is not None:
- timestep = timestep[:, grid_mask]
-
- timestep_scaled = timestep * self.timestep_scale_multiplier
-
- v_timestep, v_embedded_timestep = self.adaln_single(
- timestep_scaled.flatten(),
- {"resolution": None, "aspect_ratio": None},
- batch_size=batch_size,
- hidden_dtype=hidden_dtype,
- )
-
- # Calculate patches_per_frame from orig_shape: [batch, channels, frames, height, width]
- # Video tokens are arranged as (frames * height * width), so patches_per_frame = height * width
orig_shape = kwargs.get("orig_shape")
has_spatial_mask = kwargs.get("has_spatial_mask", None)
v_patches_per_frame = None
if not has_spatial_mask and orig_shape is not None and len(orig_shape) == 5:
- # orig_shape[3] = height, orig_shape[4] = width (in latent space)
v_patches_per_frame = orig_shape[3] * orig_shape[4]
- # Reshape to [batch_size, num_tokens, dim] and compress for storage
- v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame)
- v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame)
+ # Used by compute_prompt_timestep and the audio cross-attention paths.
+ timestep_scaled = (timestep[:, grid_mask] if grid_mask is not None else timestep) * self.timestep_scale_multiplier
+
+ # When patches in a frame share a timestep (no spatial mask), project one row per frame instead of one per token
+ per_frame_path = v_patches_per_frame is not None and (timestep.numel() // batch_size) % v_patches_per_frame == 0
+ if per_frame_path:
+ per_frame = timestep.reshape(batch_size, -1, v_patches_per_frame)[:, :, 0]
+ if grid_mask is not None:
+ # All-or-nothing per frame when has_spatial_mask=False.
+ per_frame = per_frame[:, grid_mask[::v_patches_per_frame]]
+ ts_input = per_frame * self.timestep_scale_multiplier
+ else:
+ ts_input = timestep_scaled
+
+ v_timestep, v_embedded_timestep = self.adaln_single(
+ ts_input.flatten(),
+ {"resolution": None, "aspect_ratio": None},
+ batch_size=batch_size,
+ hidden_dtype=hidden_dtype,
+ )
+ v_timestep = CompressedTimestep(v_timestep.view(batch_size, -1, v_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
+ v_embedded_timestep = CompressedTimestep(v_embedded_timestep.view(batch_size, -1, v_embedded_timestep.shape[-1]), v_patches_per_frame, per_frame=per_frame_path)
v_prompt_timestep = compute_prompt_timestep(
self.prompt_adaln_single, timestep_scaled, batch_size, hidden_dtype
@@ -764,25 +767,25 @@ class LTXAVModel(LTXVModel):
# Cross-attention timesteps - compress these too
av_ca_audio_scale_shift_timestep, _ = self.av_ca_audio_scale_shift_adaln_single(
- timestep.max().expand_as(a_timestep_flat),
+ a_timestep_flat,
{"resolution": None, "aspect_ratio": None},
batch_size=batch_size,
hidden_dtype=hidden_dtype,
)
av_ca_video_scale_shift_timestep, _ = self.av_ca_video_scale_shift_adaln_single(
- a_timestep.max().expand_as(timestep_flat),
+ timestep_flat,
{"resolution": None, "aspect_ratio": None},
batch_size=batch_size,
hidden_dtype=hidden_dtype,
)
av_ca_a2v_gate_noise_timestep, _ = self.av_ca_a2v_gate_adaln_single(
- a_timestep.max().expand_as(timestep_flat) * av_ca_factor,
+ a_timestep_scaled.max().expand_as(timestep_flat) * av_ca_factor,
{"resolution": None, "aspect_ratio": None},
batch_size=batch_size,
hidden_dtype=hidden_dtype,
)
av_ca_v2a_gate_noise_timestep, _ = self.av_ca_v2a_gate_adaln_single(
- timestep.max().expand_as(a_timestep_flat) * av_ca_factor,
+ timestep_scaled.max().expand_as(a_timestep_flat) * av_ca_factor,
{"resolution": None, "aspect_ratio": None},
batch_size=batch_size,
hidden_dtype=hidden_dtype,
@@ -907,9 +910,11 @@ class LTXAVModel(LTXVModel):
"""Process transformer blocks for LTXAV."""
patches_replace = transformer_options.get("patches_replace", {})
blocks_replace = patches_replace.get("dit", {})
+ prefetch_queue = comfy.model_prefetch.make_prefetch_queue(list(self.transformer_blocks), vx.device, transformer_options)
# Process transformer blocks
for i, block in enumerate(self.transformer_blocks):
+ comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, block)
if ("double_block", i) in blocks_replace:
def block_wrap(args):
@@ -982,6 +987,8 @@ class LTXAVModel(LTXVModel):
a_prompt_timestep=a_prompt_timestep,
)
+ comfy.model_prefetch.prefetch_queue_pop(prefetch_queue, vx.device, None)
+
return [vx, ax]
def _process_output(self, x, embedded_timestep, keyframe_idxs, **kwargs):
diff --git a/comfy/ldm/lightricks/model.py b/comfy/ldm/lightricks/model.py
index bfbc08357..e0a4a0f9b 100644
--- a/comfy/ldm/lightricks/model.py
+++ b/comfy/ldm/lightricks/model.py
@@ -358,6 +358,61 @@ def apply_split_rotary_emb(input_tensor, cos, sin):
return output.swapaxes(1, 2).reshape(B, T, -1) if needs_reshape else output
+class GuideAttentionMask:
+ """Holds the two per-group masks for LTXV guide self-attention.
+ _attention_with_guide_mask splits queries into noisy and tracked-guide
+ groups, so the largest mask is (1, 1, tracked_count, T).
+ """
+ __slots__ = ("guide_start", "tracked_count", "noisy_mask", "tracked_mask")
+
+ def __init__(self, total_tokens, guide_start, tracked_count, tracked_weights):
+ device = tracked_weights.device
+ dtype = tracked_weights.dtype
+ finfo = torch.finfo(dtype)
+
+ pos = tracked_weights > 0
+ log_w = torch.full_like(tracked_weights, finfo.min)
+ log_w[pos] = torch.log(tracked_weights[pos].clamp(min=finfo.tiny))
+
+ self.guide_start = guide_start
+ self.tracked_count = tracked_count
+
+ self.noisy_mask = torch.zeros((1, 1, 1, total_tokens), device=device, dtype=dtype)
+ self.noisy_mask[:, :, :, guide_start:guide_start + tracked_count] = log_w.view(1, 1, 1, -1)
+
+ self.tracked_mask = torch.zeros((1, 1, tracked_count, total_tokens), device=device, dtype=dtype)
+ self.tracked_mask[:, :, :, :guide_start] = log_w.view(1, 1, -1, 1)
+
+
+def _attention_with_guide_mask(q, k, v, heads, guide_mask, attn_precision, transformer_options):
+ """Apply the guide mask by partitioning Q into noisy and tracked-guide
+ groups, so each group needs only its own sub-mask. Avoids materializing
+ the (1,1,T,T) dense mask.
+ """
+ guide_start = guide_mask.guide_start
+ tracked_end = guide_start + guide_mask.tracked_count
+
+ out = torch.empty_like(q)
+
+ if guide_start > 0: # In practice currently guides are always after noise, guard for safety if this changes.
+ out[:, :guide_start, :] = comfy.ldm.modules.attention.optimized_attention(
+ q[:, :guide_start, :], k, v, heads, mask=guide_mask.noisy_mask,
+ attn_precision=attn_precision, transformer_options=transformer_options,
+ low_precision_attention=False, # sageattn mask support is unreliable
+ )
+ out[:, guide_start:tracked_end, :] = comfy.ldm.modules.attention.optimized_attention(
+ q[:, guide_start:tracked_end, :], k, v, heads, mask=guide_mask.tracked_mask,
+ attn_precision=attn_precision, transformer_options=transformer_options,
+ low_precision_attention=False,
+ )
+ if tracked_end < q.shape[1]: # Every guide token is tracked, and nothing comes after them, guard for safety if this changes.
+ out[:, tracked_end:, :] = comfy.ldm.modules.attention.optimized_attention(
+ q[:, tracked_end:, :], k, v, heads,
+ attn_precision=attn_precision, transformer_options=transformer_options,
+ )
+ return out
+
+
class CrossAttention(nn.Module):
def __init__(
self,
@@ -412,8 +467,10 @@ class CrossAttention(nn.Module):
if mask is None:
out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, attn_precision=self.attn_precision, transformer_options=transformer_options)
+ elif isinstance(mask, GuideAttentionMask):
+ out = _attention_with_guide_mask(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
else:
- out = comfy.ldm.modules.attention.optimized_attention_masked(q, k, v, self.heads, mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
+ out = comfy.ldm.modules.attention.optimized_attention(q, k, v, self.heads, mask=mask, attn_precision=self.attn_precision, transformer_options=transformer_options)
# Apply per-head gating if enabled
if self.to_gate_logits is not None:
@@ -1063,7 +1120,9 @@ class LTXVModel(LTXBaseModel):
additional_args["resolved_guide_entries"] = resolved_entries
keyframe_idxs = keyframe_idxs[..., kf_grid_mask, :]
- pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
+
+ if keyframe_idxs.shape[2] > 0: # Guard for the case of no keyframes surviving
+ pixel_coords[:, :, -keyframe_idxs.shape[2]:, :] = keyframe_idxs
# Total surviving guide tokens (all guides)
additional_args["num_guide_tokens"] = keyframe_idxs.shape[2]
@@ -1099,12 +1158,12 @@ class LTXVModel(LTXBaseModel):
if not resolved_entries:
return None
- # Check if any attenuation is actually needed
- needs_attenuation = any(
- e["strength"] < 1.0 or e.get("pixel_mask") is not None
+ # strength != 1.0 means we want to either attenuate (< 1) or amplify (> 1) guide attention.
+ needs_mask = any(
+ e["strength"] != 1.0 or e.get("pixel_mask") is not None
for e in resolved_entries
)
- if not needs_attenuation:
+ if not needs_mask:
return None
# Build per-guide-token weights for all tracked guide tokens.
@@ -1159,16 +1218,11 @@ class LTXVModel(LTXBaseModel):
# Concatenate per-token weights for all tracked guides
tracked_weights = torch.cat(all_weights, dim=1) # (1, total_tracked)
- # Check if any weight is actually < 1.0 (otherwise no attenuation needed)
- if (tracked_weights >= 1.0).all():
+ # Skip when every weight is exactly 1.0 (additive bias would be 0).
+ if (tracked_weights == 1.0).all():
return None
- # Build the mask: guide tokens are at the end of the sequence.
- # Tracked guides come first (in order), untracked follow.
- return self._build_self_attention_mask(
- total_tokens, num_guide_tokens, total_tracked,
- tracked_weights, guide_start, device, dtype,
- )
+ return GuideAttentionMask(total_tokens, guide_start, total_tracked, tracked_weights)
@staticmethod
def _downsample_mask_to_latent(mask, f_lat, h_lat, w_lat):
@@ -1234,45 +1288,6 @@ class LTXVModel(LTXBaseModel):
return rearrange(latent_mask, "b 1 f h w -> b (f h w)")
- @staticmethod
- def _build_self_attention_mask(total_tokens, num_guide_tokens, tracked_count,
- tracked_weights, guide_start, device, dtype):
- """Build a log-space additive self-attention bias mask.
-
- Attenuates attention between noisy tokens and tracked guide tokens.
- Untracked guide tokens (at the end of the guide portion) keep full attention.
-
- Args:
- total_tokens: Total sequence length.
- num_guide_tokens: Total guide tokens (all guides) at end of sequence.
- tracked_count: Number of tracked guide tokens (first in the guide portion).
- tracked_weights: (1, tracked_count) tensor, values in [0, 1].
- guide_start: Index where guide tokens begin in the sequence.
- device: Target device.
- dtype: Target dtype.
-
- Returns:
- (1, 1, total_tokens, total_tokens) additive bias mask.
- 0.0 = full attention, negative = attenuated, finfo.min = effectively fully masked.
- """
- finfo = torch.finfo(dtype)
- mask = torch.zeros((1, 1, total_tokens, total_tokens), device=device, dtype=dtype)
- tracked_end = guide_start + tracked_count
-
- # Convert weights to log-space bias
- w = tracked_weights.to(device=device, dtype=dtype) # (1, tracked_count)
- log_w = torch.full_like(w, finfo.min)
- positive_mask = w > 0
- if positive_mask.any():
- log_w[positive_mask] = torch.log(w[positive_mask].clamp(min=finfo.tiny))
-
- # noisy → tracked guides: each noisy row gets the same per-guide weight
- mask[:, :, :guide_start, guide_start:tracked_end] = log_w.view(1, 1, 1, -1)
- # tracked guides → noisy: each guide row broadcasts its weight across noisy cols
- mask[:, :, guide_start:tracked_end, :guide_start] = log_w.view(1, 1, -1, 1)
-
- return mask
-
def _process_transformer_blocks(self, x, context, attention_mask, timestep, pe, transformer_options={}, self_attention_mask=None, **kwargs):
"""Process transformer blocks for LTXV."""
patches_replace = transformer_options.get("patches_replace", {})
diff --git a/comfy/ldm/lightricks/vae/audio_vae.py b/comfy/ldm/lightricks/vae/audio_vae.py
index fa0a00748..dd5320c8f 100644
--- a/comfy/ldm/lightricks/vae/audio_vae.py
+++ b/comfy/ldm/lightricks/vae/audio_vae.py
@@ -4,9 +4,6 @@ import math
import torch
import torchaudio
-import comfy.model_management
-import comfy.model_patcher
-import comfy.utils as utils
from comfy.ldm.mmaudio.vae.distributions import DiagonalGaussianDistribution
from comfy.ldm.lightricks.symmetric_patchifier import AudioPatchifier
from comfy.ldm.lightricks.vae.causal_audio_autoencoder import (
@@ -43,30 +40,6 @@ class AudioVAEComponentConfig:
return cls(autoencoder=audio_config, vocoder=vocoder_config)
-
-class ModelDeviceManager:
- """Manages device placement and GPU residency for the composed model."""
-
- def __init__(self, module: torch.nn.Module):
- load_device = comfy.model_management.get_torch_device()
- offload_device = comfy.model_management.vae_offload_device()
- self.patcher = comfy.model_patcher.ModelPatcher(module, load_device, offload_device)
-
- def ensure_model_loaded(self) -> None:
- comfy.model_management.free_memory(
- self.patcher.model_size(),
- self.patcher.load_device,
- )
- comfy.model_management.load_model_gpu(self.patcher)
-
- def move_to_load_device(self, tensor: torch.Tensor) -> torch.Tensor:
- return tensor.to(self.patcher.load_device)
-
- @property
- def load_device(self):
- return self.patcher.load_device
-
-
class AudioLatentNormalizer:
"""Applies per-channel statistics in patch space and restores original layout."""
@@ -132,23 +105,17 @@ class AudioPreprocessor:
class AudioVAE(torch.nn.Module):
"""High-level Audio VAE wrapper exposing encode and decode entry points."""
- def __init__(self, state_dict: dict, metadata: dict):
+ def __init__(self, metadata: dict):
super().__init__()
component_config = AudioVAEComponentConfig.from_metadata(metadata)
- vae_sd = utils.state_dict_prefix_replace(state_dict, {"audio_vae.": ""}, filter_keys=True)
- vocoder_sd = utils.state_dict_prefix_replace(state_dict, {"vocoder.": ""}, filter_keys=True)
-
self.autoencoder = CausalAudioAutoencoder(config=component_config.autoencoder)
if "bwe" in component_config.vocoder:
self.vocoder = VocoderWithBWE(config=component_config.vocoder)
else:
self.vocoder = Vocoder(config=component_config.vocoder)
- self.autoencoder.load_state_dict(vae_sd, strict=False)
- self.vocoder.load_state_dict(vocoder_sd, strict=False)
-
autoencoder_config = self.autoencoder.get_config()
self.normalizer = AudioLatentNormalizer(
AudioPatchifier(
@@ -168,18 +135,12 @@ class AudioVAE(torch.nn.Module):
n_fft=autoencoder_config["n_fft"],
)
- self.device_manager = ModelDeviceManager(self)
-
- def encode(self, audio: dict) -> torch.Tensor:
+ def encode(self, audio, sample_rate=44100) -> torch.Tensor:
"""Encode a waveform dictionary into normalized latent tensors."""
- waveform = audio["waveform"]
- waveform_sample_rate = audio["sample_rate"]
+ waveform = audio
+ waveform_sample_rate = sample_rate
input_device = waveform.device
- # Ensure that Audio VAE is loaded on the correct device.
- self.device_manager.ensure_model_loaded()
-
- waveform = self.device_manager.move_to_load_device(waveform)
expected_channels = self.autoencoder.encoder.in_channels
if waveform.shape[1] != expected_channels:
if waveform.shape[1] == 1:
@@ -190,7 +151,7 @@ class AudioVAE(torch.nn.Module):
)
mel_spec = self.preprocessor.waveform_to_mel(
- waveform, waveform_sample_rate, device=self.device_manager.load_device
+ waveform, waveform_sample_rate, device=waveform.device
)
latents = self.autoencoder.encode(mel_spec)
@@ -204,17 +165,13 @@ class AudioVAE(torch.nn.Module):
"""Decode normalized latent tensors into an audio waveform."""
original_shape = latents.shape
- # Ensure that Audio VAE is loaded on the correct device.
- self.device_manager.ensure_model_loaded()
-
- latents = self.device_manager.move_to_load_device(latents)
latents = self.normalizer.denormalize(latents)
target_shape = self.target_shape_from_latents(original_shape)
mel_spec = self.autoencoder.decode(latents, target_shape=target_shape)
waveform = self.run_vocoder(mel_spec)
- return self.device_manager.move_to_load_device(waveform)
+ return waveform
def target_shape_from_latents(self, latents_shape):
batch, _, time, _ = latents_shape
diff --git a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
index b556b128f..58b67d45a 100644
--- a/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_audio_autoencoder.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import torch
from torch import nn
from torch.nn import functional as F
diff --git a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
index 998122c85..5975015e2 100644
--- a/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
+++ b/comfy/ldm/lightricks/vae/causal_video_autoencoder.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import threading
import torch
from torch import nn
diff --git a/comfy/ldm/lumina/model.py b/comfy/ldm/lumina/model.py
index 9e432d5c0..d0ee97d33 100644
--- a/comfy/ldm/lumina/model.py
+++ b/comfy/ldm/lumina/model.py
@@ -1,5 +1,4 @@
# Code from: https://github.com/Alpha-VLLM/Lumina-Image-2.0/blob/main/models/model.py
-from __future__ import annotations
from typing import List, Optional, Tuple
diff --git a/comfy/ldm/modules/attention.py b/comfy/ldm/modules/attention.py
index b193fe5e8..55360535a 100644
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -14,6 +14,8 @@ from .sub_quadratic_attention import efficient_dot_product_attention
from comfy import model_management
+TORCH_HAS_GQA = model_management.torch_version_numeric >= (2, 5)
+
if model_management.xformers_enabled():
import xformers
import xformers.ops
@@ -150,7 +152,12 @@ def attention_basic(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
b, _, dim_head = q.shape
dim_head //= heads
- scale = dim_head ** -0.5
+ if kwargs.get("enable_gqa", False) and q.shape[-3] != k.shape[-3]:
+ n_rep = q.shape[-3] // k.shape[-3]
+ k = k.repeat_interleave(n_rep, dim=-3)
+ v = v.repeat_interleave(n_rep, dim=-3)
+
+ scale = kwargs.get("scale", dim_head ** -0.5)
h = heads
if skip_reshape:
@@ -219,6 +226,10 @@ def attention_sub_quad(query, key, value, heads, mask=None, attn_precision=None,
b, _, dim_head = query.shape
dim_head //= heads
+ if "scale" in kwargs:
+ # Pre-scale query to match requested scale (cancels internal 1/sqrt(dim_head))
+ query = query * (kwargs["scale"] * dim_head ** 0.5)
+
if skip_reshape:
query = query.reshape(b * heads, -1, dim_head)
value = value.reshape(b * heads, -1, dim_head)
@@ -290,7 +301,7 @@ def attention_split(q, k, v, heads, mask=None, attn_precision=None, skip_reshape
b, _, dim_head = q.shape
dim_head //= heads
- scale = dim_head ** -0.5
+ scale = kwargs.get("scale", dim_head ** -0.5)
if skip_reshape:
q, k, v = map(
@@ -500,8 +511,13 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
if mask.ndim == 3:
mask = mask.unsqueeze(1)
+ # Pass through extra SDPA kwargs (scale, enable_gqa) if provided
+ # enable_gqa requires PyTorch 2.5+; older versions use manual KV expansion above
+ sdpa_keys = ("scale", "enable_gqa") if TORCH_HAS_GQA else ("scale",)
+ sdpa_extra = {k: v for k, v in kwargs.items() if k in sdpa_keys}
+
if SDP_BATCH_LIMIT >= b:
- out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+ out = comfy.ops.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False, **sdpa_extra)
if not skip_output_reshape:
out = (
out.transpose(1, 2).reshape(b, -1, heads * dim_head)
@@ -519,7 +535,7 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
k[i : i + SDP_BATCH_LIMIT],
v[i : i + SDP_BATCH_LIMIT],
attn_mask=m,
- dropout_p=0.0, is_causal=False
+ dropout_p=0.0, is_causal=False, **sdpa_extra
).transpose(1, 2).reshape(-1, q.shape[2], heads * dim_head)
return out
@@ -725,12 +741,12 @@ optimized_attention = attention_basic
if model_management.sage_attention_enabled():
logging.info("Using sage attention")
optimized_attention = attention_sage
-elif model_management.xformers_enabled():
- logging.info("Using xformers attention")
- optimized_attention = attention_xformers
elif model_management.flash_attention_enabled():
logging.info("Using Flash Attention")
optimized_attention = attention_flash
+elif model_management.xformers_enabled():
+ logging.info("Using xformers attention")
+ optimized_attention = attention_xformers
elif model_management.pytorch_attention_enabled():
logging.info("Using pytorch attention")
optimized_attention = attention_pytorch
diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py
index 0dc8fe789..9ab3c463c 100644
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -211,7 +211,7 @@ class TimestepEmbedder(nn.Module):
Embeds scalar timesteps into vector representations.
"""
- def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None):
+ def __init__(self, hidden_size, frequency_embedding_size=256, output_size=None, dtype=None, device=None, operations=None, max_period=10000):
super().__init__()
if output_size is None:
output_size = hidden_size
@@ -221,9 +221,10 @@ class TimestepEmbedder(nn.Module):
operations.Linear(hidden_size, output_size, bias=True, dtype=dtype, device=device),
)
self.frequency_embedding_size = frequency_embedding_size
+ self.max_period = max_period
def forward(self, t, dtype, **kwargs):
- t_freq = timestep_embedding(t, self.frequency_embedding_size).to(dtype)
+ t_freq = timestep_embedding(t, self.frequency_embedding_size, max_period=self.max_period).to(dtype)
t_emb = self.mlp(t_freq)
return t_emb
diff --git a/comfy/ldm/modules/diffusionmodules/openaimodel.py b/comfy/ldm/modules/diffusionmodules/openaimodel.py
index 295310df6..4b92c44cf 100644
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
@@ -34,6 +34,16 @@ class TimestepBlock(nn.Module):
#This is needed because accelerate makes a copy of transformer_options which breaks "transformer_index"
def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, output_shape=None, time_context=None, num_video_frames=None, image_only_indicator=None):
for layer in ts:
+ if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
+ found_patched = False
+ for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
+ if isinstance(layer, class_type):
+ x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
+ found_patched = True
+ break
+ if found_patched:
+ continue
+
if isinstance(layer, VideoResBlock):
x = layer(x, emb, num_video_frames, image_only_indicator)
elif isinstance(layer, TimestepBlock):
@@ -49,15 +59,6 @@ def forward_timestep_embed(ts, x, emb, context=None, transformer_options={}, out
elif isinstance(layer, Upsample):
x = layer(x, output_shape=output_shape)
else:
- if "patches" in transformer_options and "forward_timestep_embed_patch" in transformer_options["patches"]:
- found_patched = False
- for class_type, handler in transformer_options["patches"]["forward_timestep_embed_patch"]:
- if isinstance(layer, class_type):
- x = handler(layer, x, emb, context, transformer_options, output_shape, time_context, num_video_frames, image_only_indicator)
- found_patched = True
- break
- if found_patched:
- continue
x = layer(x)
return x
@@ -894,6 +895,12 @@ class UNetModel(nn.Module):
h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator)
h = apply_control(h, control, 'middle')
+ if "middle_block_after_patch" in transformer_patches:
+ patch = transformer_patches["middle_block_after_patch"]
+ for p in patch:
+ out = p({"h": h, "x": x, "emb": emb, "context": context, "y": y,
+ "timesteps": timesteps, "transformer_options": transformer_options})
+ h = out["h"]
for id, module in enumerate(self.output_blocks):
transformer_options["block"] = ("output", id)
@@ -905,8 +912,9 @@ class UNetModel(nn.Module):
for p in patch:
h, hsp = p(h, hsp, transformer_options)
- h = th.cat([h, hsp], dim=1)
- del hsp
+ if hsp is not None:
+ h = th.cat([h, hsp], dim=1)
+ del hsp
if len(hs) > 0:
output_shape = hs[-1].shape
else:
diff --git a/comfy/ldm/modules/diffusionmodules/util.py b/comfy/ldm/modules/diffusionmodules/util.py
index 233011dc9..aed5c149c 100644
--- a/comfy/ldm/modules/diffusionmodules/util.py
+++ b/comfy/ldm/modules/diffusionmodules/util.py
@@ -140,7 +140,7 @@ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
alphas = alphacums[ddim_timesteps]
alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
- # according the the formula provided in https://arxiv.org/abs/2010.02502
+ # according to the formula provided in https://arxiv.org/abs/2010.02502
sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
if verbose:
logging.info(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
diff --git a/comfy/ldm/moge/geometry.py b/comfy/ldm/moge/geometry.py
new file mode 100644
index 000000000..d1a1e445f
--- /dev/null
+++ b/comfy/ldm/moge/geometry.py
@@ -0,0 +1,188 @@
+"""Pure-torch + scipy geometry helpers for MoGe inference and mesh export."""
+
+
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from scipy.optimize import least_squares
+
+def normalized_view_plane_uv(width: int, height: int, aspect_ratio: Optional[float] = None,
+ dtype: Optional[torch.dtype] = None, device: Optional[torch.device] = None) -> torch.Tensor:
+ """Normalized view-plane UV coordinates with corners at +/-(W, H)/diagonal."""
+ if aspect_ratio is None:
+ aspect_ratio = width / height
+ span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+ span_y = 1.0 / (1 + aspect_ratio ** 2) ** 0.5
+ u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
+ v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
+ u, v = torch.meshgrid(u, v, indexing="xy")
+ return torch.stack([u, v], dim=-1)
+
+
+def intrinsics_from_focal_center(fx: torch.Tensor, fy: torch.Tensor, cx: torch.Tensor, cy: torch.Tensor) -> torch.Tensor:
+ """Assemble (..., 3, 3) intrinsics from broadcastable fx, fy, cx, cy."""
+ fx, fy, cx, cy = [torch.as_tensor(v) for v in (fx, fy, cx, cy)]
+ fx, fy, cx, cy = torch.broadcast_tensors(fx, fy, cx, cy)
+ zero = torch.zeros_like(fx)
+ one = torch.ones_like(fx)
+ return torch.stack([
+ torch.stack([fx, zero, cx], dim=-1),
+ torch.stack([zero, fy, cy], dim=-1),
+ torch.stack([zero, zero, one], dim=-1),
+ ], dim=-2)
+
+
+def depth_map_to_point_map(depth: torch.Tensor, intrinsics: torch.Tensor) -> torch.Tensor:
+ """Back-project a (..., H, W) depth map through K^-1 to (..., H, W, 3) camera-space points.
+
+ Intrinsics use normalized image coords (x in [0, 1] left->right, y in [0, 1] top->bottom).
+ """
+ H, W = depth.shape[-2:]
+ device, dtype = depth.device, depth.dtype
+ u = (torch.arange(W, dtype=dtype, device=device) + 0.5) / W
+ v = (torch.arange(H, dtype=dtype, device=device) + 0.5) / H
+ grid_v, grid_u = torch.meshgrid(v, u, indexing="ij")
+ pix = torch.stack([grid_u, grid_v, torch.ones_like(grid_u)], dim=-1)
+ K_inv = torch.linalg.inv(intrinsics)
+ rays = torch.einsum("...ij,hwj->...hwi", K_inv, pix)
+ return rays * depth.unsqueeze(-1)
+
+
+def _solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray,
+ focal: Optional[float] = None) -> Tuple[float, float]:
+ """LM-solve for z-shift; when focal is None, also recovers the optimal focal."""
+ uv = uv.reshape(-1, 2)
+ xy = xyz[..., :2].reshape(-1, 2)
+ z = xyz[..., 2].reshape(-1)
+
+ def fn(shift):
+ xy_proj = xy / (z + shift)[:, None]
+ f = focal if focal is not None else (xy_proj * uv).sum() / np.square(xy_proj).sum()
+ return (f * xy_proj - uv).ravel()
+
+ sol = least_squares(fn, x0=0.0, ftol=1e-3, method="lm")
+ shift = float(np.asarray(sol["x"]).squeeze())
+ if focal is None:
+ xy_proj = xy / (z + shift)[:, None]
+ focal = float((xy_proj * uv).sum() / np.square(xy_proj).sum())
+ return shift, focal
+
+
+def recover_focal_shift(points: torch.Tensor, mask: Optional[torch.Tensor] = None,
+ focal: Optional[torch.Tensor] = None, downsample_size: Tuple[int, int] = (64, 64)
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Recover the focal length and z-shift that turn points into a metric point map.
+
+ Optical center is at the image center; returned focal is relative to half the image diagonal.
+ Returns (focal, shift) on the same device/dtype as points.
+ """
+ shape = points.shape
+ H, W = shape[-3], shape[-2]
+ points_b = points.reshape(-1, H, W, 3)
+ mask_b = None if mask is None else mask.reshape(-1, H, W)
+ focal_b = None if focal is None else focal.reshape(-1)
+
+ uv = normalized_view_plane_uv(W, H, dtype=points.dtype, device=points.device)
+
+ points_lr = F.interpolate(points_b.permute(0, 3, 1, 2), downsample_size, mode="nearest").permute(0, 2, 3, 1)
+ uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode="nearest").squeeze(0).permute(1, 2, 0)
+ mask_lr = None
+ if mask_b is not None:
+ mask_lr = F.interpolate(mask_b.to(torch.float32).unsqueeze(1), downsample_size, mode="nearest").squeeze(1) > 0
+
+ uv_np = uv_lr.detach().cpu().numpy()
+ points_np = points_lr.detach().cpu().numpy()
+ mask_np = None if mask_lr is None else mask_lr.detach().cpu().numpy()
+ focal_np = None if focal_b is None else focal_b.detach().cpu().numpy()
+
+ out_focal: list = []
+ out_shift: list = []
+ for i in range(points_b.shape[0]):
+ if mask_np is None:
+ xyz_i = points_np[i].reshape(-1, 3)
+ uv_i = uv_np.reshape(-1, 2)
+ else:
+ sel = mask_np[i]
+ if sel.sum() < 2:
+ out_focal.append(1.0)
+ out_shift.append(0.0)
+ continue
+ xyz_i = points_np[i][sel]
+ uv_i = uv_np[sel]
+ if focal_np is None:
+ shift_i, focal_i = _solve_optimal_shift(uv_i, xyz_i)
+ out_focal.append(focal_i)
+ else:
+ shift_i, _ = _solve_optimal_shift(uv_i, xyz_i, focal=float(focal_np[i]))
+ out_shift.append(shift_i)
+
+ shift_t = torch.tensor(out_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+ if focal is None:
+ focal_t = torch.tensor(out_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+ else:
+ focal_t = focal.reshape(shape[:-3])
+ return focal_t, shift_t
+
+
+def depth_map_edge(depth: torch.Tensor, atol: Optional[float] = None, rtol: Optional[float] = None, kernel_size: int = 3) -> torch.Tensor:
+ """Per-pixel boolean: True where the local depth window's max-min span exceeds atol or rtol*depth."""
+ shape = depth.shape
+ d = depth.reshape(-1, 1, *shape[-2:])
+ pad = kernel_size // 2
+ diff = F.max_pool2d(d, kernel_size, stride=1, padding=pad) + F.max_pool2d(-d, kernel_size, stride=1, padding=pad)
+ edge = torch.zeros_like(d, dtype=torch.bool)
+ if atol is not None:
+ edge |= diff > atol
+ if rtol is not None:
+ edge |= (diff / d.clamp_min(1e-6)).nan_to_num_() > rtol
+ return edge.reshape(*shape)
+
+
+def triangulate_grid_mesh(points: torch.Tensor, mask: Optional[torch.Tensor] = None, decimation: int = 1, discontinuity_threshold: float = 0.04,
+ depth: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ """Triangulate a (H, W, 3) point map into (vertices, faces, uvs) on CPU.
+
+ Vertices: pixels with finite coords (passing optional mask). Quads with four valid corners
+ become two triangles. depth overrides the scalar used for the rtol edge check; pass radial
+ depth for panoramas (the default points[..., 2] goes negative below the equator).
+ """
+ points = points.detach().cpu()
+ finite = torch.isfinite(points).all(dim=-1)
+ if mask is None:
+ mask = finite
+ else:
+ mask = mask.detach().cpu().to(torch.bool) & finite
+
+ if discontinuity_threshold > 0:
+ d = depth.detach().cpu() if depth is not None else points[..., 2]
+ # Replace inf with 0 so max-pool doesn't poison neighbourhoods (mask above already excludes those pixels).
+ d_finite = torch.where(finite, d, torch.zeros_like(d))
+ edge = depth_map_edge(d_finite, rtol=discontinuity_threshold)
+ mask = mask & ~edge
+
+ if decimation > 1:
+ points = points[::decimation, ::decimation].contiguous()
+ mask = mask[::decimation, ::decimation].contiguous()
+ H, W = points.shape[:2]
+
+ flat_mask = mask.reshape(-1)
+ idx = torch.full((H * W,), -1, dtype=torch.long)
+ n_valid = int(flat_mask.sum().item())
+ idx[flat_mask] = torch.arange(n_valid, dtype=torch.long)
+ idx = idx.reshape(H, W)
+
+ vertices = points.reshape(-1, 3)[flat_mask].contiguous()
+
+ yy, xx = torch.meshgrid(torch.arange(H), torch.arange(W), indexing="ij")
+ u = xx.float() / max(W - 1, 1)
+ v = yy.float() / max(H - 1, 1)
+ uvs = torch.stack([u, v], dim=-1).reshape(-1, 2)[flat_mask].contiguous()
+
+ a, b, c, d = idx[:-1, :-1], idx[:-1, 1:], idx[1:, 1:], idx[1:, :-1]
+ quad_ok = (a >= 0) & (b >= 0) & (c >= 0) & (d >= 0)
+ a, b, c, d = a[quad_ok], b[quad_ok], c[quad_ok], d[quad_ok]
+ faces = torch.cat([torch.stack([a, b, c], dim=-1), torch.stack([a, c, d], dim=-1)], dim=0).contiguous()
+ return vertices, faces, uvs
diff --git a/comfy/ldm/moge/model.py b/comfy/ldm/moge/model.py
new file mode 100644
index 000000000..1695626bc
--- /dev/null
+++ b/comfy/ldm/moge/model.py
@@ -0,0 +1,346 @@
+"""MoGe v1 / v2 inference modules and a state-dict-driven builder.
+
+V1: DINOv2 backbone + multi-output head (points, mask).
+V2: DINOv2 encoder + neck + per-output heads (points, mask, normal, optional metric-scale MLP).
+"""
+
+
+from numbers import Number
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+import comfy.model_management
+import comfy.model_patcher
+
+from comfy.image_encoders.dino2 import Dinov2Model
+
+from .geometry import depth_map_to_point_map, intrinsics_from_focal_center, recover_focal_shift
+from .modules import ConvStack, DINOv2Encoder, HeadV1, MLP, _view_plane_uv_grid
+
+
+def _remap_points(points: torch.Tensor) -> torch.Tensor:
+ """Apply the exp remap: z -> exp(z), xy stays linear and gets scaled by the new z."""
+ xy, z = points.split([2, 1], dim=-1)
+ z = torch.exp(z)
+ return torch.cat([xy * z, z], dim=-1)
+
+
+def _detect_dinov2(sd: dict, prefix: str) -> Dict[str, Any]:
+ # All shipped MoGe checkpoints use plain DINOv2
+ hidden = sd[prefix + "embeddings.cls_token"].shape[-1]
+ layer_prefix = prefix + "encoder.layer."
+ depth = 1 + max(int(k[len(layer_prefix):].split(".")[0]) for k in sd if k.startswith(layer_prefix))
+ return {
+ "hidden_size": hidden,
+ "num_attention_heads": hidden // 64,
+ "num_hidden_layers": depth,
+ "layer_norm_eps": 1e-6,
+ "use_swiglu_ffn": False,
+ }
+
+
+class MoGeModelV1(nn.Module):
+ """MoGe v1: DINOv2 backbone + HeadV1 (points, mask)."""
+
+ image_mean: torch.Tensor
+ image_std: torch.Tensor
+
+ intermediate_layers = 4
+ num_tokens_range: Tuple[Number, Number] = (1200, 2500)
+ mask_threshold = 0.5
+
+ def __init__(self, backbone: Dict[str, Any], dim_upsample: List[int] = (256, 128, 128),
+ num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1,
+ dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+ self.backbone = Dinov2Model(backbone, dtype, device, operations)
+ self.head = HeadV1(dim_in=backbone["hidden_size"], dim_upsample=list(dim_upsample),
+ num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times_res_block_hidden,
+ dtype=dtype, device=device, operations=operations)
+ self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+ self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+
+ def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
+ H, W = image.shape[-2:]
+ resize = ((num_tokens * 14 ** 2) / (H * W)) ** 0.5
+ rh, rw = int(H * resize), int(W * resize)
+ x = F.interpolate(image, (rh, rw), mode="bicubic", align_corners=False, antialias=True)
+ x = (x - self.image_mean) / self.image_std
+ x14 = F.interpolate(x, (rh // 14 * 14, rw // 14 * 14), mode="bilinear", align_corners=False, antialias=True)
+
+ n_layers = len(self.backbone.encoder.layer)
+ indices = list(range(n_layers - self.intermediate_layers, n_layers))
+ feats = self.backbone.get_intermediate_layers(x14, indices, apply_norm=True)
+
+ points, mask = self.head(feats, x)
+ points = F.interpolate(points.float(), (H, W), mode="bilinear", align_corners=False)
+ points = _remap_points(points.permute(0, 2, 3, 1))
+
+ mask = F.interpolate(mask.float(), (H, W), mode="bilinear", align_corners=False).squeeze(1)
+
+ return {"points": points, "mask": mask}
+
+ @classmethod
+ def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
+ """Detect the v1 head config from sd, build a model, and load weights."""
+ n_up = 1 + max(int(k.split(".")[2]) for k in sd if k.startswith("head.upsample_blocks."))
+ dim_upsample = [sd[f"head.upsample_blocks.{i}.0.0.weight"].shape[1] for i in range(n_up)]
+ # Each upsample stage is Sequential[upsampler, *res_blocks]; count res blocks at level 0.
+ num_res_blocks = max({int(k.split(".")[3]) for k in sd if k.startswith("head.upsample_blocks.0.")})
+ hidden_out = sd["head.upsample_blocks.0.1.layers.2.weight"].shape[0]
+ dim_times = max(hidden_out // dim_upsample[0], 1)
+ model = cls(backbone=_detect_dinov2(sd, prefix="backbone."),
+ dim_upsample=dim_upsample, num_res_blocks=num_res_blocks, dim_times_res_block_hidden=dim_times,
+ dtype=dtype, device=device, operations=operations)
+ model.load_state_dict(sd, strict=True)
+ return model
+
+
+class MoGeModelV2(nn.Module):
+ """MoGe v2: DINOv2 encoder + neck + per-output heads (points/mask/normal/metric-scale)."""
+
+ intermediate_layers = 4
+ num_tokens_range: Tuple[Number, Number] = (1200, 3600)
+
+ def __init__(self,
+ encoder: Dict[str, Any],
+ neck: Dict[str, Any],
+ points_head: Dict[str, Any],
+ mask_head: Dict[str, Any],
+ scale_head: Dict[str, Any],
+ normal_head: Optional[Dict[str, Any]] = None,
+ dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+ self.encoder = DINOv2Encoder(**encoder, dtype=dtype, device=device, operations=operations)
+ self.neck = ConvStack(**neck, dtype=dtype, device=device, operations=operations)
+ self.points_head = ConvStack(**points_head, dtype=dtype, device=device, operations=operations)
+ self.mask_head = ConvStack(**mask_head, dtype=dtype, device=device, operations=operations)
+ self.scale_head = MLP(**scale_head, dtype=dtype, device=device, operations=operations)
+ if normal_head is not None:
+ self.normal_head = ConvStack(**normal_head, dtype=dtype, device=device, operations=operations)
+
+ def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
+ B, _, H, W = image.shape
+ device, dtype = image.device, image.dtype
+ aspect_ratio = W / H
+ base_h = round((num_tokens / aspect_ratio) ** 0.5)
+ base_w = round((num_tokens * aspect_ratio) ** 0.5)
+
+ feat_top, cls_token = self.encoder(image, base_h, base_w, return_class_token=True)
+
+ # 5-level pyramid: feat at level 0 concatenated with UV, other levels UV-only.
+ levels = [_view_plane_uv_grid(B, base_h * (2 ** L), base_w * (2 ** L), aspect_ratio, dtype, device)
+ for L in range(5)]
+ levels[0] = torch.cat([feat_top, levels[0]], dim=1)
+
+ feats = self.neck(levels)
+
+ def _resize(v):
+ return F.interpolate(v, (H, W), mode="bilinear", align_corners=False)
+
+ points = _remap_points(_resize(self.points_head(feats)[-1]).permute(0, 2, 3, 1))
+ mask = _resize(self.mask_head(feats)[-1]).squeeze(1).sigmoid()
+ metric_scale = self.scale_head(cls_token).squeeze(1).exp()
+
+ result = {"points": points, "mask": mask, "metric_scale": metric_scale}
+ if hasattr(self, "normal_head"):
+ normal = _resize(self.normal_head(feats)[-1])
+ result["normal"] = F.normalize(normal.permute(0, 2, 3, 1), dim=-1)
+ return result
+
+ @classmethod
+ def from_state_dict(cls, sd, dtype=None, device=None, operations=comfy.ops.manual_cast):
+ """Detect the v2 encoder/neck/heads config from sd, build a model, and load weights."""
+ backbone = _detect_dinov2(sd, prefix="encoder.backbone.")
+ depth = backbone["num_hidden_layers"]
+ n = cls.intermediate_layers
+ encoder = {
+ "backbone": backbone,
+ "intermediate_layers": [(depth // n) * (i + 1) - 1 for i in range(n)],
+ "dim_out": sd["encoder.output_projections.0.weight"].shape[0],
+ }
+ # scale_head is an MLP: Sequential of [Linear, ReLU, ..., Linear]; Linear weight is (out, in).
+ scale_idxs = sorted({int(k.split(".")[1]) for k in sd if k.startswith("scale_head.")})
+ scale_first = sd[f"scale_head.{scale_idxs[0]}.weight"]
+ cfg: Dict[str, Any] = {
+ "encoder": encoder,
+ "neck": cls._detect_convstack(sd, "neck."),
+ "points_head": cls._detect_convstack(sd, "points_head."),
+ "mask_head": cls._detect_convstack(sd, "mask_head."),
+ "scale_head": {"dims": [scale_first.shape[1]] + [sd[f"scale_head.{i}.weight"].shape[0] for i in scale_idxs]},
+ }
+ if any(k.startswith("normal_head.") for k in sd):
+ cfg["normal_head"] = cls._detect_convstack(sd, "normal_head.")
+ model = cls(**cfg, dtype=dtype, device=device, operations=operations)
+ model.load_state_dict(sd, strict=True)
+ return model
+
+ @staticmethod
+ def _detect_convstack(sd: dict, prefix: str) -> Dict[str, Any]:
+ """Reconstruct a ConvStack config from the keys under prefix"""
+ in_keys = [k for k in sd if k.startswith(f"{prefix}input_blocks.") and k.endswith(".weight")]
+ n = 1 + max(int(k[len(f"{prefix}input_blocks."):].split(".")[0]) for k in in_keys)
+
+ in_shapes = [sd[f"{prefix}input_blocks.{i}.weight"].shape for i in range(n)]
+ has_out = lambda i: f"{prefix}output_blocks.{i}.weight" in sd
+ has_norm = f"{prefix}res_blocks.0.0.layers.0.weight" in sd
+
+ def num_res_at(i):
+ rb_prefix = f"{prefix}res_blocks.{i}."
+ return len({int(k[len(rb_prefix):].split(".")[0]) for k in sd if k.startswith(rb_prefix)})
+
+ return {
+ "dim_in": [s[1] for s in in_shapes],
+ "dim_res_blocks": [s[0] for s in in_shapes],
+ "dim_out": [sd[f"{prefix}output_blocks.{i}.weight"].shape[0] if has_out(i) else None for i in range(n)],
+ "num_res_blocks": [num_res_at(i) for i in range(n)],
+ "resamplers": ["conv_transpose" if f"{prefix}resamplers.{i}.0.weight" in sd else "bilinear"
+ for i in range(n - 1)],
+ "res_block_in_norm": "layer_norm" if has_norm else "none",
+ "res_block_hidden_norm": "group_norm" if has_norm else "none",
+ }
+
+
+# Translate the Meta-style DINOv2 keys MoGe ships to the naming ComfyUI DINOv2 port expects,
+# and split each fused qkv tensor into Q/K/V.
+_DINOV2_TOPLEVEL_RENAMES = {
+ "patch_embed.proj.weight": "embeddings.patch_embeddings.projection.weight",
+ "patch_embed.proj.bias": "embeddings.patch_embeddings.projection.bias",
+ "cls_token": "embeddings.cls_token",
+ "pos_embed": "embeddings.position_embeddings",
+ "register_tokens": "embeddings.register_tokens",
+ "mask_token": "embeddings.mask_token",
+ "norm.weight": "layernorm.weight",
+ "norm.bias": "layernorm.bias",
+}
+_DINOV2_BLOCK_RENAMES = [
+ ("ls1.gamma", "layer_scale1.lambda1"),
+ ("ls2.gamma", "layer_scale2.lambda1"),
+ ("attn.proj.", "attention.output.dense."),
+ ("mlp.w12.", "mlp.weights_in."),
+ ("mlp.w3.", "mlp.weights_out."),
+]
+
+
+def _remap_state_dict(sd: dict) -> dict:
+ if "model" in sd and "model_config" in sd:
+ sd = sd["model"]
+ prefix = "encoder.backbone." if any(k.startswith("encoder.backbone.") for k in sd) else "backbone."
+ out: dict = {}
+ for k, v in sd.items():
+ if not k.startswith(prefix):
+ out[k] = v
+ continue
+ rel = k[len(prefix):]
+ if rel in _DINOV2_TOPLEVEL_RENAMES:
+ out[prefix + _DINOV2_TOPLEVEL_RENAMES[rel]] = v
+ continue
+ if not rel.startswith("blocks."):
+ out[k] = v
+ continue
+ _, idx, sub = rel.split(".", 2)
+ if sub in ("attn.qkv.weight", "attn.qkv.bias"):
+ tail = sub.rsplit(".", 1)[1]
+ q, kw, vw = v.chunk(3, dim=0)
+ base = f"{prefix}encoder.layer.{idx}.attention.attention"
+ out[f"{base}.query.{tail}"] = q
+ out[f"{base}.key.{tail}"] = kw
+ out[f"{base}.value.{tail}"] = vw
+ continue
+ for old, new in _DINOV2_BLOCK_RENAMES:
+ sub = sub.replace(old, new)
+ out[f"{prefix}encoder.layer.{idx}.{sub}"] = v
+ return out
+
+
+def build_from_state_dict(sd: dict, dtype=None, device=None, operations=comfy.ops.manual_cast) -> nn.Module:
+ """Dispatch to v1 or v2 based on the DINOv2 backbone prefix."""
+ sd = _remap_state_dict(sd)
+ cls = MoGeModelV2 if any(k.startswith("encoder.backbone.") for k in sd) else MoGeModelV1
+ return cls.from_state_dict(sd, dtype=dtype, device=device, operations=operations)
+
+
+class MoGeModel:
+ """Loaded MoGe model + ComfyUI memory management."""
+
+ def __init__(self, state_dict: dict):
+ # text encoder dtype closest match
+ self.load_device = comfy.model_management.text_encoder_device()
+ offload_device = comfy.model_management.text_encoder_offload_device()
+ self.dtype = comfy.model_management.text_encoder_dtype(self.load_device)
+
+ self.model = build_from_state_dict(state_dict, dtype=self.dtype, device=offload_device, operations=comfy.ops.manual_cast).eval()
+ self.patcher = comfy.model_patcher.CoreModelPatcher(self.model, load_device=self.load_device, offload_device=offload_device)
+ self.version = "v2" if hasattr(self.model, "encoder") else "v1"
+ self.mask_threshold = float(getattr(self.model, "mask_threshold", 0.5))
+ nt = getattr(self.model, "num_tokens_range", (1200, 2500 if self.version == "v1" else 3600))
+ self.num_tokens_range = (int(nt[0]), int(nt[1]))
+
+ def infer(self, image: torch.Tensor, num_tokens: Optional[int] = None,
+ resolution_level: int = 9, fov_x: Optional[Union[Number, torch.Tensor]] = None,
+ force_projection: bool = True, apply_mask: bool = True,
+ apply_metric_scale: bool = True
+ ) -> Dict[str, torch.Tensor]:
+ """Run a single MoGe forward + post-process pass. image is (B, 3, H, W) in [0, 1]."""
+ comfy.model_management.load_model_gpu(self.patcher)
+ image = image.to(device=self.load_device, dtype=self.dtype)
+ H, W = image.shape[-2:]
+ aspect_ratio = W / H
+
+ if num_tokens is None:
+ lo, hi = self.num_tokens_range
+ num_tokens = int(lo + (resolution_level / 9) * (hi - lo))
+
+ out = self.model.forward(image, num_tokens=num_tokens)
+ points = out["points"].float() # recover_focal_shift goes through scipy on CPU; needs fp32.
+ mask_binary = out["mask"] > self.mask_threshold
+ normal = out.get("normal")
+ metric_scale = out.get("metric_scale")
+
+ diag = (1 + aspect_ratio ** 2) ** 0.5
+
+ def focal_from_fov_deg(deg):
+ fov = torch.as_tensor(deg, device=points.device, dtype=points.dtype)
+ return aspect_ratio / diag / torch.tan(torch.deg2rad(fov / 2))
+
+ if fov_x is None:
+ focal, shift = recover_focal_shift(points, mask_binary)
+ # Fall back to 60 deg FoV when the least-squares solver flips the focal sign.
+ bad = ~torch.isfinite(focal) | (focal <= 0)
+ if bool(bad.any()):
+ focal = torch.where(bad, focal_from_fov_deg(60.0), focal)
+ _, shift = recover_focal_shift(points, mask_binary, focal=focal)
+ else:
+ focal = focal_from_fov_deg(fov_x).expand(points.shape[0])
+ _, shift = recover_focal_shift(points, mask_binary, focal=focal)
+
+ f_diag = focal / 2 * diag
+ half = torch.tensor(0.5, device=points.device, dtype=points.dtype)
+ intrinsics = intrinsics_from_focal_center(f_diag / aspect_ratio, f_diag, half, half)
+ points[..., 2] = points[..., 2] + shift[..., None, None]
+ # v2 only: filter mask by depth>0 to drop metric-scale negative-depth artifacts.
+ if self.version == "v2":
+ mask_binary = mask_binary & (points[..., 2] > 0)
+ depth = points[..., 2].clone()
+
+ if force_projection:
+ points = depth_map_to_point_map(depth, intrinsics=intrinsics)
+
+ if apply_metric_scale and metric_scale is not None:
+ points = points * metric_scale[:, None, None, None]
+ depth = depth * metric_scale[:, None, None]
+
+ if apply_mask:
+ points = torch.where(mask_binary[..., None], points, torch.full_like(points, float("inf")))
+ depth = torch.where(mask_binary, depth, torch.full_like(depth, float("inf")))
+ if normal is not None:
+ normal = torch.where(mask_binary[..., None], normal, torch.zeros_like(normal))
+
+ result = {"points": points, "depth": depth, "intrinsics": intrinsics, "mask": mask_binary}
+ if normal is not None:
+ result["normal"] = normal
+ return result
diff --git a/comfy/ldm/moge/modules.py b/comfy/ldm/moge/modules.py
new file mode 100644
index 000000000..f6443d65a
--- /dev/null
+++ b/comfy/ldm/moge/modules.py
@@ -0,0 +1,203 @@
+"""Building blocks for MoGe: residual conv stack, resamplers, MLP, DINOv2 encoder, v1 head."""
+
+
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+from comfy.image_encoders.dino2 import Dinov2Model
+
+from .geometry import normalized_view_plane_uv
+
+
+def _conv2d(operations, c_in: int, c_out: int, k: int = 3, *, dtype=None, device=None):
+ return operations.Conv2d(c_in, c_out, kernel_size=k, padding=k // 2, padding_mode="replicate", dtype=dtype, device=device)
+
+
+def _view_plane_uv_grid(batch: int, height: int, width: int, aspect_ratio: float, dtype, device) -> torch.Tensor:
+ """Batched normalized view-plane UV grid as a (B, 2, H, W) tensor."""
+ uv = normalized_view_plane_uv(width, height, aspect_ratio=aspect_ratio, dtype=dtype, device=device)
+ return uv.permute(2, 0, 1).unsqueeze(0).expand(batch, -1, -1, -1)
+
+
+def _concat_view_plane_uv(x: torch.Tensor, aspect_ratio: float) -> torch.Tensor:
+ """Append a 2-channel normalized view-plane UV grid to x along the channel dim."""
+ uv = _view_plane_uv_grid(x.shape[0], x.shape[-2], x.shape[-1], aspect_ratio, x.dtype, x.device)
+ return torch.cat([x, uv], dim=1)
+
+
+class ResidualConvBlock(nn.Module):
+ def __init__(self, channels: int, hidden_channels: Optional[int] = None, in_norm: str = "layer_norm", hidden_norm: str = "group_norm",
+ dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+ hidden_channels = hidden_channels if hidden_channels is not None else channels
+
+ in_norm_layer = operations.GroupNorm(1, channels, dtype=dtype, device=device) if in_norm == "layer_norm" else nn.Identity()
+ hidden_norm_layer = (operations.GroupNorm(max(hidden_channels // 32, 1), hidden_channels, dtype=dtype, device=device)
+ if hidden_norm == "group_norm" else nn.Identity())
+
+ self.layers = nn.Sequential(
+ in_norm_layer, nn.ReLU(), _conv2d(operations, channels, hidden_channels, dtype=dtype, device=device),
+ hidden_norm_layer, nn.ReLU(), _conv2d(operations, hidden_channels, channels, dtype=dtype, device=device),
+ )
+
+ def forward(self, x):
+ return self.layers(x) + x
+
+
+class Resampler(nn.Sequential):
+ """2x upsampler: ConvTranspose2d(2x2) or bilinear upsample, followed by a 3x3 conv."""
+
+ def __init__(self, in_channels: int, out_channels: int, type_: str, dtype=None, device=None, operations=comfy.ops.manual_cast):
+ if type_ == "conv_transpose":
+ up = operations.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2, dtype=dtype, device=device)
+ conv_in = out_channels
+ else: # "bilinear"
+ up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+ conv_in = in_channels
+ super().__init__(up, _conv2d(operations, conv_in, out_channels, dtype=dtype, device=device))
+
+
+class MLP(nn.Sequential):
+ def __init__(self, dims: Sequence[int], dtype=None, device=None, operations=comfy.ops.manual_cast):
+ layers = []
+ for d_in, d_out in zip(dims[:-2], dims[1:-1]):
+ layers.append(operations.Linear(d_in, d_out, dtype=dtype, device=device))
+ layers.append(nn.ReLU(inplace=True))
+ layers.append(operations.Linear(dims[-2], dims[-1], dtype=dtype, device=device))
+ super().__init__(*layers)
+
+
+class ConvStack(nn.Module):
+ def __init__(self, dim_in: List[Optional[int]], dim_res_blocks: List[int], dim_out: List[Optional[int]], resamplers: List[str],
+ num_res_blocks: List[int], dim_times_res_block_hidden: int = 1, res_block_in_norm: str = "layer_norm", res_block_hidden_norm: str = "group_norm",
+ dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+
+ self.input_blocks = nn.ModuleList([
+ (_conv2d(operations, d_in, d_res, k=1, dtype=dtype, device=device)
+ if d_in is not None else nn.Identity())
+ for d_in, d_res in zip(dim_in, dim_res_blocks)
+ ])
+
+ self.resamplers = nn.ModuleList([
+ Resampler(prev, succ, type_=r, dtype=dtype, device=device, operations=operations)
+ for prev, succ, r in zip(dim_res_blocks[:-1], dim_res_blocks[1:], resamplers)
+ ])
+
+ self.res_blocks = nn.ModuleList([
+ nn.Sequential(*[
+ ResidualConvBlock(d_res, dim_times_res_block_hidden * d_res, in_norm=res_block_in_norm, hidden_norm=res_block_hidden_norm, dtype=dtype, device=device, operations=operations)
+ for _ in range(num_res_blocks[i])
+ ])
+ for i, d_res in enumerate(dim_res_blocks)
+ ])
+
+ self.output_blocks = nn.ModuleList([
+ (_conv2d(operations, d_res, d_out, k=1, dtype=dtype, device=device)
+ if d_out is not None else nn.Identity())
+ for d_out, d_res in zip(dim_out, dim_res_blocks)
+ ])
+
+ def forward(self, in_features: List[Optional[torch.Tensor]]):
+ out_features = []
+ x = None
+ for i in range(len(self.res_blocks)):
+ feat = self.input_blocks[i](in_features[i]) if in_features[i] is not None else None
+ if i == 0:
+ x = feat
+ elif feat is not None:
+ x = x + feat
+ x = self.res_blocks[i](x)
+ out_features.append(self.output_blocks[i](x))
+ if i < len(self.res_blocks) - 1:
+ x = self.resamplers[i](x)
+ return out_features
+
+
+class DINOv2Encoder(nn.Module):
+ """Comfy DINOv2 backbone with per-layer 1x1 projection heads."""
+
+ def __init__(self, backbone: dict, intermediate_layers: List[int], dim_out: int, dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+ self.intermediate_layers = list(intermediate_layers)
+ dim_features = backbone["hidden_size"]
+ self.backbone = Dinov2Model(backbone, dtype, device, operations)
+ self.output_projections = nn.ModuleList([
+ _conv2d(operations, dim_features, dim_out, k=1, dtype=dtype, device=device)
+ for _ in range(len(self.intermediate_layers))
+ ])
+ self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+ self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+
+ def forward(self, image: torch.Tensor, token_rows: int, token_cols: int,
+ return_class_token: bool = False) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+ image_14 = F.interpolate(image, (token_rows * 14, token_cols * 14), mode="bilinear", align_corners=False, antialias=True)
+ image_14 = (image_14 - self.image_mean) / self.image_std
+ feats = self.backbone.get_intermediate_layers(image_14, self.intermediate_layers, apply_norm=True)
+ x = torch.stack([
+ proj(feat.permute(0, 2, 1).unflatten(2, (token_rows, token_cols)).contiguous())
+ for proj, (feat, _cls) in zip(self.output_projections, feats)
+ ], dim=1).sum(dim=1)
+ if return_class_token:
+ return x, feats[-1][1]
+ return x
+
+
+class HeadV1(nn.Module):
+ """v1 head: 4 backbone-feature projections -> shared upsample stack -> per-target output convs (points, mask)."""
+
+ NUM_FEATURES = 4
+ DIM_PROJ = 512
+ DIM_OUT = (3, 1) # 3 channels for points, 1 for mask
+ LAST_CONV_CHANNELS = 32
+
+ def __init__(self, dim_in: int, dim_upsample: List[int] = (256, 128, 128), num_res_blocks: int = 1, dim_times_res_block_hidden: int = 1,
+ dtype=None, device=None, operations=comfy.ops.manual_cast):
+ super().__init__()
+ self.projects = nn.ModuleList([
+ _conv2d(operations, dim_in, self.DIM_PROJ, k=1, dtype=dtype, device=device)
+ for _ in range(self.NUM_FEATURES)
+ ])
+ def upsampler(in_ch, out_ch):
+ return nn.Sequential(
+ operations.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2, dtype=dtype, device=device),
+ _conv2d(operations, out_ch, out_ch, dtype=dtype, device=device),
+ )
+
+ in_chs = [self.DIM_PROJ] + list(dim_upsample[:-1])
+ self.upsample_blocks = nn.ModuleList([
+ nn.Sequential(
+ upsampler(in_ch + 2, out_ch),
+ *(ResidualConvBlock(out_ch, dim_times_res_block_hidden * out_ch, dtype=dtype, device=device, operations=operations)
+ for _ in range(num_res_blocks))
+ )
+ for in_ch, out_ch in zip(in_chs, dim_upsample)
+ ])
+ self.output_block = nn.ModuleList([
+ nn.Sequential(
+ _conv2d(operations, dim_upsample[-1] + 2, self.LAST_CONV_CHANNELS, dtype=dtype, device=device),
+ nn.ReLU(inplace=True),
+ _conv2d(operations, self.LAST_CONV_CHANNELS, d_out, k=1, dtype=dtype, device=device),
+ )
+ for d_out in self.DIM_OUT
+ ])
+
+ def forward(self, hidden_states, image: torch.Tensor):
+ img_h, img_w = image.shape[-2:]
+ patch_h, patch_w = img_h // 14, img_w // 14
+ aspect = img_w / img_h
+ x = torch.stack([
+ proj(feat.permute(0, 2, 1).unflatten(2, (patch_h, patch_w)).contiguous())
+ for proj, (feat, _cls) in zip(self.projects, hidden_states)
+ ], dim=1).sum(dim=1)
+
+ for block in self.upsample_blocks:
+ x = block(_concat_view_plane_uv(x, aspect))
+
+ x = F.interpolate(x, (img_h, img_w), mode="bilinear", align_corners=False)
+ x = _concat_view_plane_uv(x, aspect)
+ return [block(x) for block in self.output_block]
diff --git a/comfy/ldm/moge/panorama.py b/comfy/ldm/moge/panorama.py
new file mode 100644
index 000000000..18d0cb665
--- /dev/null
+++ b/comfy/ldm/moge/panorama.py
@@ -0,0 +1,312 @@
+"""Panorama (equirectangular) inference helpers for MoGe.
+
+Splits an equirect into 12 perspective views via an icosahedron camera rig, runs
+the model per view, and stitches per-view distance maps back into a single
+equirect distance map via a multi-scale Poisson + gradient sparse solve.
+Image sampling uses F.grid_sample (GPU); the sparse solve uses lsmr (CPU).
+"""
+
+
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from scipy.ndimage import convolve, map_coordinates
+from scipy.sparse import vstack, csr_array
+from scipy.sparse.linalg import lsmr
+
+
+def _icosahedron_directions() -> np.ndarray:
+ """12 icosahedron-vertex directions (non-normalised, matching upstream's vertex order)."""
+ A = (1.0 + np.sqrt(5.0)) / 2.0
+ return np.array([
+ [0, 1, A], [0, -1, A], [0, 1, -A], [0, -1, -A],
+ [1, A, 0], [-1, A, 0], [1, -A, 0], [-1, -A, 0],
+ [A, 0, 1], [A, 0, -1], [-A, 0, 1], [-A, 0, -1],
+ ], dtype=np.float32)
+
+
+def _intrinsics_from_fov(fov_x_rad: float, fov_y_rad: float) -> np.ndarray:
+ """Normalised-image (unit-square) K matrix."""
+ fx = 0.5 / np.tan(fov_x_rad / 2)
+ fy = 0.5 / np.tan(fov_y_rad / 2)
+ return np.array([[fx, 0, 0.5], [0, fy, 0.5], [0, 0, 1]], dtype=np.float32)
+
+
+def _extrinsics_look_at(eye: np.ndarray, target: np.ndarray, up: np.ndarray) -> np.ndarray:
+ """OpenCV-convention world->camera extrinsics for an array of look-at targets (N, 4, 4)."""
+ eye = np.asarray(eye, dtype=np.float32)
+ target = np.asarray(target, dtype=np.float32)
+ up = np.asarray(up, dtype=np.float32)
+ if target.ndim == 1:
+ target = target[None]
+
+ fwd = target - eye
+ fwd = fwd / np.linalg.norm(fwd, axis=-1, keepdims=True).clip(1e-12)
+ right = np.cross(fwd, up)
+ right_norm = np.linalg.norm(right, axis=-1, keepdims=True)
+ # Fall back to an arbitrary perpendicular if forward is parallel to up.
+ parallel = right_norm.squeeze(-1) < 1e-6
+ if parallel.any():
+ alt_up = np.array([1, 0, 0], dtype=np.float32)
+ right = np.where(parallel[:, None], np.cross(fwd, alt_up), right)
+ right_norm = np.linalg.norm(right, axis=-1, keepdims=True)
+ right = right / right_norm.clip(1e-12)
+ new_up = np.cross(fwd, right)
+
+ R = np.stack([right, new_up, fwd], axis=-2)
+ t = -np.einsum("nij,j->ni", R, eye)
+ E = np.zeros((R.shape[0], 4, 4), dtype=np.float32)
+ E[:, :3, :3] = R
+ E[:, :3, 3] = t
+ E[:, 3, 3] = 1.0
+ return E
+
+
+def get_panorama_cameras() -> Tuple[np.ndarray, List[np.ndarray]]:
+ """Returns (extrinsics (12, 4, 4), [intrinsics] * 12) for icosahedron views at 90 deg FoV."""
+ targets = _icosahedron_directions()
+ eye = np.zeros(3, dtype=np.float32)
+ up = np.array([0, 0, 1], dtype=np.float32)
+ extrinsics = _extrinsics_look_at(eye, targets, up)
+ K = _intrinsics_from_fov(np.deg2rad(90.0), np.deg2rad(90.0))
+ return extrinsics, [K] * len(targets)
+
+
+def spherical_uv_to_directions(uv: np.ndarray) -> np.ndarray:
+ """Equirect UV in [0, 1] -> 3D unit-direction (Z up)."""
+ theta = (1 - uv[..., 0]) * (2 * np.pi)
+ phi = uv[..., 1] * np.pi
+ return np.stack([
+ np.sin(phi) * np.cos(theta),
+ np.sin(phi) * np.sin(theta),
+ np.cos(phi),
+ ], axis=-1).astype(np.float32)
+
+
+def directions_to_spherical_uv(directions: np.ndarray) -> np.ndarray:
+ """3D direction -> equirect UV in [0, 1]."""
+ n = np.linalg.norm(directions, axis=-1, keepdims=True).clip(1e-12)
+ d = directions / n
+ u = 1 - np.arctan2(d[..., 1], d[..., 0]) / (2 * np.pi) % 1.0
+ v = np.arccos(d[..., 2].clip(-1, 1)) / np.pi
+ return np.stack([u, v], axis=-1).astype(np.float32)
+
+
+def _uv_grid(H: int, W: int) -> np.ndarray:
+ """Pixel-center UV grid in [0, 1]; (H, W, 2)."""
+ u = (np.arange(W, dtype=np.float32) + 0.5) / W
+ v = (np.arange(H, dtype=np.float32) + 0.5) / H
+ return np.stack(np.meshgrid(u, v, indexing="xy"), axis=-1)
+
+
+def _unproject_cv(uv: np.ndarray, depth: np.ndarray,
+ extrinsics: np.ndarray, intrinsics: np.ndarray) -> np.ndarray:
+ """Back-project pixels into world coords (OpenCV convention)."""
+ pix = np.concatenate([uv, np.ones_like(uv[..., :1])], axis=-1)
+ K_inv = np.linalg.inv(intrinsics)
+ cam = pix @ K_inv.T * depth[..., None]
+ cam_h = np.concatenate([cam, np.ones_like(cam[..., :1])], axis=-1)
+ E_inv = np.linalg.inv(extrinsics)
+ return (cam_h @ E_inv.T)[..., :3]
+
+
+def _project_cv(points: np.ndarray, extrinsics: np.ndarray, intrinsics: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """World coords -> (uv, depth) in the camera (OpenCV convention)."""
+ pts_h = np.concatenate([points, np.ones_like(points[..., :1])], axis=-1)
+ cam = pts_h @ extrinsics.T
+ cam_xyz = cam[..., :3]
+ depth = cam_xyz[..., 2]
+ proj = cam_xyz @ intrinsics.T
+ uv = proj[..., :2] / proj[..., 2:3].clip(1e-12)
+ return uv.astype(np.float32), depth.astype(np.float32)
+
+
+def _grid_sample_uv(img_bchw: torch.Tensor, uv: torch.Tensor, mode: str = "bilinear") -> torch.Tensor:
+ """Sample img_bchw at UV-in-[0,1] coords uv of shape (B, H, W, 2); replicate-border."""
+ grid = uv * 2.0 - 1.0
+ return F.grid_sample(img_bchw, grid, mode=mode, padding_mode="border", align_corners=False)
+
+
+def split_panorama_image(image: torch.Tensor, extrinsics: np.ndarray, intrinsics: List[np.ndarray], resolution: int) -> torch.Tensor:
+ """(3, Hp, Wp) equirect on any device -> (N, 3, R, R) perspective crops on the same device."""
+ device = image.device
+ N = len(extrinsics)
+ uv = _uv_grid(resolution, resolution)
+ sample_uvs = []
+ for i in range(N):
+ world = _unproject_cv(uv, np.ones(uv.shape[:-1], dtype=np.float32), extrinsics[i], intrinsics[i])
+ sample_uvs.append(directions_to_spherical_uv(world))
+ sample_uvs = np.stack(sample_uvs, axis=0)
+
+ img_bchw = image.unsqueeze(0).expand(N, -1, -1, -1).contiguous()
+ sample_uvs_t = torch.from_numpy(sample_uvs).to(device=device, dtype=image.dtype)
+ return _grid_sample_uv(img_bchw, sample_uvs_t, mode="bilinear")
+
+
+def _poisson_equation(W: int, H: int, wrap_x: bool = False, wrap_y: bool = False):
+ """Sparse Laplacian operator over the H x W grid."""
+ grid_index = np.arange(H * W).reshape(H, W)
+ grid_index = np.pad(grid_index, ((0, 0), (1, 1)), mode="wrap" if wrap_x else "edge")
+ grid_index = np.pad(grid_index, ((1, 1), (0, 0)), mode="wrap" if wrap_y else "edge")
+
+ data = np.array([[-4, 1, 1, 1, 1]], dtype=np.float32).repeat(H * W, axis=0).reshape(-1)
+ indices = np.stack([
+ grid_index[1:-1, 1:-1],
+ grid_index[:-2, 1:-1], grid_index[2:, 1:-1],
+ grid_index[1:-1, :-2], grid_index[1:-1, 2:],
+ ], axis=-1).reshape(-1)
+ indptr = np.arange(0, H * W * 5 + 1, 5)
+ return csr_array((data, indices, indptr), shape=(H * W, H * W))
+
+
+def _grad_equation(W: int, H: int, wrap_x: bool = False, wrap_y: bool = False):
+ """Sparse forward-difference operator over the H x W grid."""
+ grid_index = np.arange(W * H).reshape(H, W)
+ if wrap_x:
+ grid_index = np.pad(grid_index, ((0, 0), (0, 1)), mode="wrap")
+ if wrap_y:
+ grid_index = np.pad(grid_index, ((0, 1), (0, 0)), mode="wrap")
+
+ data = np.concatenate([
+ np.concatenate([
+ np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1),
+ -np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1),
+ ], axis=1).reshape(-1),
+ np.concatenate([
+ np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1),
+ -np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1),
+ ], axis=1).reshape(-1),
+ ])
+ indices = np.concatenate([
+ np.concatenate([grid_index[:, :-1].reshape(-1, 1), grid_index[:, 1:].reshape(-1, 1)], axis=1).reshape(-1),
+ np.concatenate([grid_index[:-1, :].reshape(-1, 1), grid_index[1:, :].reshape(-1, 1)], axis=1).reshape(-1),
+ ])
+ nx = grid_index.shape[0] * (grid_index.shape[1] - 1)
+ ny = (grid_index.shape[0] - 1) * grid_index.shape[1]
+ indptr = np.arange(0, nx * 2 + ny * 2 + 1, 2)
+ return csr_array((data, indices, indptr), shape=(nx + ny, H * W))
+
+
+def _scipy_remap_bilinear(img: np.ndarray, sample_pixels: np.ndarray, mode: str = "bilinear") -> np.ndarray:
+ """Bilinear/nearest sampling at fractional pixel coords; out-of-range clamps to nearest border."""
+ H, W = img.shape[:2]
+ yy = np.clip(sample_pixels[..., 1], 0, H - 1)
+ xx = np.clip(sample_pixels[..., 0], 0, W - 1)
+ order = 1 if mode == "bilinear" else 0
+ if img.ndim == 2:
+ return map_coordinates(img, [yy, xx], order=order, mode="nearest").astype(img.dtype)
+ out = np.stack([
+ map_coordinates(img[..., c], [yy, xx], order=order, mode="nearest")
+ for c in range(img.shape[-1])
+ ], axis=-1)
+ return out.astype(img.dtype)
+
+
+def merge_panorama_depth(width: int, height: int,
+ distance_maps: List[np.ndarray], pred_masks: List[np.ndarray],
+ extrinsics: List[np.ndarray], intrinsics: List[np.ndarray],
+ on_view: Optional[Callable[[], None]] = None,
+ on_solve_start: Optional[Callable[[int, int], None]] = None,
+ on_solve_end: Optional[Callable[[int, int], None]] = None,
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ """Stitch per-view distance maps into a single equirect distance map.
+
+ Recursive multi-scale solve: solves at half resolution first and uses that as the lsmr init
+ for the full-resolution solve. Optional callbacks fire per view processed and around each
+ lsmr solve so callers can drive a progress bar.
+ """
+
+ if max(width, height) > 256:
+ coarse_depth, _ = merge_panorama_depth(width // 2, height // 2,
+ distance_maps, pred_masks, extrinsics, intrinsics,
+ on_view=on_view,
+ on_solve_start=on_solve_start,
+ on_solve_end=on_solve_end)
+ t = torch.from_numpy(coarse_depth).unsqueeze(0).unsqueeze(0)
+ t = F.interpolate(t, size=(height, width), mode="bilinear", align_corners=False)
+ depth_init = t.squeeze().numpy().astype(np.float32)
+ else:
+ depth_init = None
+
+ spherical_directions = spherical_uv_to_directions(_uv_grid(height, width))
+
+ pano_log_grad_maps, pano_grad_masks = [], []
+ pano_log_lap_maps, pano_lap_masks = [], []
+ pano_pred_masks: List[np.ndarray] = []
+
+ for i in range(len(distance_maps)):
+ proj_uv, proj_depth = _project_cv(spherical_directions, extrinsics[i], intrinsics[i])
+ proj_valid = (proj_depth > 0) & (proj_uv > 0).all(axis=-1) & (proj_uv < 1).all(axis=-1)
+
+ Hd, Wd = distance_maps[i].shape[:2]
+ proj_pixels = np.clip(proj_uv, 0, 1) * np.array([Wd - 1, Hd - 1], dtype=np.float32)
+
+ log_dist = np.log(np.clip(distance_maps[i], 1e-6, None))
+ sampled = _scipy_remap_bilinear(log_dist, proj_pixels, mode="bilinear")
+ pano_log = np.where(proj_valid, sampled, 0.0).astype(np.float32)
+
+ sampled_mask = _scipy_remap_bilinear(pred_masks[i].astype(np.uint8), proj_pixels, mode="nearest")
+ pano_pred = proj_valid & (sampled_mask > 0)
+
+ # Equirect wraps horizontally but not vertically: wrap pad along x, edge pad along y.
+ padded = np.pad(pano_log, ((0, 0), (0, 1)), mode="wrap")
+ gx, gy = padded[:, :-1] - padded[:, 1:], padded[:-1, :] - padded[1:, :]
+ padded_m = np.pad(pano_pred, ((0, 0), (0, 1)), mode="wrap")
+ mx, my = padded_m[:, :-1] & padded_m[:, 1:], padded_m[:-1, :] & padded_m[1:, :]
+ pano_log_grad_maps.append((gx, gy))
+ pano_grad_masks.append((mx, my))
+
+ padded = np.pad(pano_log, ((1, 1), (0, 0)), mode="edge")
+ padded = np.pad(padded, ((0, 0), (1, 1)), mode="wrap")
+ lap_kernel = np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32)
+ lap = convolve(padded, lap_kernel)[1:-1, 1:-1]
+ padded_m = np.pad(pano_pred, ((1, 1), (0, 0)), mode="edge")
+ padded_m = np.pad(padded_m, ((0, 0), (1, 1)), mode="wrap")
+ m_kernel = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
+ lap_mask = convolve(padded_m.astype(np.uint8), m_kernel)[1:-1, 1:-1] == 5
+ pano_log_lap_maps.append(lap)
+ pano_lap_masks.append(lap_mask)
+ pano_pred_masks.append(pano_pred)
+
+ if on_view is not None:
+ on_view()
+
+ gx = np.stack([m[0] for m in pano_log_grad_maps], axis=0)
+ gy = np.stack([m[1] for m in pano_log_grad_maps], axis=0)
+ mx = np.stack([m[0] for m in pano_grad_masks], axis=0)
+ my = np.stack([m[1] for m in pano_grad_masks], axis=0)
+ gx_avg = (gx * mx).sum(axis=0) / mx.sum(axis=0).clip(1e-3)
+ gy_avg = (gy * my).sum(axis=0) / my.sum(axis=0).clip(1e-3)
+
+ laps = np.stack(pano_log_lap_maps, axis=0)
+ lap_masks = np.stack(pano_lap_masks, axis=0)
+ lap_avg = (laps * lap_masks).sum(axis=0) / lap_masks.sum(axis=0).clip(1e-3)
+
+ grad_x_mask = mx.any(axis=0).reshape(-1)
+ grad_y_mask = my.any(axis=0).reshape(-1)
+ grad_mask = np.concatenate([grad_x_mask, grad_y_mask])
+ lap_mask_flat = lap_masks.any(axis=0).reshape(-1)
+
+ A = vstack([
+ _grad_equation(width, height, wrap_x=True, wrap_y=False)[grad_mask],
+ _poisson_equation(width, height, wrap_x=True, wrap_y=False)[lap_mask_flat],
+ ])
+ b = np.concatenate([
+ gx_avg.reshape(-1)[grad_x_mask],
+ gy_avg.reshape(-1)[grad_y_mask],
+ lap_avg.reshape(-1)[lap_mask_flat],
+ ])
+ x0 = np.log(np.clip(depth_init, 1e-6, None)).reshape(-1) if depth_init is not None else None
+
+ if on_solve_start is not None:
+ on_solve_start(width, height)
+ x, *_ = lsmr(A, b, atol=1e-5, btol=1e-5, x0=x0, show=False)
+ if on_solve_end is not None:
+ on_solve_end(width, height)
+
+ pano_depth = np.exp(x).reshape(height, width).astype(np.float32)
+ pano_mask = np.any(pano_pred_masks, axis=0)
+ return pano_depth, pano_mask
diff --git a/comfy/ldm/pixeldit/model.py b/comfy/ldm/pixeldit/model.py
new file mode 100644
index 000000000..b044b9b29
--- /dev/null
+++ b/comfy/ldm/pixeldit/model.py
@@ -0,0 +1,239 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ldm.common_dit
+import comfy.patcher_extension
+from comfy.ldm.flux.math import apply_rope, rope
+from comfy.ldm.hidream.model import FeedForwardSwiGLU
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.modules.diffusionmodules.mmdit import TimestepEmbedder
+
+from .modules import (
+ FinalLayer,
+ PatchTokenEmbedder,
+ PiTBlock,
+ PixelTokenEmbedder,
+ apply_adaln_,
+ precompute_freqs_cis_2d,
+)
+
+
+class MMDiTJointAttention(nn.Module):
+ """Joint MMDiT attention with separate Q/K/V/proj for image and text streams.
+
+ RoPE is applied to each stream before concatenation so each stream uses its own
+ 2D/1D positional encoding. Concat order is [text, image] (text first).
+ """
+ def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
+ super().__init__()
+ assert dim % num_heads == 0
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+
+ self.qkv_x = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+ self.qkv_y = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+
+ self.q_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+ self.k_norm_x = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+ self.q_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+ self.k_norm_y = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+
+ self.proj_x = operations.Linear(dim, dim, dtype=dtype, device=device)
+ self.proj_y = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+ def forward(self, x, y, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
+ B, Nx, _ = x.shape
+ _, Ny, _ = y.shape
+ H = self.num_heads
+ D = self.head_dim
+
+ qkv_x = self.qkv_x(x).reshape(B, Nx, 3, H, D).permute(2, 0, 3, 1, 4)
+ qx, kx, vx = qkv_x.unbind(0)
+ qx = self.q_norm_x(qx)
+ kx = self.k_norm_x(kx)
+
+ qkv_y = self.qkv_y(y).reshape(B, Ny, 3, H, D).permute(2, 0, 3, 1, 4)
+ qy, ky, vy = qkv_y.unbind(0)
+ qy = self.q_norm_y(qy)
+ ky = self.k_norm_y(ky)
+
+ qx, kx = apply_rope(qx, kx, pos_img[None, None])
+ if pos_txt is not None:
+ qy, ky = apply_rope(qy, ky, pos_txt[None, None])
+
+ q_joint = torch.cat([qy, qx], dim=2)
+ k_joint = torch.cat([ky, kx], dim=2)
+ v_joint = torch.cat([vy, vx], dim=2)
+
+ out_joint = optimized_attention(
+ q_joint, k_joint, v_joint, H,
+ mask=attn_mask, skip_reshape=True, skip_output_reshape=True,
+ transformer_options=transformer_options,
+ )
+
+ out_y = out_joint[:, :, :Ny, :].transpose(1, 2).reshape(B, Ny, H * D)
+ out_x = out_joint[:, :, Ny:, :].transpose(1, 2).reshape(B, Nx, H * D)
+
+ return self.proj_x(out_x), self.proj_y(out_y)
+
+
+class MMDiTBlockT2I(nn.Module):
+ def __init__(self, hidden_size, groups, mlp_ratio=4.0, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.norm_x1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+ self.norm_y1 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+ self.attn = MMDiTJointAttention(hidden_size, num_heads=groups, qkv_bias=False, dtype=dtype, device=device, operations=operations)
+ self.norm_x2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+ self.norm_y2 = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+ mlp_hidden_dim = int(hidden_size * mlp_ratio)
+ self.mlp_x = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
+ self.mlp_y = FeedForwardSwiGLU(hidden_size, mlp_hidden_dim, multiple_of=1, dtype=dtype, device=device, operations=operations)
+ self.adaLN_modulation_img = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
+ self.adaLN_modulation_txt = nn.Sequential(operations.Linear(hidden_size, 6 * hidden_size, bias=True, dtype=dtype, device=device))
+
+ def forward(self, x, y, c, pos_img, pos_txt=None, attn_mask=None, transformer_options={}):
+ shift_msa_x, scale_msa_x, gate_msa_x, shift_mlp_x, scale_mlp_x, gate_mlp_x = self.adaLN_modulation_img(c).chunk(6, dim=-1)
+ shift_msa_y, scale_msa_y, gate_msa_y, shift_mlp_y, scale_mlp_y, gate_mlp_y = self.adaLN_modulation_txt(c).chunk(6, dim=-1)
+
+ x_norm = apply_adaln_(self.norm_x1(x), shift_msa_x, scale_msa_x)
+ y_norm = apply_adaln_(self.norm_y1(y), shift_msa_y, scale_msa_y)
+ attn_x, attn_y = self.attn(x_norm, y_norm, pos_img, pos_txt, attn_mask, transformer_options=transformer_options)
+ x = torch.addcmul(x, gate_msa_x, attn_x)
+ y = torch.addcmul(y, gate_msa_y, attn_y)
+
+ x = torch.addcmul(x, gate_mlp_x, self.mlp_x(apply_adaln_(self.norm_x2(x), shift_mlp_x, scale_mlp_x)))
+ y = torch.addcmul(y, gate_mlp_y, self.mlp_y(apply_adaln_(self.norm_y2(y), shift_mlp_y, scale_mlp_y)))
+ return x, y
+
+
+class PixDiT_T2I(nn.Module):
+ """PixelDiT T2I model. Hardcoded for the released 1024px Stage-3 checkpoint
+ (also runs at 512px when fed the appropriate latent size and flow_shift).
+
+ Forward:
+ x: [B, 3, H, W] pixel-space input (no VAE)
+ timesteps:[B] in [0, 1000] (ComfyUI flow sampling convention)
+ context: [B, Ltxt, 2304] Gemma-2-2b-it hidden states (chi_prompt prepended)
+ Returns flow-matching velocity [B, 3, H, W].
+ """
+ def __init__(
+ self,
+ in_channels=3,
+ num_groups=24,
+ hidden_size=1536,
+ pixel_hidden_size=16,
+ pixel_attn_hidden_size=1152,
+ pixel_num_groups=16,
+ patch_depth=14,
+ pixel_depth=2,
+ patch_size=16,
+ txt_embed_dim=2304,
+ txt_max_length=300,
+ use_text_rope=True,
+ text_rope_theta=10000.0,
+ image_model=None,
+ dtype=None,
+ device=None,
+ operations=None,
+ pixel_mlp_chunks=2,
+ ):
+ super().__init__()
+ self.dtype = dtype
+ self.in_channels = in_channels
+ self.out_channels = in_channels
+ self.hidden_size = hidden_size
+ self.num_groups = num_groups
+ self.patch_depth = patch_depth
+ self.pixel_depth = pixel_depth
+ self.patch_size = patch_size
+ self.pixel_hidden_size = pixel_hidden_size
+ self.pixel_attn_hidden_size = pixel_attn_hidden_size
+ self.pixel_num_groups = pixel_num_groups
+ self.txt_embed_dim = txt_embed_dim
+ self.txt_max_length = txt_max_length
+ self.use_text_rope = use_text_rope
+ self.text_rope_theta = text_rope_theta
+
+ self.pixel_embedder = PixelTokenEmbedder(self.in_channels, self.pixel_hidden_size, dtype=dtype, device=device, operations=operations)
+ self.s_embedder = PatchTokenEmbedder(self.in_channels * self.patch_size ** 2, self.hidden_size, bias=True, dtype=dtype, device=device, operations=operations)
+ self.t_embedder = TimestepEmbedder(self.hidden_size, dtype=dtype, device=device, operations=operations, max_period=10)
+ self.y_embedder = PatchTokenEmbedder(self.txt_embed_dim, self.hidden_size, bias=True, use_norm=True, dtype=dtype, device=device, operations=operations)
+ self.y_pos_embedding = nn.Parameter(torch.empty(1, self.txt_max_length, self.hidden_size, dtype=dtype, device=device))
+
+ self.patch_blocks = nn.ModuleList([
+ MMDiTBlockT2I(self.hidden_size, self.num_groups,
+ dtype=dtype, device=device, operations=operations)
+ for _ in range(self.patch_depth)
+ ])
+ self.pixel_blocks = nn.ModuleList([
+ PiTBlock(
+ self.pixel_hidden_size,
+ self.hidden_size,
+ patch_size=self.patch_size,
+ num_heads=self.num_groups,
+ attn_hidden_size=self.pixel_attn_hidden_size,
+ attn_num_heads=self.pixel_num_groups,
+ dtype=dtype, device=device, operations=operations,
+ mlp_chunks=pixel_mlp_chunks,
+ )
+ for _ in range(self.pixel_depth)
+ ])
+
+ self.final_layer = FinalLayer(self.pixel_hidden_size, self.out_channels, dtype=dtype, device=device, operations=operations)
+
+ def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
+ return precompute_freqs_cis_2d(self.hidden_size // self.num_groups, height, width, device=device, dtype=dtype, **rope_opts)
+
+ def _fetch_text_pos(self, length, device, dtype):
+ return rope(torch.arange(length, dtype=torch.float32, device=device).reshape(1, -1), self.hidden_size // self.num_groups, self.text_rope_theta).squeeze(0).to(dtype=dtype)
+
+ def forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+ return comfy.patcher_extension.WrapperExecutor.new_class_executor(
+ self._forward, self, comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, transformer_options),
+ ).execute(x, timesteps, context, attention_mask, transformer_options, **kwargs)
+
+ def _pre_patch_block(self, s, i, **kwargs):
+ """Hook for subclasses to inject per-block state into the patch stream (e.g. PiD's LQ gate)."""
+ return s
+
+ def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, **kwargs):
+ H_orig, W_orig = x.shape[2], x.shape[3]
+ x = comfy.ldm.common_dit.pad_to_patch_size(x, (self.patch_size, self.patch_size))
+ B, _, H, W = x.shape
+ Hs = H // self.patch_size
+ Ws = W // self.patch_size
+ L = Hs * Ws
+
+ pos_img = self._fetch_patch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
+ x_patches = F.unfold(x, kernel_size=self.patch_size, stride=self.patch_size).transpose(1, 2)
+
+ t_emb = self.t_embedder(timesteps.view(-1), x.dtype).view(B, -1, self.hidden_size)
+
+ if context is None or context.dim() != 3:
+ raise ValueError("PixDiT_T2I requires context (text embeddings) of shape [B, L, D]")
+ Ltxt = min(context.shape[1], self.txt_max_length)
+ y = context[:, :Ltxt, :]
+ y_emb = self.y_embedder(y).view(B, Ltxt, self.hidden_size)
+ y_emb = y_emb + self.y_pos_embedding[:, :Ltxt, :].to(y_emb) # y_pos_embedding is a raw nn.Parameter
+
+ condition = F.silu(t_emb)
+ pos_txt = self._fetch_text_pos(Ltxt, x.device, x.dtype) if self.use_text_rope else None
+
+ s = self.s_embedder(x_patches)
+ for i, blk in enumerate(self.patch_blocks):
+ s = self._pre_patch_block(s, i, **kwargs)
+ s, y_emb = blk(s, y_emb, condition, pos_img, pos_txt, None, transformer_options=transformer_options)
+ s = F.silu(t_emb + s)
+
+ s_cond = s.view(B * L, self.hidden_size)
+ x_pixels = self.pixel_embedder(x, patch_size=self.patch_size)
+ for blk in self.pixel_blocks:
+ x_pixels = blk(x_pixels, s_cond, H, W, self.patch_size, mask=None, transformer_options=transformer_options)
+
+ x_pixels = self.final_layer(x_pixels)
+ C_out = self.out_channels
+ P2 = self.patch_size * self.patch_size
+ x_pixels = x_pixels.view(B, L, P2, C_out).permute(0, 3, 2, 1).reshape(B, C_out * P2, L)
+ out = F.fold(x_pixels, (H, W), kernel_size=self.patch_size, stride=self.patch_size)
+ return out[:, :, :H_orig, :W_orig]
diff --git a/comfy/ldm/pixeldit/modules.py b/comfy/ldm/pixeldit/modules.py
new file mode 100644
index 000000000..4b1e538c7
--- /dev/null
+++ b/comfy/ldm/pixeldit/modules.py
@@ -0,0 +1,187 @@
+import torch
+import torch.nn as nn
+
+from comfy.ldm.flux.math import apply_rope, rope
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.modules.diffusionmodules.mmdit import Mlp, get_1d_sincos_pos_embed_from_grid_torch
+
+
+def apply_adaln_(x, shift, scale):
+ return x.addcmul_(x, scale).add_(shift)
+
+
+def precompute_freqs_cis_2d(dim, height, width, theta=10000.0, scale=16.0,
+ ref_grid_h=None, ref_grid_w=None,
+ scale_x=1.0, scale_y=1.0, shift_x=0.0, shift_y=0.0,
+ device=None, dtype=torch.float32, **kwargs):
+ """2D RoPE with x/y axis frequencies interleaved at stride 2 across head dim.
+
+ rope_options:
+ scale_x / scale_y multiply the position range (RoPE extrapolation).
+ shift_x / shift_y offset the position origin (tiled / regional inference).
+ With ref_grid_h/w set, also applies NTK-aware per-axis theta scaling
+ (rope_mode='ntk_aware'): theta_axis = theta * (current/ref)^(dim_axis/(dim_axis-2)).
+ Returns Flux-format rotation matrices of shape [H*W, dim/2, 2, 2].
+ Layout of head-dim pairs: [x_0, y_0, x_1, y_1, ..., x_{dim/4-1}, y_{dim/4-1}].
+ """
+ dim_axis = dim // 2
+ if ref_grid_h is not None and dim_axis > 2:
+ h_ntk = (height / ref_grid_h) ** (dim_axis / (dim_axis - 2))
+ w_ntk = (width / ref_grid_w) ** (dim_axis / (dim_axis - 2))
+ else:
+ h_ntk = w_ntk = 1.0
+
+ x_lin = torch.linspace(shift_x, scale * scale_x + shift_x, width, device=device)
+ y_lin = torch.linspace(shift_y, scale * scale_y + shift_y, height, device=device)
+ y_grid, x_grid = torch.meshgrid(y_lin, x_lin, indexing="ij")
+ x_rope = rope(x_grid.reshape(1, -1), dim_axis, theta * w_ntk).squeeze(0)
+ y_rope = rope(y_grid.reshape(1, -1), dim_axis, theta * h_ntk).squeeze(0)
+ out = torch.stack([x_rope, y_rope], dim=2).reshape(height * width, dim // 2, 2, 2)
+ return out.to(dtype=dtype)
+
+
+def get_2d_sincos_pos_embed(embed_dim, height, width, device=None, dtype=torch.float32):
+ """Standard 2D sin/cos absolute positional embedding (ViT-style).
+
+ first half encodes W-coordinates, second half H.
+ """
+ assert embed_dim % 4 == 0
+ grid_h = torch.arange(height, dtype=torch.float32, device=device)
+ grid_w = torch.arange(width, dtype=torch.float32, device=device)
+ grid_y, grid_x = torch.meshgrid(grid_h, grid_w, indexing="ij")
+ emb_w = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_x.reshape(-1), device=device)
+ emb_h = get_1d_sincos_pos_embed_from_grid_torch(embed_dim // 2, grid_y.reshape(-1), device=device)
+ return torch.cat([emb_w, emb_h], dim=1).to(dtype=dtype)
+
+
+class RotaryAttention(nn.Module):
+ """Single-stream self-attention with rotary positional encoding (used inside PiTBlock)."""
+ def __init__(self, dim, num_heads=8, qkv_bias=False, dtype=None, device=None, operations=None):
+ super().__init__()
+ assert dim % num_heads == 0
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+ self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, dtype=dtype, device=device)
+ self.q_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+ self.k_norm = operations.RMSNorm(self.head_dim, eps=1e-6, dtype=dtype, device=device)
+ self.proj = operations.Linear(dim, dim, dtype=dtype, device=device)
+
+ def forward(self, x, pos, mask=None, transformer_options={}):
+ B, N, C = x.shape
+ H = self.num_heads
+ D = self.head_dim
+ qkv = self.qkv(x).reshape(B, N, 3, H, D).permute(2, 0, 3, 1, 4)
+ q, k, v = qkv.unbind(0)
+ q, k = apply_rope(self.q_norm(q), self.k_norm(k), pos[None, None])
+ x = optimized_attention(q, k, v, H, mask=mask, skip_reshape=True, transformer_options=transformer_options)
+ return self.proj(x)
+
+
+class FinalLayer(nn.Module):
+ def __init__(self, hidden_size, out_channels, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.norm = operations.RMSNorm(hidden_size, eps=1e-6, dtype=dtype, device=device)
+ self.linear = operations.Linear(hidden_size, out_channels, bias=True, dtype=dtype, device=device)
+
+ def forward(self, x):
+ return self.linear(self.norm(x))
+
+
+class PatchTokenEmbedder(nn.Module):
+ """Linear projection used both for patchified-image tokens and text-feature tokens."""
+ def __init__(self, in_chans, embed_dim, use_norm=False, bias=True, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.proj = operations.Linear(in_chans, embed_dim, bias=bias, dtype=dtype, device=device)
+ self.norm = operations.RMSNorm(embed_dim, eps=1e-6, dtype=dtype, device=device) if use_norm else nn.Identity()
+
+ def forward(self, x):
+ return self.norm(self.proj(x))
+
+
+class PixelTokenEmbedder(nn.Module):
+ """Pixel-level embedder: lifts each RGB pixel to hidden_size and packs into per-patch sequences."""
+ def __init__(self, in_channels, hidden_size_output, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.in_channels = in_channels
+ self.hidden_size_output = hidden_size_output
+ self.proj = operations.Linear(self.in_channels, self.hidden_size_output, bias=True, dtype=dtype, device=device)
+
+ def forward(self, inputs, patch_size):
+ B, _, H, W = inputs.shape
+ Hs, Ws = H // patch_size, W // patch_size
+ P2 = patch_size * patch_size
+ x = inputs.permute(0, 2, 3, 1).contiguous()
+ x = self.proj(x)
+ pos_full = get_2d_sincos_pos_embed(self.hidden_size_output, H, W, device=x.device, dtype=x.dtype).view(H, W, self.hidden_size_output)
+ x = x + pos_full.unsqueeze(0)
+ x = x.view(B, Hs, patch_size, Ws, patch_size, self.hidden_size_output)
+ return x.permute(0, 1, 3, 2, 4, 5).reshape(B * Hs * Ws, P2, self.hidden_size_output)
+
+
+class PiTBlock(nn.Module):
+ """Pixel-level transformer block.
+
+ Compresses each patch's P^2 pixel tokens → 1 attention token via a linear,
+ runs global self-attention across patches with 2D RoPE, then expands back to P^2 tokens.
+ Conditioning is per-pixel adaLN from the patch-level features.
+ """
+ def __init__(self, pixel_hidden_size, patch_hidden_size, patch_size, num_heads, mlp_ratio=4.0,
+ attn_hidden_size=None, attn_num_heads=None, dtype=None, device=None, operations=None, mlp_chunks=1):
+ super().__init__()
+ self.pixel_dim = pixel_hidden_size
+ self.context_dim = patch_hidden_size
+ self.attn_dim = attn_hidden_size if attn_hidden_size is not None else patch_hidden_size
+ self.num_heads = attn_num_heads if attn_num_heads is not None else num_heads
+ assert self.attn_dim % self.num_heads == 0
+
+ p2 = patch_size * patch_size
+ self.compress_to_attn = operations.Linear(p2 * self.pixel_dim, self.attn_dim, bias=True, dtype=dtype, device=device)
+ self.expand_from_attn = operations.Linear(self.attn_dim, p2 * self.pixel_dim, bias=True, dtype=dtype, device=device)
+
+ self.norm1 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
+ self.attn = RotaryAttention(self.attn_dim, num_heads=self.num_heads, qkv_bias=False, dtype=dtype, device=device, operations=operations)
+ self.norm2 = operations.RMSNorm(self.pixel_dim, eps=1e-6, dtype=dtype, device=device)
+ self.mlp = Mlp(self.pixel_dim, hidden_features=int(self.pixel_dim * mlp_ratio), dtype=dtype, device=device, operations=operations)
+
+ self.adaLN_modulation_msa = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
+ self.adaLN_modulation_mlp = operations.Linear(self.context_dim, 3 * self.pixel_dim * p2, bias=True, dtype=dtype, device=device)
+
+ self._rope_fn = precompute_freqs_cis_2d
+ self.mlp_chunks = max(1, int(mlp_chunks))
+
+ def _fetch_pos(self, height, width, device, dtype, **rope_opts):
+ return self._rope_fn(self.attn_dim // self.num_heads, height, width, device=device, dtype=dtype, **rope_opts)
+
+ def forward(self, x, s_cond, image_height, image_width, patch_size, mask=None, transformer_options={}):
+ BL, P2, _ = x.shape
+ Hs, Ws = image_height // patch_size, image_width // patch_size
+ L = Hs * Ws
+ B = BL // L
+
+ # Attention path uses only msa params; compute, use, free before mlp params allocate.
+ msa_params = self.adaLN_modulation_msa(s_cond).view(BL, P2, 3 * self.pixel_dim)
+ shift_msa, scale_msa, gate_msa = msa_params.chunk(3, dim=-1)
+
+ x_norm = apply_adaln_(self.norm1(x), shift_msa, scale_msa)
+ x_flat = x_norm.view(BL, P2 * self.pixel_dim)
+
+ x_comp = self.compress_to_attn(x_flat).view(B, L, self.attn_dim)
+ pos_comp = self._fetch_pos(Hs, Ws, x.device, x.dtype, **(transformer_options.get("rope_options") or {}))
+ attn_out = self.attn(x_comp, pos_comp, mask=mask, transformer_options=transformer_options)
+ attn_flat = self.expand_from_attn(attn_out.view(B * L, self.attn_dim))
+ attn_exp = attn_flat.view(BL, P2, self.pixel_dim)
+ x = torch.addcmul(x, gate_msa, attn_exp)
+ del msa_params, shift_msa, scale_msa, gate_msa
+
+ mlp_params = self.adaLN_modulation_mlp(s_cond).view(BL, P2, 3 * self.pixel_dim)
+ shift_mlp, scale_mlp, gate_mlp = mlp_params.chunk(3, dim=-1)
+ gate_mlp = gate_mlp.contiguous() # detach from mlp_params so the del below frees shift+scale storage before the MLP
+ mlp_input = apply_adaln_(self.norm2(x), shift_mlp, scale_mlp)
+ del mlp_params, shift_mlp, scale_mlp
+
+ # MLP in chunks since the peak memory usage is huge here
+ chunk_size = (BL + self.mlp_chunks - 1) // self.mlp_chunks
+ for s in range(0, BL, chunk_size):
+ e = min(s + chunk_size, BL)
+ x[s:e].addcmul_(gate_mlp[s:e], self.mlp(mlp_input[s:e]))
+ return x
diff --git a/comfy/ldm/pixeldit/pid.py b/comfy/ldm/pixeldit/pid.py
new file mode 100644
index 000000000..21b73907a
--- /dev/null
+++ b/comfy/ldm/pixeldit/pid.py
@@ -0,0 +1,227 @@
+"""PiD — Pixel Diffusion Decoder. Decodes a Flux/SD3/Flux2/Z-Image latent
+directly to a 4x-upscaled image in 4 distilled flow-matching steps. PixDiT_T2I
+body + LQ projection branch injected before each MMDiT patch block.
+"""
+
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .model import PixDiT_T2I
+from .modules import precompute_freqs_cis_2d
+
+
+class SigmaAwareGatePerTokenPerDim(nn.Module):
+ """gate = sigmoid(content_proj(cat[x, lq]) - exp(log_alpha) * sigma); out = x + gate * lq.
+
+ Trained init gives ~0.88 gate at sigma=0, ~0.05 at sigma=1.
+ """
+
+ def __init__(self, dim: int, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.content_proj = operations.Linear(dim * 2, dim, dtype=dtype, device=device)
+ self.log_alpha = nn.Parameter(torch.empty((), dtype=dtype, device=device))
+
+ def forward(self, x: torch.Tensor, lq: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+ content_logit = self.content_proj(torch.cat([x, lq], dim=-1))
+ # log_alpha is a raw nn.Parameter -> doesn't auto-cast under dynamic VRAM.
+ log_alpha = self.log_alpha.to(device=x.device, dtype=torch.float32)
+ sigma_offset = -log_alpha.exp() * sigma.float().view(-1, 1, 1)
+ gate = torch.sigmoid(content_logit + sigma_offset)
+ return x + (gate * lq).to(x.dtype)
+
+
+class ResBlock(nn.Module):
+ """Pre-activation ResNet block: GN -> SiLU -> Conv -> GN -> SiLU -> Conv + skip."""
+
+ def __init__(self, channels: int, num_groups: int = 4, dtype=None, device=None, operations=None):
+ super().__init__()
+ self.block = nn.Sequential(
+ operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
+ operations.GroupNorm(num_groups, channels, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Conv2d(channels, channels, kernel_size=3, padding=1, dtype=dtype, device=device),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ return x + self.block(x)
+
+
+class LQProjection2D(nn.Module):
+ """LQ latent -> per-block patch-aligned features for controlnet-style injection."""
+
+ def __init__(
+ self,
+ latent_channels: int,
+ hidden_dim: int = 512,
+ out_dim: int = 1536,
+ patch_size: int = 16,
+ sr_scale: int = 4,
+ latent_spatial_down_factor: int = 8,
+ num_res_blocks: int = 4,
+ num_outputs: int = 7,
+ interval: int = 2,
+ dtype=None, device=None, operations=None,
+ ):
+ super().__init__()
+ self.latent_channels = latent_channels
+ self.hidden_dim = hidden_dim
+ self.out_dim = out_dim
+ self.patch_size = patch_size
+ self.sr_scale = sr_scale
+ self.latent_spatial_down_factor = latent_spatial_down_factor
+ self.num_outputs = num_outputs
+ self.interval = interval
+
+ z_to_patch_ratio = (sr_scale * latent_spatial_down_factor) / patch_size
+ self.z_to_patch_ratio = z_to_patch_ratio
+ if z_to_patch_ratio >= 1:
+ self.latent_fold_factor = 0
+ latent_proj_in_ch = latent_channels
+ else:
+ fold_factor = int(1 / z_to_patch_ratio)
+ assert fold_factor * z_to_patch_ratio == 1.0
+ self.latent_fold_factor = fold_factor
+ latent_proj_in_ch = latent_channels * fold_factor * fold_factor
+
+ layers = [
+ operations.Conv2d(latent_proj_in_ch, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1, dtype=dtype, device=device),
+ ]
+ for _ in range(num_res_blocks):
+ layers.append(ResBlock(hidden_dim, dtype=dtype, device=device, operations=operations))
+ self.latent_proj = nn.Sequential(*layers)
+
+ self.output_heads = nn.ModuleList(
+ [operations.Linear(hidden_dim, out_dim, dtype=dtype, device=device) for _ in range(num_outputs)]
+ )
+ self.gate_modules = nn.ModuleList(
+ [SigmaAwareGatePerTokenPerDim(out_dim, dtype=dtype, device=device, operations=operations)
+ for _ in range(num_outputs)]
+ )
+
+ def is_gate_active(self, block_idx: int) -> bool:
+ return block_idx % self.interval == 0
+
+ def output_index(self, block_idx: int) -> int:
+ return block_idx // self.interval
+
+ def gate(self, x: torch.Tensor, lq_feature: torch.Tensor, sigma: torch.Tensor, out_idx: int) -> torch.Tensor:
+ return self.gate_modules[out_idx](x, lq_feature, sigma)
+
+ def _align_latent_to_patch_grid(self, lq_latent: torch.Tensor, pH: int, pW: int) -> torch.Tensor:
+ B, z_dim = lq_latent.shape[:2]
+ if self.z_to_patch_ratio >= 1:
+ if lq_latent.shape[2] != pH or lq_latent.shape[3] != pW:
+ z_aligned = F.interpolate(lq_latent, size=(pH, pW), mode="nearest")
+ else:
+ z_aligned = lq_latent
+ else:
+ f = self.latent_fold_factor
+ zH_expected, zW_expected = pH * f, pW * f
+ if lq_latent.shape[2] != zH_expected or lq_latent.shape[3] != zW_expected:
+ lq_latent = F.interpolate(lq_latent, size=(zH_expected, zW_expected), mode="nearest")
+ z_aligned = lq_latent.reshape(B, z_dim, pH, f, pW, f).permute(0, 1, 3, 5, 2, 4)
+ z_aligned = z_aligned.reshape(B, z_dim * f * f, pH, pW)
+ return self.latent_proj(z_aligned)
+
+ def forward(self, lq_latent: torch.Tensor, target_pH: int, target_pW: int) -> List[torch.Tensor]:
+ feat = self._align_latent_to_patch_grid(lq_latent, target_pH, target_pW)
+ B, C, H, W = feat.shape
+ tokens = feat.permute(0, 2, 3, 1).contiguous().view(B, H * W, C)
+ return [head(tokens) for head in self.output_heads]
+
+
+class PidNet(PixDiT_T2I):
+ """PixDiT_T2I + LQ injection (one sigma-gated feature inserted before each patch block)."""
+
+ def __init__(
+ self,
+ lq_latent_channels: int = 16,
+ lq_hidden_dim: int = 512,
+ lq_num_res_blocks: int = 4,
+ lq_interval: int = 2,
+ sr_scale: int = 4,
+ latent_spatial_down_factor: int = 8,
+ rope_ref_h: int = 1024, # NTK ref resolution in PIXEL units: 1024px / patch=16 -> grid_ref=64.
+ rope_ref_w: int = 1024,
+ image_model=None,
+ dtype=None, device=None, operations=None,
+ **pixdit_kwargs,
+ ):
+ super().__init__(dtype=dtype, device=device, operations=operations, **pixdit_kwargs)
+
+ self.rope_ref_grid_h = rope_ref_h // self.patch_size
+ self.rope_ref_grid_w = rope_ref_w // self.patch_size
+
+ # Parent's PiTBlocks were built with plain RoPE — swap in NTK-aware.
+ def _pit_rope_fn(head_dim, h, w, device=None, dtype=torch.float32, **rope_opts):
+ return precompute_freqs_cis_2d(head_dim, h, w, ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w, device=device, dtype=dtype, **rope_opts)
+ for blk in self.pixel_blocks:
+ blk._rope_fn = _pit_rope_fn
+
+ num_lq_outputs = (self.patch_depth + lq_interval - 1) // lq_interval
+ self.lq_proj = LQProjection2D(
+ latent_channels=lq_latent_channels,
+ hidden_dim=lq_hidden_dim,
+ out_dim=self.hidden_size,
+ patch_size=self.patch_size,
+ sr_scale=sr_scale,
+ latent_spatial_down_factor=latent_spatial_down_factor,
+ num_res_blocks=lq_num_res_blocks,
+ num_outputs=num_lq_outputs,
+ interval=lq_interval,
+ dtype=dtype,
+ device=device,
+ operations=operations,
+ )
+
+ def _fetch_patch_pos(self, height, width, device, dtype, **rope_opts):
+ return precompute_freqs_cis_2d(
+ self.hidden_size // self.num_groups,
+ height, width,
+ ref_grid_h=self.rope_ref_grid_h, ref_grid_w=self.rope_ref_grid_w,
+ device=device, dtype=dtype, **rope_opts,
+ )
+
+ def _pre_patch_block(self, s, i, pid_lq_features, pid_degrade_sigma, **kwargs):
+ if not self.lq_proj.is_gate_active(i):
+ return s
+ out_idx = self.lq_proj.output_index(i)
+ if out_idx >= len(pid_lq_features):
+ return s
+ return self.lq_proj.gate(s, pid_lq_features[out_idx], pid_degrade_sigma, out_idx)
+
+ def _forward(self, x, timesteps, context=None, attention_mask=None, transformer_options={}, lq_latent=None, degrade_sigma=None, **kwargs):
+ if lq_latent is None:
+ raise ValueError("PidNet requires lq_latent — attach via PiDConditioning")
+ expected_c = self.lq_proj.latent_channels
+ if lq_latent.shape[1] != expected_c:
+ raise ValueError(
+ f"Input latent has {lq_latent.shape[1]} channels, this model variant expects {expected_c}. "
+ f"Flux1/SD3 = 16 channels, Flux2 = 128 channels."
+ )
+ B = x.shape[0]
+ # Match the backbone's pad_to_patch_size (round up) so the LQ grid lines up with the patch stream.
+ Hs = -(-x.shape[2] // self.patch_size)
+ Ws = -(-x.shape[3] // self.patch_size)
+
+ degrade_sigma = degrade_sigma.to(device=x.device, dtype=torch.float32).reshape(-1)
+ if degrade_sigma.numel() == 1 and B > 1:
+ degrade_sigma = degrade_sigma.expand(B).contiguous()
+
+ lq_features = self.lq_proj(lq_latent=lq_latent.to(x), target_pH=Hs, target_pW=Ws)
+
+ return super()._forward(
+ x, timesteps,
+ context=context, attention_mask=attention_mask,
+ transformer_options=transformer_options,
+ pid_lq_features=lq_features,
+ pid_degrade_sigma=degrade_sigma,
+ **kwargs,
+ )
diff --git a/comfy/ldm/sam3/detector.py b/comfy/ldm/sam3/detector.py
new file mode 100644
index 000000000..23a972ac7
--- /dev/null
+++ b/comfy/ldm/sam3/detector.py
@@ -0,0 +1,599 @@
+# SAM3 detector: transformer encoder-decoder, segmentation head, geometry encoder, scoring.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.ops import roi_align
+
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.sam3.tracker import SAM3Tracker, SAM31Tracker
+from comfy.ldm.sam3.sam import SAM3VisionBackbone # noqa: used in __init__
+from comfy.ldm.sam3.sam import MLP, PositionEmbeddingSine
+
+TRACKER_CLASSES = {"SAM3": SAM3Tracker, "SAM31": SAM31Tracker}
+from comfy.ops import cast_to_input
+
+
+def box_cxcywh_to_xyxy(x):
+ cx, cy, w, h = x.unbind(-1)
+ return torch.stack([cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h], dim=-1)
+
+
+def gen_sineembed_for_position(pos_tensor, num_feats=256):
+ """Per-coordinate sinusoidal embedding: (..., N) -> (..., N * num_feats)."""
+ assert num_feats % 2 == 0
+ hdim = num_feats // 2
+ freqs = 10000.0 ** (2 * (torch.arange(hdim, dtype=torch.float32, device=pos_tensor.device) // 2) / hdim)
+ embeds = []
+ for c in range(pos_tensor.shape[-1]):
+ raw = (pos_tensor[..., c].float() * 2 * math.pi).unsqueeze(-1) / freqs
+ embeds.append(torch.stack([raw[..., 0::2].sin(), raw[..., 1::2].cos()], dim=-1).flatten(-2))
+ return torch.cat(embeds, dim=-1).to(pos_tensor.dtype)
+
+
+class SplitMHA(nn.Module):
+ """Multi-head attention with separate Q/K/V projections (split from fused in_proj_weight)."""
+ def __init__(self, d_model, num_heads=8, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ self.q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.v_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.out_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+
+ def forward(self, q_input, k_input=None, v_input=None, mask=None):
+ q = self.q_proj(q_input)
+ if k_input is None:
+ k = self.k_proj(q_input)
+ v = self.v_proj(q_input)
+ else:
+ k = self.k_proj(k_input)
+ v = self.v_proj(v_input if v_input is not None else k_input)
+ if mask is not None and mask.ndim == 2:
+ mask = mask[:, None, None, :] # [B, T] -> [B, 1, 1, T] for SDPA broadcast
+ dtype = q.dtype # manual_cast may produce mixed dtypes
+ out = optimized_attention(q, k.to(dtype), v.to(dtype), self.num_heads, mask=mask, low_precision_attention=False)
+ return self.out_proj(out)
+
+
+class MLPWithNorm(nn.Module):
+ """MLP with residual connection and output LayerNorm."""
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers, residual=True, device=None, dtype=None, operations=None):
+ super().__init__()
+ dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
+ self.layers = nn.ModuleList([
+ operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype)
+ for i in range(num_layers)
+ ])
+ self.out_norm = operations.LayerNorm(output_dim, device=device, dtype=dtype)
+ self.residual = residual and (input_dim == output_dim)
+
+ def forward(self, x):
+ orig = x
+ for i, layer in enumerate(self.layers):
+ x = layer(x)
+ if i < len(self.layers) - 1:
+ x = F.relu(x)
+ if self.residual:
+ x = x + orig
+ return self.out_norm(x)
+
+
+class EncoderLayer(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.cross_attn_image = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
+ self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
+ self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+
+ def forward(self, x, pos, text_memory=None, text_mask=None):
+ normed = self.norm1(x)
+ q_k = normed + pos
+ x = x + self.self_attn(q_k, q_k, normed)
+ if text_memory is not None:
+ normed = self.norm2(x)
+ x = x + self.cross_attn_image(normed, text_memory, text_memory, mask=text_mask)
+ normed = self.norm3(x)
+ x = x + self.linear2(F.relu(self.linear1(normed)))
+ return x
+
+
+class TransformerEncoder(nn.Module):
+ """Checkpoint: transformer.encoder.layers.N.*"""
+ def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.layers = nn.ModuleList([
+ EncoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
+ for _ in range(num_layers)
+ ])
+
+ def forward(self, x, pos, text_memory=None, text_mask=None):
+ for layer in self.layers:
+ x = layer(x, pos, text_memory, text_mask)
+ return x
+
+
+class DecoderLayer(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, dim_ff=2048, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.self_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.cross_attn = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.ca_text = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.catext_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
+ self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
+
+ def forward(self, x, memory, x_pos, memory_pos, text_memory=None, text_mask=None, cross_attn_bias=None):
+ q_k = x + x_pos
+ x = self.norm2(x + self.self_attn(q_k, q_k, x))
+ if text_memory is not None:
+ x = self.catext_norm(x + self.ca_text(x + x_pos, text_memory, text_memory, mask=text_mask))
+ x = self.norm1(x + self.cross_attn(x + x_pos, memory + memory_pos, memory, mask=cross_attn_bias))
+ x = self.norm3(x + self.linear2(F.relu(self.linear1(x))))
+ return x
+
+
+class TransformerDecoder(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, dim_ff=2048, num_layers=6,
+ num_queries=200, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.d_model = d_model
+ self.num_queries = num_queries
+
+ self.layers = nn.ModuleList([
+ DecoderLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
+ for _ in range(num_layers)
+ ])
+ self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.query_embed = operations.Embedding(num_queries, d_model, device=device, dtype=dtype)
+ self.reference_points = operations.Embedding(num_queries, 4, device=device, dtype=dtype) # Reference points: Embedding(num_queries, 4) — learned anchor boxes
+ self.ref_point_head = MLP(d_model * 2, d_model, d_model, 2, device=device, dtype=dtype, operations=operations) # ref_point_head input: 512 (4 coords * 128 sine features each)
+ self.bbox_embed = MLP(d_model, d_model, 4, 3, device=device, dtype=dtype, operations=operations)
+
+ self.boxRPB_embed_x = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
+ self.boxRPB_embed_y = MLP(2, d_model, num_heads, 2, device=device, dtype=dtype, operations=operations)
+
+ self.presence_token = operations.Embedding(1, d_model, device=device, dtype=dtype)
+ self.presence_token_head = MLP(d_model, d_model, 1, 3, device=device, dtype=dtype, operations=operations)
+ self.presence_token_out_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+
+ @staticmethod
+ def _inverse_sigmoid(x):
+ return torch.log(x / (1 - x + 1e-6) + 1e-6)
+
+ def _compute_box_rpb(self, ref_points, H, W):
+ """Box rotary position bias: (B, Q, 4) cxcywh -> (B, n_heads, Q+1, H*W) bias."""
+ boxes_xyxy = box_cxcywh_to_xyxy(ref_points)
+ B, Q, _ = boxes_xyxy.shape
+ coords_h = torch.arange(H, device=ref_points.device, dtype=torch.float32) / H
+ coords_w = torch.arange(W, device=ref_points.device, dtype=torch.float32) / W
+ deltas_x = coords_w.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 0:3:2]
+ deltas_y = coords_h.view(1, 1, -1, 1) - boxes_xyxy[:, :, None, 1:4:2]
+
+ log2_8 = float(math.log2(8))
+ def log_scale(d):
+ return torch.sign(d * 8) * torch.log2(torch.abs(d * 8) + 1.0) / log2_8
+
+ rpb_x = self.boxRPB_embed_x(log_scale(deltas_x).to(ref_points.dtype))
+ rpb_y = self.boxRPB_embed_y(log_scale(deltas_y).to(ref_points.dtype))
+
+ bias = (rpb_y.unsqueeze(3) + rpb_x.unsqueeze(2)).flatten(2, 3).permute(0, 3, 1, 2)
+ pres_bias = torch.zeros(B, bias.shape[1], 1, bias.shape[3], device=bias.device, dtype=bias.dtype)
+ return torch.cat([pres_bias, bias], dim=2)
+
+ def forward(self, memory, memory_pos, text_memory=None, text_mask=None, H=72, W=72):
+ B = memory.shape[0]
+ tgt = cast_to_input(self.query_embed.weight, memory).unsqueeze(0).expand(B, -1, -1)
+ presence_out = cast_to_input(self.presence_token.weight, memory)[None].expand(B, -1, -1)
+ ref_points = cast_to_input(self.reference_points.weight, memory).unsqueeze(0).expand(B, -1, -1).sigmoid()
+
+ for layer_idx, layer in enumerate(self.layers):
+ query_pos = self.ref_point_head(gen_sineembed_for_position(ref_points, self.d_model))
+ tgt_with_pres = torch.cat([presence_out, tgt], dim=1)
+ pos_with_pres = torch.cat([torch.zeros_like(presence_out), query_pos], dim=1)
+ tgt_with_pres = layer(tgt_with_pres, memory, pos_with_pres, memory_pos,
+ text_memory, text_mask, self._compute_box_rpb(ref_points, H, W))
+ presence_out, tgt = tgt_with_pres[:, :1], tgt_with_pres[:, 1:]
+ if layer_idx < len(self.layers) - 1:
+ ref_inv = self._inverse_sigmoid(ref_points)
+ ref_points = (ref_inv + self.bbox_embed(self.norm(tgt))).sigmoid().detach()
+
+ query_out = self.norm(tgt)
+ ref_inv = self._inverse_sigmoid(ref_points)
+ boxes = (ref_inv + self.bbox_embed(query_out)).sigmoid()
+ presence = self.presence_token_head(self.presence_token_out_norm(presence_out)).squeeze(-1)
+ return {"decoder_output": query_out, "pred_boxes": boxes, "presence": presence}
+
+
+class Transformer(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, dim_ff=2048, enc_layers=6, dec_layers=6,
+ num_queries=200, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.encoder = TransformerEncoder(d_model, num_heads, dim_ff, enc_layers, device=device, dtype=dtype, operations=operations)
+ self.decoder = TransformerDecoder(d_model, num_heads, dim_ff, dec_layers, num_queries, device=device, dtype=dtype, operations=operations)
+
+
+class GeometryEncoder(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, num_layers=3, roi_size=7, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.d_model = d_model
+ self.roi_size = roi_size
+ self.pos_enc = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
+ self.points_direct_project = operations.Linear(2, d_model, device=device, dtype=dtype)
+ self.points_pool_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.points_pos_enc_project = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.boxes_direct_project = operations.Linear(4, d_model, device=device, dtype=dtype)
+ self.boxes_pool_project = operations.Conv2d(d_model, d_model, kernel_size=roi_size, device=device, dtype=dtype)
+ self.boxes_pos_enc_project = operations.Linear(d_model + 2, d_model, device=device, dtype=dtype)
+ self.label_embed = operations.Embedding(2, d_model, device=device, dtype=dtype)
+ self.cls_embed = operations.Embedding(1, d_model, device=device, dtype=dtype)
+ self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.img_pre_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.encode = nn.ModuleList([
+ EncoderLayer(d_model, num_heads, 2048, device=device, dtype=dtype, operations=operations)
+ for _ in range(num_layers)
+ ])
+ self.encode_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.final_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+
+ def _encode_points(self, coords, labels, img_feat_2d):
+ """Encode point prompts: direct + pool + pos_enc + label. coords: [B, N, 2] normalized."""
+ B, N, _ = coords.shape
+ embed = self.points_direct_project(coords)
+ # Pool features from backbone at point locations via grid_sample
+ grid = (coords * 2 - 1).unsqueeze(2) # [B, N, 1, 2] in [-1, 1]
+ sampled = F.grid_sample(img_feat_2d, grid, align_corners=False) # [B, C, N, 1]
+ embed = embed + self.points_pool_project(sampled.squeeze(-1).permute(0, 2, 1)) # [B, N, C]
+ # Positional encoding of coordinates
+ x, y = coords[:, :, 0], coords[:, :, 1] # [B, N]
+ pos_x, pos_y = self.pos_enc._encode_xy(x.flatten(), y.flatten())
+ enc = torch.cat([pos_x, pos_y], dim=-1).view(B, N, -1)
+ embed = embed + self.points_pos_enc_project(cast_to_input(enc, embed))
+ embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
+ return embed
+
+ def _encode_boxes(self, boxes, labels, img_feat_2d):
+ """Encode box prompts: direct + pool + pos_enc + label. boxes: [B, N, 4] normalized cxcywh."""
+ B, N, _ = boxes.shape
+ embed = self.boxes_direct_project(boxes)
+ # ROI align from backbone at box regions
+ H, W = img_feat_2d.shape[-2:]
+ boxes_xyxy = box_cxcywh_to_xyxy(boxes)
+ scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype, device=boxes_xyxy.device)
+ boxes_scaled = boxes_xyxy * scale
+ sampled = roi_align(img_feat_2d, boxes_scaled.view(-1, 4).split(N), self.roi_size)
+ proj = self.boxes_pool_project(sampled).view(B, N, -1) # Conv2d(roi_size) -> [B*N, C, 1, 1] -> [B, N, C]
+ embed = embed + proj
+ # Positional encoding of box center + size
+ cx, cy, w, h = boxes[:, :, 0], boxes[:, :, 1], boxes[:, :, 2], boxes[:, :, 3]
+ enc = self.pos_enc.encode_boxes(cx.flatten(), cy.flatten(), w.flatten(), h.flatten())
+ enc = enc.view(B, N, -1)
+ embed = embed + self.boxes_pos_enc_project(cast_to_input(enc, embed))
+ embed = embed + cast_to_input(self.label_embed(labels.long()), embed)
+ return embed
+
+ def forward(self, points=None, boxes=None, image_features=None):
+ """Encode geometry prompts. image_features: [B, HW, C] flattened backbone features."""
+ # Prepare 2D image features for pooling
+ img_feat_2d = None
+ if image_features is not None:
+ B = image_features.shape[0]
+ HW, C = image_features.shape[1], image_features.shape[2]
+ hw = int(math.sqrt(HW))
+ img_normed = self.img_pre_norm(image_features)
+ img_feat_2d = img_normed.permute(0, 2, 1).view(B, C, hw, hw)
+
+ embeddings = []
+ if points is not None:
+ coords, labels = points
+ embeddings.append(self._encode_points(coords, labels, img_feat_2d))
+ if boxes is not None:
+ B = boxes.shape[0]
+ box_labels = torch.ones(B, boxes.shape[1], dtype=torch.long, device=boxes.device)
+ embeddings.append(self._encode_boxes(boxes, box_labels, img_feat_2d))
+ if not embeddings:
+ return None
+ geo = torch.cat(embeddings, dim=1)
+ geo = self.norm(geo)
+ if image_features is not None:
+ for layer in self.encode:
+ geo = layer(geo, torch.zeros_like(geo), image_features)
+ geo = self.encode_norm(geo)
+ return self.final_proj(geo)
+
+
+class PixelDecoder(nn.Module):
+ """Top-down FPN pixel decoder with GroupNorm + ReLU + nearest interpolation."""
+ def __init__(self, d_model=256, num_stages=3, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.conv_layers = nn.ModuleList([operations.Conv2d(d_model, d_model, kernel_size=3, padding=1, device=device, dtype=dtype) for _ in range(num_stages)])
+ self.norms = nn.ModuleList([operations.GroupNorm(8, d_model, device=device, dtype=dtype) for _ in range(num_stages)])
+
+ def forward(self, backbone_features):
+ prev = backbone_features[-1]
+ for i, feat in enumerate(backbone_features[:-1][::-1]):
+ prev = F.relu(self.norms[i](self.conv_layers[i](feat + F.interpolate(prev, size=feat.shape[-2:], mode="nearest"))))
+ return prev
+
+
+class MaskPredictor(nn.Module):
+ def __init__(self, d_model=256, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.mask_embed = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
+
+ def forward(self, query_embeddings, pixel_features):
+ mask_embed = self.mask_embed(query_embeddings)
+ return torch.einsum("bqc,bchw->bqhw", mask_embed, pixel_features)
+
+
+class SegmentationHead(nn.Module):
+ def __init__(self, d_model=256, num_heads=8, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.d_model = d_model
+ self.pixel_decoder = PixelDecoder(d_model, 3, device=device, dtype=dtype, operations=operations)
+ self.mask_predictor = MaskPredictor(d_model, device=device, dtype=dtype, operations=operations)
+ self.cross_attend_prompt = SplitMHA(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.cross_attn_norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.instance_seg_head = operations.Conv2d(d_model, d_model, kernel_size=1, device=device, dtype=dtype)
+ self.semantic_seg_head = operations.Conv2d(d_model, 1, kernel_size=1, device=device, dtype=dtype)
+
+ def forward(self, query_embeddings, backbone_features, encoder_hidden_states=None, prompt=None, prompt_mask=None):
+ if encoder_hidden_states is not None and prompt is not None:
+ enc_normed = self.cross_attn_norm(encoder_hidden_states)
+ enc_cross = self.cross_attend_prompt(enc_normed, prompt, prompt, mask=prompt_mask)
+ encoder_hidden_states = enc_cross + encoder_hidden_states
+
+ if encoder_hidden_states is not None:
+ B, H, W = encoder_hidden_states.shape[0], backbone_features[-1].shape[-2], backbone_features[-1].shape[-1]
+ encoder_visual = encoder_hidden_states[:, :H * W].permute(0, 2, 1).view(B, self.d_model, H, W)
+ backbone_features = list(backbone_features)
+ backbone_features[-1] = encoder_visual
+
+ pixel_features = self.pixel_decoder(backbone_features)
+ instance_features = self.instance_seg_head(pixel_features)
+ masks = self.mask_predictor(query_embeddings, instance_features)
+ return masks
+
+
+class DotProductScoring(nn.Module):
+ def __init__(self, d_model=256, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.hs_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.prompt_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.prompt_mlp = MLPWithNorm(d_model, 2048, d_model, 2, device=device, dtype=dtype, operations=operations)
+ self.scale = 1.0 / (d_model ** 0.5)
+
+ def forward(self, query_embeddings, prompt_embeddings, prompt_mask=None):
+ prompt = self.prompt_mlp(prompt_embeddings)
+ if prompt_mask is not None:
+ weight = prompt_mask.unsqueeze(-1).to(dtype=prompt.dtype)
+ pooled = (prompt * weight).sum(dim=1) / weight.sum(dim=1).clamp(min=1)
+ else:
+ pooled = prompt.mean(dim=1)
+ hs = self.hs_proj(query_embeddings)
+ pp = self.prompt_proj(pooled).unsqueeze(-1).to(hs.dtype)
+ scores = torch.matmul(hs, pp)
+ return (scores * self.scale).clamp(-12.0, 12.0).squeeze(-1)
+
+
+class SAM3Detector(nn.Module):
+ def __init__(self, d_model=256, embed_dim=1024, num_queries=200, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+ image_model = kwargs.pop("image_model", "SAM3")
+ for k in ("num_heads", "num_head_channels"):
+ kwargs.pop(k, None)
+ multiplex = image_model == "SAM31"
+ # SAM3: 4 FPN levels, drop last (scalp=1); SAM3.1: 3 levels, use all (scalp=0)
+ self.scalp = 0 if multiplex else 1
+ self.backbone = nn.ModuleDict({
+ "vision_backbone": SAM3VisionBackbone(embed_dim=embed_dim, d_model=d_model, multiplex=multiplex, device=device, dtype=dtype, operations=operations, **kwargs),
+ "language_backbone": nn.ModuleDict({"resizer": operations.Linear(embed_dim, d_model, device=device, dtype=dtype)}),
+ })
+ self.transformer = Transformer(d_model=d_model, num_queries=num_queries, device=device, dtype=dtype, operations=operations)
+ self.segmentation_head = SegmentationHead(d_model=d_model, device=device, dtype=dtype, operations=operations)
+ self.geometry_encoder = GeometryEncoder(d_model=d_model, device=device, dtype=dtype, operations=operations)
+ self.dot_prod_scoring = DotProductScoring(d_model=d_model, device=device, dtype=dtype, operations=operations)
+
+ def _get_backbone_features(self, images):
+ """Run backbone and return (detector_features, detector_positions, tracker_features, tracker_positions)."""
+ bb = self.backbone["vision_backbone"]
+ if bb.multiplex:
+ all_f, all_p, tf, tp = bb(images, tracker_mode="propagation")
+ else:
+ all_f, all_p, tf, tp = bb(images, need_tracker=True)
+ return all_f, all_p, tf, tp
+
+ @staticmethod
+ def _run_geo_layer(layer, x, memory, memory_pos):
+ x = x + layer.self_attn(layer.norm1(x))
+ x = x + layer.cross_attn_image(layer.norm2(x), memory + memory_pos, memory)
+ x = x + layer.linear2(F.relu(layer.linear1(layer.norm3(x))))
+ return x
+
+ def _detect(self, features, positions, text_embeddings=None, text_mask=None,
+ points=None, boxes=None):
+ """Shared detection: geometry encoding, transformer, scoring, segmentation."""
+ B = features[0].shape[0]
+ # Scalp for encoder (use top-level feature), but keep all levels for segmentation head
+ seg_features = features
+ if self.scalp > 0:
+ features = features[:-self.scalp]
+ positions = positions[:-self.scalp]
+ enc_feat, enc_pos = features[-1], positions[-1]
+ _, _, H, W = enc_feat.shape
+ img_flat = enc_feat.flatten(2).permute(0, 2, 1)
+ pos_flat = enc_pos.flatten(2).permute(0, 2, 1)
+
+ has_prompts = text_embeddings is not None or points is not None or boxes is not None
+ if has_prompts:
+ geo_enc = self.geometry_encoder
+ geo_prompts = geo_enc(points=points, boxes=boxes, image_features=img_flat)
+ geo_cls = geo_enc.norm(geo_enc.final_proj(cast_to_input(geo_enc.cls_embed.weight, img_flat).view(1, 1, -1).expand(B, -1, -1)))
+ for layer in geo_enc.encode:
+ geo_cls = self._run_geo_layer(layer, geo_cls, img_flat, pos_flat)
+ geo_cls = geo_enc.encode_norm(geo_cls)
+ if text_embeddings is not None and text_embeddings.shape[0] != B:
+ text_embeddings = text_embeddings.expand(B, -1, -1)
+ if text_mask is not None and text_mask.shape[0] != B:
+ text_mask = text_mask.expand(B, -1)
+ parts = [t for t in [text_embeddings, geo_prompts, geo_cls] if t is not None]
+ text_embeddings = torch.cat(parts, dim=1)
+ n_new = text_embeddings.shape[1] - (text_mask.shape[1] if text_mask is not None else 0)
+ if text_mask is not None:
+ text_mask = torch.cat([text_mask, torch.ones(B, n_new, dtype=torch.bool, device=text_mask.device)], dim=1)
+ else:
+ text_mask = torch.ones(B, text_embeddings.shape[1], dtype=torch.bool, device=text_embeddings.device)
+
+ memory = self.transformer.encoder(img_flat, pos_flat, text_embeddings, text_mask)
+ dec_out = self.transformer.decoder(memory, pos_flat, text_embeddings, text_mask, H, W)
+ query_out, pred_boxes = dec_out["decoder_output"], dec_out["pred_boxes"]
+
+ if text_embeddings is not None:
+ scores = self.dot_prod_scoring(query_out, text_embeddings, text_mask)
+ else:
+ scores = torch.zeros(B, query_out.shape[1], device=query_out.device)
+
+ masks = self.segmentation_head(query_out, seg_features, encoder_hidden_states=memory, prompt=text_embeddings, prompt_mask=text_mask)
+ return box_cxcywh_to_xyxy(pred_boxes), scores, masks, dec_out
+
+ def forward(self, images, text_embeddings=None, text_mask=None, points=None, boxes=None, threshold=0.3, orig_size=None):
+ features, positions, _, _ = self._get_backbone_features(images)
+
+ if text_embeddings is not None:
+ text_embeddings = self.backbone["language_backbone"]["resizer"](text_embeddings)
+ if text_mask is not None:
+ text_mask = text_mask.bool()
+
+ boxes_xyxy, scores, masks, dec_out = self._detect(
+ features, positions, text_embeddings, text_mask, points, boxes)
+
+ if orig_size is not None:
+ oh, ow = orig_size
+ boxes_xyxy = boxes_xyxy * torch.tensor([ow, oh, ow, oh], device=boxes_xyxy.device, dtype=boxes_xyxy.dtype)
+ masks = F.interpolate(masks, size=orig_size, mode="bilinear", align_corners=False)
+
+ return {
+ "boxes": boxes_xyxy,
+ "scores": scores,
+ "masks": masks,
+ "presence": dec_out.get("presence"),
+ }
+
+ def forward_from_trunk(self, trunk_out, text_embeddings, text_mask):
+ """Run detection using a pre-computed ViTDet trunk output.
+
+ text_embeddings must already be resized through language_backbone.resizer.
+ Returns dict with boxes (normalized xyxy), scores, masks at detector resolution.
+ """
+ bb = self.backbone["vision_backbone"]
+ features = [conv(trunk_out) for conv in bb.convs]
+ positions = [cast_to_input(bb.position_encoding(f), f) for f in features]
+
+ if text_mask is not None:
+ text_mask = text_mask.bool()
+
+ boxes_xyxy, scores, masks, _ = self._detect(features, positions, text_embeddings, text_mask)
+ return {"boxes": boxes_xyxy, "scores": scores, "masks": masks}
+
+
+class SAM3Model(nn.Module):
+ def __init__(self, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+ self.dtype = dtype
+ image_model = kwargs.get("image_model", "SAM3")
+ tracker_cls = TRACKER_CLASSES[image_model]
+ self.detector = SAM3Detector(device=device, dtype=dtype, operations=operations, **kwargs)
+ self.tracker = tracker_cls(device=device, dtype=dtype, operations=operations, **kwargs)
+
+ def forward(self, images, **kwargs):
+ return self.detector(images, **kwargs)
+
+ def forward_segment(self, images, point_inputs=None, box_inputs=None, mask_inputs=None):
+ """Interactive segmentation using SAM decoder with point/box/mask prompts.
+
+ Args:
+ images: [B, 3, 1008, 1008] preprocessed images
+ point_inputs: {"point_coords": [B, N, 2], "point_labels": [B, N]} in 1008x1008 pixel space
+ box_inputs: [B, 2, 2] box corners (top-left, bottom-right) in 1008x1008 pixel space
+ mask_inputs: [B, 1, H, W] coarse mask logits to refine
+ Returns:
+ [B, 1, image_size, image_size] high-res mask logits
+ """
+ bb = self.detector.backbone["vision_backbone"]
+ if bb.multiplex:
+ _, _, tracker_features, tracker_positions = bb(images, tracker_mode="interactive")
+ else:
+ _, _, tracker_features, tracker_positions = bb(images, need_tracker=True)
+ if self.detector.scalp > 0:
+ tracker_features = tracker_features[:-self.detector.scalp]
+ tracker_positions = tracker_positions[:-self.detector.scalp]
+
+ high_res = list(tracker_features[:-1])
+ backbone_feat = tracker_features[-1]
+ B, C, H, W = backbone_feat.shape
+ # Add no-memory embedding (init frame path)
+ no_mem = getattr(self.tracker, 'interactivity_no_mem_embed', None)
+ if no_mem is None:
+ no_mem = getattr(self.tracker, 'no_mem_embed', None)
+ if no_mem is not None:
+ feat_flat = backbone_feat.flatten(2).permute(0, 2, 1)
+ feat_flat = feat_flat + cast_to_input(no_mem, feat_flat)
+ backbone_feat = feat_flat.view(B, H, W, C).permute(0, 3, 1, 2)
+
+ num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
+ _, high_res_masks, _, _ = self.tracker._forward_sam_heads(
+ backbone_features=backbone_feat,
+ point_inputs=point_inputs,
+ mask_inputs=mask_inputs,
+ box_inputs=box_inputs,
+ high_res_features=high_res,
+ multimask_output=(0 < num_pts <= 1),
+ )
+ return high_res_masks
+
+ def forward_video(self, images, initial_masks, pbar=None, text_prompts=None,
+ new_det_thresh=0.5, max_objects=0, detect_interval=1,
+ target_device=None, target_dtype=None):
+ """Track video with optional per-frame text-prompted detection."""
+ bb = self.detector.backbone["vision_backbone"]
+
+ def backbone_fn(frame, frame_idx=None):
+ trunk_out = bb.trunk(frame)
+ if bb.multiplex:
+ _, _, tf, tp = bb(frame, tracker_mode="propagation", cached_trunk=trunk_out, tracker_only=True)
+ else:
+ _, _, tf, tp = bb(frame, need_tracker=True, cached_trunk=trunk_out, tracker_only=True)
+ return tf, tp, trunk_out
+
+ detect_fn = None
+ if text_prompts:
+ resizer = self.detector.backbone["language_backbone"]["resizer"]
+ resized = [(resizer(emb), m.bool() if m is not None else None) for emb, m in text_prompts]
+ def detect_fn(trunk_out):
+ all_scores, all_masks = [], []
+ for emb, mask in resized:
+ det = self.detector.forward_from_trunk(trunk_out, emb, mask)
+ all_scores.append(det["scores"])
+ all_masks.append(det["masks"])
+ return {"scores": torch.cat(all_scores, dim=1), "masks": torch.cat(all_masks, dim=1)}
+
+ if hasattr(self.tracker, 'track_video_with_detection'):
+ return self.tracker.track_video_with_detection(
+ backbone_fn, images, initial_masks, detect_fn,
+ new_det_thresh=new_det_thresh, max_objects=max_objects,
+ detect_interval=detect_interval, backbone_obj=bb, pbar=pbar,
+ target_device=target_device, target_dtype=target_dtype)
+ # SAM3 (non-multiplex) — no detection support, requires initial masks
+ if initial_masks is None:
+ raise ValueError("SAM3 (non-multiplex) requires initial_mask for video tracking")
+ return self.tracker.track_video(backbone_fn, images, initial_masks, pbar=pbar, backbone_obj=bb,
+ target_device=target_device, target_dtype=target_dtype)
diff --git a/comfy/ldm/sam3/sam.py b/comfy/ldm/sam3/sam.py
new file mode 100644
index 000000000..75cb457cf
--- /dev/null
+++ b/comfy/ldm/sam3/sam.py
@@ -0,0 +1,425 @@
+# SAM3 shared components: primitives, ViTDet backbone, FPN neck, position encodings.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.flux.math import apply_rope
+from comfy.ldm.flux.layers import EmbedND
+from comfy.ops import cast_to_input
+
+
+class MLP(nn.Module):
+ def __init__(self, input_dim, hidden_dim, output_dim, num_layers, sigmoid_output=False, device=None, dtype=None, operations=None):
+ super().__init__()
+ dims = [input_dim] + [hidden_dim] * (num_layers - 1) + [output_dim]
+ self.layers = nn.ModuleList([operations.Linear(dims[i], dims[i + 1], device=device, dtype=dtype) for i in range(num_layers)])
+ self.sigmoid_output = sigmoid_output
+
+ def forward(self, x):
+ for i, layer in enumerate(self.layers):
+ x = F.relu(layer(x)) if i < len(self.layers) - 1 else layer(x)
+ return torch.sigmoid(x) if self.sigmoid_output else x
+
+
+class SAMAttention(nn.Module):
+ def __init__(self, embedding_dim, num_heads, downsample_rate=1, kv_in_dim=None, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ internal_dim = embedding_dim // downsample_rate
+ kv_dim = kv_in_dim if kv_in_dim is not None else embedding_dim
+ self.q_proj = operations.Linear(embedding_dim, internal_dim, device=device, dtype=dtype)
+ self.k_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
+ self.v_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
+ self.out_proj = operations.Linear(internal_dim, embedding_dim, device=device, dtype=dtype)
+
+ def forward(self, q, k, v):
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+ return self.out_proj(optimized_attention(q, k, v, self.num_heads, low_precision_attention=False))
+
+
+class TwoWayAttentionBlock(nn.Module):
+ def __init__(self, embedding_dim, num_heads, mlp_dim=2048, attention_downsample_rate=2, skip_first_layer_pe=False, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.skip_first_layer_pe = skip_first_layer_pe
+ self.self_attn = SAMAttention(embedding_dim, num_heads, device=device, dtype=dtype, operations=operations)
+ self.cross_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
+ self.cross_attn_image_to_token = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
+ self.mlp = nn.Sequential(operations.Linear(embedding_dim, mlp_dim, device=device, dtype=dtype), nn.ReLU(), operations.Linear(mlp_dim, embedding_dim, device=device, dtype=dtype))
+ self.norm1 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
+ self.norm3 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
+ self.norm4 = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
+
+ def forward(self, queries, keys, query_pe, key_pe):
+ if self.skip_first_layer_pe:
+ queries = self.norm1(self.self_attn(queries, queries, queries))
+ else:
+ q = queries + query_pe
+ queries = self.norm1(queries + self.self_attn(q, q, queries))
+ q, k = queries + query_pe, keys + key_pe
+ queries = self.norm2(queries + self.cross_attn_token_to_image(q, k, keys))
+ queries = self.norm3(queries + self.mlp(queries))
+ q, k = queries + query_pe, keys + key_pe
+ keys = self.norm4(keys + self.cross_attn_image_to_token(k, q, queries))
+ return queries, keys
+
+
+class TwoWayTransformer(nn.Module):
+ def __init__(self, depth=2, embedding_dim=256, num_heads=8, mlp_dim=2048, attention_downsample_rate=2, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.layers = nn.ModuleList([
+ TwoWayAttentionBlock(embedding_dim, num_heads, mlp_dim, attention_downsample_rate,
+ skip_first_layer_pe=(i == 0), device=device, dtype=dtype, operations=operations)
+ for i in range(depth)
+ ])
+ self.final_attn_token_to_image = SAMAttention(embedding_dim, num_heads, downsample_rate=attention_downsample_rate, device=device, dtype=dtype, operations=operations)
+ self.norm_final = operations.LayerNorm(embedding_dim, device=device, dtype=dtype)
+
+ def forward(self, image_embedding, image_pe, point_embedding):
+ queries, keys = point_embedding, image_embedding
+ for layer in self.layers:
+ queries, keys = layer(queries, keys, point_embedding, image_pe)
+ q, k = queries + point_embedding, keys + image_pe
+ queries = self.norm_final(queries + self.final_attn_token_to_image(q, k, keys))
+ return queries, keys
+
+
+class PositionEmbeddingRandom(nn.Module):
+ """Fourier feature positional encoding with random gaussian projection."""
+ def __init__(self, num_pos_feats=64, scale=None):
+ super().__init__()
+ self.register_buffer("positional_encoding_gaussian_matrix", (scale or 1.0) * torch.randn(2, num_pos_feats))
+
+ def _encode(self, normalized_coords):
+ """Map normalized [0,1] coordinates to fourier features via random projection. Computes in fp32."""
+ orig_dtype = normalized_coords.dtype
+ proj_matrix = self.positional_encoding_gaussian_matrix.to(device=normalized_coords.device, dtype=torch.float32)
+ projected = 2 * math.pi * (2 * normalized_coords.float() - 1) @ proj_matrix
+ return torch.cat([projected.sin(), projected.cos()], dim=-1).to(orig_dtype)
+
+ def forward(self, size, device=None):
+ h, w = size
+ dev = device if device is not None else self.positional_encoding_gaussian_matrix.device
+ ones = torch.ones((h, w), device=dev, dtype=torch.float32)
+ norm_xy = torch.stack([(ones.cumsum(1) - 0.5) / w, (ones.cumsum(0) - 0.5) / h], dim=-1)
+ return self._encode(norm_xy).permute(2, 0, 1).unsqueeze(0)
+
+ def forward_with_coords(self, pixel_coords, image_size):
+ norm = pixel_coords.clone()
+ norm[:, :, 0] /= image_size[1]
+ norm[:, :, 1] /= image_size[0]
+ return self._encode(norm)
+
+
+# ViTDet backbone + FPN neck
+
+def window_partition(x: torch.Tensor, window_size: int):
+ B, H, W, C = x.shape
+ pad_h = (window_size - H % window_size) % window_size
+ pad_w = (window_size - W % window_size) % window_size
+ if pad_h > 0 or pad_w > 0:
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+ Hp, Wp = H + pad_h, W + pad_w
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+ return windows, (Hp, Wp)
+
+
+def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw, hw):
+ Hp, Wp = pad_hw
+ H, W = hw
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+ x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+ if Hp > H or Wp > W:
+ x = x[:, :H, :W, :].contiguous()
+ return x
+
+
+def rope_2d(end_x: int, end_y: int, dim: int, theta: float = 10000.0, scale_pos: float = 1.0):
+ """Generate 2D axial RoPE using flux EmbedND. Returns [1, 1, HW, dim//2, 2, 2]."""
+ t = torch.arange(end_x * end_y, dtype=torch.float32)
+ ids = torch.stack([(t % end_x) * scale_pos,
+ torch.div(t, end_x, rounding_mode="floor") * scale_pos], dim=-1)
+ return EmbedND(dim=dim, theta=theta, axes_dim=[dim // 2, dim // 2])(ids.unsqueeze(0))
+
+
+class _ViTMLP(nn.Module):
+ def __init__(self, dim, mlp_ratio=4.0, device=None, dtype=None, operations=None):
+ super().__init__()
+ hidden = int(dim * mlp_ratio)
+ self.fc1 = operations.Linear(dim, hidden, device=device, dtype=dtype)
+ self.act = nn.GELU()
+ self.fc2 = operations.Linear(hidden, dim, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.fc2(self.act(self.fc1(x)))
+
+
+class Attention(nn.Module):
+ """ViTDet multi-head attention with fused QKV projection."""
+
+ def __init__(self, dim, num_heads=8, qkv_bias=True, use_rope=False, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+ self.use_rope = use_rope
+ self.qkv = operations.Linear(dim, dim * 3, bias=qkv_bias, device=device, dtype=dtype)
+ self.proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+
+ def forward(self, x, freqs_cis=None):
+ B, N, C = x.shape
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim)
+ q, k, v = qkv.permute(2, 0, 3, 1, 4).unbind(dim=0)
+ if self.use_rope and freqs_cis is not None:
+ q, k = apply_rope(q, k, freqs_cis)
+ return self.proj(optimized_attention(q, k, v, self.num_heads, skip_reshape=True, low_precision_attention=False))
+
+
+class Block(nn.Module):
+ def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True, window_size=0, use_rope=False, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.window_size = window_size
+ self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
+ self.attn = Attention(dim, num_heads, qkv_bias, use_rope, device=device, dtype=dtype, operations=operations)
+ self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
+ self.mlp = _ViTMLP(dim, mlp_ratio, device=device, dtype=dtype, operations=operations)
+
+ def forward(self, x, freqs_cis=None):
+ shortcut = x
+ x = self.norm1(x)
+ if self.window_size > 0:
+ H, W = x.shape[1], x.shape[2]
+ x, pad_hw = window_partition(x, self.window_size)
+ x = x.view(x.shape[0], self.window_size * self.window_size, -1)
+ x = self.attn(x, freqs_cis=freqs_cis)
+ x = x.view(-1, self.window_size, self.window_size, x.shape[-1])
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+ else:
+ B, H, W, C = x.shape
+ x = x.view(B, H * W, C)
+ x = self.attn(x, freqs_cis=freqs_cis)
+ x = x.view(B, H, W, C)
+ x = shortcut + x
+ x = x + self.mlp(self.norm2(x))
+ return x
+
+
+class PatchEmbed(nn.Module):
+ def __init__(self, patch_size=14, in_chans=3, embed_dim=1024, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.proj = operations.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=False, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.proj(x)
+
+
+class ViTDet(nn.Module):
+ def __init__(self, img_size=1008, patch_size=14, embed_dim=1024, depth=32, num_heads=16, mlp_ratio=4.625, qkv_bias=True, window_size=24,
+ global_att_blocks=(7, 15, 23, 31), use_rope=True, pretrain_img_size=336, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+ self.img_size = img_size
+ self.patch_size = patch_size
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.global_att_blocks = set(global_att_blocks)
+
+ self.patch_embed = PatchEmbed(patch_size, 3, embed_dim, device=device, dtype=dtype, operations=operations)
+
+ num_patches = (pretrain_img_size // patch_size) ** 2 + 1 # +1 for cls token
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim, device=device, dtype=dtype))
+
+ self.ln_pre = operations.LayerNorm(embed_dim, device=device, dtype=dtype)
+
+ grid_size = img_size // patch_size
+ pretrain_grid = pretrain_img_size // patch_size
+
+ self.blocks = nn.ModuleList()
+ for i in range(depth):
+ is_global = i in self.global_att_blocks
+ self.blocks.append(Block(
+ embed_dim, num_heads, mlp_ratio, qkv_bias,
+ window_size=0 if is_global else window_size,
+ use_rope=use_rope,
+ device=device, dtype=dtype, operations=operations,
+ ))
+
+ if use_rope:
+ rope_scale = pretrain_grid / grid_size
+ self.register_buffer("freqs_cis", rope_2d(grid_size, grid_size, embed_dim // num_heads, scale_pos=rope_scale), persistent=False)
+ self.register_buffer("freqs_cis_window", rope_2d(window_size, window_size, embed_dim // num_heads), persistent=False)
+ else:
+ self.freqs_cis = None
+ self.freqs_cis_window = None
+
+ def _get_pos_embed(self, num_tokens):
+ pos = self.pos_embed
+ if pos.shape[1] == num_tokens:
+ return pos
+ cls_pos = pos[:, :1]
+ spatial_pos = pos[:, 1:]
+ old_size = int(math.sqrt(spatial_pos.shape[1]))
+ new_size = int(math.sqrt(num_tokens - 1)) if num_tokens > 1 else old_size
+ spatial_2d = spatial_pos.reshape(1, old_size, old_size, -1).permute(0, 3, 1, 2)
+ tiles_h = new_size // old_size + 1
+ tiles_w = new_size // old_size + 1
+ tiled = spatial_2d.tile([1, 1, tiles_h, tiles_w])[:, :, :new_size, :new_size]
+ tiled = tiled.permute(0, 2, 3, 1).reshape(1, new_size * new_size, -1)
+ return torch.cat([cls_pos, tiled], dim=1)
+
+ def forward(self, x):
+ x = self.patch_embed(x)
+ B, C, Hp, Wp = x.shape
+ x = x.permute(0, 2, 3, 1).reshape(B, Hp * Wp, C)
+
+ pos = cast_to_input(self._get_pos_embed(Hp * Wp + 1), x)
+ x = x + pos[:, 1:Hp * Wp + 1]
+
+ x = x.view(B, Hp, Wp, C)
+ x = self.ln_pre(x)
+
+ freqs_cis_global = self.freqs_cis
+ freqs_cis_win = self.freqs_cis_window
+ if freqs_cis_global is not None:
+ freqs_cis_global = cast_to_input(freqs_cis_global, x)
+ if freqs_cis_win is not None:
+ freqs_cis_win = cast_to_input(freqs_cis_win, x)
+
+ for block in self.blocks:
+ fc = freqs_cis_win if block.window_size > 0 else freqs_cis_global
+ x = block(x, freqs_cis=fc)
+
+ return x.permute(0, 3, 1, 2)
+
+
+class FPNScaleConv(nn.Module):
+ def __init__(self, in_dim, out_dim, scale, device=None, dtype=None, operations=None):
+ super().__init__()
+ if scale == 4.0:
+ self.dconv_2x2_0 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
+ self.dconv_2x2_1 = operations.ConvTranspose2d(in_dim // 2, in_dim // 4, kernel_size=2, stride=2, device=device, dtype=dtype)
+ proj_in = in_dim // 4
+ elif scale == 2.0:
+ self.dconv_2x2 = operations.ConvTranspose2d(in_dim, in_dim // 2, kernel_size=2, stride=2, device=device, dtype=dtype)
+ proj_in = in_dim // 2
+ elif scale == 1.0:
+ proj_in = in_dim
+ elif scale == 0.5:
+ self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+ proj_in = in_dim
+ self.scale = scale
+ self.conv_1x1 = operations.Conv2d(proj_in, out_dim, kernel_size=1, device=device, dtype=dtype)
+ self.conv_3x3 = operations.Conv2d(out_dim, out_dim, kernel_size=3, padding=1, device=device, dtype=dtype)
+
+ def forward(self, x):
+ if self.scale == 4.0:
+ x = F.gelu(self.dconv_2x2_0(x))
+ x = self.dconv_2x2_1(x)
+ elif self.scale == 2.0:
+ x = self.dconv_2x2(x)
+ elif self.scale == 0.5:
+ x = self.pool(x)
+ x = self.conv_1x1(x)
+ x = self.conv_3x3(x)
+ return x
+
+
+class PositionEmbeddingSine(nn.Module):
+ """2D sinusoidal position encoding (DETR-style) with result caching."""
+ def __init__(self, num_pos_feats=256, temperature=10000.0, normalize=True, scale=None):
+ super().__init__()
+ assert num_pos_feats % 2 == 0
+ self.half_dim = num_pos_feats // 2
+ self.temperature = temperature
+ self.normalize = normalize
+ self.scale = scale if scale is not None else 2 * math.pi
+ self._cache = {}
+
+ def _sincos(self, vals):
+ """Encode 1D values to interleaved sin/cos features."""
+ freqs = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=vals.device) // 2) / self.half_dim)
+ raw = vals[..., None] * self.scale / freqs
+ return torch.stack((raw[..., 0::2].sin(), raw[..., 1::2].cos()), dim=-1).flatten(-2)
+
+ def _encode_xy(self, x, y):
+ """Encode normalized x, y coordinates to sinusoidal features. Returns (pos_x, pos_y) each [N, half_dim]."""
+ dim_t = self.temperature ** (2 * (torch.arange(self.half_dim, dtype=torch.float32, device=x.device) // 2) / self.half_dim)
+ pos_x = x[:, None] * self.scale / dim_t
+ pos_y = y[:, None] * self.scale / dim_t
+ pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
+ pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
+ return pos_x, pos_y
+
+ def encode_boxes(self, cx, cy, w, h):
+ """Encode box center + size to [N, d_model+2] features."""
+ pos_x, pos_y = self._encode_xy(cx, cy)
+ return torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
+
+ def forward(self, x):
+ B, C, H, W = x.shape
+ key = (H, W, x.device)
+ if key not in self._cache:
+ gy = torch.arange(H, dtype=torch.float32, device=x.device)
+ gx = torch.arange(W, dtype=torch.float32, device=x.device)
+ if self.normalize:
+ gy, gx = gy / (H - 1 + 1e-6), gx / (W - 1 + 1e-6)
+ yy, xx = torch.meshgrid(gy, gx, indexing="ij")
+ self._cache[key] = torch.cat((self._sincos(yy), self._sincos(xx)), dim=-1).permute(2, 0, 1).unsqueeze(0)
+ return self._cache[key].expand(B, -1, -1, -1)
+
+
+class SAM3VisionBackbone(nn.Module):
+ def __init__(self, embed_dim=1024, d_model=256, multiplex=False, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+ self.trunk = ViTDet(embed_dim=embed_dim, device=device, dtype=dtype, operations=operations, **kwargs)
+ self.position_encoding = PositionEmbeddingSine(num_pos_feats=d_model, normalize=True)
+ self.multiplex = multiplex
+
+ fpn_args = dict(device=device, dtype=dtype, operations=operations)
+ if multiplex:
+ scales = [4.0, 2.0, 1.0]
+ self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
+ self.propagation_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
+ self.interactive_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
+ else:
+ scales = [4.0, 2.0, 1.0, 0.5]
+ self.convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
+ self.sam2_convs = nn.ModuleList([FPNScaleConv(embed_dim, d_model, s, **fpn_args) for s in scales])
+
+ def forward(self, images, need_tracker=False, tracker_mode=None, cached_trunk=None, tracker_only=False):
+ backbone_out = cached_trunk if cached_trunk is not None else self.trunk(images)
+
+ if tracker_only:
+ # Skip detector FPN when only tracker features are needed (video tracking)
+ if self.multiplex:
+ tracker_convs = self.propagation_convs if tracker_mode == "propagation" else self.interactive_convs
+ else:
+ tracker_convs = self.sam2_convs
+ tracker_features = [conv(backbone_out) for conv in tracker_convs]
+ tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
+ return None, None, tracker_features, tracker_positions
+
+ features = [conv(backbone_out) for conv in self.convs]
+ positions = [cast_to_input(self.position_encoding(f), f) for f in features]
+
+ if self.multiplex:
+ if tracker_mode == "propagation":
+ tracker_convs = self.propagation_convs
+ elif tracker_mode == "interactive":
+ tracker_convs = self.interactive_convs
+ else:
+ return features, positions, None, None
+ elif need_tracker:
+ tracker_convs = self.sam2_convs
+ else:
+ return features, positions, None, None
+
+ tracker_features = [conv(backbone_out) for conv in tracker_convs]
+ tracker_positions = [cast_to_input(self.position_encoding(f), f) for f in tracker_features]
+ return features, positions, tracker_features, tracker_positions
diff --git a/comfy/ldm/sam3/tracker.py b/comfy/ldm/sam3/tracker.py
new file mode 100644
index 000000000..8456e90a6
--- /dev/null
+++ b/comfy/ldm/sam3/tracker.py
@@ -0,0 +1,1802 @@
+# SAM3 video tracker: memory encoder, memory attention, SAM mask decoder/prompt encoder.
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+
+try:
+ import cv2
+ _HAS_CV2 = True
+except ImportError:
+ from scipy import ndimage
+ _HAS_CV2 = False
+
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.sam3.sam import rope_2d, PositionEmbeddingSine
+from comfy.ops import cast_to_input
+from comfy.ldm.flux.math import apply_rope1
+from comfy.ldm.cascade.common import LayerNorm2d_op
+from comfy.ldm.sam3.sam import MLP, PositionEmbeddingRandom
+from comfy.ldm.sam3.sam import TwoWayTransformer as SAMTwoWayTransformer
+
+NO_OBJ_SCORE = -1024.0
+
+
+def to_spatial(x, H, W):
+ """Reshape (B, H*W, C) → (B, C, H, W)."""
+ return x.view(x.shape[0], H, W, -1).permute(0, 3, 1, 2)
+
+class MultiplexState:
+ """Tracks object-to-slot assignments for multiplex tracking. Provides mux/demux operations."""
+
+ def __init__(self, num_objects, multiplex_count, device, dtype):
+ self.multiplex_count = multiplex_count
+ self.device = device
+ self.dtype = dtype
+ self._build(num_objects)
+
+ def mux(self, x):
+ """[N_obj, ...] -> [num_buckets, multiplex_count, ...]"""
+ out_shape = (self.num_buckets, self.multiplex_count) + x.shape[1:]
+ return (self.mux_matrix.to(device=x.device, dtype=x.dtype) @ x.reshape(self.total_valid_entries, -1)).view(out_shape)
+
+ def demux(self, x):
+ """[num_buckets, multiplex_count, ...] -> [N_obj, ...]"""
+ out_shape = (self.total_valid_entries,) + x.shape[2:]
+ flat = x.reshape(self.num_buckets * self.multiplex_count, -1)
+ return (self.demux_matrix.to(device=x.device, dtype=x.dtype) @ flat).view(out_shape)
+
+ def get_valid_object_mask(self):
+ """[num_buckets, multiplex_count] bool tensor, True for valid slots."""
+ return (self.mux_matrix.sum(dim=1) > 0).reshape(self.num_buckets, self.multiplex_count)
+
+ def _build(self, num_objects):
+ M = self.multiplex_count
+ self.num_buckets = (num_objects + M - 1) // M
+ self.total_valid_entries = num_objects
+ total_slots = self.num_buckets * M
+ self.mux_matrix = torch.zeros(total_slots, num_objects, device=self.device, dtype=self.dtype)
+ self.demux_matrix = torch.zeros(num_objects, total_slots, device=self.device, dtype=self.dtype)
+ oids = torch.arange(num_objects, device=self.device)
+ slots = (oids // M) * M + (oids % M)
+ self.mux_matrix[slots, oids] = 1.0
+ self.demux_matrix[oids, slots] = 1.0
+
+ def add_objects(self, n_new):
+ """Grow multiplex state for n_new additional objects."""
+ self._build(self.total_valid_entries + n_new)
+
+def _compute_mask_overlap(masks_a, masks_b):
+ """Max of IoU and IoM (intersection over minimum area). More robust to size differences."""
+ a_flat = (masks_a > 0).float().flatten(1)
+ b_flat = (masks_b > 0).float().flatten(1)
+ intersection = a_flat @ b_flat.T
+ area_a = a_flat.sum(1, keepdim=True)
+ area_b = b_flat.sum(1, keepdim=True).T
+ iou = intersection / (area_a + area_b - intersection).clamp(min=1)
+ iom = intersection / torch.min(area_a.expand_as(iou), area_b.expand_as(iou)).clamp(min=1)
+ return torch.max(iou, iom)
+
+
+def _nms_masks(masks, scores, thresh=0.5):
+ """Mask-based NMS using IoU+IoM overlap. Returns (filtered_masks, filtered_scores)."""
+ order = scores.argsort(descending=True)
+ masks, scores = masks[order], scores[order]
+ keep = []
+ for i in range(masks.shape[0]):
+ if keep:
+ if _compute_mask_overlap(masks[i:i+1], masks[torch.tensor(keep, device=masks.device)]).max() >= thresh:
+ continue
+ keep.append(i)
+ return masks[keep], scores[keep]
+
+
+def _get_connected_components(mask_bin):
+ """Get connected component labels and areas. mask_bin: [B, 1, H, W] uint8."""
+ labels_list, areas_list = [], []
+ for i in range(mask_bin.shape[0]):
+ m = mask_bin[i, 0].cpu().numpy()
+ if _HAS_CV2:
+ _, labeled, stats, _ = cv2.connectedComponentsWithStats(m, connectivity=8)
+ areas = stats[labeled, cv2.CC_STAT_AREA].astype('int32')
+ else:
+ labeled, num_features = ndimage.label(m)
+ areas = np.zeros_like(m, dtype=np.int32)
+ for c in range(1, num_features + 1):
+ component = labeled == c
+ areas[component] = component.sum()
+ labels_list.append(torch.from_numpy(labeled).to(mask_bin.device))
+ areas_list.append(torch.from_numpy(areas).to(device=mask_bin.device, dtype=torch.int32))
+ return torch.stack(labels_list).unsqueeze(1), torch.stack(areas_list).unsqueeze(1)
+
+
+def fill_holes_in_mask_scores(mask, max_area=0):
+ """Remove small foreground sprinkles and fill small background holes using connected components."""
+ if max_area <= 0:
+ return mask
+
+ # Fill holes: small connected components in background → foreground
+ mask_bg = (mask <= 0).to(torch.uint8)
+ _, areas_bg = _get_connected_components(mask_bg)
+ small_bg = mask_bg.bool() & (areas_bg <= max_area)
+ mask = torch.where(small_bg, 0.1, mask)
+
+ # Remove sprinkles: small connected components in foreground → background
+ # Only remove if area < min(max_area, half of total foreground area)
+ mask_fg = (mask > 0).to(torch.uint8)
+ fg_area_thresh = mask_fg.sum(dim=(2, 3), keepdim=True, dtype=torch.int32)
+ fg_area_thresh.floor_divide_(2).clamp_(max=max_area)
+ _, areas_fg = _get_connected_components(mask_fg)
+ small_fg = mask_fg.bool() & (areas_fg <= fg_area_thresh)
+ mask = torch.where(small_fg, -0.1, mask)
+
+ return mask
+
+
+def apply_rope_memory(q, k, freqs, num_heads, num_k_exclude_rope=0):
+ """Apply 2D axial RoPE to memory attention using flux rope format.
+
+ Args:
+ q: [B, Nq, C] projected queries (current frame features)
+ k: [B, Nk, C] projected keys (memory tokens)
+ freqs: [1, Nq, dim//2, 2, 2] flux-format rotation matrices for one frame
+ num_heads: number of attention heads
+ num_k_exclude_rope: number of trailing k tokens to skip RoPE (object pointers)
+ """
+ B, Nq, C = q.shape
+ head_dim = C // num_heads
+
+ # freqs shape: [1, 1, Nq, dim//2, 2, 2] (heads broadcast dim already included)
+ q_h = q.view(B, Nq, num_heads, head_dim).transpose(1, 2)
+ q_h = apply_rope1(q_h, freqs)
+ q = q_h.transpose(1, 2).reshape(B, Nq, C)
+
+ # Apply RoPE to k (excluding last num_k_exclude_rope tokens)
+ Nk = k.shape[1]
+ num_k_rope = Nk - num_k_exclude_rope
+ if num_k_rope > 0:
+ # Repeat freqs for multiple frames of spatial memory
+ Nf = freqs.shape[2] # spatial positions in one frame
+ if num_k_rope > Nf:
+ r = (num_k_rope + Nf - 1) // Nf
+ pe_k = freqs.repeat(1, 1, r, 1, 1, 1)[:, :, :num_k_rope]
+ else:
+ pe_k = freqs[:, :, :num_k_rope]
+
+ k_h = k[:, :num_k_rope].view(B, num_k_rope, num_heads, head_dim).transpose(1, 2)
+ k_h = apply_rope1(k_h, pe_k)
+ k = k.clone()
+ k[:, :num_k_rope] = k_h.transpose(1, 2).reshape(B, num_k_rope, C)
+
+ return q, k
+
+
+def get_1d_sine_pe(pos_inds, dim, temperature=10000):
+ """1D sinusoidal positional encoding for temporal positions."""
+ pe_dim = dim // 2
+ dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device)
+ dim_t = temperature ** (2 * (dim_t // 2) / pe_dim)
+ pos_embed = pos_inds.unsqueeze(-1) / dim_t
+ return torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1)
+
+
+def _pad_to_buckets(tensor, target_buckets):
+ """Pad a [num_buckets, ...] tensor to target_buckets along dim 0 if needed."""
+ if tensor.shape[0] >= target_buckets:
+ return tensor
+ pad_shape = (target_buckets - tensor.shape[0],) + tensor.shape[1:]
+ return torch.cat([tensor, torch.zeros(pad_shape, device=tensor.device, dtype=tensor.dtype)], dim=0)
+
+
+def pack_masks(masks):
+ """Pack binary masks [*, H, W] to bit-packed [*, H, W//8] uint8. W must be divisible by 8."""
+ binary = masks > 0
+ shifts = torch.arange(8, device=masks.device)
+ return (binary.view(*masks.shape[:-1], -1, 8) * (1 << shifts)).sum(-1).byte()
+
+
+def unpack_masks(packed):
+ """Unpack bit-packed [*, H, W//8] uint8 to bool [*, H, W*8]."""
+ bits = torch.tensor([1, 2, 4, 8, 16, 32, 64, 128], dtype=torch.uint8, device=packed.device)
+ return (packed.unsqueeze(-1) & bits).bool().view(*packed.shape[:-1], -1)
+
+
+def _prep_frame(images, idx, device, dt, size):
+ """Slice CPU full-res frames, transfer to GPU in target dtype, and resize to (size, size)."""
+ return comfy.utils.common_upscale(images[idx].to(device=device, dtype=dt), size, size, "bicubic", crop="disabled")
+
+
+def _compute_backbone(backbone_fn, frame, frame_idx=None):
+ """Compute backbone features for a single frame. Returns (vision_feats, vision_pos, feat_sizes, features, trunk_out)."""
+ features, positions, trunk_out = backbone_fn(frame, frame_idx=frame_idx)
+ feat_sizes = [(x.shape[-2], x.shape[-1]) for x in features]
+ vision_feats = [x.flatten(2).permute(0, 2, 1) for x in features]
+ vision_pos = [x.flatten(2).permute(0, 2, 1) for x in positions]
+ return vision_feats, vision_pos, feat_sizes, features, trunk_out
+
+
+def collect_memory_tokens(output_dict, frame_idx, num_maskmem, maskmem_tpos_enc, device,
+ collect_image_feats=False, tpos_v2=False, num_buckets=None):
+ """Collect spatial memory, position encodings, and optionally image features from past frames."""
+ to_cat_memory, to_cat_memory_pos = [], []
+ to_cat_image_feat, to_cat_image_pos = [], []
+
+ def _append(out, tpos_idx):
+ feats = out["maskmem_features"].to(device)
+ if num_buckets is not None:
+ feats = _pad_to_buckets(feats, num_buckets)
+ to_cat_memory.append(feats.flatten(2).permute(0, 2, 1))
+ enc = out["maskmem_pos_enc"][-1].to(device).flatten(2).permute(0, 2, 1)
+ if num_buckets is not None:
+ enc = _pad_to_buckets(enc, num_buckets)
+ tpos = cast_to_input(maskmem_tpos_enc[tpos_idx], enc)
+ to_cat_memory_pos.append(enc + tpos)
+ if collect_image_feats and "image_features" in out:
+ to_cat_image_feat.append(out["image_features"].to(device))
+ to_cat_image_pos.append(out["image_pos_enc"].to(device) + tpos)
+
+ cond_outputs = output_dict["cond_frame_outputs"]
+ for t, out in cond_outputs.items():
+ if tpos_v2:
+ t_pos = frame_idx - t
+ tpos_idx = num_maskmem - t_pos - 1 if 0 < t_pos < num_maskmem else num_maskmem - 1
+ else:
+ tpos_idx = num_maskmem - 1
+ _append(out, tpos_idx)
+
+ for t_pos in range(1, num_maskmem):
+ out = output_dict["non_cond_frame_outputs"].get(frame_idx - (num_maskmem - t_pos), None)
+ if out is None or out.get("maskmem_features") is None:
+ continue
+ _append(out, num_maskmem - t_pos - 1)
+
+ return to_cat_memory, to_cat_memory_pos, to_cat_image_feat, to_cat_image_pos, cond_outputs
+
+
+def compute_tpos_enc(rel_pos_list, device, d_model, proj_layer, dtype=None, max_abs_pos=None):
+ """Temporal position encoding for object pointers."""
+ pos_enc = torch.tensor(rel_pos_list, dtype=torch.float32, device=device) / max((max_abs_pos or 2) - 1, 1)
+ pos_enc = get_1d_sine_pe(pos_enc, dim=d_model)
+ if dtype is not None:
+ pos_enc = pos_enc.to(dtype)
+ return proj_layer(pos_enc)
+
+
+def forward_sam_heads(backbone_features, prompt_encoder, mask_decoder, obj_ptr_proj, no_obj_fn,
+ image_size, point_inputs=None, mask_inputs=None, box_inputs=None,
+ high_res_features=None, multimask_output=False):
+ """Shared SAM prompt encoder + mask decoder forward for both SAM3 and SAM3.1 trackers."""
+ device = backbone_features.device
+ # Batch size from inputs (mask_inputs may have N_obj > 1 while backbone is batch 1)
+ if mask_inputs is not None:
+ B = mask_inputs.shape[0]
+ elif box_inputs is not None:
+ B = box_inputs.shape[0]
+ elif point_inputs is not None:
+ B = point_inputs["point_coords"].shape[0]
+ else:
+ B = backbone_features.shape[0]
+
+ if point_inputs is not None:
+ sam_point_coords = point_inputs["point_coords"]
+ sam_point_labels = point_inputs["point_labels"]
+ else:
+ sam_point_coords = torch.zeros(B, 1, 2, device=device)
+ sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device)
+
+ if mask_inputs is not None:
+ prompt_size = (prompt_encoder.image_embedding_size[0] * 4, prompt_encoder.image_embedding_size[1] * 4)
+ if mask_inputs.shape[-2:] != prompt_size:
+ sam_mask_prompt = F.interpolate(mask_inputs, size=prompt_size, mode="bilinear", align_corners=False, antialias=True)
+ else:
+ sam_mask_prompt = mask_inputs
+ else:
+ sam_mask_prompt = None
+
+ sparse, dense = prompt_encoder(points=(sam_point_coords, sam_point_labels), boxes=box_inputs, masks=sam_mask_prompt)
+ sparse = cast_to_input(sparse, backbone_features)
+ dense = cast_to_input(dense, backbone_features)
+ image_pe = cast_to_input(prompt_encoder.get_dense_pe(), backbone_features)
+
+ low_res_multimasks, ious, sam_output_tokens, object_score_logits = mask_decoder(
+ image_embeddings=backbone_features, image_pe=image_pe,
+ sparse_prompt_embeddings=sparse, dense_prompt_embeddings=dense,
+ high_res_features=high_res_features, multimask_output=multimask_output, return_all=True,
+ )
+
+ is_obj_appearing = object_score_logits > 0
+ low_res_multimasks = torch.where(is_obj_appearing[:, None, None], low_res_multimasks,
+ torch.tensor(NO_OBJ_SCORE, device=device, dtype=low_res_multimasks.dtype))
+ high_res_multimasks = F.interpolate(low_res_multimasks, size=(image_size, image_size), mode="bilinear", align_corners=False)
+
+ sam_output_token = sam_output_tokens[:, 0]
+ if multimask_output:
+ best_iou_inds = torch.argmax(ious, dim=-1)
+ batch_inds = torch.arange(B, device=device)
+ low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1)
+ if sam_output_tokens.size(1) > 1:
+ sam_output_token = sam_output_tokens[batch_inds, best_iou_inds]
+ else:
+ low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks
+
+ obj_ptr = obj_ptr_proj(sam_output_token)
+ obj_ptr = no_obj_fn(obj_ptr, is_obj_appearing)
+
+ return low_res_masks, high_res_masks, obj_ptr, object_score_logits
+
+
+def use_mask_as_output(backbone_features, high_res_features, mask_inputs, mask_downsample,
+ prompt_encoder, mask_decoder, obj_ptr_proj, no_obj_fn, image_size, backbone_stride):
+ """Shared mask-as-output for both SAM3 and SAM3.1 trackers."""
+ out_scale, out_bias = 20.0, -10.0
+ mask_inputs_float = cast_to_input(mask_inputs, backbone_features)
+ high_res_masks = mask_inputs_float * out_scale + out_bias
+ low_res_masks = F.interpolate(high_res_masks, size=(image_size // backbone_stride * 4,) * 2,
+ mode="bilinear", align_corners=False, antialias=True)
+ _, _, obj_ptr, _ = forward_sam_heads(
+ backbone_features, prompt_encoder, mask_decoder, obj_ptr_proj, no_obj_fn,
+ image_size, mask_inputs=mask_downsample(mask_inputs_float), high_res_features=high_res_features,
+ )
+ is_obj_appearing = torch.any(mask_inputs.flatten(1) > 0.0, dim=1)[..., None]
+ alpha = is_obj_appearing.to(obj_ptr.dtype)
+ object_score_logits = out_scale * alpha + out_bias
+ return low_res_masks, high_res_masks, obj_ptr, object_score_logits
+
+
+# Split attention with configurable input dims (for asymmetric cross-attention)
+class SplitAttn(nn.Module):
+ def __init__(self, embed_dim, num_heads=1, kv_dim=None, internal_dim=None, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ kv_dim = kv_dim or embed_dim
+ internal_dim = internal_dim or embed_dim
+ self.q_proj = operations.Linear(embed_dim, internal_dim, device=device, dtype=dtype)
+ self.k_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
+ self.v_proj = operations.Linear(kv_dim, internal_dim, device=device, dtype=dtype)
+ self.out_proj = operations.Linear(internal_dim, embed_dim, device=device, dtype=dtype)
+
+ def forward(self, q, k=None, v=None, rope=None, num_k_exclude_rope=0):
+ if k is None:
+ k = q
+ if v is None:
+ v = k
+ q = self.q_proj(q)
+ k = self.k_proj(k)
+ v = self.v_proj(v)
+ if rope is not None:
+ q, k = apply_rope_memory(q, k, rope, self.num_heads, num_k_exclude_rope)
+ out = optimized_attention(q, k, v, self.num_heads, low_precision_attention=False)
+ return self.out_proj(out)
+
+
+class MemoryAttnLayer(nn.Module):
+ def __init__(self, d_model=256, num_heads=1, kv_dim=64, dim_ff=2048, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ self.self_attn = SplitAttn(d_model, num_heads, device=device, dtype=dtype, operations=operations)
+ self.cross_attn_image = SplitAttn(d_model, num_heads, kv_dim=kv_dim, device=device, dtype=dtype, operations=operations)
+ self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
+ self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
+ self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+
+ def forward(self, x, memory, memory_pos=None, rope=None, num_k_exclude_rope=0):
+ x = x + self.self_attn(self.norm1(x), rope=rope)
+ mem_k = memory + memory_pos if memory_pos is not None else memory
+ x = x + self.cross_attn_image(self.norm2(x), mem_k, memory, rope=rope, num_k_exclude_rope=num_k_exclude_rope)
+ normed = self.norm3(x)
+ x = x + self.linear2(F.relu(self.linear1(normed)))
+ return x
+
+
+class MemoryAttnEncoder(nn.Module):
+ def __init__(self, d_model=256, num_heads=1, kv_dim=64, dim_ff=2048, num_layers=4, image_size=1008, patch_size=14,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.layers = nn.ModuleList([
+ MemoryAttnLayer(d_model, num_heads, kv_dim, dim_ff, device=device, dtype=dtype, operations=operations)
+ for _ in range(num_layers)
+ ])
+ self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ hw = image_size // patch_size
+ self.register_buffer("_rope", rope_2d(hw, hw, d_model // num_heads), persistent=False)
+
+ def forward(self, x, memory, src_pos=None, memory_pos=None, num_k_exclude_rope=0):
+ if src_pos is not None:
+ x = x + 0.1 * src_pos
+
+ rope = self._rope.to(device=x.device)
+ for layer in self.layers:
+ x = layer(x, memory, memory_pos=memory_pos, rope=rope, num_k_exclude_rope=num_k_exclude_rope)
+ return self.norm(x)
+
+
+class MemoryTransformer(nn.Module):
+ def __init__(self, d_model=256, num_heads=1, kv_dim=64, dim_ff=2048, num_layers=4, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.encoder = MemoryAttnEncoder(d_model, num_heads, kv_dim, dim_ff, num_layers, device=device, dtype=dtype, operations=operations)
+
+
+def _upscale_masks(output_upscaling, conv_s0, conv_s1, src_out, high_res_features):
+ """Shared upscaling for SAM mask decoders: deconv + high-res feature integration."""
+ dc1, ln1, act1, dc2, act2 = output_upscaling
+ if high_res_features is not None:
+ upscaled = act1(ln1(dc1(src_out) + conv_s1(high_res_features[1])))
+ upscaled = act2(dc2(upscaled) + conv_s0(high_res_features[0]))
+ else:
+ upscaled = act2(dc2(act1(ln1(dc1(src_out)))))
+ return upscaled
+
+
+class SAMMaskDecoder(nn.Module):
+ def __init__(self, d_model=256, num_multimask_outputs=3, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_mask_tokens = num_multimask_outputs + 1
+
+ self.transformer = SAMTwoWayTransformer(depth=2, embedding_dim=d_model, num_heads=8, mlp_dim=2048, device=device, dtype=dtype, operations=operations)
+
+ self.iou_token = operations.Embedding(1, d_model, device=device, dtype=dtype)
+ self.mask_tokens = operations.Embedding(self.num_mask_tokens, d_model, device=device, dtype=dtype)
+ self.obj_score_token = operations.Embedding(1, d_model, device=device, dtype=dtype)
+
+ # Output upscaling: d_model -> d_model//4 -> d_model//8 at 4x resolution
+ LN2d = LayerNorm2d_op(operations)
+ self.output_upscaling = nn.Sequential(
+ operations.ConvTranspose2d(d_model, d_model // 4, kernel_size=2, stride=2, device=device, dtype=dtype), LN2d(d_model // 4, device=device, dtype=dtype), nn.GELU(),
+ operations.ConvTranspose2d(d_model // 4, d_model // 8, kernel_size=2, stride=2, device=device, dtype=dtype), nn.GELU(),
+ )
+
+ # High-res feature integration
+ self.conv_s0 = operations.Conv2d(d_model, d_model // 8, kernel_size=1, device=device, dtype=dtype)
+ self.conv_s1 = operations.Conv2d(d_model, d_model // 4, kernel_size=1, device=device, dtype=dtype)
+
+ # Per-mask hypernetwork MLPs
+ self.output_hypernetworks_mlps = nn.ModuleList([
+ MLP(d_model, d_model, d_model // 8, 3, device=device, dtype=dtype, operations=operations)
+ for _ in range(self.num_mask_tokens)
+ ])
+
+ self.iou_prediction_head = MLP(d_model, d_model, self.num_mask_tokens, 3, device=device, dtype=dtype, operations=operations)
+ self.pred_obj_score_head = MLP(d_model, d_model, 1, 3, device=device, dtype=dtype, operations=operations)
+
+ def forward(self, image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings,
+ high_res_features=None, multimask_output=False, return_all=False):
+ B = sparse_prompt_embeddings.shape[0]
+ ref = sparse_prompt_embeddings
+ # Token order: [obj_score(1), iou(1), mask(num_mask_tokens)]
+ tokens = torch.cat([cast_to_input(self.obj_score_token.weight, ref),
+ cast_to_input(self.iou_token.weight, ref),
+ cast_to_input(self.mask_tokens.weight, ref)], dim=0)
+ tokens = torch.cat([tokens.unsqueeze(0).expand(B, -1, -1), sparse_prompt_embeddings], dim=1)
+
+ src = image_embeddings
+ if src.shape[0] != B:
+ src = src.expand(B, -1, -1, -1)
+ src = src + dense_prompt_embeddings
+ pos_src = image_pe.expand(B, -1, -1, -1)
+
+ b, c, h, w = src.shape
+ src_flat = src.flatten(2).permute(0, 2, 1)
+ pos_flat = pos_src.flatten(2).permute(0, 2, 1)
+
+ hs, src_out = self.transformer(src_flat, pos_flat, tokens)
+
+ obj_score_token_out = hs[:, 0, :]
+ iou_token_out = hs[:, 1, :]
+ mask_tokens_out = hs[:, 2:2 + self.num_mask_tokens, :]
+
+ src_out = src_out.permute(0, 2, 1).view(b, c, h, w)
+ upscaled = _upscale_masks(self.output_upscaling, self.conv_s0, self.conv_s1, src_out, high_res_features)
+
+ hyper_in = torch.stack([
+ mlp(mask_tokens_out[:, i, :]) for i, mlp in enumerate(self.output_hypernetworks_mlps)
+ ], dim=1)
+
+ masks = (hyper_in @ upscaled.flatten(2)).view(B, self.num_mask_tokens, upscaled.shape[2], upscaled.shape[3])
+ iou_pred = self.iou_prediction_head(iou_token_out)
+ object_score_logits = self.pred_obj_score_head(obj_score_token_out)
+
+ if multimask_output:
+ out_masks = masks[:, 1:]
+ out_iou = iou_pred[:, 1:]
+ out_tokens = mask_tokens_out[:, 1:]
+ else:
+ out_masks = masks[:, 0:1]
+ out_iou = iou_pred[:, 0:1]
+ out_tokens = mask_tokens_out[:, 0:1]
+
+ if return_all:
+ return out_masks, out_iou, out_tokens, object_score_logits
+ return out_masks, out_iou
+
+
+class SAMPromptEncoder(nn.Module):
+ def __init__(self, d_model=256, image_embedding_size=(72, 72), input_image_size=(1008, 1008), device=None, dtype=None, operations=None):
+ super().__init__()
+ self.embed_dim = d_model
+ self.image_embedding_size = image_embedding_size
+ self.input_image_size = input_image_size
+
+ self.pe_layer = PositionEmbeddingRandom(d_model // 2)
+ self.point_embeddings = nn.ModuleList([
+ operations.Embedding(1, d_model, device=device, dtype=dtype) for _ in range(4)
+ ])
+ self.not_a_point_embed = operations.Embedding(1, d_model, device=device, dtype=dtype)
+
+ LN2d = LayerNorm2d_op(operations)
+ self.mask_downscaling = nn.Sequential(
+ operations.Conv2d(1, 4, kernel_size=2, stride=2, device=device, dtype=dtype),
+ LN2d(4, device=device, dtype=dtype), nn.GELU(),
+ operations.Conv2d(4, 16, kernel_size=2, stride=2, device=device, dtype=dtype),
+ LN2d(16, device=device, dtype=dtype), nn.GELU(),
+ operations.Conv2d(16, d_model, kernel_size=1, device=device, dtype=dtype),
+ )
+ self.no_mask_embed = operations.Embedding(1, d_model, device=device, dtype=dtype)
+
+ def get_dense_pe(self):
+ return self.pe_layer(self.image_embedding_size)
+
+ def forward(self, points=None, boxes=None, masks=None):
+ ref = points[0] if points is not None else boxes if boxes is not None else masks
+ B = 1
+ sparse = torch.empty((B, 0, self.embed_dim), device=ref.device, dtype=ref.dtype)
+
+ if points is not None:
+ coords, labels = points
+ B = coords.shape[0]
+ # Pad with an extra point (label=-1) when no boxes are provided (matching reference)
+ if boxes is None:
+ coords = torch.cat([coords, torch.zeros(B, 1, 2, device=coords.device, dtype=coords.dtype)], dim=1)
+ labels = torch.cat([labels, -torch.ones(B, 1, device=labels.device, dtype=labels.dtype)], dim=1)
+ pe = self.pe_layer.forward_with_coords(coords + 0.5, self.input_image_size)
+ for i in range(4):
+ pe[labels == i] += cast_to_input(self.point_embeddings[i].weight, ref)
+ invalid = (labels == -1)
+ pe[invalid] = 0.0
+ pe[invalid] += cast_to_input(self.not_a_point_embed.weight, ref)
+ sparse = torch.cat([sparse.expand(B, -1, -1), pe], dim=1)
+
+ if boxes is not None:
+ B = boxes.shape[0]
+ corners = self.pe_layer.forward_with_coords((boxes.reshape(-1, 2, 2) + 0.5), self.input_image_size)
+ corners[:, 0] += cast_to_input(self.point_embeddings[2].weight, ref)
+ corners[:, 1] += cast_to_input(self.point_embeddings[3].weight, ref)
+ sparse = torch.cat([sparse.expand(B, -1, -1), corners], dim=1)
+
+ if masks is not None:
+ dense = self.mask_downscaling(masks)
+ else:
+ dense = cast_to_input(self.no_mask_embed.weight, ref).reshape(1, -1, 1, 1).expand(
+ B, -1, self.image_embedding_size[0], self.image_embedding_size[1])
+
+ return sparse, dense
+
+
+class CXBlock(nn.Module):
+ def __init__(self, dim=256, kernel_size=7, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.dwconv = operations.Conv2d(dim, dim, kernel_size=kernel_size, padding=kernel_size // 2, groups=dim, device=device, dtype=dtype)
+ self.norm = operations.LayerNorm(dim, device=device, dtype=dtype)
+ self.pwconv1 = operations.Linear(dim, 4 * dim, device=device, dtype=dtype)
+ self.pwconv2 = operations.Linear(4 * dim, dim, device=device, dtype=dtype)
+ self.gamma = nn.Parameter(torch.ones(dim, device=device, dtype=dtype))
+
+ def forward(self, x):
+ residual = x
+ x = self.dwconv(x).permute(0, 2, 3, 1)
+ x = self.pwconv2(F.gelu(self.pwconv1(self.norm(x))))
+ x.mul_(cast_to_input(self.gamma, x))
+ return residual + x.permute(0, 3, 1, 2)
+
+
+class MaskDownSampler(nn.Module):
+ def __init__(self, out_dim=256, in_chans=1, channels=None, interpol_size=(1152, 1152), device=None, dtype=None, operations=None):
+ super().__init__()
+ self.interpol_size = list(interpol_size) if interpol_size else None
+ if channels is None:
+ channels = [4, 16, 64, out_dim] # SAM3 default
+ LN2d = LayerNorm2d_op(operations)
+ layers = []
+ prev = in_chans
+ for ch in channels:
+ layers += [operations.Conv2d(prev, ch, kernel_size=3, stride=2, padding=1, device=device, dtype=dtype),
+ LN2d(ch, device=device, dtype=dtype), nn.GELU()]
+ prev = ch
+ layers.append(operations.Conv2d(prev, out_dim, kernel_size=1, device=device, dtype=dtype))
+ self.encoder = nn.Sequential(*layers)
+
+ def forward(self, x):
+ if self.interpol_size is not None and list(x.shape[-2:]) != self.interpol_size:
+ x = F.interpolate(x, size=self.interpol_size, mode="bilinear", align_corners=False, antialias=True)
+ return self.encoder(x)
+
+
+class Fuser(nn.Module):
+ def __init__(self, dim=256, num_layers=2, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.layers = nn.Sequential(*[CXBlock(dim, device=device, dtype=dtype, operations=operations) for _ in range(num_layers)])
+
+ def forward(self, x):
+ return self.layers(x)
+
+
+# --- SAM3.1 Multiplex components ---
+
+class DecoupledMemoryAttnLayer(nn.Module):
+ """Decoupled cross-attention layer for SAM3.1: fuses image and memory projections."""
+
+ def __init__(self, d_model=256, num_heads=1, dim_ff=2048, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_heads = num_heads
+ # Self-attention projections (flat, not nested)
+ self.self_attn_q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.self_attn_k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.self_attn_v_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.self_attn_out_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ # Cross-attention projections
+ self.cross_attn_q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.cross_attn_k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.cross_attn_v_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.cross_attn_out_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ # Image cross-attention (q/k only, fused with cross_attn)
+ self.image_cross_attn_q_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.image_cross_attn_k_proj = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ # FFN
+ self.linear1 = operations.Linear(d_model, dim_ff, device=device, dtype=dtype)
+ self.linear2 = operations.Linear(dim_ff, d_model, device=device, dtype=dtype)
+ self.norm1 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ self.norm3 = operations.LayerNorm(d_model, device=device, dtype=dtype)
+
+ def forward(self, image, x, memory_image, memory, memory_image_pos=None,
+ rope=None, num_k_exclude_rope=0):
+ # Self-attention with RoPE
+ normed = self.norm1(x)
+ q = self.self_attn_q_proj(normed)
+ k = self.self_attn_k_proj(normed)
+ v = self.self_attn_v_proj(normed)
+ if rope is not None:
+ q, k = apply_rope_memory(q, k, rope, self.num_heads, 0)
+ x = x + self.self_attn_out_proj(optimized_attention(q, k, v, self.num_heads, low_precision_attention=False))
+
+ # Decoupled cross-attention: fuse image and memory projections
+ normed = self.norm2(x)
+ q = self.image_cross_attn_q_proj(image) + self.cross_attn_q_proj(normed)
+ k = self.image_cross_attn_k_proj(memory_image) + self.cross_attn_k_proj(memory)
+ if memory_image_pos is not None:
+ k = k + memory_image_pos
+ v = self.cross_attn_v_proj(memory)
+ if rope is not None:
+ q, k = apply_rope_memory(q, k, rope, self.num_heads, num_k_exclude_rope)
+ x = x + self.cross_attn_out_proj(optimized_attention(q, k, v, self.num_heads, low_precision_attention=False))
+
+ # FFN
+ x = x + self.linear2(F.gelu(self.linear1(self.norm3(x))))
+ return image, x
+
+
+class DecoupledMemoryEncoder(nn.Module):
+ """Memory attention encoder for SAM3.1 with decoupled cross-attention."""
+
+ def __init__(self, d_model=256, num_heads=1, dim_ff=2048, num_layers=4, image_size=1008, patch_size=14,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ self.layers = nn.ModuleList([
+ DecoupledMemoryAttnLayer(d_model, num_heads, dim_ff, device=device, dtype=dtype, operations=operations)
+ for _ in range(num_layers)
+ ])
+ self.norm = operations.LayerNorm(d_model, device=device, dtype=dtype)
+ hw = image_size // patch_size
+ self.register_buffer("_rope", rope_2d(hw, hw, d_model // num_heads), persistent=False)
+
+ def forward(self, x, memory, memory_pos=None, src_pos=None, num_k_exclude_rope=0,
+ memory_image=None, memory_image_pos=None):
+ image = x # constant residual for decoupled cross-attention
+ output = x
+ if src_pos is not None:
+ output = output + 0.1 * src_pos
+
+ B, _, C = x.shape
+ rope = self._rope.to(device=x.device)
+
+ # memory_image: raw backbone features from past frames for decoupled cross-attention
+ if memory_image is None:
+ # Fallback: use spatial portion of memory (without obj pointers)
+ num_spatial = memory.shape[1] - num_k_exclude_rope
+ memory_image = memory[:, :num_spatial]
+ memory_image_pos = memory_pos[:, :num_spatial] if memory_pos is not None else None
+ # Pad memory_image to match memory length (zeros for obj pointer tokens)
+ if memory_image.shape[1] < memory.shape[1]:
+ pad_len = memory.shape[1] - memory_image.shape[1]
+ pad = torch.zeros(B, pad_len, C, device=memory.device, dtype=memory.dtype)
+ memory_image = torch.cat([memory_image, pad], dim=1)
+ if memory_image_pos is not None:
+ ptr_pos = memory_pos[:, -pad_len:] if memory_pos is not None else torch.zeros_like(pad)
+ memory_image_pos = torch.cat([memory_image_pos, ptr_pos], dim=1)
+
+ for layer in self.layers:
+ image, output = layer(image, output, memory_image, memory,
+ memory_image_pos=memory_image_pos, rope=rope,
+ num_k_exclude_rope=num_k_exclude_rope)
+
+ return self.norm(output)
+
+
+class DecoupledMemoryTransformer(nn.Module):
+ def __init__(self, d_model=256, num_heads=1, dim_ff=2048, num_layers=4, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.encoder = DecoupledMemoryEncoder(d_model, num_heads, dim_ff, num_layers,
+ device=device, dtype=dtype, operations=operations)
+
+
+class MemoryBackbone(nn.Module):
+ """Memory encoder: downsamples mask, fuses with pixel features, optionally compresses."""
+
+ def __init__(self, d_model=256, out_dim=None, in_chans=1, channels=None, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.mask_downsampler = MaskDownSampler(d_model, in_chans=in_chans, channels=channels, device=device, dtype=dtype, operations=operations)
+ self.pix_feat_proj = operations.Conv2d(d_model, d_model, kernel_size=1, device=device, dtype=dtype)
+ self.fuser = Fuser(d_model, num_layers=2, device=device, dtype=dtype, operations=operations)
+ self.has_out_proj = out_dim is not None and out_dim != d_model
+ if self.has_out_proj:
+ self.out_proj = operations.Conv2d(d_model, out_dim, kernel_size=1, device=device, dtype=dtype)
+ feat_dim = out_dim
+ else:
+ feat_dim = d_model
+ self.position_encoding = PositionEmbeddingSine(num_pos_feats=feat_dim, normalize=True)
+
+ def forward(self, image_features, mask_for_mem, skip_mask_sigmoid=False):
+ if not skip_mask_sigmoid:
+ mask_for_mem = mask_for_mem.sigmoid()
+ mask_features = self.mask_downsampler(cast_to_input(mask_for_mem, image_features))
+ if mask_features.shape[-2:] != image_features.shape[-2:]:
+ mask_features = F.interpolate(mask_features, size=image_features.shape[-2:], mode="bilinear", align_corners=False)
+ features = self.pix_feat_proj(image_features) + mask_features
+ features = self.fuser(features)
+ if self.has_out_proj:
+ features = self.out_proj(features)
+ pos = cast_to_input(self.position_encoding(features), features)
+ return {"vision_features": features, "vision_pos_enc": [pos]}
+
+
+class MultiplexMaskDecoder(nn.Module):
+ """SAM mask decoder for SAM3.1 multiplex: predicts masks for num_multiplex objects simultaneously.
+
+ Uses multimask_outputs_only=True: num_mask_output_per_object = num_multimask_outputs (no +1).
+ Hypernetwork MLPs are shared across multiplex objects.
+ Token order: [obj_score_token(M), iou_token(M), mask_tokens(M*T)].
+ """
+
+ def __init__(self, d_model=256, num_multiplex=16, num_multimask_outputs=3, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.num_multiplex = num_multiplex
+ self.num_mask_output_per_object = num_multimask_outputs # 3 (multimask_outputs_only)
+ total_mask_tokens = num_multiplex * self.num_mask_output_per_object # 48
+
+ self.transformer = SAMTwoWayTransformer(depth=2, embedding_dim=d_model, num_heads=8, mlp_dim=2048, device=device, dtype=dtype, operations=operations)
+
+ self.obj_score_token = operations.Embedding(num_multiplex, d_model, device=device, dtype=dtype)
+ self.iou_token = operations.Embedding(num_multiplex, d_model, device=device, dtype=dtype)
+ self.mask_tokens = operations.Embedding(total_mask_tokens, d_model, device=device, dtype=dtype)
+
+ LN2d = LayerNorm2d_op(operations)
+ self.output_upscaling = nn.Sequential(
+ operations.ConvTranspose2d(d_model, d_model // 4, kernel_size=2, stride=2, device=device, dtype=dtype),
+ LN2d(d_model // 4, device=device, dtype=dtype), nn.GELU(),
+ operations.ConvTranspose2d(d_model // 4, d_model // 8, kernel_size=2, stride=2, device=device, dtype=dtype), nn.GELU(),
+ )
+ self.conv_s0 = operations.Conv2d(d_model, d_model // 8, kernel_size=1, device=device, dtype=dtype)
+ self.conv_s1 = operations.Conv2d(d_model, d_model // 4, kernel_size=1, device=device, dtype=dtype)
+
+ # Shared across all multiplex objects (one per mask output)
+ self.output_hypernetworks_mlps = nn.ModuleList([
+ MLP(d_model, d_model, d_model // 8, 3, device=device, dtype=dtype, operations=operations)
+ for _ in range(self.num_mask_output_per_object)
+ ])
+ self.iou_prediction_head = MLP(d_model, d_model, self.num_mask_output_per_object, 3, device=device, dtype=dtype, operations=operations)
+ self.pred_obj_score_head = MLP(d_model, d_model, 1, 3, device=device, dtype=dtype, operations=operations)
+
+ def forward(self, image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings,
+ high_res_features=None, multimask_output=False, return_all=False, extra_per_object_embeddings=None):
+ B = sparse_prompt_embeddings.shape[0]
+ M = self.num_multiplex
+ T = self.num_mask_output_per_object
+
+ # Token order: [obj_score(M), iou(M), mask(M*T)]
+ ref = sparse_prompt_embeddings
+ mask_tokens = cast_to_input(self.mask_tokens.weight, ref)
+ if extra_per_object_embeddings is not None:
+ mask_tokens = mask_tokens.view(1, M, T, -1).expand(B, -1, -1, -1) + extra_per_object_embeddings.unsqueeze(2)
+ mask_tokens = mask_tokens.flatten(1, 2) # [B, M*T, C]
+ other_tokens = torch.cat([cast_to_input(self.obj_score_token.weight, ref),
+ cast_to_input(self.iou_token.weight, ref)], dim=0).unsqueeze(0).expand(B, -1, -1)
+ tokens = torch.cat([other_tokens, mask_tokens, sparse_prompt_embeddings], dim=1)
+ else:
+ tokens = torch.cat([cast_to_input(self.obj_score_token.weight, ref),
+ cast_to_input(self.iou_token.weight, ref), mask_tokens], dim=0)
+ tokens = torch.cat([tokens.unsqueeze(0).expand(B, -1, -1), sparse_prompt_embeddings], dim=1)
+
+ src = image_embeddings
+ if src.shape[0] != B:
+ src = src.expand(B, -1, -1, -1)
+ src = src + dense_prompt_embeddings
+ pos_src = image_pe.expand(B, -1, -1, -1)
+
+ b, c, h, w = src.shape
+ hs, src_out = self.transformer(src.flatten(2).permute(0, 2, 1), pos_src.flatten(2).permute(0, 2, 1), tokens)
+
+ # Parse output tokens
+ obj_score_token_out = hs[:, :M]
+ iou_token_out = hs[:, M:2 * M]
+ mask_tokens_out = hs[:, 2 * M:2 * M + M * T]
+
+ src_out = src_out.permute(0, 2, 1).view(b, c, h, w)
+ upscaled = _upscale_masks(self.output_upscaling, self.conv_s0, self.conv_s1, src_out, high_res_features)
+
+ # Reshape mask tokens to [B, M, T, C] and apply shared hypernetwork MLPs per mask output index
+ mask_tokens_2d = mask_tokens_out.view(B, M, T, -1)
+ hyper_in = torch.stack([
+ self.output_hypernetworks_mlps[i](mask_tokens_2d[:, :, i, :]) # [B, M, C//8]
+ for i in range(T)
+ ], dim=2) # [B, M, T, C//8]
+
+ # Generate masks: [B, M*T, H*W] -> [B, M, T, H, W]
+ masks = torch.bmm(hyper_in.flatten(1, 2), upscaled.flatten(2)).view(b, M, T, upscaled.shape[2], upscaled.shape[3])
+
+ # IoU and object scores
+ iou_pred = self.iou_prediction_head(iou_token_out).view(b, M, T)
+ object_score_logits = self.pred_obj_score_head(obj_score_token_out) # [B, M, 1]
+
+ # multimask_outputs_only: always output all T masks (no singlemask token)
+ sam_tokens_out = mask_tokens_2d[:, :, 0:1] # [B, M, 1, C]
+
+ if return_all:
+ return masks, iou_pred, sam_tokens_out, object_score_logits
+ return masks, iou_pred
+
+
+class SAM3Tracker(nn.Module):
+ def __init__(self, d_model=256, mem_dim=64, num_maskmem=7, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+
+ # Memory attention transformer
+ self.transformer = MemoryTransformer(d_model, num_heads=1, kv_dim=mem_dim, dim_ff=2048, num_layers=4,
+ device=device, dtype=dtype, operations=operations)
+ # SAM components
+ self.sam_mask_decoder = SAMMaskDecoder(d_model, device=device, dtype=dtype, operations=operations)
+ self.sam_prompt_encoder = SAMPromptEncoder(d_model, device=device, dtype=dtype, operations=operations)
+
+ # Memory backbone
+ self.maskmem_backbone = MemoryBackbone(d_model, out_dim=mem_dim, device=device, dtype=dtype, operations=operations)
+
+ # Standalone parameters
+ self.maskmem_tpos_enc = nn.Parameter(torch.zeros(num_maskmem, 1, 1, mem_dim, device=device, dtype=dtype))
+ self.no_mem_embed = nn.Parameter(torch.zeros(1, 1, d_model, device=device, dtype=dtype))
+ self.register_buffer("no_mem_pos_enc", torch.zeros(1, 1, d_model, device=device, dtype=dtype)) # checkpoint key, unused in forward
+ self.no_obj_embed_spatial = nn.Parameter(torch.zeros(1, mem_dim, device=device, dtype=dtype))
+ self.no_obj_ptr = nn.Parameter(torch.zeros(1, d_model, device=device, dtype=dtype))
+
+ # Object pointer projection
+ self.obj_ptr_proj = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
+ self.obj_ptr_tpos_proj = operations.Linear(d_model, mem_dim, device=device, dtype=dtype)
+
+ # Mask downsample: Conv2d stride 4 to reduce GT mask to SAM logit scale
+ self.mask_downsample = operations.Conv2d(1, 1, kernel_size=4, stride=4, device=device, dtype=dtype)
+
+ # Config
+ self.d_model = d_model
+ self.mem_dim = mem_dim
+ self.num_maskmem = num_maskmem
+ self.image_size = 1008
+ self.backbone_stride = 14
+ self.max_obj_ptrs_in_encoder = 16
+ self.sigmoid_scale_for_mem_enc = 20.0
+ self.sigmoid_bias_for_mem_enc = -10.0
+
+ def _no_obj_blend(self, obj_ptr, is_obj):
+ alpha = is_obj.to(obj_ptr.dtype)
+ return torch.lerp(cast_to_input(self.no_obj_ptr, obj_ptr), obj_ptr, alpha)
+
+ def _forward_sam_heads(self, backbone_features, point_inputs=None, mask_inputs=None, box_inputs=None,
+ high_res_features=None, multimask_output=False):
+ return forward_sam_heads(backbone_features, self.sam_prompt_encoder, self.sam_mask_decoder,
+ self.obj_ptr_proj, self._no_obj_blend, self.image_size,
+ point_inputs, mask_inputs, box_inputs, high_res_features, multimask_output)
+
+ def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+ return use_mask_as_output(backbone_features, high_res_features, mask_inputs,
+ self.mask_downsample, self.sam_prompt_encoder, self.sam_mask_decoder,
+ self.obj_ptr_proj, self._no_obj_blend, self.image_size, self.backbone_stride)
+
+ def _prepare_memory_conditioned_features(self, frame_idx, is_init_cond_frame, current_vision_feats, current_vision_pos_embeds, feat_sizes, output_dict, num_frames):
+ """Fuse current frame features with memory from previous frames."""
+ B = current_vision_feats[-1].shape[0]
+ C = self.d_model
+ H, W = feat_sizes[-1]
+ device = current_vision_feats[-1].device
+
+ if self.num_maskmem == 0:
+ return current_vision_feats[-1].permute(0, 2, 1).view(B, C, H, W)
+
+ if is_init_cond_frame:
+ # First conditioning frame: no memory yet, add no_mem_embed
+ pix_feat = current_vision_feats[-1] + cast_to_input(self.no_mem_embed, current_vision_feats[-1])
+ return to_spatial(pix_feat, H, W)
+
+ to_cat_memory, to_cat_memory_pos, _, _, cond_outputs = collect_memory_tokens(
+ output_dict, frame_idx, self.num_maskmem, self.maskmem_tpos_enc, device)
+
+ max_obj_ptrs = min(num_frames, self.max_obj_ptrs_in_encoder)
+ pos_and_ptrs = []
+ for t, out in cond_outputs.items():
+ if t <= frame_idx:
+ pos_and_ptrs.append(((frame_idx - t), out["obj_ptr"].to(device)))
+ for t_diff in range(1, max_obj_ptrs):
+ t = frame_idx - t_diff
+ if t < 0:
+ break
+ out = output_dict["non_cond_frame_outputs"].get(t, None)
+ if out is not None:
+ pos_and_ptrs.append((t_diff, out["obj_ptr"].to(device)))
+
+ num_obj_ptr_tokens = 0
+ if len(pos_and_ptrs) > 0:
+ pos_list, ptrs_list = zip(*pos_and_ptrs)
+ obj_ptrs = torch.stack(ptrs_list, dim=1) # [B, N, C=256]
+
+ # Temporal position encoding for pointers
+ obj_pos = compute_tpos_enc(
+ list(pos_list), device, self.d_model, self.obj_ptr_tpos_proj,
+ max_abs_pos=max_obj_ptrs, dtype=current_vision_feats[-1].dtype
+ ) # [N, mem_dim=64]
+ obj_pos = obj_pos.unsqueeze(0).expand(B, -1, -1) # [B, N, 64]
+
+ # Split each 256-dim pointer into 4 x 64-dim tokens
+ if self.mem_dim < C:
+ N = obj_ptrs.shape[1]
+ obj_ptrs = obj_ptrs.view(B, N, C // self.mem_dim, self.mem_dim) # [B, N, 4, 64]
+ obj_ptrs = obj_ptrs.reshape(B, N * (C // self.mem_dim), self.mem_dim) # [B, N*4, 64]
+ obj_pos = obj_pos.unsqueeze(2).expand(-1, -1, C // self.mem_dim, -1)
+ obj_pos = obj_pos.reshape(B, N * (C // self.mem_dim), self.mem_dim) # [B, N*4, 64]
+
+ to_cat_memory.append(obj_ptrs)
+ to_cat_memory_pos.append(obj_pos)
+ num_obj_ptr_tokens = obj_ptrs.shape[1]
+
+ if len(to_cat_memory) == 0:
+ # No memory available yet, add no_mem_embed
+ pix_feat = current_vision_feats[-1] + cast_to_input(self.no_mem_embed, current_vision_feats[-1])
+ return to_spatial(pix_feat, H, W)
+
+ # Concatenate all memory and position encodings [B, total_mem, mem_dim=64]
+ memory = torch.cat(to_cat_memory, dim=1)
+ memory_pos = torch.cat(to_cat_memory_pos, dim=1)
+
+ # Run memory attention encoder
+ pix_feat = current_vision_feats[-1] # [B, HW, C]
+ src_pos = current_vision_pos_embeds[-1] # [B, HW, C]
+
+ pix_feat_with_mem = self.transformer.encoder(
+ x=pix_feat,
+ memory=memory,
+ src_pos=src_pos,
+ memory_pos=memory_pos,
+ num_k_exclude_rope=num_obj_ptr_tokens,
+ )
+ return to_spatial(pix_feat_with_mem, H, W)
+
+ def _encode_new_memory(self, pix_feat, pred_masks_high_res, object_score_logits, is_mask_from_pts=False):
+ """Encode predicted mask into memory features."""
+ if is_mask_from_pts:
+ mask_for_mem = (pred_masks_high_res > 0).to(pix_feat.dtype)
+ else:
+ mask_for_mem = torch.sigmoid(pred_masks_high_res)
+
+ mask_for_mem.mul_(self.sigmoid_scale_for_mem_enc).add_(self.sigmoid_bias_for_mem_enc)
+
+ maskmem_out = self.maskmem_backbone(pix_feat, mask_for_mem, skip_mask_sigmoid=True)
+ maskmem_features = maskmem_out["vision_features"]
+ maskmem_pos_enc = maskmem_out["vision_pos_enc"]
+
+ # Add no_obj_embed for occluded objects
+ alpha = (object_score_logits > 0).to(maskmem_features.dtype)[..., None, None]
+ no_obj = cast_to_input(self.no_obj_embed_spatial, maskmem_features)[..., None, None].expand_as(maskmem_features)
+ return maskmem_features + (1 - alpha) * no_obj, maskmem_pos_enc
+
+ def track_step(self, frame_idx, is_init_cond_frame, current_vision_feats, current_vision_pos_embeds, feat_sizes, mask_inputs, output_dict,
+ num_frames, point_inputs=None):
+ """Track one frame: fuse with memory, predict mask, encode memory."""
+ current_out = {}
+
+ # High-res features for SAM head [stride-8, stride-4]
+ if len(current_vision_feats) > 1:
+ high_res_features = [
+ x.view(x.shape[0], feat_sizes[i][0], feat_sizes[i][1], -1).permute(0, 3, 1, 2)
+ for i, x in enumerate(current_vision_feats[:-1])
+ ]
+ else:
+ high_res_features = None
+
+ # Top-level feature for memory
+ H, W = feat_sizes[-1]
+
+ if mask_inputs is not None:
+ # Conditioning frame: use mask directly
+ pix_feat = to_spatial(current_vision_feats[-1], H, W)
+ sam_outputs = self._use_mask_as_output(pix_feat, high_res_features, mask_inputs)
+ else:
+ # Track frame: fuse with memory, then SAM decoder
+ pix_feat_with_mem = self._prepare_memory_conditioned_features(
+ frame_idx=frame_idx,
+ is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats,
+ current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes,
+ output_dict=output_dict,
+ num_frames=num_frames,
+ )
+ # Use multimask for point prompts on init frames (picks best of 3 candidates)
+ num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
+ multimask_output = is_init_cond_frame and 0 < num_pts <= 1
+ sam_outputs = self._forward_sam_heads(
+ backbone_features=pix_feat_with_mem,
+ point_inputs=point_inputs,
+ high_res_features=high_res_features,
+ multimask_output=multimask_output,
+ )
+
+ (low_res_masks, high_res_masks, obj_ptr, object_score_logits) = sam_outputs
+
+ # Clean low-res masks: remove sprinkles and fill holes
+ low_res_masks = fill_holes_in_mask_scores(low_res_masks, max_area=200)
+ high_res_masks = F.interpolate(low_res_masks, size=(self.image_size, self.image_size), mode="bilinear", align_corners=False)
+
+ current_out["pred_masks"] = low_res_masks
+ current_out["pred_masks_high_res"] = high_res_masks
+ current_out["obj_ptr"] = obj_ptr
+ current_out["object_score_logits"] = object_score_logits
+
+ # Encode memory
+ if self.num_maskmem > 0:
+ pix_feat = to_spatial(current_vision_feats[-1], H, W)
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ pix_feat=pix_feat,
+ pred_masks_high_res=high_res_masks,
+ object_score_logits=object_score_logits,
+ is_mask_from_pts=(point_inputs is not None),
+ )
+ current_out["maskmem_features"] = maskmem_features
+ current_out["maskmem_pos_enc"] = maskmem_pos_enc
+ else:
+ current_out["maskmem_features"] = None
+ current_out["maskmem_pos_enc"] = None
+
+ return current_out
+
+ def _compute_backbone_frame(self, backbone_fn, frame, frame_idx=None):
+ vision_feats, vision_pos, feat_sizes, _, _ = _compute_backbone(backbone_fn, frame, frame_idx)
+ # SAM3: drop last FPN level
+ return vision_feats[:-1], vision_pos[:-1], feat_sizes[:-1]
+
+ def _track_single_object(self, backbone_fn, images, initial_mask, pbar=None,
+ target_device=None, target_dtype=None):
+ """Track one object, computing backbone per frame to save VRAM."""
+ N = images.shape[0]
+ device = target_device if target_device is not None else images.device
+ dt = target_dtype if target_dtype is not None else images.dtype
+ size = self.image_size
+ output_dict = {"cond_frame_outputs": {}, "non_cond_frame_outputs": {}}
+ all_masks = []
+
+ for frame_idx in tqdm(range(N), desc="tracking"):
+ vision_feats, vision_pos, feat_sizes = self._compute_backbone_frame(
+ backbone_fn, _prep_frame(images, slice(frame_idx, frame_idx + 1), device, dt, size), frame_idx=frame_idx)
+ mask_input = None
+ if frame_idx == 0:
+ mask_input = F.interpolate(initial_mask.to(device=device, dtype=dt),
+ size=(self.image_size, self.image_size), mode="bilinear", align_corners=False)
+ mask_input = (mask_input > 0.5).to(dt)
+
+ current_out = self.track_step(
+ frame_idx=frame_idx, is_init_cond_frame=(frame_idx == 0),
+ current_vision_feats=vision_feats, current_vision_pos_embeds=vision_pos,
+ feat_sizes=feat_sizes, mask_inputs=mask_input, output_dict=output_dict, num_frames=N)
+
+ if frame_idx == 0:
+ output_dict["cond_frame_outputs"][frame_idx] = current_out
+ else:
+ output_dict["non_cond_frame_outputs"][frame_idx] = current_out
+ lookback = max(self.num_maskmem, self.max_obj_ptrs_in_encoder)
+ for old_idx in list(output_dict["non_cond_frame_outputs"]):
+ if old_idx < frame_idx - lookback:
+ del output_dict["non_cond_frame_outputs"][old_idx]
+ # Move masks to CPU immediately to free VRAM
+ all_masks.append(current_out["pred_masks_high_res"].to(comfy.model_management.intermediate_device()))
+ if pbar is not None:
+ pbar.update(1)
+
+ return torch.cat(all_masks, dim=0) # [N, 1, H, W]
+
+ def track_video(self, backbone_fn, images, initial_masks, pbar=None,
+ target_device=None, target_dtype=None, **kwargs):
+ """Track one or more objects across video frames.
+
+ Args:
+ backbone_fn: callable that returns (sam2_features, sam2_positions, trunk_out) for a frame
+ images: [N, 3, H, W] CPU full-res video frames (resized per-frame to self.image_size)
+ initial_masks: [N_obj, 1, H, W] binary masks for first frame (one per object)
+ pbar: optional progress bar
+
+ Returns:
+ [N, N_obj, image_size, image_size] predicted mask logits per frame per object
+ """
+ N_obj = initial_masks.shape[0]
+ per_object = []
+ for obj_idx in range(N_obj):
+ obj_masks = self._track_single_object(
+ backbone_fn, images, initial_masks[obj_idx:obj_idx + 1], pbar=pbar,
+ target_device=target_device, target_dtype=target_dtype)
+ per_object.append(obj_masks)
+
+ return torch.cat(per_object, dim=1) # [N, N_obj, H, W]
+
+
+class SAM31Tracker(nn.Module):
+ """SAM3.1 multiplex tracker: decoupled memory attention, dual decoder, 16-object multiplex."""
+
+ def __init__(self, d_model=256, mem_dim=256, num_maskmem=7, num_multiplex=16, device=None, dtype=None, operations=None, **kwargs):
+ super().__init__()
+ self.d_model = d_model
+ self.mem_dim = mem_dim
+ self.num_maskmem = num_maskmem
+ self.num_multiplex = num_multiplex
+ self.image_size = 1008
+ self.backbone_stride = 14
+ self.max_obj_ptrs_in_encoder = 16
+ self.sigmoid_scale_for_mem_enc = 2.0
+ self.sigmoid_bias_for_mem_enc = -1.0
+
+ # Memory attention (decoupled cross-attention, 8 heads matching reference)
+ self.transformer = DecoupledMemoryTransformer(d_model, num_heads=8, dim_ff=2048, num_layers=4,
+ device=device, dtype=dtype, operations=operations)
+
+ # Propagation decoder (multiplex: 16 objects, multimask_outputs_only)
+ self.sam_mask_decoder = MultiplexMaskDecoder(d_model, num_multiplex, num_multimask_outputs=3,
+ device=device, dtype=dtype, operations=operations)
+ # Interactive decoder (single object, same as SAM3)
+ self.interactive_sam_mask_decoder = SAMMaskDecoder(d_model, num_multimask_outputs=3,
+ device=device, dtype=dtype, operations=operations)
+ self.interactive_sam_prompt_encoder = SAMPromptEncoder(d_model, device=device, dtype=dtype, operations=operations)
+
+ # Memory backbone (mem_dim=256, no out_proj compression)
+ self.maskmem_backbone = MemoryBackbone(d_model, in_chans=num_multiplex * 2, channels=[16, 64, 256, 1024],
+ device=device, dtype=dtype, operations=operations)
+
+ # Standalone parameters
+ self.maskmem_tpos_enc = nn.Parameter(torch.zeros(num_maskmem, 1, 1, mem_dim, device=device, dtype=dtype))
+ self.no_obj_embed_spatial = nn.Parameter(torch.zeros(num_multiplex, mem_dim, device=device, dtype=dtype))
+ self.interactivity_no_mem_embed = nn.Parameter(torch.zeros(1, 1, d_model, device=device, dtype=dtype))
+
+ # Object pointer projection
+ self.obj_ptr_proj = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
+ self.obj_ptr_tpos_proj = operations.Linear(d_model, mem_dim, device=device, dtype=dtype)
+ self.no_obj_ptr_linear = operations.Linear(d_model, d_model, device=device, dtype=dtype)
+ self.interactive_obj_ptr_proj = MLP(d_model, d_model, d_model, 3, device=device, dtype=dtype, operations=operations)
+
+ # Interactive mask downsample
+ self.interactive_mask_downsample = operations.Conv2d(1, 1, kernel_size=4, stride=4, device=device, dtype=dtype)
+
+ # Multiplex validity embeddings
+ self.output_valid_embed = nn.Parameter(torch.zeros(num_multiplex, d_model, device=device, dtype=dtype))
+ self.output_invalid_embed = nn.Parameter(torch.zeros(num_multiplex, d_model, device=device, dtype=dtype))
+
+ # Position encoding for image (used by multiplex decoder)
+ self.image_pe_layer = PositionEmbeddingRandom(d_model // 2)
+
+ def _no_obj_blend(self, obj_ptr, is_obj):
+ alpha = is_obj.to(obj_ptr.dtype)
+ return torch.lerp(self.no_obj_ptr_linear(obj_ptr), obj_ptr, alpha)
+
+ def _forward_sam_heads(self, backbone_features, point_inputs=None, mask_inputs=None, box_inputs=None,
+ high_res_features=None, multimask_output=False):
+ return forward_sam_heads(backbone_features, self.interactive_sam_prompt_encoder, self.interactive_sam_mask_decoder,
+ self.interactive_obj_ptr_proj, self._no_obj_blend, self.image_size,
+ point_inputs, mask_inputs, box_inputs, high_res_features, multimask_output)
+
+ def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
+ return use_mask_as_output(backbone_features, high_res_features, mask_inputs,
+ self.interactive_mask_downsample, self.interactive_sam_prompt_encoder,
+ self.interactive_sam_mask_decoder, self.interactive_obj_ptr_proj,
+ self._no_obj_blend, self.image_size, self.backbone_stride)
+
+ def _prepare_memory_conditioned_features(self, frame_idx, is_init_cond_frame, current_vision_feats,
+ current_vision_pos_embeds, feat_sizes, output_dict, num_frames,
+ multiplex_state=None):
+ B = current_vision_feats[-1].shape[0]
+ C = self.d_model
+ H, W = feat_sizes[-1]
+ device = current_vision_feats[-1].device
+ num_buc = multiplex_state.num_buckets if multiplex_state is not None else None
+
+ if self.num_maskmem == 0:
+ return current_vision_feats[-1].permute(0, 2, 1).view(B, C, H, W)
+
+ if is_init_cond_frame:
+ pix_feat = current_vision_feats[-1] + cast_to_input(self.interactivity_no_mem_embed, current_vision_feats[-1])
+ return to_spatial(pix_feat, H, W)
+
+ to_cat_memory, to_cat_memory_pos, to_cat_image_feat, to_cat_image_pos, cond_outputs = collect_memory_tokens(
+ output_dict, frame_idx, self.num_maskmem, self.maskmem_tpos_enc, device,
+ collect_image_feats=True, tpos_v2=True, num_buckets=num_buc)
+
+ max_obj_ptrs = min(num_frames, self.max_obj_ptrs_in_encoder)
+ pos_and_ptrs = []
+ for t, out in cond_outputs.items():
+ if t <= frame_idx and "obj_ptr" in out:
+ ptr = out["obj_ptr"].to(device)
+ if num_buc is not None:
+ ptr = _pad_to_buckets(ptr, num_buc)
+ pos_and_ptrs.append(((frame_idx - t), ptr))
+ for t_diff in range(1, max_obj_ptrs):
+ t = frame_idx - t_diff
+ if t < 0:
+ break
+ out = output_dict["non_cond_frame_outputs"].get(t, None)
+ if out is not None and "obj_ptr" in out:
+ ptr = out["obj_ptr"].to(device)
+ if num_buc is not None:
+ ptr = _pad_to_buckets(ptr, num_buc)
+ pos_and_ptrs.append((t_diff, ptr))
+
+ num_obj_ptr_tokens = 0
+ if len(pos_and_ptrs) > 0:
+ pos_list, ptrs_list = zip(*pos_and_ptrs)
+ obj_ptrs = torch.stack(ptrs_list, dim=1) # [num_buckets, N, M, C]
+ B_ptr = obj_ptrs.shape[0]
+ N_ptrs = obj_ptrs.shape[1]
+ M = obj_ptrs.shape[2]
+ obj_ptrs = obj_ptrs.reshape(B_ptr, N_ptrs * M, -1)
+ obj_pos = compute_tpos_enc(list(pos_list), device, self.d_model, self.obj_ptr_tpos_proj,
+ max_abs_pos=max_obj_ptrs, dtype=current_vision_feats[-1].dtype)
+ obj_pos = obj_pos.unsqueeze(0).expand(B_ptr, -1, -1)
+ obj_pos = obj_pos.unsqueeze(2).expand(-1, -1, M, -1).reshape(B_ptr, N_ptrs * M, -1)
+ to_cat_memory.append(obj_ptrs)
+ to_cat_memory_pos.append(obj_pos)
+ num_obj_ptr_tokens = obj_ptrs.shape[1]
+
+ if len(to_cat_memory) == 0:
+ pix_feat = current_vision_feats[-1] + cast_to_input(self.interactivity_no_mem_embed, current_vision_feats[-1])
+ return to_spatial(pix_feat, H, W)
+
+ memory = torch.cat(to_cat_memory, dim=1)
+ memory_pos = torch.cat(to_cat_memory_pos, dim=1)
+
+ # Expand vision features to num_buckets if memory has more buckets than B
+ mem_B = memory.shape[0]
+ x = current_vision_feats[-1]
+ x_pos = current_vision_pos_embeds[-1]
+ if x.shape[0] < mem_B:
+ x = x.expand(mem_B, -1, -1)
+ x_pos = x_pos.expand(mem_B, -1, -1)
+
+ if len(to_cat_image_feat) > 0:
+ # Decoupled cross-attention: separate image features from memory
+ memory_image = cast_to_input(torch.cat(to_cat_image_feat, dim=1), x)
+ memory_image_pos = cast_to_input(torch.cat(to_cat_image_pos, dim=1), x)
+ if memory_image.shape[0] < mem_B:
+ memory_image = memory_image.expand(mem_B, -1, -1)
+ memory_image_pos = memory_image_pos.expand(mem_B, -1, -1)
+ pix_feat_with_mem = self.transformer.encoder(
+ x=x,
+ memory=cast_to_input(memory, x),
+ memory_pos=cast_to_input(memory_pos, x),
+ src_pos=cast_to_input(x_pos, x),
+ num_k_exclude_rope=num_obj_ptr_tokens,
+ memory_image=memory_image,
+ memory_image_pos=memory_image_pos,
+ )
+ else:
+ pix_feat_with_mem = self.transformer.encoder(
+ x=x,
+ memory=memory,
+ memory_pos=memory_pos,
+ src_pos=x_pos,
+ num_k_exclude_rope=num_obj_ptr_tokens,
+ )
+ return to_spatial(pix_feat_with_mem, H, W)
+
+ def _encode_new_memory(self, pix_feat, pred_masks_high_res, object_score_logits, is_mask_from_pts=False,
+ multiplex_state=None, is_conditioning=False, cond_obj_mask=None):
+ if is_mask_from_pts:
+ mask_for_mem = (pred_masks_high_res > 0).to(pix_feat.dtype)
+ else:
+ mask_for_mem = torch.sigmoid(pred_masks_high_res)
+ mask_for_mem.mul_(self.sigmoid_scale_for_mem_enc).add_(self.sigmoid_bias_for_mem_enc)
+
+ # Mux masks: [N_obj, 1, H, W] -> [num_buckets, M, H, W]
+ mux_masks = multiplex_state.mux(mask_for_mem[:, 0])
+
+ # Conditioning channel: 1.0 = clean mask (trust it), 0.0 = propagation (noisy)
+ N_obj = mask_for_mem.shape[0]
+ cond_values = torch.full((N_obj,), 0.0, device=mask_for_mem.device, dtype=mask_for_mem.dtype)
+ if is_conditioning:
+ cond_values[:] = 1.0
+ elif cond_obj_mask is not None:
+ cond_values[cond_obj_mask] = 1.0
+ cond_spatial = cond_values.view(-1, 1, 1, 1).expand_as(mask_for_mem[:, 0:1, :, :]).squeeze(1)
+ mux_cond = multiplex_state.mux(cond_spatial) # [num_buckets, M, H, W]
+ mux_input = torch.cat([mux_masks, mux_cond], dim=1) # [num_buckets, 2*M, H, W]
+
+ maskmem_out = self.maskmem_backbone(pix_feat, mux_input, skip_mask_sigmoid=True)
+ maskmem_features = maskmem_out["vision_features"]
+ maskmem_pos_enc = maskmem_out["vision_pos_enc"]
+
+ # Add no_obj_embed_spatial for occluded objects
+ is_obj = (object_score_logits > 0).float() # [N_obj, 1]
+ mux_is_obj = multiplex_state.mux(is_obj) # [num_buckets, M, 1]
+ no_obj_embed = cast_to_input(self.no_obj_embed_spatial, maskmem_features) # [M, C]
+ no_obj_spatial = no_obj_embed.unsqueeze(0)[..., None, None] # [1, M, C, 1, 1]
+ # Expand and sum across multiplex slots weighted by (1 - is_obj)
+ alpha = mux_is_obj[..., None, None] # [num_buckets, M, 1, 1, 1]
+ per_slot_no_obj = ((1 - alpha) * no_obj_spatial).sum(dim=1) # [num_buckets, C, 1, 1]
+ maskmem_features = maskmem_features + per_slot_no_obj.expand_as(maskmem_features)
+
+ return maskmem_features, maskmem_pos_enc
+
+ def _forward_propagation(self, backbone_features, high_res_features=None, multiplex_state=None):
+ """Propagation path using the multiplex SAM decoder (no prompts)."""
+ B = backbone_features.shape[0]
+ device = backbone_features.device
+
+ # Suppression embeddings from valid object mask
+ valid_mask = cast_to_input(multiplex_state.get_valid_object_mask().unsqueeze(-1).float(), backbone_features)
+ output_valid = cast_to_input(self.output_valid_embed, backbone_features).unsqueeze(0)
+ output_invalid = cast_to_input(self.output_invalid_embed, backbone_features).unsqueeze(0)
+ extra_embed = valid_mask * output_valid + (1 - valid_mask) * output_invalid
+
+ image_pe = self.image_pe_layer((backbone_features.shape[-2], backbone_features.shape[-1]), device=backbone_features.device)
+ image_pe = cast_to_input(image_pe, backbone_features)
+
+ masks, iou_pred, sam_tokens_out, object_score_logits = self.sam_mask_decoder(
+ image_embeddings=backbone_features, image_pe=image_pe,
+ sparse_prompt_embeddings=torch.empty(B, 0, self.d_model, device=device, dtype=backbone_features.dtype),
+ dense_prompt_embeddings=torch.zeros(B, self.d_model, *backbone_features.shape[-2:], device=device, dtype=backbone_features.dtype),
+ high_res_features=high_res_features, multimask_output=True, return_all=True,
+ extra_per_object_embeddings=extra_embed.expand(B, -1, -1),
+ )
+ # masks: [B=num_buckets, M, T, H, W]
+ # Demux to per-object: [N_obj, T, H, W]
+ masks_obj = multiplex_state.demux(masks)
+ iou_obj = multiplex_state.demux(iou_pred)
+ score_obj = multiplex_state.demux(object_score_logits)
+ tokens_obj = multiplex_state.demux(sam_tokens_out)
+
+ # Select best mask by IoU for each object
+ best_idx = torch.argmax(iou_obj, dim=-1) # [N_obj]
+ N_obj = masks_obj.shape[0]
+ obj_range = torch.arange(N_obj, device=device)
+ low_res_masks = masks_obj[obj_range, best_idx].unsqueeze(1) # [N_obj, 1, H, W]
+ # Suppress masks for objects with low confidence
+ is_obj = score_obj > 0
+ low_res_masks = torch.where(is_obj[:, :, None, None], low_res_masks,
+ torch.tensor(NO_OBJ_SCORE, device=device, dtype=low_res_masks.dtype))
+ high_res_masks = F.interpolate(low_res_masks.float(), size=(self.image_size, self.image_size), mode="bilinear", align_corners=False)
+
+ # Object pointer: compute per-object, mux for storage as [num_buckets, M, C]
+ sam_token = tokens_obj[:, 0] # [N_obj, C]
+ obj_ptr = self.obj_ptr_proj(sam_token)
+ is_obj = (score_obj > 0).float()
+ no_obj = self.no_obj_ptr_linear(obj_ptr)
+ obj_ptr = is_obj * obj_ptr + (1 - is_obj) * no_obj
+ obj_ptr_muxed = multiplex_state.mux(obj_ptr) # [num_buckets, M, C]
+
+ return low_res_masks, high_res_masks, obj_ptr_muxed, score_obj
+
+ def track_step(self, frame_idx, is_init_cond_frame, current_vision_feats, current_vision_pos_embeds,
+ feat_sizes, mask_inputs, output_dict, num_frames, point_inputs=None,
+ interactive_high_res=None, interactive_backbone=None, propagation_high_res=None,
+ multiplex_state=None, run_mem_encoder=True):
+ current_out = {}
+ H, W = feat_sizes[-1]
+
+ if mask_inputs is not None:
+ # Conditioning frame: use interactive features if available, else propagation
+ if interactive_backbone is not None:
+ pix_feat = interactive_backbone
+ # Add no_mem_embed for interactive path
+ pix_flat = pix_feat.flatten(2)
+ bf = pix_flat.permute(0, 2, 1) + cast_to_input(self.interactivity_no_mem_embed, pix_flat)
+ pix_feat = to_spatial(bf, H, W)
+ hi_res = interactive_high_res
+ else:
+ # Fallback: interactive backbone not available (e.g. called outside track_video).
+ # Propagation features work but may produce lower-quality conditioning.
+ pix_feat = to_spatial(current_vision_feats[-1], H, W)
+ hi_res = propagation_high_res
+ sam_outputs = self._use_mask_as_output(pix_feat, hi_res, mask_inputs)
+ elif point_inputs is not None:
+ # Interactive path: use interactive SAM decoder
+ pix_feat_with_mem = self._prepare_memory_conditioned_features(
+ frame_idx=frame_idx, is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats, current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes, output_dict=output_dict, num_frames=num_frames,
+ multiplex_state=multiplex_state,
+ )
+ hi_res = interactive_high_res if interactive_high_res is not None else propagation_high_res
+ num_pts = point_inputs["point_labels"].size(1)
+ multimask_output = is_init_cond_frame and 0 < num_pts <= 1
+ sam_outputs = self._forward_sam_heads(
+ backbone_features=pix_feat_with_mem, point_inputs=point_inputs,
+ high_res_features=hi_res, multimask_output=multimask_output,
+ )
+ else:
+ # Propagation path: use multiplex SAM decoder with propagation features
+ pix_feat_with_mem = self._prepare_memory_conditioned_features(
+ frame_idx=frame_idx, is_init_cond_frame=is_init_cond_frame,
+ current_vision_feats=current_vision_feats, current_vision_pos_embeds=current_vision_pos_embeds,
+ feat_sizes=feat_sizes, output_dict=output_dict, num_frames=num_frames,
+ multiplex_state=multiplex_state,
+ )
+ sam_outputs = self._forward_propagation(pix_feat_with_mem, propagation_high_res,
+ multiplex_state=multiplex_state)
+
+ (low_res_masks, high_res_masks, obj_ptr, object_score_logits) = sam_outputs
+
+ # Mux obj_ptr if it came from interactive path (shape [B, C]) vs propagation ([num_buckets, M, C])
+ if multiplex_state is not None and obj_ptr.dim() == 2:
+ obj_ptr = multiplex_state.mux(obj_ptr) # [N_obj, C] -> [num_buckets, M, C]
+
+ # Encode memory (can be deferred with run_mem_encoder=False)
+ if run_mem_encoder and self.num_maskmem > 0:
+ pix_feat = to_spatial(current_vision_feats[-1], H, W)
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ pix_feat=pix_feat, pred_masks_high_res=high_res_masks,
+ object_score_logits=object_score_logits,
+ is_mask_from_pts=(point_inputs is not None),
+ multiplex_state=multiplex_state,
+ is_conditioning=(mask_inputs is not None),
+ )
+ current_out["maskmem_features"] = maskmem_features
+ current_out["maskmem_pos_enc"] = maskmem_pos_enc
+ else:
+ current_out["maskmem_features"] = None
+ current_out["maskmem_pos_enc"] = None
+
+ # Store propagation image features for decoupled memory attention
+ current_out["image_features"] = current_vision_feats[-1] # [B, HW, C]
+ current_out["image_pos_enc"] = current_vision_pos_embeds[-1] # [B, HW, C]
+
+ current_out["pred_masks"] = low_res_masks
+ current_out["pred_masks_high_res"] = high_res_masks
+ current_out["obj_ptr"] = obj_ptr
+ current_out["object_score_logits"] = object_score_logits
+
+ return current_out
+
+ def _compute_backbone_frame(self, backbone_fn, frame, frame_idx=None):
+ vision_feats, vision_pos, feat_sizes, features, trunk_out = _compute_backbone(backbone_fn, frame, frame_idx)
+ return vision_feats, vision_pos, feat_sizes, list(features[:-1]), trunk_out
+
+ @staticmethod
+ def _suppress_recently_occluded(low_res_masks, last_occluded, frame_idx, threshold=0.3):
+ """Suppress overlapping masks for objects that were most recently occluded.
+ Prevents corrupted masks from occluded objects from contaminating other objects."""
+ N_obj = low_res_masks.shape[0]
+ if N_obj <= 1:
+ return low_res_masks
+ binary = low_res_masks[:, 0] > 0 # [N_obj, H, W]
+ iou = _compute_mask_overlap(low_res_masks[:, 0], low_res_masks[:, 0])
+ overlapping = torch.triu(iou >= threshold, diagonal=1) # [N, N] upper triangle
+ last_occ_i = last_occluded.unsqueeze(1) # [N, 1]
+ last_occ_j = last_occluded.unsqueeze(0) # [1, N]
+ # Suppress the more recently occluded object in each overlapping pair
+ suppress_i = overlapping & (last_occ_i > last_occ_j) & (last_occ_j > -1)
+ suppress_j = overlapping & (last_occ_j > last_occ_i) & (last_occ_i > -1)
+ to_suppress = suppress_i.any(dim=1) | suppress_j.any(dim=0)
+ # Update last_occluded for occluded/suppressed objects
+ is_empty = ~binary.any(dim=(-1, -2))
+ newly_occluded = is_empty | to_suppress
+ last_occluded[newly_occluded] = frame_idx
+ # Suppress masks
+ low_res_masks[to_suppress] = -10.0
+ return low_res_masks
+
+ def _deferred_memory_encode(self, current_out, N_obj, vision_feats, feat_sizes, mux_state, device,
+ cond_obj_mask=None):
+ """Deferred memory encoding for propagation frames. cond_obj_mask: per-object bool for conditioning."""
+ low_res_masks = current_out["pred_masks"] # [N_obj, 1, H_low, W_low]
+
+ if N_obj > 1:
+ lr = low_res_masks.squeeze(1) # [N_obj, H, W]
+ max_obj = torch.argmax(lr, dim=0, keepdim=True)
+ batch_inds = torch.arange(N_obj, device=device)[:, None, None]
+ pixel_nol = torch.where(max_obj == batch_inds, lr, torch.clamp(lr, max=-10.0))
+ area_before = (lr > 0).sum(dim=(-1, -2)).float().clamp(min=1)
+ area_after = (pixel_nol > 0).sum(dim=(-1, -2)).float()
+ shrink_ok = (area_after / area_before) >= 0.3
+ low_res_masks = torch.where(
+ shrink_ok[:, None, None, None].expand_as(low_res_masks),
+ low_res_masks, torch.clamp(low_res_masks, max=-10.0))
+
+ interpol_size = self.maskmem_backbone.mask_downsampler.interpol_size
+ mem_masks = F.interpolate(low_res_masks, size=interpol_size,
+ mode="bilinear", align_corners=False)
+
+ obj_scores = torch.where(
+ (mem_masks > 0).any(dim=(-1, -2)), 10.0, -10.0)
+
+ pix_feat = to_spatial(vision_feats[-1], feat_sizes[-1][0], feat_sizes[-1][1])
+ maskmem_features, maskmem_pos_enc = self._encode_new_memory(
+ pix_feat=pix_feat, pred_masks_high_res=mem_masks,
+ object_score_logits=obj_scores,
+ multiplex_state=mux_state, cond_obj_mask=cond_obj_mask)
+ current_out["maskmem_features"] = maskmem_features
+ current_out["maskmem_pos_enc"] = maskmem_pos_enc
+
+ def _add_detected_objects(self, new_masks, mux_state, vision_feats, feat_sizes, current_out):
+ """Grow MultiplexState with new detections, merge masks, re-encode memory. Modifies current_out."""
+ n_old = mux_state.total_valid_entries
+ mux_state.add_objects(new_masks.shape[0])
+ N_obj = mux_state.total_valid_entries
+ # Stored memory with old bucket counts is padded at read time by _pad_to_buckets
+ for k in ("pred_masks", "pred_masks_high_res"):
+ det = F.interpolate(new_masks.unsqueeze(1), size=current_out[k].shape[-2:],
+ mode="bilinear", align_corners=False)
+ current_out[k] = torch.cat([current_out[k], det], dim=0)
+ if self.num_maskmem > 0:
+ # Mark new objects as conditioning (clean detection masks) so model trusts them
+ cond_mask = torch.zeros(N_obj, dtype=torch.bool, device=new_masks.device)
+ cond_mask[n_old:] = True
+ self._deferred_memory_encode(current_out, N_obj, vision_feats, feat_sizes,
+ mux_state, new_masks.device, cond_obj_mask=cond_mask)
+
+ def _condition_with_masks(self, masks, frame_idx, vision_feats, vision_pos, feat_sizes,
+ high_res_prop, output_dict, N, mux_state, backbone_obj, frame,
+ trunk_out, threshold=0.5):
+ """Condition tracker with masks on a frame."""
+ mask_input = F.interpolate(masks if masks.dim() == 4 else masks.unsqueeze(1),
+ size=(self.image_size, self.image_size), mode="bilinear", align_corners=False)
+ mask_input = (mask_input > threshold).to(masks.dtype)
+ hi_res = lo_feat = None
+ if backbone_obj is not None and backbone_obj.multiplex:
+ _, _, itf, _ = backbone_obj(frame, tracker_mode="interactive", cached_trunk=trunk_out, tracker_only=True)
+ hi_res, lo_feat = itf[:-1], itf[-1]
+ current_out = self.track_step(
+ frame_idx=frame_idx, is_init_cond_frame=True, current_vision_feats=vision_feats,
+ current_vision_pos_embeds=vision_pos, feat_sizes=feat_sizes, mask_inputs=mask_input,
+ output_dict=output_dict, num_frames=N, interactive_high_res=hi_res,
+ interactive_backbone=lo_feat, propagation_high_res=high_res_prop,
+ multiplex_state=mux_state, run_mem_encoder=True)
+ output_dict["cond_frame_outputs"][frame_idx] = current_out
+ return current_out
+
+ def _match_and_add_detections(self, det_masks, det_scores, current_out, mux_state,
+ vision_feats, feat_sizes, device, max_objects=0,
+ keep_alive=None):
+ """Match detections against tracked masks, add new objects, recondition degraded tracks.
+ Updates keep_alive counters: +1 for matched tracks, -1 for unmatched."""
+ N_obj = mux_state.total_valid_entries
+ if det_masks.shape[0] == 0:
+ if keep_alive is not None:
+ for i in range(N_obj):
+ keep_alive[i] = max(-4, keep_alive.get(i, 0) - 1)
+ return []
+
+ # Match at low-res (like reference)
+ trk_masks = current_out["pred_masks"][:, 0] # [N_obj, H_low, W_low]
+ det_resized = F.interpolate(det_masks.unsqueeze(1), size=trk_masks.shape[-2:],
+ mode="bilinear", align_corners=False)[:, 0]
+ overlap = _compute_mask_overlap(det_resized, trk_masks)
+
+ # Update keep_alive and find matched tracks
+ matched = set()
+ if overlap.shape[1] > 0:
+ matched = set((overlap >= 0.5).any(dim=0).nonzero(as_tuple=True)[0].tolist())
+ if keep_alive is not None:
+ for i in range(N_obj):
+ if i in matched:
+ keep_alive[i] = min(8, keep_alive.get(i, 0) + 1)
+ else:
+ keep_alive[i] = max(-4, keep_alive.get(i, 0) - 1)
+
+ # Recondition: high-confidence detections (>=0.8) with high overlap refresh tracked masks
+ reconditioned = False
+ if det_scores is not None and overlap.shape[1] > 0:
+ HIGH_CONF = 0.8
+ for det_idx in range(overlap.shape[0]):
+ if det_scores[det_idx] < HIGH_CONF:
+ continue
+ best_trk = overlap[det_idx].argmax().item()
+ if overlap[det_idx, best_trk] >= 0.5:
+ # Replace tracked mask with fresh detection mask
+ current_out["pred_masks"][best_trk] = det_resized[det_idx].unsqueeze(0)
+ det_hr = F.interpolate(det_masks[det_idx:det_idx+1].unsqueeze(1),
+ size=current_out["pred_masks_high_res"].shape[-2:],
+ mode="bilinear", align_corners=False)
+ current_out["pred_masks_high_res"][best_trk] = det_hr[0]
+ reconditioned = True
+
+ # Re-encode memory if any tracks were reconditioned
+ if reconditioned and self.num_maskmem > 0:
+ self._deferred_memory_encode(current_out, N_obj, vision_feats, feat_sizes, mux_state, device)
+
+ # Add new detections (not matching any track)
+ if max_objects > 0 and N_obj >= max_objects:
+ return []
+ max_overlap = overlap.max(dim=1)[0] if overlap.shape[1] > 0 else torch.zeros(overlap.shape[0], device=device)
+ new_dets = max_overlap < 0.5
+ if new_dets.any():
+ if max_objects > 0:
+ slots = max_objects - N_obj
+ new_dets = new_dets & (torch.cumsum(new_dets.int(), 0) <= slots)
+ self._add_detected_objects(det_masks[new_dets], mux_state,
+ vision_feats, feat_sizes, current_out)
+ if keep_alive is not None:
+ for i in range(N_obj, mux_state.total_valid_entries):
+ keep_alive[i] = 1
+ return det_scores[new_dets].tolist() if det_scores is not None else [0.0] * new_dets.sum().item()
+ return []
+
+ INTERNAL_MAX_OBJECTS = 64 # Hard ceiling on accumulated tracks; max_objects=0 or any value above this is clamped here.
+
+ def track_video_with_detection(self, backbone_fn, images, initial_masks, detect_fn=None,
+ new_det_thresh=0.5, max_objects=0, detect_interval=1,
+ backbone_obj=None, pbar=None, target_device=None, target_dtype=None):
+ """Track with optional per-frame detection. Returns [N, max_N_obj, H, W] mask logits."""
+ if max_objects <= 0 or max_objects > self.INTERNAL_MAX_OBJECTS:
+ max_objects = self.INTERNAL_MAX_OBJECTS
+ N = images.shape[0]
+ device = target_device if target_device is not None else images.device
+ dt = target_dtype if target_dtype is not None else images.dtype
+ size = self.image_size
+ output_dict = {"cond_frame_outputs": {}, "non_cond_frame_outputs": {}}
+ all_masks = []
+ idev = comfy.model_management.intermediate_device()
+ mux_state = None
+ if initial_masks is not None:
+ mux_state = MultiplexState(initial_masks.shape[0], self.num_multiplex, device, dt)
+ obj_scores = [] # per-object detection score (1.0 for initial masks)
+ keep_alive = {} if detect_fn is not None else None
+ last_occluded = torch.empty(0, device=device, dtype=torch.long) # per-object last occluded frame
+
+ # Prefetch next frame's backbone on a separate CUDA stream
+ prefetch = False
+ backbone_stream = None
+ if comfy.model_management.is_device_cuda(device):
+ try:
+ backbone_stream = torch.cuda.Stream(device=device)
+ prefetch = True
+ except RuntimeError:
+ pass
+ cur_bb = self._compute_backbone_frame(backbone_fn, _prep_frame(images, slice(0, 1), device, dt, size), frame_idx=0)
+
+ for frame_idx in tqdm(range(N), desc="tracking"):
+ vision_feats, vision_pos, feat_sizes, high_res_prop, trunk_out = cur_bb
+
+ # Start next frame's backbone on separate stream (overlaps with current frame's work)
+ if prefetch and frame_idx + 1 < N:
+ backbone_stream.wait_stream(torch.cuda.current_stream(device))
+ with torch.cuda.stream(backbone_stream):
+ next_bb = self._compute_backbone_frame(
+ backbone_fn, _prep_frame(images, slice(frame_idx + 1, frame_idx + 2), device, dt, size), frame_idx=frame_idx + 1)
+
+ # Per-frame detection with NMS (skip if no detect_fn, or interval/max not met)
+ det_masks = torch.empty(0, device=device)
+ det_scores = None
+ run_det = (detect_fn is not None
+ and frame_idx % max(detect_interval, 1) == 0
+ and not (max_objects > 0 and mux_state is not None
+ and mux_state.total_valid_entries >= max_objects))
+ if run_det:
+ det_out = detect_fn(trunk_out)
+ scores = det_out["scores"][0].sigmoid()
+ keep = scores > new_det_thresh
+ det_masks, det_scores = det_out["masks"][0][keep], scores[keep]
+ if det_masks.shape[0] > 1:
+ det_masks, det_scores = _nms_masks(det_masks, det_scores)
+
+ if frame_idx == 0 and initial_masks is not None:
+ current_out = self._condition_with_masks(
+ initial_masks.to(device=device, dtype=dt), frame_idx, vision_feats, vision_pos,
+ feat_sizes, high_res_prop, output_dict, N, mux_state, backbone_obj,
+ _prep_frame(images, slice(frame_idx, frame_idx + 1), device, dt, size), trunk_out)
+ last_occluded = torch.full((mux_state.total_valid_entries,), -1, device=device, dtype=torch.long)
+ obj_scores = [1.0] * mux_state.total_valid_entries
+ if keep_alive is not None:
+ for i in range(mux_state.total_valid_entries):
+ keep_alive[i] = 8
+ elif mux_state is None or mux_state.total_valid_entries == 0:
+ if det_masks.shape[0] > 0:
+ if max_objects > 0:
+ det_scores = det_scores[:max_objects]
+ det_masks = det_masks[:max_objects]
+ mux_state = MultiplexState(det_masks.shape[0], self.num_multiplex, device, dt)
+ current_out = self._condition_with_masks(
+ det_masks, frame_idx, vision_feats, vision_pos, feat_sizes, high_res_prop,
+ output_dict, N, mux_state, backbone_obj,
+ _prep_frame(images, slice(frame_idx, frame_idx + 1), device, dt, size), trunk_out, threshold=0.0)
+ last_occluded = torch.full((mux_state.total_valid_entries,), -1, device=device, dtype=torch.long)
+ obj_scores = det_scores[:mux_state.total_valid_entries].tolist()
+ if keep_alive is not None:
+ for i in range(mux_state.total_valid_entries):
+ keep_alive[i] = 1
+ else:
+ all_masks.append(None)
+ if pbar is not None:
+ pbar.update(1)
+ # Skip to backbone advance at end of loop
+ if frame_idx + 1 < N:
+ if prefetch:
+ torch.cuda.current_stream(device).wait_stream(backbone_stream)
+ cur_bb = next_bb
+ else:
+ cur_bb = self._compute_backbone_frame(backbone_fn, _prep_frame(images, slice(frame_idx + 1, frame_idx + 2), device, dt, size), frame_idx=frame_idx + 1)
+ continue
+ else:
+ N_obj = mux_state.total_valid_entries
+ current_out = self.track_step(
+ frame_idx=frame_idx, is_init_cond_frame=False, current_vision_feats=vision_feats,
+ current_vision_pos_embeds=vision_pos, feat_sizes=feat_sizes, mask_inputs=None,
+ output_dict=output_dict, num_frames=N, propagation_high_res=high_res_prop,
+ multiplex_state=mux_state, run_mem_encoder=False)
+ current_out["pred_masks"] = fill_holes_in_mask_scores(
+ current_out["pred_masks"], max_area=16)
+ if last_occluded.shape[0] == N_obj and N_obj > 1:
+ self._suppress_recently_occluded(
+ current_out["pred_masks"], last_occluded, frame_idx)
+ if self.num_maskmem > 0:
+ self._deferred_memory_encode(current_out, N_obj, vision_feats, feat_sizes, mux_state, device)
+ output_dict["non_cond_frame_outputs"][frame_idx] = current_out
+ lookback = max(self.num_maskmem, self.max_obj_ptrs_in_encoder)
+ for old_idx in list(output_dict["non_cond_frame_outputs"]):
+ if old_idx < frame_idx - lookback:
+ del output_dict["non_cond_frame_outputs"][old_idx]
+ n_before = mux_state.total_valid_entries
+ new_obj_scores = self._match_and_add_detections(det_masks, det_scores, current_out, mux_state,
+ vision_feats, feat_sizes, device, max_objects,
+ keep_alive if run_det else None)
+ n_added = mux_state.total_valid_entries - n_before
+ if n_added > 0:
+ last_occluded = torch.cat([last_occluded,
+ torch.full((n_added,), -1, device=device, dtype=torch.long)])
+ obj_scores.extend(new_obj_scores)
+
+ masks_out = current_out["pred_masks_high_res"][:, 0]
+ if keep_alive is not None:
+ for i in range(masks_out.shape[0]):
+ if keep_alive.get(i, 0) <= 0:
+ masks_out[i] = NO_OBJ_SCORE
+ N_obj_now = mux_state.total_valid_entries if mux_state is not None else 0
+ if N_obj_now > 0:
+ all_masks.append(pack_masks(masks_out).to(idev))
+ else:
+ all_masks.append(None)
+ if pbar is not None:
+ pbar.update(1)
+
+ # Next frame's backbone
+ if frame_idx + 1 < N:
+ if prefetch:
+ torch.cuda.current_stream(device).wait_stream(backbone_stream)
+ cur_bb = next_bb
+ else:
+ cur_bb = self._compute_backbone_frame(backbone_fn, _prep_frame(images, slice(frame_idx + 1, frame_idx + 2), device, dt, size), frame_idx=frame_idx + 1)
+
+ if not all_masks or all(m is None for m in all_masks):
+ return {"packed_masks": None, "n_frames": N, "scores": []}
+
+ max_obj = max(m.shape[0] for m in all_masks if m is not None)
+ sample = next(m for m in all_masks if m is not None)
+ empty_packed = torch.zeros(max_obj, *sample.shape[1:], dtype=torch.uint8, device=sample.device)
+ for i, m in enumerate(all_masks):
+ if m is None:
+ all_masks[i] = empty_packed
+ elif m.shape[0] < max_obj:
+ pad = torch.zeros(max_obj - m.shape[0], *m.shape[1:], dtype=torch.uint8, device=m.device)
+ all_masks[i] = torch.cat([m, pad], dim=0)
+ return {"packed_masks": torch.stack(all_masks, dim=0), "n_frames": N, "scores": obj_scores}
diff --git a/comfy/ldm/supir/__init__.py b/comfy/ldm/supir/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/comfy/ldm/supir/supir_modules.py b/comfy/ldm/supir/supir_modules.py
new file mode 100644
index 000000000..7389b01d2
--- /dev/null
+++ b/comfy/ldm/supir/supir_modules.py
@@ -0,0 +1,226 @@
+import torch
+import torch.nn as nn
+
+from comfy.ldm.modules.diffusionmodules.util import timestep_embedding
+from comfy.ldm.modules.diffusionmodules.openaimodel import Downsample, TimestepEmbedSequential, ResBlock, SpatialTransformer
+from comfy.ldm.modules.attention import optimized_attention
+
+
+class ZeroSFT(nn.Module):
+ def __init__(self, label_nc, norm_nc, concat_channels=0, dtype=None, device=None, operations=None):
+ super().__init__()
+
+ ks = 3
+ pw = ks // 2
+
+ self.param_free_norm = operations.GroupNorm(32, norm_nc + concat_channels, dtype=dtype, device=device)
+
+ nhidden = 128
+
+ self.mlp_shared = nn.Sequential(
+ operations.Conv2d(label_nc, nhidden, kernel_size=ks, padding=pw, dtype=dtype, device=device),
+ nn.SiLU()
+ )
+ self.zero_mul = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
+ self.zero_add = operations.Conv2d(nhidden, norm_nc + concat_channels, kernel_size=ks, padding=pw, dtype=dtype, device=device)
+
+ self.zero_conv = operations.Conv2d(label_nc, norm_nc, 1, 1, 0, dtype=dtype, device=device)
+ self.pre_concat = bool(concat_channels != 0)
+
+ def forward(self, c, h, h_ori=None, control_scale=1):
+ if h_ori is not None and self.pre_concat:
+ h_raw = torch.cat([h_ori, h], dim=1)
+ else:
+ h_raw = h
+
+ h = h + self.zero_conv(c)
+ if h_ori is not None and self.pre_concat:
+ h = torch.cat([h_ori, h], dim=1)
+ actv = self.mlp_shared(c)
+ gamma = self.zero_mul(actv)
+ beta = self.zero_add(actv)
+ h = self.param_free_norm(h)
+ h = torch.addcmul(h + beta, h, gamma)
+ if h_ori is not None and not self.pre_concat:
+ h = torch.cat([h_ori, h], dim=1)
+ return torch.lerp(h_raw, h, control_scale)
+
+
+class _CrossAttnInner(nn.Module):
+ """Inner cross-attention module matching the state_dict layout of the original CrossAttention."""
+ def __init__(self, query_dim, context_dim, heads, dim_head, dtype=None, device=None, operations=None):
+ super().__init__()
+ inner_dim = dim_head * heads
+ self.heads = heads
+ self.to_q = operations.Linear(query_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.to_k = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.to_v = operations.Linear(context_dim, inner_dim, bias=False, dtype=dtype, device=device)
+ self.to_out = nn.Sequential(
+ operations.Linear(inner_dim, query_dim, dtype=dtype, device=device),
+ )
+
+ def forward(self, x, context):
+ q = self.to_q(x)
+ k = self.to_k(context)
+ v = self.to_v(context)
+ return self.to_out(optimized_attention(q, k, v, self.heads))
+
+
+class ZeroCrossAttn(nn.Module):
+ def __init__(self, context_dim, query_dim, dtype=None, device=None, operations=None):
+ super().__init__()
+ heads = query_dim // 64
+ dim_head = 64
+ self.attn = _CrossAttnInner(query_dim, context_dim, heads, dim_head, dtype=dtype, device=device, operations=operations)
+ self.norm1 = operations.GroupNorm(32, query_dim, dtype=dtype, device=device)
+ self.norm2 = operations.GroupNorm(32, context_dim, dtype=dtype, device=device)
+
+ def forward(self, context, x, control_scale=1):
+ b, c, h, w = x.shape
+ x_in = x
+
+ x = self.attn(
+ self.norm1(x).flatten(2).transpose(1, 2),
+ self.norm2(context).flatten(2).transpose(1, 2),
+ ).transpose(1, 2).unflatten(2, (h, w))
+
+ return x_in + x * control_scale
+
+
+class GLVControl(nn.Module):
+ """SUPIR's Guided Latent Vector control encoder. Truncated UNet (input + middle blocks only)."""
+ def __init__(
+ self,
+ in_channels=4,
+ model_channels=320,
+ num_res_blocks=2,
+ attention_resolutions=(4, 2),
+ channel_mult=(1, 2, 4),
+ num_head_channels=64,
+ transformer_depth=(1, 2, 10),
+ context_dim=2048,
+ adm_in_channels=2816,
+ use_linear_in_transformer=True,
+ use_checkpoint=False,
+ dtype=None,
+ device=None,
+ operations=None,
+ **kwargs,
+ ):
+ super().__init__()
+ self.model_channels = model_channels
+ time_embed_dim = model_channels * 4
+
+ self.time_embed = nn.Sequential(
+ operations.Linear(model_channels, time_embed_dim, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
+ )
+
+ self.label_emb = nn.Sequential(
+ nn.Sequential(
+ operations.Linear(adm_in_channels, time_embed_dim, dtype=dtype, device=device),
+ nn.SiLU(),
+ operations.Linear(time_embed_dim, time_embed_dim, dtype=dtype, device=device),
+ )
+ )
+
+ self.input_blocks = nn.ModuleList([
+ TimestepEmbedSequential(
+ operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
+ )
+ ])
+ ch = model_channels
+ ds = 1
+ for level, mult in enumerate(channel_mult):
+ for nr in range(num_res_blocks):
+ layers = [
+ ResBlock(ch, time_embed_dim, 0, out_channels=mult * model_channels,
+ dtype=dtype, device=device, operations=operations)
+ ]
+ ch = mult * model_channels
+ if ds in attention_resolutions:
+ num_heads = ch // num_head_channels
+ layers.append(
+ SpatialTransformer(ch, num_heads, num_head_channels,
+ depth=transformer_depth[level], context_dim=context_dim,
+ use_linear=use_linear_in_transformer,
+ use_checkpoint=use_checkpoint,
+ dtype=dtype, device=device, operations=operations)
+ )
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
+ if level != len(channel_mult) - 1:
+ self.input_blocks.append(
+ TimestepEmbedSequential(
+ Downsample(ch, True, out_channels=ch, dtype=dtype, device=device, operations=operations)
+ )
+ )
+ ds *= 2
+
+ num_heads = ch // num_head_channels
+ self.middle_block = TimestepEmbedSequential(
+ ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
+ SpatialTransformer(ch, num_heads, num_head_channels,
+ depth=transformer_depth[-1], context_dim=context_dim,
+ use_linear=use_linear_in_transformer,
+ use_checkpoint=use_checkpoint,
+ dtype=dtype, device=device, operations=operations),
+ ResBlock(ch, time_embed_dim, 0, dtype=dtype, device=device, operations=operations),
+ )
+
+ self.input_hint_block = TimestepEmbedSequential(
+ operations.Conv2d(in_channels, model_channels, 3, padding=1, dtype=dtype, device=device)
+ )
+
+ def forward(self, x, timesteps, xt, context=None, y=None, **kwargs):
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype)
+ emb = self.time_embed(t_emb) + self.label_emb(y)
+
+ guided_hint = self.input_hint_block(x, emb, context)
+
+ hs = []
+ h = xt
+ for module in self.input_blocks:
+ if guided_hint is not None:
+ h = module(h, emb, context)
+ h += guided_hint
+ guided_hint = None
+ else:
+ h = module(h, emb, context)
+ hs.append(h)
+ h = self.middle_block(h, emb, context)
+ hs.append(h)
+ return hs
+
+
+class SUPIR(nn.Module):
+ """
+ SUPIR model containing GLVControl (control encoder) and project_modules (adapters).
+ State dict keys match the original SUPIR checkpoint layout:
+ control_model.* -> GLVControl
+ project_modules.* -> nn.ModuleList of ZeroSFT/ZeroCrossAttn
+ """
+ def __init__(self, device=None, dtype=None, operations=None):
+ super().__init__()
+
+ self.control_model = GLVControl(dtype=dtype, device=device, operations=operations)
+
+ project_channel_scale = 2
+ cond_output_channels = [320] * 4 + [640] * 3 + [1280] * 3
+ project_channels = [int(c * project_channel_scale) for c in [160] * 4 + [320] * 3 + [640] * 3]
+ concat_channels = [320] * 2 + [640] * 3 + [1280] * 4 + [0]
+ cross_attn_insert_idx = [6, 3]
+
+ self.project_modules = nn.ModuleList()
+ for i in range(len(cond_output_channels)):
+ self.project_modules.append(ZeroSFT(
+ project_channels[i], cond_output_channels[i],
+ concat_channels=concat_channels[i],
+ dtype=dtype, device=device, operations=operations,
+ ))
+
+ for i in cross_attn_insert_idx:
+ self.project_modules.insert(i, ZeroCrossAttn(
+ cond_output_channels[i], concat_channels[i],
+ dtype=dtype, device=device, operations=operations,
+ ))
diff --git a/comfy/ldm/supir/supir_patch.py b/comfy/ldm/supir/supir_patch.py
new file mode 100644
index 000000000..b67ab4cd8
--- /dev/null
+++ b/comfy/ldm/supir/supir_patch.py
@@ -0,0 +1,103 @@
+import torch
+from comfy.ldm.modules.diffusionmodules.openaimodel import Upsample
+
+
+class SUPIRPatch:
+ """
+ Holds GLVControl (control encoder) + project_modules (ZeroSFT/ZeroCrossAttn adapters).
+ Runs GLVControl lazily on first patch invocation per step, applies adapters through
+ middle_block_after_patch, output_block_merge_patch, and forward_timestep_embed_patch.
+ """
+ SIGMA_MAX = 14.6146
+
+ def __init__(self, model_patch, project_modules, hint_latent, strength_start, strength_end):
+ self.model_patch = model_patch # CoreModelPatcher wrapping GLVControl
+ self.project_modules = project_modules # nn.ModuleList of ZeroSFT/ZeroCrossAttn
+ self.hint_latent = hint_latent # encoded LQ image latent
+ self.strength_start = strength_start
+ self.strength_end = strength_end
+ self.cached_features = None
+ self.adapter_idx = 0
+ self.control_idx = 0
+ self.current_control_idx = 0
+ self.active = True
+
+ def _ensure_features(self, kwargs):
+ """Run GLVControl on first call per step, cache results."""
+ if self.cached_features is not None:
+ return
+ x = kwargs["x"]
+ b = x.shape[0]
+ hint = self.hint_latent.to(device=x.device, dtype=x.dtype)
+ if hint.shape[0] != b:
+ hint = hint.expand(b, -1, -1, -1) if hint.shape[0] == 1 else hint.repeat((b + hint.shape[0] - 1) // hint.shape[0], 1, 1, 1)[:b]
+ self.cached_features = self.model_patch.model.control_model(
+ hint, kwargs["timesteps"], x,
+ kwargs["context"], kwargs["y"]
+ )
+ self.adapter_idx = len(self.project_modules) - 1
+ self.control_idx = len(self.cached_features) - 1
+
+ def _get_control_scale(self, kwargs):
+ if self.strength_start == self.strength_end:
+ return self.strength_end
+ sigma = kwargs["transformer_options"].get("sigmas")
+ if sigma is None:
+ return self.strength_end
+ s = sigma[0].item() if sigma.dim() > 0 else sigma.item()
+ t = min(s / self.SIGMA_MAX, 1.0)
+ return t * (self.strength_start - self.strength_end) + self.strength_end
+
+ def middle_after(self, kwargs):
+ """middle_block_after_patch: run GLVControl lazily, apply last adapter after middle block."""
+ self.cached_features = None # reset from previous step
+ self.current_scale = self._get_control_scale(kwargs)
+ self.active = self.current_scale > 0
+ if not self.active:
+ return {"h": kwargs["h"]}
+ self._ensure_features(kwargs)
+ h = kwargs["h"]
+ h = self.project_modules[self.adapter_idx](
+ self.cached_features[self.control_idx], h, control_scale=self.current_scale
+ )
+ self.adapter_idx -= 1
+ self.control_idx -= 1
+ return {"h": h}
+
+ def output_block(self, h, hsp, transformer_options):
+ """output_block_patch: ZeroSFT adapter fusion replaces cat([h, hsp]). Returns (h, None) to skip cat."""
+ if not self.active:
+ return h, hsp
+ self.current_control_idx = self.control_idx
+ h = self.project_modules[self.adapter_idx](
+ self.cached_features[self.control_idx], hsp, h, control_scale=self.current_scale
+ )
+ self.adapter_idx -= 1
+ self.control_idx -= 1
+ return h, None
+
+ def pre_upsample(self, layer, x, emb, context, transformer_options, output_shape, *args, **kw):
+ """forward_timestep_embed_patch for Upsample: extra cross-attn adapter before upsample."""
+ block_type, _ = transformer_options["block"]
+ if block_type == "output" and self.active and self.cached_features is not None:
+ x = self.project_modules[self.adapter_idx](
+ self.cached_features[self.current_control_idx], x, control_scale=self.current_scale
+ )
+ self.adapter_idx -= 1
+ return layer(x, output_shape=output_shape)
+
+ def to(self, device_or_dtype):
+ if isinstance(device_or_dtype, torch.device):
+ self.cached_features = None
+ if self.hint_latent is not None:
+ self.hint_latent = self.hint_latent.to(device_or_dtype)
+ return self
+
+ def models(self):
+ return [self.model_patch]
+
+ def register(self, model_patcher):
+ """Register all patches on a cloned model patcher."""
+ model_patcher.set_model_patch(self.middle_after, "middle_block_after_patch")
+ model_patcher.set_model_output_block_patch(self.output_block)
+ model_patcher.set_model_patch((Upsample, self.pre_upsample), "forward_timestep_embed_patch")
diff --git a/comfy/ldm/wan/ar_model.py b/comfy/ldm/wan/ar_model.py
new file mode 100644
index 000000000..d72f53602
--- /dev/null
+++ b/comfy/ldm/wan/ar_model.py
@@ -0,0 +1,276 @@
+"""
+CausalWanModel: Wan 2.1 backbone with KV-cached causal self-attention for
+autoregressive (frame-by-frame) video generation via Causal Forcing.
+
+Weight-compatible with the standard WanModel -- same layer names, same shapes.
+The difference is purely in the forward pass: this model processes one temporal
+block at a time and maintains a KV cache across blocks.
+
+Reference: https://github.com/thu-ml/Causal-Forcing
+"""
+
+import torch
+import torch.nn as nn
+
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.flux.math import apply_rope1
+from comfy.ldm.wan.model import (
+ sinusoidal_embedding_1d,
+ repeat_e,
+ WanModel,
+ WanAttentionBlock,
+)
+import comfy.ldm.common_dit
+import comfy.model_management
+
+
+class CausalWanSelfAttention(nn.Module):
+ """Self-attention with KV cache support for autoregressive inference."""
+
+ def __init__(self, dim, num_heads, window_size=(-1, -1), qk_norm=True,
+ eps=1e-6, operation_settings={}):
+ assert dim % num_heads == 0
+ super().__init__()
+ self.dim = dim
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+ self.qk_norm = qk_norm
+ self.eps = eps
+
+ ops = operation_settings.get("operations")
+ device = operation_settings.get("device")
+ dtype = operation_settings.get("dtype")
+
+ self.q = ops.Linear(dim, dim, device=device, dtype=dtype)
+ self.k = ops.Linear(dim, dim, device=device, dtype=dtype)
+ self.v = ops.Linear(dim, dim, device=device, dtype=dtype)
+ self.o = ops.Linear(dim, dim, device=device, dtype=dtype)
+ self.norm_q = ops.RMSNorm(dim, eps=eps, elementwise_affine=True, device=device, dtype=dtype) if qk_norm else nn.Identity()
+ self.norm_k = ops.RMSNorm(dim, eps=eps, elementwise_affine=True, device=device, dtype=dtype) if qk_norm else nn.Identity()
+
+ def forward(self, x, freqs, kv_cache=None, transformer_options={}):
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+
+ q = apply_rope1(self.norm_q(self.q(x)).view(b, s, n, d), freqs)
+ k = apply_rope1(self.norm_k(self.k(x)).view(b, s, n, d), freqs)
+ v = self.v(x).view(b, s, n, d)
+
+ if kv_cache is None:
+ x = optimized_attention(
+ q.view(b, s, n * d),
+ k.view(b, s, n * d),
+ v.view(b, s, n * d),
+ heads=self.num_heads,
+ transformer_options=transformer_options,
+ )
+ else:
+ end = kv_cache["end"]
+ new_end = end + s
+
+ # Roped K and plain V go into cache
+ kv_cache["k"][:, end:new_end] = k
+ kv_cache["v"][:, end:new_end] = v
+ kv_cache["end"] = new_end
+
+ x = optimized_attention(
+ q.view(b, s, n * d),
+ kv_cache["k"][:, :new_end].view(b, new_end, n * d),
+ kv_cache["v"][:, :new_end].view(b, new_end, n * d),
+ heads=self.num_heads,
+ transformer_options=transformer_options,
+ )
+
+ x = self.o(x)
+ return x
+
+
+class CausalWanAttentionBlock(WanAttentionBlock):
+ """Transformer block with KV-cached self-attention and cross-attention caching."""
+
+ def __init__(self, cross_attn_type, dim, ffn_dim, num_heads,
+ window_size=(-1, -1), qk_norm=True, cross_attn_norm=False,
+ eps=1e-6, operation_settings={}):
+ super().__init__(cross_attn_type, dim, ffn_dim, num_heads,
+ window_size, qk_norm, cross_attn_norm, eps,
+ operation_settings=operation_settings)
+ self.self_attn = CausalWanSelfAttention(
+ dim, num_heads, window_size, qk_norm, eps,
+ operation_settings=operation_settings)
+
+ def forward(self, x, e, freqs, context, context_img_len=257,
+ kv_cache=None, crossattn_cache=None, transformer_options={}):
+ if e.ndim < 4:
+ e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device) + e).chunk(6, dim=1)
+ else:
+ e = (comfy.model_management.cast_to(self.modulation, dtype=x.dtype, device=x.device).unsqueeze(0) + e).unbind(2)
+
+ # Self-attention with optional KV cache
+ x = x.contiguous()
+ y = self.self_attn(
+ torch.addcmul(repeat_e(e[0], x), self.norm1(x), 1 + repeat_e(e[1], x)),
+ freqs, kv_cache=kv_cache, transformer_options=transformer_options)
+ x = torch.addcmul(x, y, repeat_e(e[2], x))
+ del y
+
+ # Cross-attention with optional caching
+ if crossattn_cache is not None and crossattn_cache.get("is_init"):
+ q = self.cross_attn.norm_q(self.cross_attn.q(self.norm3(x)))
+ x_ca = optimized_attention(
+ q, crossattn_cache["k"], crossattn_cache["v"],
+ heads=self.num_heads, transformer_options=transformer_options)
+ x = x + self.cross_attn.o(x_ca)
+ else:
+ x = x + self.cross_attn(self.norm3(x), context, context_img_len=context_img_len, transformer_options=transformer_options)
+ if crossattn_cache is not None:
+ crossattn_cache["k"] = self.cross_attn.norm_k(self.cross_attn.k(context))
+ crossattn_cache["v"] = self.cross_attn.v(context)
+ crossattn_cache["is_init"] = True
+
+ # FFN
+ y = self.ffn(torch.addcmul(repeat_e(e[3], x), self.norm2(x), 1 + repeat_e(e[4], x)))
+ x = torch.addcmul(x, y, repeat_e(e[5], x))
+ return x
+
+
+class CausalWanModel(WanModel):
+ """
+ Wan 2.1 diffusion backbone with causal KV-cache support.
+
+ Same weight structure as WanModel -- loads identical state dicts.
+ Adds forward_block() for frame-by-frame autoregressive inference.
+ """
+
+ def __init__(self,
+ model_type='t2v',
+ patch_size=(1, 2, 2),
+ text_len=512,
+ in_dim=16,
+ dim=2048,
+ ffn_dim=8192,
+ freq_dim=256,
+ text_dim=4096,
+ out_dim=16,
+ num_heads=16,
+ num_layers=32,
+ window_size=(-1, -1),
+ qk_norm=True,
+ cross_attn_norm=True,
+ eps=1e-6,
+ image_model=None,
+ device=None,
+ dtype=None,
+ operations=None):
+ super().__init__(
+ model_type=model_type, patch_size=patch_size, text_len=text_len,
+ in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim,
+ text_dim=text_dim, out_dim=out_dim, num_heads=num_heads,
+ num_layers=num_layers, window_size=window_size, qk_norm=qk_norm,
+ cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model,
+ wan_attn_block_class=CausalWanAttentionBlock,
+ device=device, dtype=dtype, operations=operations)
+
+ def forward_block(self, x, timestep, context, start_frame,
+ kv_caches, crossattn_caches, clip_fea=None):
+ """
+ Forward one temporal block for autoregressive inference.
+
+ Args:
+ x: [B, C, block_frames, H, W] input latent for the current block
+ timestep: [B, block_frames] per-frame timesteps
+ context: [B, L, text_dim] raw text embeddings (pre-text_embedding)
+ start_frame: temporal frame index for RoPE offset
+ kv_caches: list of per-layer KV cache dicts
+ crossattn_caches: list of per-layer cross-attention cache dicts
+ clip_fea: optional CLIP features for I2V
+
+ Returns:
+ flow_pred: [B, C_out, block_frames, H, W] flow prediction
+ """
+ x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
+ bs, c, t, h, w = x.shape
+
+ x = self.patch_embedding(x.float()).to(x.dtype)
+ grid_sizes = x.shape[2:]
+ x = x.flatten(2).transpose(1, 2)
+
+ # Per-frame time embedding
+ e = self.time_embedding(
+ sinusoidal_embedding_1d(self.freq_dim, timestep.flatten()).to(dtype=x.dtype))
+ e = e.reshape(timestep.shape[0], -1, e.shape[-1])
+ e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+
+ # Text embedding (reuses crossattn_cache after first block)
+ context = self.text_embedding(context)
+
+ context_img_len = None
+ if clip_fea is not None and self.img_emb is not None:
+ context_clip = self.img_emb(clip_fea)
+ context = torch.concat([context_clip, context], dim=1)
+ context_img_len = clip_fea.shape[-2]
+
+ # RoPE for current block's temporal position
+ freqs = self.rope_encode(t, h, w, t_start=start_frame, device=x.device, dtype=x.dtype)
+
+ # Transformer blocks
+ for i, block in enumerate(self.blocks):
+ x = block(x, e=e0, freqs=freqs, context=context,
+ context_img_len=context_img_len,
+ kv_cache=kv_caches[i],
+ crossattn_cache=crossattn_caches[i])
+
+ # Head
+ x = self.head(x, e)
+
+ # Unpatchify
+ x = self.unpatchify(x, grid_sizes)
+ return x[:, :, :t, :h, :w]
+
+ def init_kv_caches(self, batch_size, max_seq_len, device, dtype):
+ """Create fresh KV caches for all layers."""
+ caches = []
+ for _ in range(self.num_layers):
+ caches.append({
+ "k": torch.zeros(batch_size, max_seq_len, self.num_heads, self.head_dim, device=device, dtype=dtype),
+ "v": torch.zeros(batch_size, max_seq_len, self.num_heads, self.head_dim, device=device, dtype=dtype),
+ "end": 0,
+ })
+ return caches
+
+ def init_crossattn_caches(self, batch_size, device, dtype):
+ """Create fresh cross-attention caches for all layers."""
+ caches = []
+ for _ in range(self.num_layers):
+ caches.append({"is_init": False})
+ return caches
+
+ def reset_kv_caches(self, kv_caches):
+ """Reset KV caches to empty (reuse allocated memory)."""
+ for cache in kv_caches:
+ cache["end"] = 0
+
+ def reset_crossattn_caches(self, crossattn_caches):
+ """Reset cross-attention caches."""
+ for cache in crossattn_caches:
+ cache["is_init"] = False
+
+ @property
+ def head_dim(self):
+ return self.dim // self.num_heads
+
+ def forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, **kwargs):
+ ar_state = transformer_options.get("ar_state")
+ if ar_state is not None:
+ bs = x.shape[0]
+ block_frames = x.shape[2]
+ t_per_frame = timestep.unsqueeze(1).expand(bs, block_frames)
+ return self.forward_block(
+ x=x, timestep=t_per_frame, context=context,
+ start_frame=ar_state["start_frame"],
+ kv_caches=ar_state["kv_caches"],
+ crossattn_caches=ar_state["crossattn_caches"],
+ clip_fea=clip_fea,
+ )
+
+ return super().forward(x, timestep, context, clip_fea=clip_fea,
+ time_dim_concat=time_dim_concat,
+ transformer_options=transformer_options, **kwargs)
diff --git a/comfy/ldm/wan/model.py b/comfy/ldm/wan/model.py
index b2287dba9..70dfe7b16 100644
--- a/comfy/ldm/wan/model.py
+++ b/comfy/ldm/wan/model.py
@@ -1135,7 +1135,7 @@ class AudioInjector_WAN(nn.Module):
self.injector_adain_output_layers = nn.ModuleList(
[operations.Linear(dim, dim, dtype=dtype, device=device) for _ in range(audio_injector_id)])
- def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len):
+ def forward(self, x, block_id, audio_emb, audio_emb_global, seq_len, scale=1.0):
audio_attn_id = self.injected_block_id.get(block_id, None)
if audio_attn_id is None:
return x
@@ -1148,12 +1148,15 @@ class AudioInjector_WAN(nn.Module):
attn_hidden_states = adain_hidden_states
else:
attn_hidden_states = self.injector_pre_norm_feat[audio_attn_id](input_hidden_states)
- audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
- attn_audio_emb = audio_emb
+
+ if audio_emb.dim() == 3: # WanDancer case
+ attn_audio_emb = rearrange(audio_emb, "b t c -> (b t) 1 c", t=num_frames)
+ else: # S2V case
+ attn_audio_emb = rearrange(audio_emb, "b t n c -> (b t) n c", t=num_frames)
+
residual_out = self.injector[audio_attn_id](x=attn_hidden_states, context=attn_audio_emb)
- residual_out = rearrange(
- residual_out, "(b t) n c -> b (t n) c", t=num_frames)
- x[:, :seq_len] = x[:, :seq_len] + residual_out
+ residual_out = rearrange(residual_out, "(b t) n c -> b (t n) c", t=num_frames)
+ x[:, :seq_len] = x[:, :seq_len] + residual_out * scale
return x
diff --git a/comfy/ldm/wan/model_wandancer.py b/comfy/ldm/wan/model_wandancer.py
new file mode 100644
index 000000000..3caef6dc5
--- /dev/null
+++ b/comfy/ldm/wan/model_wandancer.py
@@ -0,0 +1,251 @@
+import torch
+import torch.nn as nn
+import comfy
+from comfy.ldm.modules.attention import optimized_attention
+from comfy.ldm.flux.math import apply_rope1
+from comfy.ldm.flux.layers import EmbedND
+
+from .model import AudioInjector_WAN, WanModel, MLPProj, Head, sinusoidal_embedding_1d
+
+
+class MusicSelfAttention(nn.Module):
+ def __init__(self, dim, num_heads, device=None, dtype=None, operations=None):
+ assert dim % num_heads == 0
+ super().__init__()
+ self.embed_dim = dim
+ self.num_heads = num_heads
+ self.head_dim = dim // num_heads
+
+ self.q_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+ self.k_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+ self.v_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+ self.out_proj = operations.Linear(dim, dim, device=device, dtype=dtype)
+
+ def forward(self, x, freqs):
+ b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
+
+ q = self.q_proj(x).view(b, s, n, d)
+ q = apply_rope1(q, freqs)
+
+ k = self.k_proj(x).view(b, s, n, d)
+ k = apply_rope1(k, freqs)
+
+ x = optimized_attention(
+ q.view(b, s, n * d),
+ k.view(b, s, n * d),
+ self.v_proj(x).view(b, s, n * d),
+ heads=self.num_heads,
+ )
+
+ return self.out_proj(x)
+
+
+class MusicEncoderLayer(nn.Module):
+ def __init__(self, dim: int, num_heads: int, ffn_dim: int, device=None, dtype=None, operations=None):
+ super().__init__()
+ self.self_attn = MusicSelfAttention(dim, num_heads, device=device, dtype=dtype, operations=operations)
+
+ self.linear1 = operations.Linear(dim, ffn_dim, device=device, dtype=dtype)
+ self.linear2 = operations.Linear(ffn_dim, dim, device=device, dtype=dtype)
+
+ self.norm1 = operations.LayerNorm(dim, device=device, dtype=dtype)
+ self.norm2 = operations.LayerNorm(dim, device=device, dtype=dtype)
+
+ def forward(self, x: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+ x = x + self.self_attn(self.norm1(x), freqs=freqs)
+ x = x + self.linear2(torch.nn.functional.gelu(self.linear1(self.norm2(x)))) # ffn
+ return x
+
+
+class WanDancerModel(WanModel):
+ def __init__(self,
+ model_type='wandancer',
+ patch_size=(1, 2, 2),
+ text_len=512,
+ in_dim=16,
+ dim=5120,
+ ffn_dim=8192,
+ freq_dim=256,
+ text_dim=4096,
+ out_dim=16,
+ num_heads=16,
+ num_layers=40,
+ window_size=(-1, -1),
+ qk_norm=True,
+ cross_attn_norm=True,
+ eps=1e-6,
+ in_dim_ref_conv=None,
+ image_model=None,
+ device=None, dtype=None, operations=None,
+ audio_inject_layers=[0, 4, 8, 12, 16, 20, 24, 27],
+ music_dim = 256,
+ music_heads = 4,
+ music_feature_dim = 35,
+ music_latent_dim = 256
+ ):
+
+ super().__init__(model_type='i2v', patch_size=patch_size, text_len=text_len, in_dim=in_dim, dim=dim, ffn_dim=ffn_dim, freq_dim=freq_dim, text_dim=text_dim, out_dim=out_dim,
+ num_heads=num_heads, num_layers=num_layers, window_size=window_size, qk_norm=qk_norm, cross_attn_norm=cross_attn_norm, eps=eps, image_model=image_model, in_dim_ref_conv=in_dim_ref_conv,
+ device=device, dtype=dtype, operations=operations)
+
+ self.dtype = dtype
+ operation_settings = {"operations": operations, "device": device, "dtype": dtype}
+
+ self.patch_embedding_global = operations.Conv3d(in_dim, dim, kernel_size=patch_size, stride=patch_size, device=operation_settings.get("device"), dtype=torch.float32)
+ self.img_emb_refimage = MLPProj(1280, dim, operation_settings=operation_settings)
+ self.head_global = Head(dim, out_dim, patch_size, eps, operation_settings=operation_settings)
+
+ self.music_injector = AudioInjector_WAN(
+ dim=self.dim,
+ num_heads=self.num_heads,
+ inject_layer=audio_inject_layers,
+ root_net=self,
+ enable_adain=False,
+ dtype=dtype, device=device, operations=operations
+ )
+
+ self.music_projection = operations.Linear(music_feature_dim, music_latent_dim, device=device, dtype=dtype)
+ self.music_encoder = nn.ModuleList([MusicEncoderLayer(dim=music_dim, num_heads=music_heads, ffn_dim=1024, device=device, dtype=dtype, operations=operations) for _ in range(2)])
+ music_head_dim = music_dim // music_heads
+ self.music_rope_embedder = EmbedND(dim=music_head_dim, theta=10000.0, axes_dim=[music_head_dim])
+
+ def forward_orig(self, x, t, context, clip_fea=None, clip_fea_ref=None, freqs=None, audio_embed=None, fps=30, audio_inject_scale=1.0, transformer_options={}, **kwargs):
+ # embeddings
+ if int(fps + 0.5) != 30:
+ x = self.patch_embedding_global(x.float()).to(x.dtype)
+ else:
+ x = self.patch_embedding(x.float()).to(x.dtype)
+
+ grid_sizes = x.shape[2:]
+ latent_frames = grid_sizes[0]
+ transformer_options["grid_sizes"] = grid_sizes
+ x = x.flatten(2).transpose(1, 2)
+ seq_len = x.size(1)
+
+ # time embeddings
+ e = self.time_embedding(sinusoidal_embedding_1d(self.freq_dim, t.flatten()).to(dtype=x[0].dtype))
+ e = e.reshape(t.shape[0], -1, e.shape[-1])
+ e0 = self.time_projection(e).unflatten(2, (6, self.dim))
+
+ full_ref = None
+ if self.ref_conv is not None: # model has the weight, but this wasn't used in the original pipeline
+ full_ref = kwargs.get("reference_latent", None)
+ if full_ref is not None:
+ full_ref = self.ref_conv(full_ref).flatten(2).transpose(1, 2)
+ x = torch.concat((full_ref, x), dim=1)
+
+ # context
+ context = self.text_embedding(context)
+
+ audio_emb = None
+ if audio_embed is not None: # encode music feature,[1, frame_num, 35] -> [1, F*8, dim]
+ music_feature = self.music_projection(audio_embed)
+
+ music_seq_len = music_feature.shape[1]
+ music_ids = torch.arange(music_seq_len, device=music_feature.device, dtype=music_feature.dtype).reshape(1, -1, 1) # create 1D position IDs
+ music_freqs = self.music_rope_embedder(music_ids).movedim(1, 2)
+
+ # apply encoder layers
+ for layer in self.music_encoder:
+ music_feature = layer(music_feature, music_freqs)
+
+ # interpolate
+ audio_emb = torch.nn.functional.interpolate(music_feature.unsqueeze(1), size=(latent_frames * 8, self.dim), mode='bilinear').squeeze(1)
+
+ context_img_len = 0
+ if self.img_emb is not None and clip_fea is not None:
+ context_clip = self.img_emb(clip_fea) # bs x 257 x dim
+ context = torch.cat([context_clip, context], dim=1)
+ context_img_len += clip_fea.shape[-2]
+ if self.img_emb_refimage is not None and clip_fea_ref is not None:
+ context_clip_ref = self.img_emb_refimage(clip_fea_ref)
+ context = torch.cat([context_clip_ref, context], dim=1)
+ context_img_len += clip_fea_ref.shape[-2]
+
+ patches_replace = transformer_options.get("patches_replace", {})
+ blocks_replace = patches_replace.get("dit", {})
+ transformer_options["total_blocks"] = len(self.blocks)
+ transformer_options["block_type"] = "double"
+ for i, block in enumerate(self.blocks):
+ transformer_options["block_index"] = i
+ if ("double_block", i) in blocks_replace:
+ def block_wrap(args):
+ out = {}
+ out["img"] = block(args["img"], context=args["txt"], e=args["vec"], freqs=args["pe"], context_img_len=context_img_len, transformer_options=args["transformer_options"])
+ return out
+ out = blocks_replace[("double_block", i)]({"img": x, "txt": context, "vec": e0, "pe": freqs, "transformer_options": transformer_options}, {"original_block": block_wrap})
+ x = out["img"]
+ else:
+ x = block(x, e=e0, freqs=freqs, context=context, context_img_len=context_img_len, transformer_options=transformer_options)
+ if audio_emb is not None:
+ x = self.music_injector(x, i, audio_emb, audio_emb_global=None, seq_len=seq_len, scale=audio_inject_scale)
+
+ # head
+ if int(fps + 0.5) != 30:
+ x = self.head_global(x, e)
+ else:
+ x = self.head(x, e)
+
+ if full_ref is not None:
+ x = x[:, full_ref.shape[1]:]
+
+ # unpatchify
+ x = self.unpatchify(x, grid_sizes)
+ return x
+
+ def _forward(self, x, timestep, context, clip_fea=None, time_dim_concat=None, transformer_options={}, clip_fea_ref=None, fps=30, audio_inject_scale=1.0, **kwargs):
+ bs, c, t, h, w = x.shape
+ x = comfy.ldm.common_dit.pad_to_patch_size(x, self.patch_size)
+
+ t_len = t
+ if time_dim_concat is not None:
+ time_dim_concat = comfy.ldm.common_dit.pad_to_patch_size(time_dim_concat, self.patch_size)
+ x = torch.cat([x, time_dim_concat], dim=2)
+ t_len = x.shape[2]
+
+ freqs = self.rope_encode(t_len, h, w, device=x.device, dtype=x.dtype, fps=fps, transformer_options=transformer_options)
+ return self.forward_orig(x, timestep, context, clip_fea=clip_fea, clip_fea_ref=clip_fea_ref, freqs=freqs, fps=fps, audio_inject_scale=audio_inject_scale, transformer_options=transformer_options, **kwargs)[:, :, :t, :h, :w]
+
+ def rope_encode(self, t, h, w, t_start=0, steps_t=None, steps_h=None, steps_w=None, fps=30, device=None, dtype=None, transformer_options={}):
+ patch_size = self.patch_size
+ t_len = ((t + (patch_size[0] // 2)) // patch_size[0])
+ h_len = ((h + (patch_size[1] // 2)) // patch_size[1])
+ w_len = ((w + (patch_size[2] // 2)) // patch_size[2])
+
+ if steps_t is None:
+ steps_t = t_len
+ if steps_h is None:
+ steps_h = h_len
+ if steps_w is None:
+ steps_w = w_len
+
+ h_start = 0
+ w_start = 0
+ rope_options = transformer_options.get("rope_options", None)
+ if rope_options is not None:
+ t_len = (t_len - 1.0) * rope_options.get("scale_t", 1.0) + 1.0
+ h_len = (h_len - 1.0) * rope_options.get("scale_y", 1.0) + 1.0
+ w_len = (w_len - 1.0) * rope_options.get("scale_x", 1.0) + 1.0
+
+ t_start += rope_options.get("shift_t", 0.0)
+ h_start += rope_options.get("shift_y", 0.0)
+ w_start += rope_options.get("shift_x", 0.0)
+
+ img_ids = torch.zeros((steps_t, steps_h, steps_w, 3), device=device, dtype=dtype)
+
+ if int(fps + 0.5) != 30:
+ time_scale = 30.0 / fps # how many time units each frame represents relative to 30fps
+ positions_new = torch.arange(steps_t, device=device, dtype=dtype) * time_scale + t_start
+ total_frames_at_30fps = int(time_scale * steps_t + 0.5)
+ positions_new[-1] = t_start + (total_frames_at_30fps - 1)
+
+ img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + positions_new.reshape(-1, 1, 1)
+ else:
+ img_ids[:, :, :, 0] = img_ids[:, :, :, 0] + torch.linspace(t_start, t_start + (t_len - 1), steps=steps_t, device=device, dtype=dtype).reshape(-1, 1, 1)
+
+ img_ids[:, :, :, 1] = img_ids[:, :, :, 1] + torch.linspace(h_start, h_start + (h_len - 1), steps=steps_h, device=device, dtype=dtype).reshape(1, -1, 1)
+ img_ids[:, :, :, 2] = img_ids[:, :, :, 2] + torch.linspace(w_start, w_start + (w_len - 1), steps=steps_w, device=device, dtype=dtype).reshape(1, 1, -1)
+ img_ids = img_ids.reshape(1, -1, img_ids.shape[-1])
+
+ freqs = self.rope_embedder(img_ids).movedim(1, 2)
+ return freqs
diff --git a/comfy/lora.py b/comfy/lora.py
index 63ee85323..4e0ea29e0 100644
--- a/comfy/lora.py
+++ b/comfy/lora.py
@@ -16,7 +16,7 @@
along with this program. If not, see .
"""
-from __future__ import annotations
+import comfy.memory_management
import comfy.utils
import comfy.model_management
import comfy.model_base
@@ -96,12 +96,14 @@ def load_lora(lora, to_load, log_missing=True):
def model_lora_keys_clip(model, key_map={}):
sdk = model.state_dict().keys()
+ prefix_set = set()
for k in sdk:
if k.endswith(".weight"):
key_map["text_encoders.{}".format(k[:-len(".weight")])] = k #generic lora format without any weird key names
tp = k.find(".transformer.") #also map without wrapper prefix for composite text encoder models
if tp > 0 and not k.startswith("clip_"):
key_map["text_encoders.{}".format(k[tp + 1:-len(".weight")])] = k
+ prefix_set.add(k.split('.')[0])
text_model_lora_key = "lora_te_text_model_encoder_layers_{}_{}"
clip_l_present = False
@@ -162,6 +164,13 @@ def model_lora_keys_clip(model, key_map={}):
lora_key = "lora_te1_{}".format(l_key.replace(".", "_"))
key_map[lora_key] = k
+ if len(prefix_set) == 1:
+ full_prefix = "{}.transformer.model.".format(next(iter(prefix_set))) # kohya anima and maybe other single TE models that use a single llama arch based te
+ for k in sdk:
+ if k.endswith(".weight"):
+ if k.startswith(full_prefix):
+ l_key = k[len(full_prefix):-len(".weight")]
+ key_map["lora_te_{}".format(l_key.replace(".", "_"))] = k
k = "clip_g.transformer.text_projection.weight"
if k in sdk:
@@ -342,6 +351,12 @@ def model_lora_keys_unet(model, key_map={}):
key_map["base_model.model.{}".format(key_lora)] = k # Official base model loras
key_map["lycoris_{}".format(key_lora.replace(".", "_"))] = k # LyCORIS/LoKR format
+ if isinstance(model, comfy.model_base.ErnieImage):
+ for k in sdk:
+ if k.startswith("diffusion_model.") and k.endswith(".weight"):
+ key_lora = k[len("diffusion_model."):-len(".weight")]
+ key_map["transformer.{}".format(key_lora)] = k
+
return key_map
@@ -467,3 +482,24 @@ def calculate_weight(patches, weight, key, intermediate_dtype=torch.float32, ori
weight = old_weight
return weight
+
+def prefetch_prepared_value(value, counter, destination, stream, copy):
+ if isinstance(value, torch.Tensor):
+ size = comfy.memory_management.vram_aligned_size(value)
+ offset = counter[0]
+ counter[0] += size
+ if destination is None:
+ return value
+
+ dest = destination[offset:offset + size]
+ if copy:
+ comfy.model_management.cast_to_gathered([value], dest, non_blocking=True, stream=stream)
+ return comfy.memory_management.interpret_gathered_like([value], dest)[0]
+ elif isinstance(value, weight_adapter.WeightAdapterBase):
+ return type(value)(value.loaded_keys, prefetch_prepared_value(value.weights, counter, destination, stream, copy))
+ elif isinstance(value, tuple):
+ return tuple(prefetch_prepared_value(item, counter, destination, stream, copy) for item in value)
+ elif isinstance(value, list):
+ return [prefetch_prepared_value(item, counter, destination, stream, copy) for item in value]
+
+ return value
diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 48e3c11da..962addb27 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -1,6 +1,5 @@
import math
import ctypes
-import threading
import dataclasses
import torch
from typing import NamedTuple
@@ -10,12 +9,12 @@ from comfy.quant_ops import QuantizedTensor
class TensorFileSlice(NamedTuple):
file_ref: object
- thread_id: int
+ lock: object
offset: int
size: int
-def read_tensor_file_slice_into(tensor, destination):
+def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None):
if isinstance(tensor, QuantizedTensor):
if not isinstance(destination, QuantizedTensor):
@@ -23,12 +22,17 @@ def read_tensor_file_slice_into(tensor, destination):
if tensor._layout_cls != destination._layout_cls:
return False
- if not read_tensor_file_slice_into(tensor._qdata, destination._qdata):
+ if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream,
+ destination2=(destination2._qdata if destination2 is not None else None)):
return False
dst_orig_dtype = destination._params.orig_dtype
destination._params.copy_from(tensor._params, non_blocking=False)
destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
+ if destination2 is not None:
+ dst_orig_dtype = destination2._params.orig_dtype
+ destination2._params.copy_from(destination._params, non_blocking=True)
+ destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype)
return True
info = getattr(tensor.untyped_storage(), "_comfy_tensor_file_slice", None)
@@ -38,7 +42,6 @@ def read_tensor_file_slice_into(tensor, destination):
file_obj = info.file_ref
if (destination.device.type != "cpu"
or file_obj is None
- or threading.get_ident() != info.thread_id
or destination.numel() * destination.element_size() < info.size
or tensor.numel() * tensor.element_size() != info.size
or tensor.storage_offset() != 0
@@ -48,20 +51,33 @@ def read_tensor_file_slice_into(tensor, destination):
if info.size == 0:
return True
+ hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
+ if hostbuf is not None:
+ stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
+ device_ptr = destination2.data_ptr() if destination2 is not None else 0
+ with info.lock:
+ hostbuf.read_file_slice(file_obj, info.offset, info.size,
+ offset=destination.data_ptr() - hostbuf.get_raw_address(),
+ stream=stream_ptr,
+ device_ptr=device_ptr,
+ device=None if destination2 is None else destination2.device.index)
+ return True
+
buf_type = ctypes.c_ubyte * info.size
view = memoryview(buf_type.from_address(destination.data_ptr()))
try:
- file_obj.seek(info.offset)
- done = 0
- while done < info.size:
- try:
- n = file_obj.readinto(view[done:])
- except OSError:
- return False
- if n <= 0:
- return False
- done += n
+ with info.lock:
+ file_obj.seek(info.offset)
+ done = 0
+ while done < info.size:
+ try:
+ n = file_obj.readinto(view[done:])
+ except OSError:
+ return False
+ if n <= 0:
+ return False
+ done += n
return True
finally:
view.release()
@@ -151,7 +167,7 @@ def set_ram_cache_release_state(callback, headroom):
extra_ram_release_callback = callback
RAM_CACHE_HEADROOM = max(0, int(headroom))
-def extra_ram_release(target):
+def extra_ram_release(target, free_active=False):
if extra_ram_release_callback is None:
return 0
- return extra_ram_release_callback(target)
+ return extra_ram_release_callback(target, free_active=free_active)
diff --git a/comfy/model_base.py b/comfy/model_base.py
index 5c2668ba9..205178911 100644
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -35,6 +35,7 @@ import comfy.ldm.hydit.models
import comfy.ldm.audio.dit
import comfy.ldm.audio.embedders
import comfy.ldm.flux.model
+import comfy.ldm.lens.model
import comfy.ldm.lightricks.model
import comfy.ldm.hunyuan_video.model
import comfy.ldm.cosmos.model
@@ -42,18 +43,26 @@ import comfy.ldm.cosmos.predict2
import comfy.ldm.lumina.model
import comfy.ldm.wan.model
import comfy.ldm.wan.model_animate
+import comfy.ldm.wan.ar_model
+import comfy.ldm.wan.model_wandancer
import comfy.ldm.hunyuan3d.model
import comfy.ldm.hidream.model
import comfy.ldm.chroma.model
import comfy.ldm.chroma_radiance.model
+import comfy.ldm.pixeldit.model
+import comfy.ldm.pixeldit.pid
import comfy.ldm.ace.model
import comfy.ldm.omnigen.omnigen2
import comfy.ldm.qwen_image.model
import comfy.ldm.kandinsky5.model
import comfy.ldm.anima.model
import comfy.ldm.ace.ace_step15
+import comfy.ldm.cogvideo.model
import comfy.ldm.rt_detr.rtdetr_v4
import comfy.ldm.ernie.model
+import comfy.ldm.sam3.detector
+import comfy.ldm.hidream_o1.model
+from comfy.ldm.hidream_o1.conditioning import build_extra_conds
import comfy.model_management
import comfy.patcher_extension
@@ -80,6 +89,7 @@ class ModelType(Enum):
IMG_TO_IMG = 9
FLOW_COSMOS = 10
IMG_TO_IMG_FLOW = 11
+ V_PREDICTION_DDPM = 12
def model_sampling(model_config, model_type):
@@ -114,6 +124,8 @@ def model_sampling(model_config, model_type):
s = comfy.model_sampling.ModelSamplingCosmosRFlow
elif model_type == ModelType.IMG_TO_IMG_FLOW:
c = comfy.model_sampling.IMG_TO_IMG_FLOW
+ elif model_type == ModelType.V_PREDICTION_DDPM:
+ c = comfy.model_sampling.V_PREDICTION_DDPM
class ModelSampling(s, c):
pass
@@ -209,6 +221,11 @@ class BaseModel(torch.nn.Module):
if "latent_shapes" in extra_conds:
xc = utils.unpack_latents(xc, extra_conds.pop("latent_shapes"))
+ transformer_options = transformer_options.copy()
+ transformer_options["prefetch_dynamic_vbars"] = (
+ self.current_patcher is not None and self.current_patcher.is_dynamic()
+ )
+
model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds)
if len(model_output) > 1 and not torch.is_tensor(model_output):
model_output, _ = utils.pack_latents(model_output)
@@ -578,8 +595,8 @@ class Stable_Zero123(BaseModel):
def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
super().__init__(model_config, model_type, device=device)
self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
- self.cc_projection.weight.copy_(cc_projection_weight)
- self.cc_projection.bias.copy_(cc_projection_bias)
+ self.cc_projection.weight = torch.nn.Parameter(cc_projection_weight.clone())
+ self.cc_projection.bias = torch.nn.Parameter(cc_projection_bias.clone())
def extra_conds(self, **kwargs):
out = {}
@@ -799,6 +816,85 @@ class StableAudio1(BaseModel):
sd["{}{}".format(k, l)] = s[l]
return sd
+class StableAudio3(BaseModel):
+ def __init__(self, model_config, seconds_total_embedder_weights, padding_embedding=None, model_type=ModelType.FLOW, device=None):
+ super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.audio.dit.AudioDiffusionTransformer)
+ self.seconds_total_embedder = comfy.ldm.audio.embedders.NumberConditioner(768, min_val=0, max_val=384, fourier_features_type=model_config.unet_config["timestep_features_type"])
+ self.seconds_total_embedder.load_state_dict(seconds_total_embedder_weights)
+ if padding_embedding is not None:
+ self.padding_embedding = torch.nn.Parameter(padding_embedding, requires_grad=False)
+ else:
+ self.padding_embedding = None
+
+ def concat_cond(self, **kwargs):
+ noise = kwargs.get("noise", None)
+ image = kwargs.get("concat_latent_image", None)
+
+ if image is None:
+ shape_image = list(noise.shape)
+ image = torch.zeros(shape_image, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+ else:
+ image = self.process_latent_in(image)
+ # TODO: scale if not match
+ image = utils.resize_to_batch_size(image, noise.shape[0])
+
+ mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
+ if mask is None:
+ mask = torch.zeros_like(noise)[:, :1]
+ else:
+ if mask.shape[1] != 1:
+ mask = torch.mean(mask, dim=1, keepdim=True)
+ mask = 1.0 - mask
+ # TODO: scale if not match
+ mask = utils.resize_to_batch_size(mask, noise.shape[0])
+
+ return torch.cat((mask, image), dim=1)
+
+ def extra_conds(self, **kwargs):
+ out = {}
+
+ concat_cond = self.concat_cond(**kwargs)
+ if concat_cond is not None:
+ out['local_add_cond'] = comfy.conds.CONDNoiseShape(concat_cond)
+
+ noise = kwargs.get("noise", None)
+ device = kwargs["device"]
+
+ seconds_total = kwargs.get("seconds_total", int(noise.shape[-1] / 10.7666))
+ seconds_total_embed = self.seconds_total_embedder([seconds_total])[0].to(device)
+
+ global_embed = seconds_total_embed.reshape((1, -1))
+ out['global_embed'] = comfy.conds.CONDRegular(global_embed)
+
+ cross_attn = kwargs.get("cross_attn", None)
+ if cross_attn is not None:
+ cross_attn = cross_attn.to(device)
+ if self.padding_embedding is not None:
+ pe = self.padding_embedding.to(device=device, dtype=cross_attn.dtype)
+ max_text_tokens = self.model_config.unet_config.get("max_text_tokens", 256)
+ n_text = cross_attn.shape[1]
+ if n_text < max_text_tokens:
+ pad = pe.view(1, 1, -1).expand(cross_attn.shape[0], max_text_tokens - n_text, -1)
+ cross_attn = torch.cat([cross_attn, pad], dim=1)
+ cross_attn = torch.cat([cross_attn, seconds_total_embed.repeat((cross_attn.shape[0], 1, 1))], dim=1)
+ out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+
+ return out
+
+ def state_dict_for_saving(self, unet_state_dict, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
+ sd = super().state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
+
+ d = {"conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()}
+
+ for k in d:
+ s = d[k]
+ for l in s:
+ sd["{}{}".format(k, l)] = s[l]
+
+ if self.padding_embedding is not None:
+ sd["conditioner.conditioners.prompt.padding_embedding"] = self.padding_embedding.data
+ return sd
+
class HunyuanDiT(BaseModel):
def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
@@ -965,6 +1061,27 @@ class Flux2(Flux):
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
return out
+
+class Lens(BaseModel):
+ def __init__(self, model_config, model_type=ModelType.FLUX, device=None):
+ super().__init__(
+ model_config, model_type, device=device,
+ unet_model=comfy.ldm.lens.model.LensTransformer2DModel,
+ )
+
+ def encode_adm(self, **kwargs):
+ return None # Lens has no pooled/ADM conditioning.
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ cross_attn = kwargs.get("cross_attn", None)
+ if cross_attn is not None:
+ out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
+ attention_mask = kwargs.get("attention_mask", None)
+ if attention_mask is not None:
+ out['attention_mask'] = comfy.conds.CONDRegular(attention_mask)
+ return out
+
class GenmoMochi(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.genmo.joint_model.asymm_models_joint.AsymmDiTJoint)
@@ -1282,6 +1399,53 @@ class ZImagePixelSpace(Lumina2):
BaseModel.__init__(self, model_config, model_type, device=device, unet_model=comfy.ldm.lumina.model.NextDiTPixelSpace)
self.memory_usage_factor_conds = ("ref_latents",)
+
+class PixelDiTT2I(BaseModel):
+ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+ super().__init__(model_config, model_type, device=device,
+ unet_model=comfy.ldm.pixeldit.model.PixDiT_T2I)
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ attention_mask = kwargs.get("attention_mask", None)
+ if attention_mask is not None:
+ out["attention_mask"] = comfy.conds.CONDRegular(attention_mask)
+ return out
+
+
+class PiD(PixelDiTT2I):
+ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+ BaseModel.__init__(self, model_config, model_type, device=device,
+ unet_model=comfy.ldm.pixeldit.pid.PidNet)
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ lq_latent = kwargs.get("lq_latent", None)
+ if lq_latent is not None:
+ out["lq_latent"] = comfy.conds.CONDRegular(lq_latent)
+ degrade_sigma = kwargs.get("degrade_sigma", None)
+ if degrade_sigma is not None:
+ out["degrade_sigma"] = comfy.conds.CONDRegular(degrade_sigma)
+ return out
+
+ def resize_cond_for_context_window(self, cond_key, cond_value, window, x_in, device, retain_index_list=[]):
+ if cond_key == "lq_latent" and hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
+ lq = cond_value.cond
+ dim = window.dim
+ if dim >= lq.ndim:
+ return None
+ lq_proj = self.diffusion_model.lq_proj
+ ratio = lq_proj.sr_scale * lq_proj.latent_spatial_down_factor
+ # Map x window indices -> lq indices (deduplicated, sorted, in-bounds).
+ lq_size = lq.size(dim)
+ lq_indices = sorted({i // ratio for i in window.index_list if 0 <= i // ratio < lq_size})
+ if not lq_indices:
+ return None
+ idx = tuple([slice(None)] * dim + [lq_indices])
+ return cond_value._copy_with(lq[idx].to(device))
+ return super().resize_cond_for_context_window(cond_key, cond_value, window, x_in, device, retain_index_list=retain_index_list)
+
+
class WAN21(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.WanModel)
@@ -1355,6 +1519,13 @@ class WAN21(BaseModel):
return out
+class WAN21_CausalAR(WAN21):
+ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+ super(WAN21, self).__init__(model_config, model_type, device=device,
+ unet_model=comfy.ldm.wan.ar_model.CausalWanModel)
+ self.image_to_video = False
+
+
class WAN21_Vace(WAN21):
def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=False, device=None):
super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model.VaceWanModel)
@@ -1581,6 +1752,30 @@ class WAN21_SCAIL(WAN21):
return out
+class WAN22_WanDancer(WAN21):
+ def __init__(self, model_config, model_type=ModelType.FLOW, image_to_video=True, device=None):
+ super(WAN21, self).__init__(model_config, model_type, device=device, unet_model=comfy.ldm.wan.model_wandancer.WanDancerModel)
+ self.image_to_video = image_to_video
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ audio_embed = kwargs.get("audio_embed", None)
+ if audio_embed is not None:
+ out['audio_embed'] = comfy.conds.CONDRegular(audio_embed)
+
+ clip_vision_output_ref = kwargs.get("clip_vision_output_ref", None)
+ if clip_vision_output_ref is not None:
+ out['clip_fea_ref'] = comfy.conds.CONDRegular(clip_vision_output_ref.penultimate_hidden_states)
+
+ fps = kwargs.get("fps", None)
+ if fps is not None:
+ out['fps'] = comfy.conds.CONDRegular(torch.FloatTensor([fps]))
+
+ audio_inject_scale = kwargs.get("audio_inject_scale", None)
+ if audio_inject_scale is not None:
+ out['audio_inject_scale'] = comfy.conds.CONDRegular(torch.FloatTensor([audio_inject_scale]))
+ return out
+
class Hunyuan3Dv2(BaseModel):
def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hunyuan3d.model.Hunyuan3Dv2)
@@ -1631,6 +1826,39 @@ class HiDream(BaseModel):
out['image_cond'] = comfy.conds.CONDNoiseShape(self.process_latent_in(image_cond))
return out
+class HiDreamO1(BaseModel):
+ """HiDream-O1-Image: pixel-space DiT (no VAE). Refs from HiDreamO1ReferenceImages and tokens from the stub TE flow through
+ extra_conds; the heavy preprocessing lives in comfy.ldm.hidream_o1.conditioning."""
+ PATCH_SIZE = 32
+
+ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+ super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.hidream_o1.model.HiDreamO1Transformer)
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ text_input_ids = kwargs.get("text_input_ids", None)
+ noise = kwargs.get("noise", None)
+ if text_input_ids is None or noise is None:
+ return out
+
+ # handle area conds
+ area = kwargs.get("area", None)
+ if area is not None:
+ crop_h = min(noise.shape[-2] - area[2], area[0])
+ crop_w = min(noise.shape[-1] - area[3], area[1])
+ noise = torch.empty((noise.shape[0], 3, crop_h, crop_w), dtype=noise.dtype, device=noise.device)
+
+ conds = build_extra_conds(
+ text_input_ids, noise,
+ ref_images=kwargs.get("reference_latents", None),
+ target_patch_size=self.PATCH_SIZE,
+ )
+ for k, v in conds.items():
+ # ar_len is a Python int (precomputed to avoid a GPU sync in forward).
+ cls = comfy.conds.CONDConstant if k == "ar_len" else comfy.conds.CONDRegular
+ out[k] = cls(v)
+ return out
+
class Chroma(Flux):
def __init__(self, model_config, model_type=ModelType.FLUX, device=None, unet_model=comfy.ldm.chroma.model.Chroma):
super().__init__(model_config, model_type, device=device, unet_model=unet_model)
@@ -1974,3 +2202,63 @@ class ErnieImage(BaseModel):
if cross_attn is not None:
out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
return out
+
+class SAM3(BaseModel):
+ def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
+ super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.sam3.detector.SAM3Model)
+
+class CogVideoX(BaseModel):
+ def __init__(self, model_config, model_type=ModelType.V_PREDICTION_DDPM, image_to_video=False, device=None):
+ super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.cogvideo.model.CogVideoXTransformer3DModel)
+ self.image_to_video = image_to_video
+
+ def concat_cond(self, **kwargs):
+ noise = kwargs.get("noise", None)
+ # Detect extra channels needed (e.g. 32 - 16 = 16 for ref latent)
+ extra_channels = self.diffusion_model.in_channels - noise.shape[1]
+ if extra_channels == 0:
+ return None
+
+ image = kwargs.get("concat_latent_image", None)
+ device = kwargs["device"]
+
+ if image is None:
+ shape = list(noise.shape)
+ shape[1] = extra_channels
+ return torch.zeros(shape, dtype=noise.dtype, layout=noise.layout, device=noise.device)
+
+ latent_dim = self.latent_format.latent_channels
+ image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")
+
+ if noise.ndim == 5 and image.ndim == 5:
+ if image.shape[-3] < noise.shape[-3]:
+ image = torch.nn.functional.pad(image, (0, 0, 0, 0, 0, noise.shape[-3] - image.shape[-3]), "constant", 0)
+ elif image.shape[-3] > noise.shape[-3]:
+ image = image[:, :, :noise.shape[-3]]
+
+ for i in range(0, image.shape[1], latent_dim):
+ image[:, i:i + latent_dim] = self.process_latent_in(image[:, i:i + latent_dim])
+ image = utils.resize_to_batch_size(image, noise.shape[0])
+
+ if image.shape[1] > extra_channels:
+ image = image[:, :extra_channels]
+ elif image.shape[1] < extra_channels:
+ repeats = extra_channels // image.shape[1]
+ remainder = extra_channels % image.shape[1]
+ parts = [image] * repeats
+ if remainder > 0:
+ parts.append(image[:, :remainder])
+ image = torch.cat(parts, dim=1)
+
+ return image
+
+ def extra_conds(self, **kwargs):
+ out = super().extra_conds(**kwargs)
+ # OFS embedding (CogVideoX 1.5 I2V), default 2.0 as used by SparkVSR
+ if self.diffusion_model.ofs_proj_dim is not None:
+ ofs = kwargs.get("ofs", None)
+ if ofs is None:
+ noise = kwargs.get("noise", None)
+ ofs = torch.full((noise.shape[0],), 2.0, device=noise.device, dtype=noise.dtype)
+ out['ofs'] = comfy.conds.CONDRegular(ofs)
+ return out
diff --git a/comfy/model_detection.py b/comfy/model_detection.py
index ca06cdd1e..f0db7d388 100644
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -116,6 +116,45 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
if '{}transformer.rotary_pos_emb.inv_freq'.format(key_prefix) in state_dict_keys: #stable audio dit
unet_config = {}
unet_config["audio_model"] = "dit1.0"
+ unet_config["global_cond_dim"] = state_dict['{}to_global_embed.0.weight'.format(key_prefix)].shape[1]
+ cond_embed = state_dict['{}to_cond_embed.0.weight'.format(key_prefix)]
+ unet_config["project_cond_tokens"] = cond_embed.shape[0] != cond_embed.shape[1]
+ unet_config["embed_dim"] = state_dict['{}to_timestep_embed.0.weight'.format(key_prefix)].shape[0]
+ mem_tokens = state_dict.get('{}transformer.memory_tokens'.format(key_prefix), None)
+ to_qkv = state_dict.get('{}transformer.layers.0.self_attn.to_qkv.weight'.format(key_prefix), None)
+ differential = False
+ if to_qkv is not None:
+ if to_qkv.shape[0] == to_qkv.shape[1] * 5:
+ differential = True
+ if mem_tokens is not None:
+ unet_config["num_memory_tokens"] = mem_tokens.shape[0]
+ if '{}transformer.layers.0.self_attn.q_norm.weight'.format(key_prefix) in state_dict:
+ unet_config["attn_kwargs"] = {"qk_norm": "ln", "feat_scale": True}
+ rms_norm = state_dict.get('{}transformer.layers.0.self_attn.q_norm.gamma'.format(key_prefix), None)
+ if rms_norm is not None:
+ unet_config["attn_kwargs"] = {"qk_norm": "rms", "differential": differential}
+ unet_config["norm_type"] = "rms_norm"
+ unet_config["num_heads"] = unet_config["embed_dim"] // rms_norm.shape[0]
+
+ if '{}timestep_features.weight'.format(key_prefix) in state_dict:
+ unet_config["timestep_features_type"] = "learned"
+ else:
+ unet_config["timestep_features_type"] = "expo"
+
+ io_channels = state_dict['{}postprocess_conv.weight'.format(key_prefix)].shape[0]
+ unet_config["io_channels"] = io_channels
+ unet_config["input_concat_dim"] = state_dict['{}transformer.project_in.weight'.format(key_prefix)].shape[1] - io_channels
+
+ local_add_cond = state_dict.get('{}transformer.layers.0.to_local_embed.0.weight'.format(key_prefix), None)
+ if local_add_cond is not None:
+ unet_config["local_add_cond_dim"] = local_add_cond.shape[1]
+
+ global_cond_embed = state_dict.get('{}transformer.global_cond_embedder.0.weight'.format(key_prefix), None)
+ if global_cond_embed is not None:
+ unet_config["global_cond_shared_embed"] = True
+ unet_config["global_cond_type"] = "adaLN"
+
+ unet_config["depth"] = count_blocks(state_dict_keys, '{}transformer.layers.'.format(key_prefix) + '{}.')
return unet_config
if '{}double_layers.0.attn.w1q.weight'.format(key_prefix) in state_dict_keys: #aura flow dit
@@ -424,6 +463,23 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["extra_per_block_abs_pos_emb_type"] = "learnable"
return dit_config
+ # PiD (Pixel Diffusion Decoder). Must check BEFORE plain PixelDiT_T2I.
+ _lq_w_key = '{}lq_proj.latent_proj.0.weight'.format(key_prefix)
+ if _lq_w_key in state_dict_keys:
+ in_ch = int(state_dict[_lq_w_key].shape[1])
+ _gate_prefix = '{}lq_proj.gate_modules.'.format(key_prefix)
+ num_gates = len({k[len(_gate_prefix):].split('.')[0]
+ for k in state_dict_keys if k.startswith(_gate_prefix)})
+ dit_config = {"image_model": "pid",
+ "lq_latent_channels": in_ch,
+ "latent_spatial_down_factor": 16 if in_ch >= 64 else 8}
+ if num_gates > 0:
+ dit_config["lq_interval"] = (14 + num_gates - 1) // num_gates
+ return dit_config
+
+ if '{}core.pixel_embedder.proj.weight'.format(key_prefix) in state_dict_keys: # PixelDiT T2I
+ return {"image_model": "pixeldit_t2i"}
+
if '{}cap_embedder.1.weight'.format(key_prefix) in state_dict_keys and '{}noise_refiner.0.attention.k_norm.weight'.format(key_prefix) in state_dict_keys: # Lumina 2
dit_config = {}
dit_config["image_model"] = "lumina2"
@@ -490,6 +546,54 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
return dit_config
+ if '{}blocks.0.norm1.linear.weight'.format(key_prefix) in state_dict_keys: # CogVideoX
+ dit_config = {}
+ dit_config["image_model"] = "cogvideox"
+
+ # Extract config from weight shapes
+ norm1_weight = state_dict['{}blocks.0.norm1.linear.weight'.format(key_prefix)]
+ time_embed_dim = norm1_weight.shape[1]
+ dim = norm1_weight.shape[0] // 6
+
+ dit_config["num_attention_heads"] = dim // 64
+ dit_config["attention_head_dim"] = 64
+ dit_config["time_embed_dim"] = time_embed_dim
+ dit_config["num_layers"] = count_blocks(state_dict_keys, '{}blocks.'.format(key_prefix) + '{}.')
+
+ # Detect in_channels from patch_embed
+ patch_proj_key = '{}patch_embed.proj.weight'.format(key_prefix)
+ if patch_proj_key in state_dict_keys:
+ w = state_dict[patch_proj_key]
+ if w.ndim == 4:
+ # Conv2d: [out, in, kh, kw] — CogVideoX 1.0
+ dit_config["in_channels"] = w.shape[1]
+ dit_config["patch_size"] = w.shape[2]
+ elif w.ndim == 2:
+ # Linear: [out, in_channels * patch_size * patch_size * patch_size_t] — CogVideoX 1.5
+ dit_config["patch_size"] = 2
+ dit_config["patch_size_t"] = 2
+ dit_config["in_channels"] = w.shape[1] // (2 * 2 * 2) # 256 // 8 = 32
+
+ text_proj_key = '{}patch_embed.text_proj.weight'.format(key_prefix)
+ if text_proj_key in state_dict_keys:
+ dit_config["text_embed_dim"] = state_dict[text_proj_key].shape[1]
+
+ # Detect OFS embedding
+ ofs_key = '{}ofs_embedding_linear_1.weight'.format(key_prefix)
+ if ofs_key in state_dict_keys:
+ dit_config["ofs_embed_dim"] = state_dict[ofs_key].shape[1]
+
+ # Detect positional embedding type
+ pos_key = '{}patch_embed.pos_embedding'.format(key_prefix)
+ if pos_key in state_dict_keys:
+ dit_config["use_learned_positional_embeddings"] = True
+ dit_config["use_rotary_positional_embeddings"] = False
+ else:
+ dit_config["use_learned_positional_embeddings"] = False
+ dit_config["use_rotary_positional_embeddings"] = True
+
+ return dit_config
+
if '{}head.modulation'.format(key_prefix) in state_dict_keys: # Wan 2.1
dit_config = {}
dit_config["image_model"] = "wan2.1"
@@ -524,6 +628,8 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["model_type"] = "animate"
elif '{}patch_embedding_pose.weight'.format(key_prefix) in state_dict_keys:
dit_config["model_type"] = "scail"
+ elif '{}patch_embedding_global.weight'.format(key_prefix) in state_dict_keys:
+ dit_config["model_type"] = "wandancer"
else:
if '{}img_emb.proj.0.bias'.format(key_prefix) in state_dict_keys:
dit_config["model_type"] = "i2v"
@@ -570,6 +676,9 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["guidance_cond_proj_dim"] = None#f"{key_prefix}t_embedder.cond_proj.weight" in state_dict_keys
return dit_config
+ if '{}t_embedder1.mlp.0.weight'.format(key_prefix) in state_dict_keys and '{}x_embedder.proj1.weight'.format(key_prefix) in state_dict_keys: # HiDream-O1
+ return {"image_model": "hidream_o1"}
+
if '{}caption_projection.0.linear.weight'.format(key_prefix) in state_dict_keys: # HiDream
dit_config = {}
dit_config["image_model"] = "hidream"
@@ -663,6 +772,30 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["timestep_scale"] = 1000.0
return dit_config
+ if '{}transformer_blocks.0.attn.norm_added_q.weight'.format(key_prefix) in state_dict_keys \
+ and '{}transformer_blocks.0.img_mlp.w1.weight'.format(key_prefix) in state_dict_keys: # Lens
+ img_in_w = state_dict['{}img_in.weight'.format(key_prefix)]
+ proj_out_w = state_dict['{}proj_out.weight'.format(key_prefix)]
+ multi_layer = '{}txt_norm.0.weight'.format(key_prefix) in state_dict_keys
+ if multi_layer:
+ enc_hidden_dim = state_dict['{}txt_norm.0.weight'.format(key_prefix)].shape[0]
+ # Indices are TE-side; the DiT just consumes L layers in order.
+ selected_layer_index = tuple(range(count_blocks(state_dict_keys, '{}txt_norm.'.format(key_prefix) + '{}.')))
+ else:
+ enc_hidden_dim = state_dict['{}txt_norm.weight'.format(key_prefix)].shape[0]
+ selected_layer_index = (0,)
+
+ return {
+ "image_model": "lens",
+ "in_channels": img_in_w.shape[1],
+ "out_channels": proj_out_w.shape[0] // 4, # patch_size ** 2 (=2² default)
+ "num_layers": count_blocks(state_dict_keys, '{}transformer_blocks.'.format(key_prefix) + '{}.'),
+ "num_attention_heads": img_in_w.shape[0] // 64, # // attention_head_dim default
+ "enc_hidden_dim": enc_hidden_dim,
+ "multi_layer_encoder_feature": multi_layer,
+ "selected_layer_index": selected_layer_index,
+ }
+
if '{}txt_norm.weight'.format(key_prefix) in state_dict_keys: # Qwen Image
dit_config = {}
dit_config["image_model"] = "qwen_image"
@@ -718,6 +851,14 @@ def detect_unet_config(state_dict, key_prefix, metadata=None):
dit_config["image_model"] = "ernie"
return dit_config
+ if 'detector.backbone.vision_backbone.trunk.blocks.0.attn.qkv.weight' in state_dict_keys: # SAM3 / SAM3.1
+ if 'detector.transformer.decoder.query_embed.weight' in state_dict_keys:
+ dit_config = {}
+ dit_config["image_model"] = "SAM3"
+ if 'detector.backbone.vision_backbone.propagation_convs.0.conv_1x1.weight' in state_dict_keys:
+ dit_config["image_model"] = "SAM31"
+ return dit_config
+
if '{}input_blocks.0.0.weight'.format(key_prefix) not in state_dict_keys:
return None
@@ -873,6 +1014,10 @@ def model_config_from_unet(state_dict, unet_key_prefix, use_base_if_no_match=Fal
return model_config
def unet_prefix_from_state_dict(state_dict):
+ # SAM3: detector.* and tracker.* at top level, no common prefix
+ if any(k.startswith("detector.") for k in state_dict) and any(k.startswith("tracker.") for k in state_dict):
+ return ""
+
candidates = ["model.diffusion_model.", #ldm/sgm models
"model.model.", #audio models
"net.", #cosmos
diff --git a/comfy/model_management.py b/comfy/model_management.py
index bcf1399c4..b01c4d7fa 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -15,6 +15,7 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
"""
+from __future__ import annotations
import psutil
import logging
@@ -27,10 +28,17 @@ import platform
import weakref
import gc
import os
-from contextlib import nullcontext
+from contextlib import contextmanager, nullcontext
import comfy.memory_management
import comfy.utils
import comfy.quant_ops
+import comfy_aimdo.host_buffer
+import comfy_aimdo.vram_buffer
+
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+ from comfy.model_patcher import ModelPatcher
+
class VRAMState(Enum):
DISABLED = 0 #No vram present: no need to move models to vram
@@ -112,10 +120,6 @@ if args.directml is not None:
# torch_directml.disable_tiled_resources(True)
lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
-try:
- import intel_extension_for_pytorch as ipex # noqa: F401
-except:
- pass
try:
_ = torch.xpu.device_count()
@@ -206,6 +210,107 @@ def get_torch_device():
else:
return torch.device(torch.cuda.current_device())
+def get_all_torch_devices(exclude_current=False):
+ global cpu_state
+ devices = []
+ if cpu_state == CPUState.GPU:
+ # NVIDIA + AMD/ROCm both expose their GPUs through torch.cuda.*;
+ # without the AMD arm, single-GPU ROCm users get an empty list
+ # which silently turns unload_all_models() into a no-op.
+ if is_nvidia() or is_amd():
+ for i in range(torch.cuda.device_count()):
+ devices.append(torch.device("cuda", i))
+ elif is_intel_xpu():
+ for i in range(torch.xpu.device_count()):
+ devices.append(torch.device("xpu", i))
+ elif is_ascend_npu():
+ for i in range(torch.npu.device_count()):
+ devices.append(torch.device("npu", i))
+ elif is_mlu():
+ for i in range(torch.mlu.device_count()):
+ devices.append(torch.device("mlu", i))
+ else:
+ # Fallback for unhandled GPU backends (e.g. DirectML): at least
+ # report the current device so callers like unload_all_models()
+ # do not silently no-op.
+ devices.append(get_torch_device())
+ else:
+ devices.append(get_torch_device())
+ if exclude_current:
+ current = get_torch_device()
+ if current in devices:
+ devices.remove(current)
+ return devices
+
+def get_gpu_device_options():
+ """Return list of device option strings for node widgets.
+
+ Always includes "default" and "cpu". When multiple GPUs are present,
+ adds "gpu:0", "gpu:1", etc. (vendor-agnostic labels).
+ """
+ options = ["default", "cpu"]
+ devices = get_all_torch_devices()
+ if len(devices) > 1:
+ for i in range(len(devices)):
+ options.append(f"gpu:{i}")
+ return options
+
+def get_gpu_device_options_no_cpu():
+ """Variant of get_gpu_device_options that omits "cpu".
+
+ Intended for components like the VAE selector where running on CPU
+ is impractical and should not be offered as a choice.
+ """
+ return [o for o in get_gpu_device_options() if o != "cpu"]
+
+def resolve_gpu_device_option(option: str):
+ """Resolve a device option string to a torch.device.
+
+ Returns None for "default" (let the caller use its normal default).
+ Returns torch.device("cpu") for "cpu".
+ For "gpu:N", returns the Nth torch device. Returns None if the
+ index is out of range, the option string is malformed, or
+ unrecognized (callers are expected to log their own context-rich
+ message before falling back to the default device).
+ """
+ if option is None or option == "default":
+ return None
+ if option == "cpu":
+ return torch.device("cpu")
+ if option.startswith("gpu:"):
+ try:
+ idx = int(option[4:])
+ except ValueError:
+ return None
+ devices = get_all_torch_devices()
+ if 0 <= idx < len(devices):
+ return devices[idx]
+ return None
+
+@contextmanager
+def cuda_device_context(device):
+ """Context manager that sets torch.cuda.current_device to match *device*.
+
+ Used when running operations on a non-default CUDA device so that custom
+ CUDA kernels (e.g. comfy_kitchen fp8 quantization) pick up the correct
+ device index. The previous device is restored on exit.
+
+ No-op when *device* is not CUDA, has no explicit index, or already matches
+ the current device.
+ """
+ prev = None
+ if device.type == "cuda" and device.index is not None:
+ prev = torch.cuda.current_device()
+ if prev != device.index:
+ torch.cuda.set_device(device)
+ else:
+ prev = None
+ try:
+ yield
+ finally:
+ if prev is not None:
+ torch.cuda.set_device(prev)
+
def get_total_memory(dev=None, torch_total_too=False):
global directml_enabled
if dev is None:
@@ -494,9 +599,21 @@ try:
logging.info("Device: {}".format(get_torch_device_name(get_torch_device())))
except:
logging.warning("Could not pick default device.")
+try:
+ for device in get_all_torch_devices(exclude_current=True):
+ logging.info("Device: {}".format(get_torch_device_name(device)))
+except:
+ pass
+current_loaded_models: list[LoadedModel] = []
-current_loaded_models = []
+DIRTY_MMAPS = set()
+
+PIN_PRESSURE_HYSTERESIS = 256 * 1024 * 1024
+
+#Freeing registerables on pressure does imply a GPU sync, so go big on
+#the hysteresis so each expensive sync gives us back a good chunk.
+REGISTERABLE_PIN_HYSTERESIS = 2048 * 1024 * 1024
def module_size(module):
module_mem = 0
@@ -506,30 +623,49 @@ def module_size(module):
module_mem += t.nbytes
return module_mem
-def module_mmap_residency(module, free=False):
- mmap_touched_mem = 0
- module_mem = 0
- bounced_mmaps = set()
- sd = module.state_dict()
- for k in sd:
- t = sd[k]
- module_mem += t.nbytes
- storage = t._qdata.untyped_storage() if isinstance(t, comfy.quant_ops.QuantizedTensor) else t.untyped_storage()
- if not getattr(storage, "_comfy_tensor_mmap_touched", False):
- continue
- mmap_touched_mem += t.nbytes
- if not free:
- continue
- storage._comfy_tensor_mmap_touched = False
- mmap_obj = storage._comfy_tensor_mmap_refs[0]
- if mmap_obj in bounced_mmaps:
- continue
- mmap_obj.bounce()
- bounced_mmaps.add(mmap_obj)
- return mmap_touched_mem, module_mem
+def mark_mmap_dirty(storage):
+ mmap_refs = getattr(storage, "_comfy_tensor_mmap_refs", None)
+ if mmap_refs is not None:
+ DIRTY_MMAPS.add(mmap_refs[0])
+
+def free_pins(size, evict_active=False):
+ freed_total = 0
+ for loaded_model in reversed(current_loaded_models):
+ if size <= 0:
+ return freed_total
+ model = loaded_model.model
+ if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+ freed = model.partially_unload_ram(size)
+ freed_total += freed
+ size -= freed
+ return freed_total
+
+def ensure_pin_budget(size, evict_active=False):
+ shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available
+ if shortfall <= 0:
+ return True
+
+ to_free = shortfall + PIN_PRESSURE_HYSTERESIS
+ return free_pins(to_free, evict_active=evict_active) >= shortfall
+
+def ensure_pin_registerable(size, evict_active=False):
+ shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+ if MAX_PINNED_MEMORY <= 0:
+ return False
+ if shortfall <= 0:
+ return True
+
+ shortfall += REGISTERABLE_PIN_HYSTERESIS
+ for loaded_model in reversed(current_loaded_models):
+ model = loaded_model.model
+ if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+ shortfall -= model.unregister_inactive_pins(shortfall)
+ if shortfall <= 0:
+ return True
+ return shortfall <= REGISTERABLE_PIN_HYSTERESIS
class LoadedModel:
- def __init__(self, model):
+ def __init__(self, model: ModelPatcher):
self._set_model(model)
self.device = model.load_device
self.real_model = None
@@ -537,7 +673,7 @@ class LoadedModel:
self.model_finalizer = None
self._patcher_finalizer = None
- def _set_model(self, model):
+ def _set_model(self, model: ModelPatcher):
self._model = weakref.ref(model)
if model.parent is not None:
self._parent_model = weakref.ref(model.parent)
@@ -548,6 +684,7 @@ class LoadedModel:
model = self._parent_model()
if model is not None:
self._set_model(model)
+ self.device = model.load_device
@property
def model(self):
@@ -556,9 +693,6 @@ class LoadedModel:
def model_memory(self):
return self.model.model_size()
- def model_mmap_residency(self, free=False):
- return self.model.model_mmap_residency(free=free)
-
def model_loaded_memory(self):
return self.model.loaded_size()
@@ -583,9 +717,6 @@ class LoadedModel:
real_model = self.model.model
- if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
- with torch.no_grad():
- real_model = ipex.optimize(real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)
self.real_model = weakref.ref(real_model)
self.model_finalizer = weakref.finalize(real_model, cleanup_models)
@@ -641,15 +772,9 @@ WINDOWS = any(platform.win32_ver())
EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
if WINDOWS:
- import comfy.windows
EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
if total_vram > (15 * 1024): # more extra reserved vram on 16GB+ cards
EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
- def get_free_ram():
- return comfy.windows.get_free_ram()
-else:
- def get_free_ram():
- return psutil.virtual_memory().available
if args.reserve_vram is not None:
EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
@@ -678,11 +803,9 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
for x in can_unload_sorted:
i = x[-1]
memory_to_free = 1e32
- pins_to_free = 1e32
- if not DISABLE_SMART_MEMORY or device is None:
+ if current_loaded_models[i].model.is_dynamic() and (not DISABLE_SMART_MEMORY or device is None):
memory_to_free = 0 if device is None else memory_required - get_free_memory(device)
- pins_to_free = pins_required - get_free_ram()
- if current_loaded_models[i].model.is_dynamic() and for_dynamic:
+ if for_dynamic:
#don't actually unload dynamic models for the sake of other dynamic models
#as that works on-demand.
memory_required -= current_loaded_models[i].model.loaded_size()
@@ -690,18 +813,6 @@ def free_memory(memory_required, device, keep_loaded=[], for_dynamic=False, pins
if memory_to_free > 0 and current_loaded_models[i].model_unload(memory_to_free):
logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
unloaded_model.append(i)
- if pins_to_free > 0:
- logging.debug(f"PIN Unloading {current_loaded_models[i].model.model.__class__.__name__}")
- current_loaded_models[i].model.partially_unload_ram(pins_to_free)
-
- for x in can_unload_sorted:
- i = x[-1]
- ram_to_free = ram_required - psutil.virtual_memory().available
- if ram_to_free <= 0 and i not in unloaded_model:
- continue
- resident_memory, _ = current_loaded_models[i].model_mmap_residency(free=True)
- if resident_memory > 0:
- logging.debug(f"RAM Unloading {current_loaded_models[i].model.model.__class__.__name__}")
for i in sorted(unloaded_model, reverse=True):
unloaded_models.append(current_loaded_models.pop(i))
@@ -726,13 +837,15 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
else:
minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())
- models_temp = set()
+ # Order-preserving dedup. A plain set() would randomize iteration order across runs
+ models_temp = {}
for m in models:
- models_temp.add(m)
+ models_temp[m] = None
for mm in m.model_patches_models():
- models_temp.add(mm)
+ models_temp[mm] = None
- models = models_temp
+ models = list(models_temp)
+ models.reverse()
models_to_load = []
@@ -765,29 +878,16 @@ def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimu
model_to_unload.model.detach(unpatch_all=False)
model_to_unload.model_finalizer.detach()
-
total_memory_required = {}
- total_pins_required = {}
- total_ram_required = {}
for loaded_model in models_to_load:
device = loaded_model.device
total_memory_required[device] = total_memory_required.get(device, 0) + loaded_model.model_memory_required(device)
- resident_memory, model_memory = loaded_model.model_mmap_residency()
- pinned_memory = loaded_model.model.pinned_memory_size()
- #FIXME: This can over-free the pins as it budgets to pin the entire model. We should
- #make this JIT to keep as much pinned as possible.
- pins_required = model_memory - pinned_memory
- ram_required = model_memory - resident_memory
- total_pins_required[device] = total_pins_required.get(device, 0) + pins_required
- total_ram_required[device] = total_ram_required.get(device, 0) + ram_required
for device in total_memory_required:
if device != torch.device("cpu"):
free_memory(total_memory_required[device] * 1.1 + extra_mem,
device,
- for_dynamic=free_for_dynamic,
- pins_required=total_pins_required[device],
- ram_required=total_ram_required[device])
+ for_dynamic=free_for_dynamic)
for device in total_memory_required:
if device != torch.device("cpu"):
@@ -1181,6 +1281,11 @@ stream_counters = {}
STREAM_CAST_BUFFERS = {}
LARGEST_CASTED_WEIGHT = (None, 0)
+STREAM_AIMDO_CAST_BUFFERS = {}
+LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
+STREAM_PIN_BUFFERS = {}
+
+DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3
def get_cast_buffer(offload_stream, device, size, ref):
global LARGEST_CASTED_WEIGHT
@@ -1214,13 +1319,71 @@ def get_cast_buffer(offload_stream, device, size, ref):
return cast_buffer
+def get_aimdo_cast_buffer(offload_stream, device):
+ cast_buffer = STREAM_AIMDO_CAST_BUFFERS.get(offload_stream, None)
+ if cast_buffer is None:
+ cast_buffer = comfy_aimdo.vram_buffer.VRAMBuffer(DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE, device.index)
+ STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
+ return cast_buffer
+
+def get_pin_buffer(offload_stream):
+ pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None)
+ if pin_buffer is None:
+ pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3), mark_cold=False)
+ STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
+ elif offload_stream is not None:
+ event = getattr(pin_buffer, "_comfy_event", None)
+ if event is not None:
+ event.synchronize()
+ delattr(pin_buffer, "_comfy_event")
+ return pin_buffer
+
+def resize_pin_buffer(pin_buffer, size):
+ global TOTAL_PINNED_MEMORY
+ old_size = pin_buffer.size
+ if size <= old_size:
+ return True
+ growth = size - old_size
+ comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+ ensure_pin_budget(growth, evict_active=True)
+ ensure_pin_registerable(growth, evict_active=True)
+ try:
+ pin_buffer.extend(size=size, reallocate=True)
+ except RuntimeError:
+ return False
+ TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
+ return True
+
def reset_cast_buffers():
+ global TOTAL_PINNED_MEMORY
global LARGEST_CASTED_WEIGHT
+ global LARGEST_AIMDO_CASTED_WEIGHT
+
LARGEST_CASTED_WEIGHT = (None, 0)
- for offload_stream in STREAM_CAST_BUFFERS:
- offload_stream.synchronize()
+ LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
+ for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS):
+ if offload_stream is not None:
+ offload_stream.synchronize()
synchronize()
+
+ for mmap_obj in DIRTY_MMAPS:
+ mmap_obj.bounce()
+ DIRTY_MMAPS.clear()
+
+ for pin_buffer in STREAM_PIN_BUFFERS.values():
+ TOTAL_PINNED_MEMORY -= pin_buffer.size
+ TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
+
+ for loaded_model in current_loaded_models:
+ model = loaded_model.model
+ if model is not None and model.is_dynamic():
+ model.model.dynamic_pins[model.load_device]["active"] = False
+ model.partially_unload_ram(1e30, subsets=[ "patches" ])
+ model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0])
+
STREAM_CAST_BUFFERS.clear()
+ STREAM_AIMDO_CAST_BUFFERS.clear()
+ STREAM_PIN_BUFFERS.clear()
soft_empty_cache()
def get_offload_stream(device):
@@ -1266,7 +1429,7 @@ def sync_stream(device, stream):
current_stream(device).wait_stream(stream)
-def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
+def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None):
wf_context = nullcontext()
if stream is not None:
wf_context = stream
@@ -1274,17 +1437,20 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None):
wf_context = wf_context.as_context(stream)
dest_views = comfy.memory_management.interpret_gathered_like(tensors, r)
+ dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None
with wf_context:
for tensor in tensors:
dest_view = dest_views.pop(0)
+ dest2_view = dest2_views.pop(0) if dest2_views is not None else None
if tensor is None:
continue
- if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view):
+ if comfy.memory_management.read_tensor_file_slice_into(tensor, dest_view, stream=stream, destination2=dest2_view):
continue
storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
- if hasattr(storage, "_comfy_tensor_mmap_touched"):
- storage._comfy_tensor_mmap_touched = True
+ mark_mmap_dirty(storage)
dest_view.copy_(tensor, non_blocking=non_blocking)
+ if dest2_view is not None:
+ dest2_view.copy_(dest_view, non_blocking=non_blocking)
def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None):
@@ -1325,14 +1491,18 @@ TOTAL_PINNED_MEMORY = 0
MAX_PINNED_MEMORY = -1
if not args.disable_pinned_memory:
if is_nvidia() or is_amd():
+ ram = get_total_memory(torch.device("cpu"))
if WINDOWS:
- MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.40 # Windows limit is apparently 50%
+ MAX_PINNED_MEMORY = ram * 0.40 # Windows limit is apparently 50%
else:
- MAX_PINNED_MEMORY = get_total_memory(torch.device("cpu")) * 0.90
+ MAX_PINNED_MEMORY = ram * 0.90
logging.info("Enabled pinned memory {}".format(MAX_PINNED_MEMORY // (1024 * 1024)))
PINNING_ALLOWED_TYPES = set(["Tensor", "Parameter", "QuantizedTensor"])
+def pinned_hostbuf_size(size):
+ return max(0, int(min(size, MAX_PINNED_MEMORY) * 2))
+
def discard_cuda_async_error():
try:
a = torch.tensor([1], dtype=torch.uint8, device=get_torch_device())
@@ -1364,8 +1534,8 @@ def pin_memory(tensor):
return False
size = tensor.nbytes
- if (TOTAL_PINNED_MEMORY + size) > MAX_PINNED_MEMORY:
- return False
+ comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+ ensure_pin_registerable(size)
ptr = tensor.data_ptr()
if ptr == 0:
@@ -1402,7 +1572,8 @@ def unpin_memory(tensor):
return False
if torch.cuda.cudart().cudaHostUnregister(ptr) == 0:
- TOTAL_PINNED_MEMORY -= PINNED_MEMORY.pop(ptr)
+ size = PINNED_MEMORY.pop(ptr)
+ TOTAL_PINNED_MEMORY -= size
return True
else:
logging.warning("Unpin error.")
@@ -1580,10 +1751,7 @@ def should_use_fp16(device=None, model_params=0, prioritize_performance=True, ma
return False
if is_intel_xpu():
- if torch_version_numeric < (2, 3):
- return True
- else:
- return torch.xpu.get_device_properties(device).has_fp16
+ return torch.xpu.get_device_properties(device).has_fp16
if is_ascend_npu():
return True
@@ -1649,10 +1817,7 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
return False
if is_intel_xpu():
- if torch_version_numeric < (2, 3):
- return True
- else:
- return torch.xpu.is_bf16_supported()
+ return torch.xpu.is_bf16_supported()
if is_ascend_npu():
return True
@@ -1783,6 +1948,7 @@ def soft_empty_cache(force=False):
if cpu_state == CPUState.MPS:
torch.mps.empty_cache()
elif is_intel_xpu():
+ torch.xpu.synchronize()
torch.xpu.empty_cache()
elif is_ascend_npu():
torch.npu.empty_cache()
@@ -1794,14 +1960,41 @@ def soft_empty_cache(force=False):
torch.cuda.ipc_collect()
def unload_all_models():
- free_memory(1e30, get_torch_device())
+ for device in get_all_torch_devices():
+ free_memory(1e30, device)
+
+def unload_model_and_clones(model: ModelPatcher, unload_additional_models=True, all_devices=False):
+ 'Unload only model and its clones - primarily for multigpu cloning purposes.'
+ initial_keep_loaded: list[LoadedModel] = current_loaded_models.copy()
+ additional_models = []
+ if unload_additional_models:
+ additional_models = model.get_nested_additional_models()
+ keep_loaded = []
+ for loaded_model in initial_keep_loaded:
+ if loaded_model.model is not None:
+ if model.clone_base_uuid == loaded_model.model.clone_base_uuid:
+ continue
+ # check additional models if they are a match
+ skip = False
+ for add_model in additional_models:
+ if add_model.clone_base_uuid == loaded_model.model.clone_base_uuid:
+ skip = True
+ break
+ if skip:
+ continue
+ keep_loaded.append(loaded_model)
+ if not all_devices:
+ free_memory(1e30, get_torch_device(), keep_loaded)
+ else:
+ for device in get_all_torch_devices():
+ free_memory(1e30, device, keep_loaded)
def debug_memory_summary():
if is_amd() or is_nvidia():
return torch.cuda.memory.memory_summary()
return ""
-class InterruptProcessingException(Exception):
+class InterruptProcessingException(BaseException):
pass
interrupt_processing_mutex = threading.RLock()
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 6deb71e12..00a15fa63 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -26,13 +26,16 @@ import uuid
from typing import Callable, Optional
import torch
+import tqdm
import comfy.float
import comfy.hooks
import comfy.lora
import comfy.model_management
+import comfy.ops
import comfy.patcher_extension
import comfy.utils
+import comfy_aimdo.host_buffer
from comfy.comfy_types import UnetWrapperFunction
from comfy.quant_ops import QuantizedTensor
from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
@@ -75,12 +78,15 @@ def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_
def create_model_options_clone(orig_model_options: dict):
return comfy.patcher_extension.copy_nested_dicts(orig_model_options)
-def create_hook_patches_clone(orig_hook_patches):
+def create_hook_patches_clone(orig_hook_patches, copy_tuples=False):
new_hook_patches = {}
for hook_ref in orig_hook_patches:
new_hook_patches[hook_ref] = {}
for k in orig_hook_patches[hook_ref]:
new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:]
+ if copy_tuples:
+ for i in range(len(new_hook_patches[hook_ref][k])):
+ new_hook_patches[hook_ref][k][i] = tuple(new_hook_patches[hook_ref][k][i])
return new_hook_patches
def wipe_lowvram_weight(m):
@@ -115,14 +121,37 @@ def string_to_seed(data):
return comfy.utils.string_to_seed(data)
class LowVramPatch:
+ is_lowvram_patch = True
+
def __init__(self, key, patches, convert_func=None, set_func=None):
self.key = key
self.patches = patches
self.convert_func = convert_func # TODO: remove
self.set_func = set_func
+ self.prepared_patches = None
+
+ def memory_required(self):
+ counter = [0]
+ for patch in self.patches[self.key]:
+ comfy.lora.prefetch_prepared_value(patch[1], counter, None, None, False)
+ return counter[0]
+
+ def prepare(self, destination, stream, copy=True, commit=True):
+ counter = [0]
+ prepared_patches = [
+ (patch[0], comfy.lora.prefetch_prepared_value(patch[1], counter, destination, stream, copy), patch[2], patch[3], patch[4])
+ for patch in self.patches[self.key]
+ ]
+ if commit:
+ self.prepared_patches = prepared_patches
+ return prepared_patches
+
+ def clear_prepared(self):
+ self.prepared_patches = None
def __call__(self, weight):
- return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=weight.dtype)
+ patches = self.prepared_patches if self.prepared_patches is not None else self.patches[self.key]
+ return comfy.lora.calculate_weight(patches, weight, self.key, intermediate_dtype=weight.dtype)
LOWVRAM_PATCH_ESTIMATE_MATH_FACTOR = 2
@@ -229,6 +258,37 @@ class LazyCastingParam(torch.nn.Parameter):
return self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True).to("cpu")
+class LazyCastingQuantizedParam:
+ def __init__(self, model, key):
+ self.model = model
+ self.key = key
+ self.cpu_state_dict = None
+
+ def state_dict_tensor(self, state_dict_key):
+ if self.cpu_state_dict is None:
+ weight = self.model.patch_weight_to_device(self.key, device_to=self.model.load_device, return_weight=True)
+ self.cpu_state_dict = {k: v.to("cpu") for k, v in weight.state_dict(self.key).items()}
+ return self.cpu_state_dict[state_dict_key]
+
+
+class LazyCastingParamPiece(torch.nn.Parameter):
+ def __new__(cls, caster, state_dict_key, tensor):
+ return super().__new__(cls, tensor)
+
+ def __init__(self, caster, state_dict_key, tensor):
+ self.caster = caster
+ self.state_dict_key = state_dict_key
+
+ @property
+ def device(self):
+ return CustomTorchDevice
+
+ def to(self, *args, **kwargs):
+ caster = self.caster
+ del self.caster
+ return caster.state_dict_tensor(self.state_dict_key)
+
+
class ModelPatcher:
def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
self.size = size
@@ -272,7 +332,10 @@ class ModelPatcher:
self.is_clip = False
self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
- self.cached_patcher_init: tuple[Callable, tuple] | None = None
+ self.cached_patcher_init: tuple[Callable, tuple] | tuple[Callable, tuple, int] | None = None
+ self.is_multigpu_base_clone = False
+ self.clone_base_uuid = uuid.uuid4()
+
if not hasattr(self.model, 'model_loaded_weight_memory'):
self.model.model_loaded_weight_memory = 0
@@ -297,9 +360,6 @@ class ModelPatcher:
self.size = comfy.model_management.module_size(self.model)
return self.size
- def model_mmap_residency(self, free=False):
- return comfy.model_management.module_mmap_residency(self.model, free=free)
-
def loaded_size(self):
return self.model.model_loaded_weight_memory
@@ -312,7 +372,8 @@ class ModelPatcher:
#than pays for CFG. So return everything both torch and Aimdo could give us
aimdo_mem = 0
if comfy.memory_management.aimdo_enabled:
- aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze()
+ aimdo_device = device.index if getattr(device, "type", None) == "cuda" else None
+ aimdo_mem = comfy_aimdo.model_vbar.vbars_analyze(aimdo_device)
return comfy.model_management.get_free_memory(device) + aimdo_mem
def get_clone_model_override(self):
@@ -326,6 +387,8 @@ class ModelPatcher:
if self.cached_patcher_init is None:
raise RuntimeError("Cannot create non-dynamic delegate: cached_patcher_init is not initialized.")
temp_model_patcher = self.cached_patcher_init[0](*self.cached_patcher_init[1], disable_dynamic=True)
+ if len(self.cached_patcher_init) > 2:
+ temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
model_override = temp_model_patcher.get_clone_model_override()
if model_override is None:
model_override = self.get_clone_model_override()
@@ -384,19 +447,113 @@ class ModelPatcher:
n.hook_mode = self.hook_mode
n.cached_patcher_init = self.cached_patcher_init
+ n.is_multigpu_base_clone = self.is_multigpu_base_clone
+ n.clone_base_uuid = self.clone_base_uuid
for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE):
callback(self, n)
return n
+ def deepclone_multigpu(self, new_load_device=None, models_cache: dict[uuid.UUID,ModelPatcher]=None):
+ logging.info(f"Creating deepclone of {self.model.__class__.__name__} for {new_load_device if new_load_device else self.load_device}.")
+ if self.cached_patcher_init is None:
+ raise RuntimeError(
+ f"Cannot create multigpu deepclone of {self.model.__class__.__name__}: "
+ "the loader that produced this model does not support multigpu "
+ "(cached_patcher_init is not initialized). Use a core loader "
+ "(CheckpointLoaderSimple, UNETLoader, CLIPLoader/DualCLIPLoader, VAELoader), "
+ "or have the custom loader register a cached_patcher_init factory."
+ )
+ comfy.model_management.unload_model_and_clones(self)
+ # Produce a freshly-loaded patcher from the loader factory so the multigpu
+ # clone owns its own untainted model weights (rather than relying on
+ # copy.deepcopy of an already-patched/already-loaded module).
+ temp_model_patcher: ModelPatcher | list[ModelPatcher] = self.cached_patcher_init[0](*self.cached_patcher_init[1])
+ if len(self.cached_patcher_init) > 2:
+ temp_model_patcher = temp_model_patcher[self.cached_patcher_init[2]]
+ # Override clone()'s normal "share self.model + share backup containers" with
+ # the pristine model from temp_model_patcher plus empty backup containers --
+ # the fresh model has no patches applied, so any deepcopy of self's stale
+ # backup/object_patches_backup/pinned would just propagate dead state that
+ # no longer corresponds to anything in n.model.
+ model_override = (temp_model_patcher.model, ({}, {}, {}, set()))
+ n = self.clone(model_override=model_override)
+ # clone() copies hook_backup by reference from self; reset since model is pristine.
+ n.hook_backup = {}
+ # set load device, if present
+ if new_load_device is not None:
+ n.load_device = new_load_device
+ # Ensure any per-device bookkeeping (e.g. ModelPatcherDynamic.dynamic_pins)
+ # has an entry for n.load_device on the freshly-loaded n.model. temp_model_patcher's
+ # __init__ only registered its own (default) load_device.
+ if hasattr(n, "register_load_device"):
+ n.register_load_device(n.load_device)
+ # multigpu clone should not have multigpu additional_models entry
+ n.remove_additional_models("multigpu")
+ # multigpu_clone all stored additional_models; make sure circular references are properly handled
+ if models_cache is None:
+ models_cache = {}
+ for key, model_list in n.additional_models.items():
+ for i in range(len(model_list)):
+ add_model = n.additional_models[key][i]
+ if add_model.clone_base_uuid not in models_cache:
+ models_cache[add_model.clone_base_uuid] = add_model.deepclone_multigpu(new_load_device=new_load_device, models_cache=models_cache)
+ n.additional_models[key][i] = models_cache[add_model.clone_base_uuid]
+ for callback in self.get_all_callbacks(CallbacksMP.ON_DEEPCLONE_MULTIGPU):
+ callback(self, n)
+ return n
+
+ def match_multigpu_clones(self):
+ multigpu_models = self.get_additional_models_with_key("multigpu")
+ if len(multigpu_models) > 0:
+ new_multigpu_models = []
+ for mm in multigpu_models:
+ # clone main model, but bring over relevant props from existing multigpu clone
+ n = self.clone()
+ n.load_device = mm.load_device
+ n.backup = mm.backup
+ n.object_patches_backup = mm.object_patches_backup
+ n.hook_backup = mm.hook_backup
+ n.model = mm.model
+ n.is_multigpu_base_clone = mm.is_multigpu_base_clone
+ n.remove_additional_models("multigpu")
+ orig_additional_models: dict[str, list[ModelPatcher]] = comfy.patcher_extension.copy_nested_dicts(n.additional_models)
+ n.additional_models = comfy.patcher_extension.copy_nested_dicts(mm.additional_models)
+ # figure out which additional models are not present in multigpu clone
+ models_cache = {}
+ for mm_add_model in mm.get_additional_models():
+ models_cache[mm_add_model.clone_base_uuid] = mm_add_model
+ remove_models_uuids = set(list(models_cache.keys()))
+ for key, model_list in orig_additional_models.items():
+ for orig_add_model in model_list:
+ if orig_add_model.clone_base_uuid not in models_cache:
+ models_cache[orig_add_model.clone_base_uuid] = orig_add_model.deepclone_multigpu(new_load_device=n.load_device, models_cache=models_cache)
+ existing_list = n.get_additional_models_with_key(key)
+ existing_list.append(models_cache[orig_add_model.clone_base_uuid])
+ n.set_additional_models(key, existing_list)
+ if orig_add_model.clone_base_uuid in remove_models_uuids:
+ remove_models_uuids.remove(orig_add_model.clone_base_uuid)
+ # remove duplicate additional models
+ for key, model_list in n.additional_models.items():
+ new_model_list = [x for x in model_list if x.clone_base_uuid not in remove_models_uuids]
+ n.set_additional_models(key, new_model_list)
+ for callback in self.get_all_callbacks(CallbacksMP.ON_MATCH_MULTIGPU_CLONES):
+ callback(self, n)
+ new_multigpu_models.append(n)
+ self.set_additional_models("multigpu", new_multigpu_models)
+
def is_clone(self, other):
if hasattr(other, 'model') and self.model is other.model:
return True
return False
- def clone_has_same_weights(self, clone: 'ModelPatcher'):
- if not self.is_clone(clone):
- return False
+ def clone_has_same_weights(self, clone: ModelPatcher, allow_multigpu=False):
+ if allow_multigpu:
+ if self.clone_base_uuid != clone.clone_base_uuid:
+ return False
+ else:
+ if not self.is_clone(clone):
+ return False
if self.current_hooks != clone.current_hooks:
return False
@@ -506,6 +663,10 @@ class ModelPatcher:
def set_model_noise_refiner_patch(self, patch):
self.set_model_patch(patch, "noise_refiner")
+ def set_model_middle_block_after_patch(self, patch):
+ self.set_model_patch(patch, "middle_block_after_patch")
+
+
def set_model_rope_options(self, scale_x, shift_x, scale_y, shift_y, scale_t, shift_t, **kwargs):
rope_options = self.model_options["transformer_options"].get("rope_options", {})
rope_options["scale_x"] = scale_x
@@ -681,9 +842,9 @@ class ModelPatcher:
sd.pop(k)
return sd
- def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False):
+ def patch_weight_to_device(self, key, device_to=None, inplace_update=False, return_weight=False, force_cast=False):
weight, set_func, convert_func = get_key_weight(self.model, key)
- if key not in self.patches:
+ if key not in self.patches and not force_cast:
return weight
inplace_update = self.weight_inplace_update or inplace_update
@@ -691,7 +852,7 @@ class ModelPatcher:
if key not in self.backup and not return_weight:
self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
- temp_dtype = comfy.model_management.lora_compute_dtype(device_to)
+ temp_dtype = comfy.model_management.lora_compute_dtype(device_to) if key in self.patches else None
if device_to is not None:
temp_weight = comfy.model_management.cast_to_device(weight, device_to, temp_dtype, copy=True)
else:
@@ -699,9 +860,10 @@ class ModelPatcher:
if convert_func is not None:
temp_weight = convert_func(temp_weight, inplace=True)
- out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
+ out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key) if key in self.patches else temp_weight
if set_func is None:
- out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
+ if key in self.patches:
+ out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=comfy.utils.string_to_seed(key))
if return_weight:
return out_weight
elif inplace_update:
@@ -851,7 +1013,9 @@ class ModelPatcher:
if m.comfy_patched_weights == True:
continue
- for param in params:
+ for param, param_value in params.items():
+ if hasattr(m, "comfy_cast_weights") and getattr(param_value, "is_meta", False):
+ comfy.ops.disable_weight_init._zero_init_parameter(m, param)
key = key_param_name_to_key(n, param)
self.unpin_weight(key)
self.patch_weight_to_device(key, device_to=device_to)
@@ -1067,8 +1231,12 @@ class ModelPatcher:
# Pinned memory pressure tracking is only implemented for DynamicVram loading
return 0
+ def loaded_ram_size(self):
+ # Loaded RAM pressure tracking is only implemented for DynamicVram loading
+ return 0
+
def partially_unload_ram(self, ram_to_unload):
- pass
+ return 0
def detach(self, unpatch_all=True):
self.eject_model()
@@ -1167,7 +1335,7 @@ class ModelPatcher:
return self.additional_models.get(key, [])
def get_additional_models(self):
- all_models = []
+ all_models: list[ModelPatcher] = []
for models in self.additional_models.values():
all_models.extend(models)
return all_models
@@ -1221,9 +1389,18 @@ class ModelPatcher:
for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN):
callback(self)
- def prepare_state(self, timestep):
+ def prepare_state(self, timestep, model_options):
+ ignore_multigpu = model_options.get("ignore_multigpu", False)
for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE):
- callback(self, timestep)
+ callback(self, timestep, model_options)
+ if not ignore_multigpu and "multigpu_clones" in model_options:
+ model_options["ignore_multigpu"] = True
+ try:
+ for p in model_options["multigpu_clones"].values():
+ p: ModelPatcher
+ p.prepare_state(timestep, model_options)
+ finally:
+ model_options.pop("ignore_multigpu", None)
def restore_hook_patches(self):
if self.hook_patches_backup is not None:
@@ -1236,12 +1413,18 @@ class ModelPatcher:
def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]):
curr_t = t[0]
reset_current_hooks = False
+ multigpu_kf_changed_cache = None
transformer_options = model_options.get("transformer_options", {})
for hook in hook_group.hooks:
changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options)
# if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref;
# this will cause the weights to be recalculated when sampling
if changed:
+ # cache changed for multigpu usage
+ if "multigpu_clones" in model_options:
+ if multigpu_kf_changed_cache is None:
+ multigpu_kf_changed_cache = []
+ multigpu_kf_changed_cache.append(hook)
# reset current_hooks if contains hook that changed
if self.current_hooks is not None:
for current_hook in self.current_hooks.hooks:
@@ -1253,6 +1436,28 @@ class ModelPatcher:
self.cached_hook_patches.pop(cached_group)
if reset_current_hooks:
self.patch_hooks(None)
+ if "multigpu_clones" in model_options:
+ for p in model_options["multigpu_clones"].values():
+ p: ModelPatcher
+ p._handle_changed_hook_keyframes(multigpu_kf_changed_cache)
+
+ def _handle_changed_hook_keyframes(self, kf_changed_cache: list[comfy.hooks.Hook]):
+ 'Used to handle multigpu behavior inside prepare_hook_patches_current_keyframe.'
+ if kf_changed_cache is None:
+ return
+ reset_current_hooks = False
+ # reset current_hooks if contains hook that changed
+ for hook in kf_changed_cache:
+ if self.current_hooks is not None:
+ for current_hook in self.current_hooks.hooks:
+ if current_hook == hook:
+ reset_current_hooks = True
+ break
+ for cached_group in list(self.cached_hook_patches.keys()):
+ if cached_group.contains(hook):
+ self.cached_hook_patches.pop(cached_group)
+ if reset_current_hooks:
+ self.patch_hooks(None)
def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None,
registered: comfy.hooks.HookGroup = None):
@@ -1442,21 +1647,45 @@ class ModelPatcher:
self.unpatch_hooks()
self.clear_cached_hook_weights()
- def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
- unet_state_dict = self.model.diffusion_model.state_dict()
- for k, v in unet_state_dict.items():
+ def model_state_dict_for_saving(self, model=None, prefix=""):
+ if model is None:
+ model = self.model
+
+ original_state_dict = model.state_dict()
+ output_state_dict = {}
+ keys = list(original_state_dict)
+ while len(keys) > 0:
+ k = keys.pop(0)
+ v = original_state_dict[k]
op_keys = k.rsplit('.', 1)
if (len(op_keys) < 2) or op_keys[1] not in ["weight", "bias"]:
+ output_state_dict[k] = v
continue
try:
- op = comfy.utils.get_attr(self.model.diffusion_model, op_keys[0])
+ op = comfy.utils.get_attr(model, op_keys[0])
except:
+ output_state_dict[k] = v
continue
if not op or not hasattr(op, "comfy_cast_weights") or \
(hasattr(op, "comfy_patched_weights") and op.comfy_patched_weights == True):
+ output_state_dict[k] = v
continue
- key = "diffusion_model." + k
- unet_state_dict[k] = LazyCastingParam(self, key, comfy.utils.get_attr(self.model, key))
+ key = prefix + k
+ weight = comfy.utils.get_attr(self.model, key)
+ if isinstance(weight, QuantizedTensor) and k in original_state_dict:
+ qt_state_dict = weight.state_dict(k)
+ caster = LazyCastingQuantizedParam(self, key)
+ for group_key in (x for x in qt_state_dict if x in original_state_dict):
+ if group_key in keys:
+ keys.remove(group_key)
+ output_state_dict.pop(group_key, "")
+ output_state_dict[group_key] = LazyCastingParamPiece(caster, prefix + group_key, original_state_dict[group_key])
+ continue
+ output_state_dict[k] = LazyCastingParam(self, key, weight)
+ return output_state_dict
+
+ def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
+ unet_state_dict = self.model_state_dict_for_saving(self.model.diffusion_model, "diffusion_model.")
return self.model.state_dict_for_saving(unet_state_dict, clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
def __del__(self):
@@ -1475,9 +1704,30 @@ class ModelPatcherDynamic(ModelPatcher):
super().__init__(model, load_device, offload_device, size, weight_inplace_update)
if not hasattr(self.model, "dynamic_vbars"):
self.model.dynamic_vbars = {}
+ if not hasattr(self.model, "dynamic_pins"):
+ self.model.dynamic_pins = {}
+ self.register_load_device(self.load_device)
self.non_dynamic_delegate_model = None
assert load_device is not None
+ def register_load_device(self, device):
+ """Ensure dynamic_pins has an entry for *device*.
+
+ Called from __init__ and also from any code that retargets an
+ already-constructed patcher to a new load_device (e.g. the
+ Select{Model,CLIP,VAE}Device selector nodes); without this entry
+ partially_unload_ram() raises KeyError when it tries to read the
+ per-device pin state.
+ """
+ if device not in self.model.dynamic_pins:
+ self.model.dynamic_pins[device] = {
+ "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
+ "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
+ "hostbufs_initialized": False,
+ "failed": False,
+ "active": False,
+ }
+
def is_dynamic(self):
return True
@@ -1514,6 +1764,16 @@ class ModelPatcherDynamic(ModelPatcher):
#use all ModelPatcherDynamic this is ignored and its all done dynamically.
return super().memory_required(input_shape=input_shape) * 1.3 + (1024 ** 3)
+ def restore_loaded_backups(self):
+ restored = self.model.model_loaded_weight_memory
+ for key in list(self.backup.keys()):
+ bk = self.backup.pop(key)
+ comfy.utils.set_attr_param(self.model, key, bk.weight)
+ for key in list(self.backup_buffers.keys()):
+ comfy.utils.set_attr_buffer(self.model, key, self.backup_buffers.pop(key))
+ self.model.model_loaded_weight_memory = 0
+ return restored
+
def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False, dirty=False):
@@ -1530,12 +1790,20 @@ class ModelPatcherDynamic(ModelPatcher):
num_patches = 0
allocated_size = 0
- self.model.model_loaded_weight_memory = 0
+ self.restore_loaded_backups()
with self.use_ejected():
self.unpatch_hooks()
vbar = self._vbar_get(create=True)
+ pin_state = self.model.dynamic_pins[self.load_device]
+ if not pin_state["hostbufs_initialized"]:
+ hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size())
+ pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0])
+ pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0])
+ pin_state["hostbufs_initialized"] = True
+ pin_state["failed"] = False
+ pin_state["active"] = True
if vbar is not None:
vbar.prioritize()
@@ -1561,7 +1829,9 @@ class ModelPatcherDynamic(ModelPatcher):
if key in self.patches:
if comfy.lora.calculate_shape(self.patches[key], weight, key) != weight.shape:
return (True, 0)
- setattr(m, param_key + "_lowvram_function", LowVramPatch(key, self.patches))
+ lowvram_patch = LowVramPatch(key, self.patches)
+ lowvram_patch._pin_state = pin_state
+ setattr(m, param_key + "_lowvram_function", lowvram_patch)
num_patches += 1
else:
setattr(m, param_key + "_lowvram_function", None)
@@ -1578,26 +1848,38 @@ class ModelPatcherDynamic(ModelPatcher):
def force_load_param(self, param_key, device_to):
key = key_param_name_to_key(n, param_key)
+ weight, _, _ = get_key_weight(self.model, key)
+ if weight is None:
+ return
if key in self.backup:
comfy.utils.set_attr_param(self.model, key, self.backup[key].weight)
- self.patch_weight_to_device(key, device_to=device_to)
+ self.patch_weight_to_device(key, device_to=device_to, force_cast=True)
weight, _, _ = get_key_weight(self.model, key)
if weight is not None:
self.model.model_loaded_weight_memory += weight.numel() * weight.element_size()
if hasattr(m, "comfy_cast_weights"):
m.comfy_cast_weights = True
- m.pin_failed = False
m.seed_key = n
+ m._pin_state = pin_state
set_dirty(m, dirty)
- force_load, v_weight_size = setup_param(self, m, n, "weight")
- force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
- force_load = force_load or force_load_bias
- v_weight_size += v_weight_bias
+ #Models that mix tiny and giant weights can causing lopsided stream buffer
+ #rotations and stall. force the tinys over.
+ if module_mem > 16 * 1024:
+ force_load, v_weight_size = setup_param(self, m, n, "weight")
+ force_load_bias, v_weight_bias = setup_param(self, m, n, "bias")
+ force_load = force_load or force_load_bias
+ v_weight_size += v_weight_bias
+ if force_load:
+ logging.info(f"Module {n} has resizing Lora - force loading")
+ else:
+ force_load=True
if force_load:
- logging.info(f"Module {n} has resizing Lora - force loading")
+ if hasattr(m, "_v"):
+ comfy_aimdo.model_vbar.vbar_unpin(m._v)
+ delattr(m, "_v")
force_load_param(self, "weight", device_to)
force_load_param(self, "bias", device_to)
else:
@@ -1605,6 +1887,10 @@ class ModelPatcherDynamic(ModelPatcher):
m._v = vbar.alloc(v_weight_size)
allocated_size += v_weight_size
+ for param in params:
+ if param not in ("weight", "bias"):
+ force_load_param(self, param, device_to)
+
else:
for param in params:
key = key_param_name_to_key(n, param)
@@ -1628,7 +1914,11 @@ class ModelPatcherDynamic(ModelPatcher):
self.model.model_loaded_weight_memory += casted_buf.numel() * casted_buf.element_size()
force_load_stat = f" Force pre-loaded {len(self.backup)} weights: {self.model.model_loaded_weight_memory // 1024} KB." if len(self.backup) > 0 else ""
- logging.info(f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.{force_load_stat}")
+ log_key = (self.patches_uuid, allocated_size, num_patches, len(self.backup), self.model.model_loaded_weight_memory)
+ in_loop = bool(getattr(tqdm.tqdm, "_instances", None))
+ level = logging.DEBUG if in_loop and getattr(self, "_last_prepare_log_key", None) == log_key else logging.INFO
+ self._last_prepare_log_key = log_key
+ logging.log(level, f"Model {self.model.__class__.__name__} prepared for dynamic VRAM loading. {allocated_size // (1024 ** 2)}MB Staged. {num_patches} patches attached.{force_load_stat}")
self.model.device = device_to
self.model.current_weight_patches_uuid = self.patches_uuid
@@ -1647,33 +1937,62 @@ class ModelPatcherDynamic(ModelPatcher):
freed = 0 if vbar is None else vbar.free_memory(memory_to_free)
if freed < memory_to_free:
- for key in list(self.backup.keys()):
- bk = self.backup.pop(key)
- comfy.utils.set_attr_param(self.model, key, bk.weight)
- for key in list(self.backup_buffers.keys()):
- comfy.utils.set_attr_buffer(self.model, key, self.backup_buffers.pop(key))
- freed += self.model.model_loaded_weight_memory
- self.model.model_loaded_weight_memory = 0
+ freed += self.restore_loaded_backups()
return freed
- def pinned_memory_size(self):
- total = 0
- loading = self._load_list(for_dynamic=True)
- for x in loading:
- _, _, _, _, m, _ = x
- pin = comfy.pinned_memory.get_pin(m)
- if pin is not None:
- total += pin.numel() * pin.element_size()
- return total
+ def loaded_ram_size(self):
+ return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
+ self.model.dynamic_pins[self.load_device]["patches"][0].size)
- def partially_unload_ram(self, ram_to_unload):
- loading = self._load_list(for_dynamic=True, default_device=self.offload_device)
- for x in loading:
- *_, m, _ = x
- ram_to_unload -= comfy.pinned_memory.unpin_memory(m)
- if ram_to_unload <= 0:
- return
+ def pinned_memory_size(self):
+ return (self.model.dynamic_pins[self.load_device]["weights"][3][0] +
+ self.model.dynamic_pins[self.load_device]["patches"][3][0])
+
+ def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
+ freed = 0
+ pin_state = self.model.dynamic_pins[self.load_device]
+ for subset in subsets:
+ hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+ split = stack_split[0]
+ while split >= 0:
+ module, offset = stack[split]
+ split -= 1
+ stack_split[0] = split
+ if not module._pin_registered:
+ continue
+ size = module._pin.numel() * module._pin.element_size()
+ if torch.cuda.cudart().cudaHostUnregister(module._pin.data_ptr()) != 0:
+ comfy.model_management.discard_cuda_async_error()
+ continue
+ module._pin_registered = False
+ comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
+ pinned_size[0] = max(0, pinned_size[0] - size)
+ freed += size
+ ram_to_unload -= size
+ if ram_to_unload <= 0:
+ return freed
+ return freed
+
+ def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
+ freed = 0
+ pin_state = self.model.dynamic_pins[self.load_device]
+ for subset in subsets:
+ hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+ while len(stack) > 0:
+ module, offset = stack.pop()
+ size = module._pin.numel() * module._pin.element_size()
+ del module._pin
+ hostbuf.truncate(offset, do_unregister=module._pin_registered)
+ stack_split[0] = min(stack_split[0], len(stack) - 1)
+ if module._pin_registered:
+ comfy.model_management.TOTAL_PINNED_MEMORY = max(0, comfy.model_management.TOTAL_PINNED_MEMORY - size)
+ pinned_size[0] = max(0, pinned_size[0] - size)
+ freed += size
+ ram_to_unload -= size
+ if ram_to_unload <= 0:
+ return freed
+ return freed
def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
#This isn't used by the core at all and can only be to load a model out of
diff --git a/comfy/model_prefetch.py b/comfy/model_prefetch.py
new file mode 100644
index 000000000..72e11dec6
--- /dev/null
+++ b/comfy/model_prefetch.py
@@ -0,0 +1,66 @@
+import comfy_aimdo.model_vbar
+import comfy.model_management
+import comfy.ops
+
+PREFETCH_QUEUES = []
+
+def cleanup_prefetched_modules(comfy_modules):
+ for s in comfy_modules:
+ prefetch = getattr(s, "_prefetch", None)
+ if prefetch is None:
+ continue
+ for param_key in ("weight", "bias"):
+ lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
+ if lowvram_fn is not None:
+ lowvram_fn.clear_prepared()
+ if prefetch["signature"] is not None:
+ comfy_aimdo.model_vbar.vbar_unpin(s._v)
+ delattr(s, "_prefetch")
+
+def cleanup_prefetch_queues():
+ global PREFETCH_QUEUES
+
+ for queue in PREFETCH_QUEUES:
+ for entry in queue:
+ if entry is None or not isinstance(entry, tuple):
+ continue
+ _, prefetch_state = entry
+ comfy_modules = prefetch_state[1]
+ if comfy_modules is not None:
+ cleanup_prefetched_modules(comfy_modules)
+ PREFETCH_QUEUES = []
+
+def prefetch_queue_pop(queue, device, module):
+ if queue is None:
+ return
+
+ consumed = queue.pop(0)
+ if consumed is not None:
+ offload_stream, prefetch_state = consumed
+ if offload_stream is not None:
+ offload_stream.wait_stream(comfy.model_management.current_stream(device))
+ _, comfy_modules = prefetch_state
+ if comfy_modules is not None:
+ cleanup_prefetched_modules(comfy_modules)
+
+ prefetch = queue[0]
+ if prefetch is not None:
+ comfy_modules = []
+ for s in prefetch.modules():
+ if hasattr(s, "_v"):
+ comfy_modules.append(s)
+
+ offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True)
+ comfy.model_management.sync_stream(device, offload_stream)
+ queue[0] = (offload_stream, (prefetch, comfy_modules))
+
+def make_prefetch_queue(queue, device, transformer_options):
+ if (not transformer_options.get("prefetch_dynamic_vbars", False)
+ or comfy.model_management.NUM_STREAMS == 0
+ or comfy.model_management.is_device_cpu(device)
+ or not comfy.model_management.device_supports_non_blocking(device)):
+ return None
+
+ queue = [None] + queue + [None]
+ PREFETCH_QUEUES.append(queue)
+ return queue
diff --git a/comfy/model_sampling.py b/comfy/model_sampling.py
index 13860e6a2..5af336e76 100644
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
@@ -54,6 +54,30 @@ class V_PREDICTION(EPS):
sigma = reshape_sigma(sigma, model_output.ndim)
return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+class V_PREDICTION_DDPM:
+ """CogVideoX v-prediction: model receives raw x_t (unscaled), predicts velocity v.
+ x_0 = sqrt(alpha) * x_t - sqrt(1-alpha) * v
+ = x_t / sqrt(sigma^2 + 1) - v * sigma / sqrt(sigma^2 + 1)
+ """
+ def calculate_input(self, sigma, noise):
+ return noise
+
+ def calculate_denoised(self, sigma, model_output, model_input):
+ sigma = reshape_sigma(sigma, model_output.ndim)
+ return model_input / (sigma ** 2 + 1.0) ** 0.5 - model_output * sigma / (sigma ** 2 + 1.0) ** 0.5
+
+ def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+ sigma = reshape_sigma(sigma, noise.ndim)
+ if max_denoise:
+ noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
+ else:
+ noise = noise * sigma
+ noise += latent_image
+ return noise
+
+ def inverse_noise_scaling(self, sigma, latent):
+ return latent
+
class EDM(V_PREDICTION):
def calculate_denoised(self, sigma, model_output, model_input):
sigma = reshape_sigma(sigma, model_output.ndim)
@@ -69,7 +93,8 @@ class CONST:
def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
sigma = reshape_sigma(sigma, noise.ndim)
- return sigma * noise + (1.0 - sigma) * latent_image
+ s = getattr(self, "noise_scale", 1.0)
+ return sigma * (s * noise) + (1.0 - sigma) * latent_image
def inverse_noise_scaling(self, sigma, latent):
sigma = reshape_sigma(sigma, latent.ndim)
@@ -264,7 +289,11 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
else:
sampling_settings = {}
- self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
+ self.set_noise_scale(sampling_settings.get("noise_scale", 1.0))
+ self.set_parameters(
+ shift=sampling_settings.get("shift", 1.0),
+ multiplier=sampling_settings.get("multiplier", 1000),
+ )
def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000):
self.shift = shift
@@ -272,6 +301,9 @@ class ModelSamplingDiscreteFlow(torch.nn.Module):
ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
self.register_buffer('sigmas', ts)
+ def set_noise_scale(self, noise_scale):
+ self.noise_scale = float(noise_scale)
+
@property
def sigma_min(self):
return self.sigmas[0]
diff --git a/comfy/multigpu.py b/comfy/multigpu.py
new file mode 100644
index 000000000..e7f5b3d6f
--- /dev/null
+++ b/comfy/multigpu.py
@@ -0,0 +1,248 @@
+from __future__ import annotations
+import queue
+import threading
+import torch
+import logging
+
+from collections import namedtuple
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+ from comfy.model_patcher import ModelPatcher
+import comfy.utils
+import comfy.patcher_extension
+import comfy.model_management
+
+
+class MultiGPUThreadPool:
+ """Persistent thread pool for multi-GPU work distribution.
+
+ Maintains one worker thread per extra GPU device. Each thread calls
+ torch.cuda.set_device() once at startup so that compiled kernel caches
+ (inductor/triton) stay warm across diffusion steps.
+ """
+
+ def __init__(self, devices: list[torch.device]):
+ self._workers: list[threading.Thread] = []
+ self._work_queues: dict[torch.device, queue.Queue] = {}
+ self._result_queues: dict[torch.device, queue.Queue] = {}
+
+ for device in devices:
+ wq = queue.Queue()
+ rq = queue.Queue()
+ self._work_queues[device] = wq
+ self._result_queues[device] = rq
+ t = threading.Thread(target=self._worker_loop, args=(device, wq, rq), daemon=True)
+ t.start()
+ self._workers.append(t)
+
+ def _worker_loop(self, device: torch.device, work_q: queue.Queue, result_q: queue.Queue):
+ try:
+ torch.cuda.set_device(device)
+ except Exception as e:
+ logging.error(f"MultiGPUThreadPool: failed to set device {device}: {e}")
+ while True:
+ item = work_q.get()
+ if item is None:
+ return
+ result_q.put((None, e))
+ return
+ while True:
+ item = work_q.get()
+ if item is None:
+ break
+ fn, args, kwargs = item
+ try:
+ result = fn(*args, **kwargs)
+ result_q.put((result, None))
+ except Exception as e:
+ result_q.put((None, e))
+
+ def submit(self, device: torch.device, fn, *args, **kwargs):
+ self._work_queues[device].put((fn, args, kwargs))
+
+ def get_result(self, device: torch.device):
+ return self._result_queues[device].get()
+
+ @property
+ def devices(self) -> list[torch.device]:
+ return list(self._work_queues.keys())
+
+ def shutdown(self):
+ for wq in self._work_queues.values():
+ wq.put(None) # sentinel
+ for t in self._workers:
+ t.join(timeout=5.0)
+
+
+class GPUOptions:
+ def __init__(self, device_index: int, relative_speed: float):
+ self.device_index = device_index
+ self.relative_speed = relative_speed
+
+ def clone(self):
+ return GPUOptions(self.device_index, self.relative_speed)
+
+ def create_dict(self):
+ return {
+ "relative_speed": self.relative_speed
+ }
+
+class GPUOptionsGroup:
+ def __init__(self):
+ self.options: dict[int, GPUOptions] = {}
+
+ def add(self, info: GPUOptions):
+ self.options[info.device_index] = info
+
+ def clone(self):
+ c = GPUOptionsGroup()
+ for opt in self.options.values():
+ c.add(opt)
+ return c
+
+ def register(self, model: ModelPatcher):
+ opts_dict = {}
+ # get devices that are valid for this model
+ devices: list[torch.device] = [model.load_device]
+ for extra_model in model.get_additional_models_with_key("multigpu"):
+ extra_model: ModelPatcher
+ devices.append(extra_model.load_device)
+ # create dictionary with actual device mapped to its GPUOptions
+ device_opts_list: list[GPUOptions] = []
+ for device in devices:
+ device_opts = self.options.get(device.index, GPUOptions(device_index=device.index, relative_speed=1.0))
+ opts_dict[device] = device_opts.create_dict()
+ device_opts_list.append(device_opts)
+ # make relative_speed relative to 1.0
+ min_speed = min([x.relative_speed for x in device_opts_list])
+ for value in opts_dict.values():
+ value['relative_speed'] /= min_speed
+ model.model_options['multigpu_options'] = opts_dict
+
+
+def create_multigpu_deepclones(model: ModelPatcher, max_gpus: int, gpu_options: GPUOptionsGroup=None, reuse_loaded=False):
+ 'Prepare ModelPatcher to contain deepclones of its BaseModel and related properties.'
+ model = model.clone()
+ # check if multigpu is already prepared - get the load devices from them if possible to exclude
+ skip_devices = set()
+ multigpu_models = model.get_additional_models_with_key("multigpu")
+ if len(multigpu_models) > 0:
+ for mm in multigpu_models:
+ skip_devices.add(mm.load_device)
+ skip_devices = list(skip_devices)
+
+ # Exclude the primary model's actual device, not the global current device:
+ # after SelectModelDevice(gpu:N) the primary may not live on the process's
+ # current CUDA device, and excluding the wrong device picks bad extras.
+ all_devices = comfy.model_management.get_all_torch_devices(exclude_current=False)
+ full_extra_devices = [d for d in all_devices if d != model.load_device]
+ limit_extra_devices = full_extra_devices[:max_gpus-1]
+ extra_devices = limit_extra_devices.copy()
+ # exclude skipped devices
+ for skip in skip_devices:
+ if skip in extra_devices:
+ extra_devices.remove(skip)
+ # create new deepclones
+ if len(extra_devices) > 0:
+ for device in extra_devices:
+ device_patcher = None
+ if reuse_loaded:
+ # Only reuse a previously-loaded MultiGPU clone. A SelectModelDevice
+ # patcher on the same device shares clone_base_uuid but has
+ # is_multigpu_base_clone=False, which would later be filtered out by
+ # prepare_model_patcher_multigpu_clones() and silently shrink the
+ # work split back to one GPU.
+ loaded_models: list[ModelPatcher] = comfy.model_management.loaded_models()
+ for lm in loaded_models:
+ if lm.model is None:
+ continue
+ if lm.load_device != device:
+ continue
+ if lm.clone_base_uuid != model.clone_base_uuid:
+ continue
+ if not getattr(lm, "is_multigpu_base_clone", False):
+ continue
+ device_patcher = lm.clone()
+ logging.info(f"Reusing loaded multigpu deepclone of {device_patcher.model.__class__.__name__} for {device}")
+ break
+ if device_patcher is None:
+ device_patcher = model.deepclone_multigpu(new_load_device=device)
+ # Always flag the clone; whether reused or freshly deepcloned, it must
+ # advertise itself as a MultiGPU base clone so the cond scheduler picks
+ # it up in prepare_model_patcher_multigpu_clones().
+ device_patcher.is_multigpu_base_clone = True
+ multigpu_models = model.get_additional_models_with_key("multigpu")
+ multigpu_models.append(device_patcher)
+ model.set_additional_models("multigpu", multigpu_models)
+ model.match_multigpu_clones()
+ if gpu_options is None:
+ gpu_options = GPUOptionsGroup()
+ gpu_options.register(model)
+ else:
+ logging.info("No extra torch devices need initialization, skipping initializing MultiGPU Work Units.")
+ # only keep model clones that don't go 'past' the intended max_gpu count;
+ # this prunes any inherited multigpu clones whose load_device is no longer allowed
+ # when max_gpus is lowered between runs.
+ allowed_devices = set(limit_extra_devices)
+ allowed_devices.add(model.load_device)
+ multigpu_models = model.get_additional_models_with_key("multigpu")
+ new_multigpu_models = [m for m in multigpu_models if m.load_device in allowed_devices]
+ if len(new_multigpu_models) != len(multigpu_models):
+ model.set_additional_models("multigpu", new_multigpu_models)
+ model.match_multigpu_clones()
+ return model
+
+
+LoadBalance = namedtuple('LoadBalance', ['work_per_device', 'idle_time'])
+def load_balance_devices(model_options: dict[str], total_work: int, return_idle_time=False, work_normalized: int=None):
+ 'Optimize work assigned to different devices, accounting for their relative speeds and splittable work.'
+ opts_dict = model_options['multigpu_options']
+ devices = list(model_options['multigpu_clones'].keys())
+ speed_per_device = []
+ work_per_device = []
+ # get sum of each device's relative_speed
+ total_speed = 0.0
+ for opts in opts_dict.values():
+ total_speed += opts['relative_speed']
+ # get relative work for each device;
+ # obtained by w = (W*r)/R
+ for device in devices:
+ relative_speed = opts_dict[device]['relative_speed']
+ relative_work = (total_work*relative_speed) / total_speed
+ speed_per_device.append(relative_speed)
+ work_per_device.append(relative_work)
+ # relative work must be expressed in whole numbers, but likely is a decimal;
+ # perform rounding while maintaining total sum equal to total work (sum of relative works)
+ work_per_device = round_preserved(work_per_device)
+ dict_work_per_device = {}
+ for device, relative_work in zip(devices, work_per_device):
+ dict_work_per_device[device] = relative_work
+ if not return_idle_time:
+ return LoadBalance(dict_work_per_device, None)
+ # divide relative work by relative speed to get estimated completion time of said work by each device;
+ # time here is relative and does not correspond to real-world units
+ completion_time = [w/r for w,r in zip(work_per_device, speed_per_device)]
+ # calculate relative time spent by the devices waiting on each other after their work is completed
+ idle_time = abs(min(completion_time) - max(completion_time))
+ # if need to compare work idle time, need to normalize to a common total work
+ if work_normalized:
+ idle_time *= (work_normalized/total_work)
+
+ return LoadBalance(dict_work_per_device, idle_time)
+
+def round_preserved(values: list[float]):
+ 'Round all values in a list, preserving the combined sum of values.'
+ # get floor of values; casting to int does it too
+ floored = [int(x) for x in values]
+ total_floored = sum(floored)
+ # get remainder to distribute
+ remainder = round(sum(values)) - total_floored
+ # pair values with fractional portions
+ fractional = [(i, x-floored[i]) for i, x in enumerate(values)]
+ # sort by fractional part in descending order
+ fractional.sort(key=lambda x: x[1], reverse=True)
+ # distribute the remainder
+ for i in range(remainder):
+ index = fractional[i][0]
+ floored[index] += 1
+ return floored
diff --git a/comfy/ops.py b/comfy/ops.py
index b5cd1d47e..56445be8d 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -18,6 +18,7 @@
import torch
import logging
+import contextlib
import comfy.model_management
from comfy.cli_args import args, PerformanceFeature
import comfy.float
@@ -75,41 +76,93 @@ except:
cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024
+
def cast_to_input(weight, input, non_blocking=False, copy=True):
return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
-def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant):
+def materialize_meta_param(s, param_keys):
+ for param_key in param_keys:
+ param = getattr(s, param_key, None)
+ if param is not None and getattr(param, "is_meta", False):
+ setattr(s, param_key, torch.nn.Parameter(torch.zeros(param.shape, dtype=param.dtype), requires_grad=param.requires_grad))
- #vbar doesn't support CPU weights, but some custom nodes have weird paths
- #that might switch the layer to the CPU and expect it to work. We have to take
- #a clone conservatively as we are mmapped and some SFT files are packed misaligned
- #If you are a custom node author reading this, please move your layer to the GPU
- #or declare your ModelPatcher as CPU in the first place.
- if comfy.model_management.is_device_cpu(device):
- weight = s.weight.to(dtype=dtype, copy=True)
- if isinstance(weight, QuantizedTensor):
- weight = weight.dequantize()
- bias = None
- if s.bias is not None:
- bias = s.bias.to(dtype=bias_dtype, copy=True)
- return weight, bias, (None, None, None)
+# FIXME: add n=1 cache hit fast path
+def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blocking):
offload_stream = None
- xfer_dest = None
+ cast_buffer = None
+ cast_buffer_offset = 0
+ stream_pin_hostbuf = None
+ stream_pin_offset = 0
+ stream_pin_queue = []
+
+ def ensure_offload_stream(module, required_size, check_largest):
+ nonlocal offload_stream
+ nonlocal cast_buffer
+
+ if offload_stream is None:
+ offload_stream = comfy.model_management.get_offload_stream(device)
+ if offload_stream is None or not check_largest or len(comfy_modules) != 1:
+ return
+
+ current_size = 0 if cast_buffer is None else cast_buffer.size()
+ if current_size < required_size and module is comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[0]:
+ offload_stream = comfy.model_management.get_offload_stream(device)
+ cast_buffer = None
+ if required_size > comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT[1]:
+ comfy.model_management.LARGEST_AIMDO_CASTED_WEIGHT = (module, required_size)
+
+ def get_cast_buffer(buffer_size):
+ nonlocal offload_stream
+ nonlocal cast_buffer
+ nonlocal cast_buffer_offset
+
+ if buffer_size == 0:
+ return None
+
+ if offload_stream is None:
+ return torch.empty((buffer_size,), dtype=torch.uint8, device=device)
+
+ cast_buffer = comfy.model_management.get_aimdo_cast_buffer(offload_stream, device)
+ buffer = comfy_aimdo.torch.aimdo_to_tensor(cast_buffer.get(buffer_size, cast_buffer_offset), device)
+ cast_buffer_offset += buffer_size
+ return buffer
+
+ def get_stream_pin_buffer_offset(buffer_size):
+ nonlocal stream_pin_hostbuf
+ nonlocal stream_pin_offset
+
+ if buffer_size == 0 or offload_stream is None:
+ return None
+
+ if stream_pin_hostbuf is None:
+ stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream)
+ if stream_pin_hostbuf is None:
+ return None
+
+ offset = stream_pin_offset
+ stream_pin_offset += buffer_size
+ return offset
+
+ for s in comfy_modules:
+ signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
+ resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
+ prefetch = {
+ "signature": signature,
+ "resident": resident,
+ }
- signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
- resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
- if signature is not None:
if resident:
- weight = s._v_weight
- bias = s._v_bias
- else:
- xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device)
+ s._prefetch = prefetch
+ continue
- if not resident:
+ materialize_meta_param(s, ["weight", "bias"])
+ xfer_dest = comfy_aimdo.torch.aimdo_to_tensor(s._v, device) if signature is not None else None
cast_geometry = comfy.memory_management.tensors_to_geometries([ s.weight, s.bias ])
cast_dest = None
+ needs_cast = False
xfer_source = [ s.weight, s.bias ]
@@ -121,55 +174,116 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
if data is None:
continue
if data.dtype != geometry.dtype:
+ needs_cast = True
cast_dest = xfer_dest
- if cast_dest is None:
- cast_dest = torch.empty((comfy.memory_management.vram_aligned_size(cast_geometry),), dtype=torch.uint8, device=device)
xfer_dest = None
break
dest_size = comfy.memory_management.vram_aligned_size(xfer_source)
- offload_stream = comfy.model_management.get_offload_stream(device)
- if xfer_dest is None and offload_stream is not None:
- xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
- if xfer_dest is None:
- offload_stream = comfy.model_management.get_offload_stream(device)
- xfer_dest = comfy.model_management.get_cast_buffer(offload_stream, device, dest_size, s)
+ ensure_offload_stream(s, dest_size if xfer_dest is None else 0, True)
if xfer_dest is None:
- xfer_dest = torch.empty((dest_size,), dtype=torch.uint8, device=device)
- offload_stream = None
+ xfer_dest = get_cast_buffer(dest_size)
- if signature is None and pin is None:
- comfy.pinned_memory.pin_memory(s)
- pin = comfy.pinned_memory.get_pin(s)
- else:
- pin = None
+ def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
+ if xfer_source is not None:
+ if getattr(xfer_source, "is_lowvram_patch", False):
+ xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
+ else:
+ comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
- if pin is not None:
- comfy.model_management.cast_to_gathered(xfer_source, pin)
- xfer_source = [ pin ]
- #send it over
- comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=offload_stream)
- comfy.model_management.sync_stream(device, offload_stream)
+ def handle_pin(m, pin, source, dest, subset="weights", size=None):
+ if pin is not None:
+ cast_maybe_lowvram_patch([pin], dest, offload_stream)
+ return
+ if signature is None:
+ comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
+ pin = comfy.pinned_memory.get_pin(m, subset=subset)
+ if pin is not None:
+ if isinstance(source, list):
+ comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest)
+ else:
+ cast_maybe_lowvram_patch(source, pin, None)
+ cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
+ return
+ if pin is None:
+ pin_offset = get_stream_pin_buffer_offset(size)
+ if pin_offset is not None:
+ stream_pin_queue.append((source, pin_offset, size, dest))
+ return
+ cast_maybe_lowvram_patch(source, dest, offload_stream)
- if cast_dest is not None:
+ handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size)
+
+ for param_key in ("weight", "bias"):
+ lowvram_source = getattr(s, param_key + "_lowvram_function", None)
+ if lowvram_source is not None:
+ ensure_offload_stream(s, cast_buffer_offset, False)
+ lowvram_size = lowvram_source.memory_required()
+ lowvram_dest = get_cast_buffer(lowvram_size)
+ lowvram_source.prepare(lowvram_dest, None, copy=False, commit=True)
+
+ pin = comfy.pinned_memory.get_pin(lowvram_source, subset="patches")
+ handle_pin(lowvram_source, pin, lowvram_source, lowvram_dest, subset="patches", size=lowvram_size)
+
+
+ prefetch["xfer_dest"] = xfer_dest
+ prefetch["cast_dest"] = cast_dest
+ prefetch["cast_geometry"] = cast_geometry
+ prefetch["needs_cast"] = needs_cast
+ s._prefetch = prefetch
+
+ if stream_pin_offset > 0:
+ if stream_pin_hostbuf.size < stream_pin_offset:
+ if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM):
+ for xfer_source, _, _, xfer_dest in stream_pin_queue:
+ cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
+ return offload_stream
+ stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf)
+ stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
+ for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
+ pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
+ if isinstance(xfer_source, list):
+ comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest)
+ else:
+ cast_maybe_lowvram_patch(xfer_source, pin, None)
+ comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
+ stream_pin_hostbuf._comfy_event = offload_stream.record_event()
+
+ return offload_stream
+
+
+def resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant):
+
+ prefetch = getattr(s, "_prefetch", None)
+
+ if prefetch["resident"]:
+ weight = s._v_weight
+ bias = s._v_bias
+ else:
+ xfer_dest = prefetch["xfer_dest"]
+ if prefetch["needs_cast"]:
+ cast_dest = prefetch["cast_dest"] if prefetch["cast_dest"] is not None else torch.empty((comfy.memory_management.vram_aligned_size(prefetch["cast_geometry"]),), dtype=torch.uint8, device=device)
for pre_cast, post_cast in zip(comfy.memory_management.interpret_gathered_like([s.weight, s.bias ], xfer_dest),
- comfy.memory_management.interpret_gathered_like(cast_geometry, cast_dest)):
+ comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], cast_dest)):
if post_cast is not None:
post_cast.copy_(pre_cast)
xfer_dest = cast_dest
- params = comfy.memory_management.interpret_gathered_like(cast_geometry, xfer_dest)
+ params = comfy.memory_management.interpret_gathered_like(prefetch["cast_geometry"], xfer_dest)
weight = params[0]
bias = params[1]
- if signature is not None:
+ if prefetch["signature"] is not None:
s._v_weight = weight
s._v_bias = bias
- s._v_signature=signature
+ s._v_signature = prefetch["signature"]
def post_cast(s, param_key, x, dtype, resident, update_weight):
lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
fns = getattr(s, param_key + "_function", [])
+ if x is None:
+ return None
+
orig = x
def to_dequant(tensor, dtype):
@@ -197,18 +311,19 @@ def cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compu
x = f(x)
return x
- update_weight = signature is not None
+ update_weight = prefetch["signature"] is not None
+ weight = post_cast(s, "weight", weight, dtype, prefetch["resident"], update_weight)
+ if bias is not None:
+ bias = post_cast(s, "bias", bias, bias_dtype, prefetch["resident"], update_weight)
- weight = post_cast(s, "weight", weight, dtype, resident, update_weight)
- if s.bias is not None:
- bias = post_cast(s, "bias", bias, bias_dtype, resident, update_weight)
+ if prefetch["signature"] is not None:
+ prefetch["resident"] = True
- #FIXME: weird offload return protocol
- return weight, bias, (offload_stream, device if signature is not None else None, None)
+ return weight, bias
def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, offloadable=False, compute_dtype=None, want_requant=False):
- # NOTE: offloadable=False is a a legacy and if you are a custom node author reading this please pass
+ # NOTE: offloadable=False is a legacy mode and if you are a custom node author reading this please pass
# offloadable=True and call uncast_bias_weight() after your last usage of the weight/bias. This
# will add async-offload support to your cast and improve performance.
if input is not None:
@@ -222,10 +337,46 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
if device is None:
device = input.device
+ def format_return(result, offloadable):
+ weight, bias, offload_stream = result
+ return (weight, bias, offload_stream) if offloadable else (weight, bias)
+
non_blocking = comfy.model_management.device_supports_non_blocking(device)
if hasattr(s, "_v"):
- return cast_bias_weight_with_vbar(s, dtype, device, bias_dtype, non_blocking, compute_dtype, want_requant)
+
+ #vbar doesn't support CPU weights, but some custom nodes have weird paths
+ #that might switch the layer to the CPU and expect it to work. We have to take
+ #a clone conservatively as we are mmapped and some SFT files are packed misaligned
+ #If you are a custom node author reading this, please move your layer to the GPU
+ #or declare your ModelPatcher as CPU in the first place.
+ if comfy.model_management.is_device_cpu(device):
+ materialize_meta_param(s, ["weight", "bias"])
+ weight = s.weight.to(dtype=dtype, copy=True)
+ if isinstance(weight, QuantizedTensor):
+ weight = weight.dequantize()
+ bias = s.bias.to(dtype=bias_dtype, copy=True) if s.bias is not None else None
+ return format_return((weight, bias, (None, None, None)), offloadable)
+
+ prefetched = hasattr(s, "_prefetch")
+ offload_stream = None
+ offload_device = None
+ if not prefetched:
+ offload_stream = cast_modules_with_vbar([s], dtype, device, bias_dtype, non_blocking)
+ comfy.model_management.sync_stream(device, offload_stream)
+
+ weight, bias = resolve_cast_module_with_vbar(s, dtype, device, bias_dtype, compute_dtype, want_requant)
+
+ if not prefetched:
+ if getattr(s, "_prefetch")["signature"] is not None:
+ offload_device = device
+ for param_key in ("weight", "bias"):
+ lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
+ if lowvram_fn is not None:
+ lowvram_fn.clear_prepared()
+ delattr(s, "_prefetch")
+ return format_return((weight, bias, (offload_stream, offload_device, None)), offloadable)
+
if offloadable and (device != s.weight.device or
(s.bias is not None and device != s.bias.device)):
@@ -272,11 +423,7 @@ def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None, of
for f in s.weight_function:
weight = f(weight)
- if offloadable:
- return weight, bias, (offload_stream, weight_a, bias_a)
- else:
- #Legacy function signature
- return weight, bias
+ return format_return((weight, bias, (offload_stream, weight_a, bias_a)), offloadable)
def uncast_bias_weight(s, weight, bias, offload_stream):
@@ -306,6 +453,12 @@ class CastWeightBiasOp:
bias_function = []
class disable_weight_init:
+ @staticmethod
+ def _zero_init_parameter(module, name):
+ param = getattr(module, name)
+ device = None if getattr(param, "is_meta", False) else param.device
+ setattr(module, name, torch.nn.Parameter(torch.zeros(param.shape, device=device, dtype=param.dtype), requires_grad=False))
+
@staticmethod
def _lazy_load_from_state_dict(module, state_dict, prefix, local_metadata,
missing_keys, unexpected_keys, weight_shape,
@@ -472,6 +625,25 @@ class disable_weight_init:
else:
return super().forward(*args, **kwargs)
+ class BatchNorm2d(torch.nn.BatchNorm2d, CastWeightBiasOp):
+ def reset_parameters(self):
+ return None
+
+ def forward_comfy_cast_weights(self, input):
+ weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+ running_mean = self.running_mean.to(device=input.device, dtype=weight.dtype) if self.running_mean is not None else None
+ running_var = self.running_var.to(device=input.device, dtype=weight.dtype) if self.running_var is not None else None
+ x = torch.nn.functional.batch_norm(input, running_mean, running_var, weight, bias, self.training, self.momentum, self.eps)
+ uncast_bias_weight(self, weight, bias, offload_stream)
+ return x
+
+ def forward(self, *args, **kwargs):
+ run_every_op()
+ if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+ return self.forward_comfy_cast_weights(*args, **kwargs)
+ else:
+ return super().forward(*args, **kwargs)
+
class LayerNorm(torch.nn.LayerNorm, CastWeightBiasOp):
def reset_parameters(self):
return None
@@ -659,6 +831,9 @@ class manual_cast(disable_weight_init):
class Conv3d(disable_weight_init.Conv3d):
comfy_cast_weights = True
+ class BatchNorm2d(disable_weight_init.BatchNorm2d):
+ comfy_cast_weights = True
+
class GroupNorm(disable_weight_init.GroupNorm):
comfy_cast_weights = True
@@ -873,6 +1048,144 @@ class QuantLinearFunc(torch.autograd.Function):
return grad_input, grad_weight, grad_bias, None, None, None
+# Quantized-weight module helpers
+
+def _quantized_apply(module, fn, recurse=True):
+ """Re-wrap Parameters after fn so .to()/.cuda() propagate through QuantizedTensor weights."""
+ if recurse:
+ for child in module.children():
+ child._apply(fn)
+ for key, param in module._parameters.items():
+ if param is None:
+ continue
+ p = fn(param)
+ if (not torch.is_inference_mode_enabled()) and p.is_inference():
+ p = p.clone()
+ module.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
+ for key, buf in module._buffers.items():
+ if buf is not None:
+ module._buffers[key] = fn(buf)
+ return module
+
+
+def _load_quantized_module(module, super_load, state_dict, prefix, local_metadata, strict,
+ missing_keys, unexpected_keys, error_msgs, load_extra_params=False):
+ """Shared _load_from_state_dict body for quantized-weight modules.
+
+ Pops weight (+ scales, +/- extras), populates module.weight as a Parameter
+ or Parameter-wrapped QuantizedTensor, then calls super_load and strips
+ consumed keys from missing_keys. Reads compute_dtype from factory_kwargs
+ and disabled formats from module._disabled_formats.
+ """
+ device = module.factory_kwargs["device"]
+ compute_dtype = module.factory_kwargs["dtype"]
+ disabled_formats = module._disabled_formats
+ layer_name = prefix.rstrip('.')
+
+ weight = state_dict.pop(f"{prefix}weight", None)
+ if weight is None:
+ logging.warning(f"Missing weight for layer {layer_name}")
+ module.weight = None
+ return
+ manually_loaded_keys = [f"{prefix}weight"]
+
+ def pop_scale(name, dtype=None):
+ key = f"{prefix}{name}"
+ v = state_dict.pop(key, None)
+ if v is not None:
+ v = v.to(device=device)
+ if dtype is not None:
+ v = v.view(dtype=dtype)
+ manually_loaded_keys.append(key)
+ return v
+
+ layer_conf = state_dict.pop(f"{prefix}comfy_quant", None)
+ if layer_conf is not None:
+ layer_conf = json.loads(layer_conf.numpy().tobytes())
+
+ if layer_conf is None:
+ module.weight = torch.nn.Parameter(weight.to(device=device, dtype=compute_dtype), requires_grad=False)
+ else:
+ module.quant_format = layer_conf.get("format", None)
+ module._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False)
+ if not module._full_precision_mm:
+ module._full_precision_mm = module._full_precision_mm_config
+ if module.quant_format in disabled_formats:
+ module._full_precision_mm = True
+ if module.quant_format is None:
+ raise ValueError(f"Unknown quantization format for layer {layer_name}")
+
+ qconfig = QUANT_ALGOS[module.quant_format]
+ module.layout_type = qconfig["comfy_tensor_layout"]
+ layout_cls = get_layout_class(module.layout_type)
+
+ # Per-format scales; fp8 dtype views handle both legacy uint8-on-disk and native fp8.
+ if module.quant_format in ("float8_e4m3fn", "float8_e5m2"):
+ scales = {"scale": pop_scale("weight_scale")}
+ elif module.quant_format == "mxfp8":
+ bs = pop_scale("weight_scale", torch.float8_e8m0fnu)
+ if bs is None:
+ raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}")
+ scales = {"scale": bs}
+ elif module.quant_format == "nvfp4":
+ ts = pop_scale("weight_scale_2")
+ bs = pop_scale("weight_scale", torch.float8_e4m3fn)
+ if ts is None or bs is None:
+ raise ValueError(f"Missing NVFP4 scales for layer {layer_name}")
+ scales = {"scale": ts, "block_scale": bs}
+ else:
+ raise ValueError(f"Unsupported quantization format: {module.quant_format}")
+
+ params = layout_cls.Params(**scales, orig_dtype=compute_dtype, orig_shape=module._orig_shape)
+ module.weight = torch.nn.Parameter(
+ QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), module.layout_type, params),
+ requires_grad=False,
+ )
+
+ if load_extra_params:
+ for param_name in qconfig["parameters"]:
+ if param_name in {"weight_scale", "weight_scale_2"}:
+ continue
+ param_key = f"{prefix}{param_name}"
+ _v = state_dict.pop(param_key, None)
+ if _v is None:
+ continue
+ module.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
+ manually_loaded_keys.append(param_key)
+
+ super_load(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+ for key in manually_loaded_keys:
+ if key in missing_keys:
+ missing_keys.remove(key)
+
+
+def _quantized_weight_state_dict(module, sd, prefix, extra_quant_conf=None, extra_quant_params=()):
+ """Shared state_dict body. extra_quant_conf merges into the comfy_quant JSON;
+ extra_quant_params names attributes written as additional top-level keys."""
+ if not hasattr(module, 'weight'):
+ logging.warning(f"Warning: state dict on uninitialized op {prefix}")
+ return sd
+ bias = getattr(module, 'bias', None)
+ if bias is not None:
+ sd[f"{prefix}bias"] = bias
+ if module.weight is None:
+ return sd
+ if isinstance(module.weight, QuantizedTensor):
+ sd.update(module.weight.state_dict(f"{prefix}weight"))
+ quant_conf = {"format": module.quant_format}
+ if getattr(module, '_full_precision_mm_config', False):
+ quant_conf["full_precision_matrix_mult"] = True
+ if extra_quant_conf:
+ quant_conf.update(extra_quant_conf)
+ sd[f"{prefix}comfy_quant"] = torch.tensor(list(json.dumps(quant_conf).encode("utf-8")), dtype=torch.uint8)
+ for name in extra_quant_params:
+ value = getattr(module, name, None)
+ if value is not None:
+ sd[f"{prefix}{name}"] = value
+ else:
+ sd[f"{prefix}weight"] = module.weight
+ return sd
+
def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_precision_mm=False, disabled=[]):
class MixedPrecisionOps(manual_cast):
@@ -882,21 +1195,16 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
_disabled = disabled
class Linear(torch.nn.Module, CastWeightBiasOp):
- def __init__(
- self,
- in_features: int,
- out_features: int,
- bias: bool = True,
- device=None,
- dtype=None,
- ) -> None:
+ _disabled_formats = disabled
+
+ def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None):
super().__init__()
self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
- # self.factory_kwargs = {"device": device, "dtype": dtype}
self.in_features = in_features
self.out_features = out_features
+ self._orig_shape = (out_features, in_features)
if bias:
self.bias = torch.nn.Parameter(torch.empty(out_features, **self.factory_kwargs))
else:
@@ -909,151 +1217,12 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
def reset_parameters(self):
return None
- def _load_scale_param(self, state_dict, prefix, param_name, device, manually_loaded_keys, dtype=None):
- key = f"{prefix}{param_name}"
- value = state_dict.pop(key, None)
- if value is not None:
- value = value.to(device=device)
- if dtype is not None:
- value = value.view(dtype=dtype)
- manually_loaded_keys.append(key)
- return value
-
- def _load_from_state_dict(self, state_dict, prefix, local_metadata,
- strict, missing_keys, unexpected_keys, error_msgs):
-
- device = self.factory_kwargs["device"]
- layer_name = prefix.rstrip('.')
- weight_key = f"{prefix}weight"
- weight = state_dict.pop(weight_key, None)
- if weight is None:
- logging.warning(f"Missing weight for layer {layer_name}")
- self.weight = None
- return
-
- manually_loaded_keys = [weight_key]
-
- layer_conf = state_dict.pop(f"{prefix}comfy_quant", None)
- if layer_conf is not None:
- layer_conf = json.loads(layer_conf.numpy().tobytes())
-
- if layer_conf is None:
- self.weight = torch.nn.Parameter(weight.to(device=device, dtype=MixedPrecisionOps._compute_dtype), requires_grad=False)
- else:
- self.quant_format = layer_conf.get("format", None)
- self._full_precision_mm_config = layer_conf.get("full_precision_matrix_mult", False)
- if not self._full_precision_mm:
- self._full_precision_mm = self._full_precision_mm_config
-
- if self.quant_format in MixedPrecisionOps._disabled:
- self._full_precision_mm = True
-
- if self.quant_format is None:
- raise ValueError(f"Unknown quantization format for layer {layer_name}")
-
- qconfig = QUANT_ALGOS[self.quant_format]
- self.layout_type = qconfig["comfy_tensor_layout"]
- layout_cls = get_layout_class(self.layout_type)
-
- # Load format-specific parameters
- if self.quant_format in ["float8_e4m3fn", "float8_e5m2"]:
- # FP8: single tensor scale
- scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys)
-
- params = layout_cls.Params(
- scale=scale,
- orig_dtype=MixedPrecisionOps._compute_dtype,
- orig_shape=(self.out_features, self.in_features),
- )
-
- elif self.quant_format == "mxfp8":
- # MXFP8: E8M0 block scales stored as uint8 in safetensors
- block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys,
- dtype=torch.uint8)
-
- if block_scale is None:
- raise ValueError(f"Missing MXFP8 block scales for layer {layer_name}")
-
- block_scale = block_scale.view(torch.float8_e8m0fnu)
-
- params = layout_cls.Params(
- scale=block_scale,
- orig_dtype=MixedPrecisionOps._compute_dtype,
- orig_shape=(self.out_features, self.in_features),
- )
-
- elif self.quant_format == "nvfp4":
- # NVFP4: tensor_scale (weight_scale_2) + block_scale (weight_scale)
- tensor_scale = self._load_scale_param(state_dict, prefix, "weight_scale_2", device, manually_loaded_keys)
- block_scale = self._load_scale_param(state_dict, prefix, "weight_scale", device, manually_loaded_keys,
- dtype=torch.float8_e4m3fn)
-
- if tensor_scale is None or block_scale is None:
- raise ValueError(f"Missing NVFP4 scales for layer {layer_name}")
-
- params = layout_cls.Params(
- scale=tensor_scale,
- block_scale=block_scale,
- orig_dtype=MixedPrecisionOps._compute_dtype,
- orig_shape=(self.out_features, self.in_features),
- )
- else:
- raise ValueError(f"Unsupported quantization format: {self.quant_format}")
-
- self.weight = torch.nn.Parameter(
- QuantizedTensor(weight.to(device=device, dtype=qconfig["storage_t"]), self.layout_type, params),
- requires_grad=False
- )
-
- for param_name in qconfig["parameters"]:
- if param_name in {"weight_scale", "weight_scale_2"}:
- continue # Already handled above
-
- param_key = f"{prefix}{param_name}"
- _v = state_dict.pop(param_key, None)
- if _v is None:
- continue
- self.register_parameter(param_name, torch.nn.Parameter(_v.to(device=device), requires_grad=False))
- manually_loaded_keys.append(param_key)
-
- super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
-
- for key in manually_loaded_keys:
- if key in missing_keys:
- missing_keys.remove(key)
+ def _load_from_state_dict(self, *args):
+ _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=True)
def state_dict(self, *args, destination=None, prefix="", **kwargs):
- if destination is not None:
- sd = destination
- else:
- sd = {}
-
- if not hasattr(self, 'weight'):
- logging.warning("Warning: state dict on uninitialized op {}".format(prefix))
- return sd
-
- if self.bias is not None:
- sd["{}bias".format(prefix)] = self.bias
-
- if self.weight is None:
- return sd
-
- if isinstance(self.weight, QuantizedTensor):
- sd_out = self.weight.state_dict("{}weight".format(prefix))
- for k in sd_out:
- sd[k] = sd_out[k]
-
- quant_conf = {"format": self.quant_format}
- if self._full_precision_mm_config:
- quant_conf["full_precision_matrix_mult"] = True
- sd["{}comfy_quant".format(prefix)] = torch.tensor(list(json.dumps(quant_conf).encode('utf-8')), dtype=torch.uint8)
-
- input_scale = getattr(self, 'input_scale', None)
- if input_scale is not None:
- sd["{}input_scale".format(prefix)] = input_scale
- else:
- sd["{}weight".format(prefix)] = self.weight
- return sd
+ sd = destination if destination is not None else {}
+ return _quantized_weight_state_dict(self, sd, prefix, extra_quant_params=("input_scale",))
def _forward(self, input, weight, bias):
return torch.nn.functional.linear(input, weight, bias)
@@ -1143,21 +1312,196 @@ def mixed_precision_ops(quant_config={}, compute_dtype=torch.bfloat16, full_prec
self.weight = torch.nn.Parameter(weight, requires_grad=False)
def _apply(self, fn, recurse=True): # This is to get torch.compile + moving weights to another device working
- if recurse:
- for module in self.children():
- module._apply(fn)
+ return _quantized_apply(self, fn, recurse)
- for key, param in self._parameters.items():
- if param is None:
- continue
- p = fn(param)
- if p.is_inference():
- p = p.clone()
- self.register_parameter(key, torch.nn.Parameter(p, requires_grad=False))
- for key, buf in self._buffers.items():
- if buf is not None:
- self._buffers[key] = fn(buf)
- return self
+ class MoEExperts(torch.nn.Module, CastWeightBiasOp):
+ """Container for E quantized expert weights, indexed via expert_weight(i).
+
+ The bank lives on self.weight as a single 3D tensor — either a
+ compute_dtype Parameter or a Parameter wrapping a QuantizedTensor
+ with leading expert dim.
+
+ State-dict layout matches mixed_precision_ops.Linear with a leading
+ expert dim:
+ {prefix}.weight quant data (storage_t), leading dim = E
+ {prefix}.weight_scale block / per-tensor scale
+ {prefix}.weight_scale_2 [E] or scalar NVFP4 only
+ {prefix}.bias [E, out_features] optional, compute_dtype
+ {prefix}.comfy_quant json -> {{"format": "...", "num_experts": E}}
+
+ Without comfy_quant the weight loads as a plain compute_dtype 3D Parameter [E, out, in].
+ """
+
+ _disabled_formats = disabled
+
+ def __init__(self, num_experts: int, in_features: int, out_features: int, bias: bool = True, device=None, dtype=None):
+ super().__init__()
+ self.num_experts = num_experts
+ self.in_features = in_features
+ self.out_features = out_features
+ self._orig_shape = (num_experts, out_features, in_features)
+ self.factory_kwargs = {"device": device, "dtype": MixedPrecisionOps._compute_dtype}
+ if bias:
+ self.bias = torch.nn.Parameter(torch.empty(num_experts, out_features, **self.factory_kwargs))
+ else:
+ self.register_parameter("bias", None)
+
+ # Populated by _load_from_state_dict:
+ self.weight = None
+ self.quant_format = None
+ self.layout_type = None
+ self._full_precision_mm = MixedPrecisionOps._full_precision_mm
+ self._full_precision_mm_config = False
+ self._resident_bank = None
+
+ def reset_parameters(self):
+ return None
+
+ def _apply(self, fn, recurse=True):
+ return _quantized_apply(self, fn, recurse)
+
+ def _load_from_state_dict(self, *args):
+ _load_quantized_module(self, super()._load_from_state_dict, *args, load_extra_params=False)
+
+ def expert_weight(self, i: int):
+ """Expert i's weight (Tensor or per-expert QuantizedTensor view)."""
+ if isinstance(self.weight, QuantizedTensor):
+ return self._expert_qt_from(self.weight, i)
+ return self.weight[i]
+
+ @contextlib.contextmanager
+ def bank_resident(self, input):
+ """Cast the whole bank once; expert_linear inside reuses the cast.
+ Not re-entrant — do not nest calls on the same instance.
+ """
+ weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+ self._resident_bank = (weight, bias)
+ try:
+ yield self
+ finally:
+ self._resident_bank = None
+ uncast_bias_weight(self, weight, bias, offload_stream)
+
+ def expert_linear(self, input: torch.Tensor, i: int) -> torch.Tensor:
+ """Linear against expert i's weight (with optional bias)."""
+ resident = getattr(self, "_resident_bank", None)
+ if resident is not None:
+ weight, bias = resident
+ return self._expert_linear_impl(input, weight, bias, i)
+ weight, bias, offload_stream = cast_bias_weight(self, input, offloadable=True)
+ try:
+ return self._expert_linear_impl(input, weight, bias, i)
+ finally:
+ uncast_bias_weight(self, weight, bias, offload_stream)
+
+ def _expert_linear_impl(self, input, weight, bias, i):
+ if isinstance(weight, QuantizedTensor):
+ qw = self._expert_qt_from(weight, i)
+ else:
+ qw = weight[i]
+ b = cast_to_input(bias[i], input, copy=False) if bias is not None else None
+
+ if isinstance(qw, QuantizedTensor):
+ use_fast = (
+ not self._full_precision_mm
+ and qw.layout_cls.supports_fast_matmul()
+ and input.dim() == 2
+ )
+ if use_fast:
+ qin = QuantizedTensor.from_float(input, self.layout_type)
+ return torch.nn.functional.linear(qin, qw, b)
+ out = input @ qw.dequantize().t()
+ return out + b if b is not None else out
+ return torch.nn.functional.linear(input, qw, b)
+
+ def _expert_qt_from(self, weight: QuantizedTensor, i: int) -> QuantizedTensor:
+ """Build a per-expert QuantizedTensor by indexing into a resident bank."""
+ params = weight._params
+ kwargs = {
+ "scale": params.scale[i] if params.scale.dim() else params.scale,
+ "orig_dtype": params.orig_dtype,
+ "orig_shape": (self.out_features, self.in_features),
+ }
+ if hasattr(params, "block_scale"): # NVFP4
+ kwargs["block_scale"] = params.block_scale[i]
+ return QuantizedTensor(weight._qdata[i], weight._layout_cls, type(params)(**kwargs))
+
+ def state_dict(self, *args, destination=None, prefix="", **kwargs):
+ sd = destination if destination is not None else {}
+ return _quantized_weight_state_dict(self, sd, prefix, extra_quant_conf={"num_experts": self.num_experts})
+
+ class Embedding(manual_cast.Embedding):
+ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+ weight_key = f"{prefix}weight"
+ layer_conf = state_dict.pop(f"{prefix}comfy_quant", None)
+ if layer_conf is not None:
+ layer_conf = json.loads(layer_conf.numpy().tobytes())
+
+ # Only fp8 makes sense for embeddings (per-row dequant via index select).
+ # Block-scaled formats (NVFP4, MXFP8) can't do per-row lookup efficiently.
+ quant_format = layer_conf.get("format") if layer_conf is not None else None
+ manually_loaded_keys = []
+
+ if quant_format in ("float8_e4m3fn", "float8_e5m2") and weight_key in state_dict:
+ self.quant_format = quant_format
+ qconfig = QUANT_ALGOS[quant_format]
+ self.layout_type = qconfig["comfy_tensor_layout"]
+ layout_cls = get_layout_class(self.layout_type)
+ weight = state_dict.pop(weight_key)
+ manually_loaded_keys.append(weight_key)
+
+ scale_key = f"{prefix}weight_scale"
+ scale = state_dict.pop(scale_key, None)
+ if scale is not None:
+ scale = scale.float()
+ manually_loaded_keys.append(scale_key)
+
+ params = layout_cls.Params(
+ scale=scale if scale is not None else torch.ones((), dtype=torch.float32),
+ orig_dtype=MixedPrecisionOps._compute_dtype,
+ orig_shape=(self.num_embeddings, self.embedding_dim),
+ )
+ self.weight = torch.nn.Parameter(
+ QuantizedTensor(weight.to(dtype=qconfig["storage_t"]), qconfig["comfy_tensor_layout"], params),
+ requires_grad=False)
+ elif layer_conf is not None:
+ # Unsupported format — restore the marker so it round-trips; fall through to default load.
+ state_dict[f"{prefix}comfy_quant"] = torch.tensor(
+ list(json.dumps(layer_conf).encode('utf-8')), dtype=torch.uint8)
+
+ super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
+ for k in manually_loaded_keys:
+ if k in missing_keys:
+ missing_keys.remove(k)
+
+ def state_dict(self, *args, destination=None, prefix="", **kwargs):
+ sd = destination if destination is not None else {}
+ return _quantized_weight_state_dict(self, sd, prefix)
+
+ def forward_comfy_cast_weights(self, input, out_dtype=None):
+ weight = self.weight
+
+ # Optimized path: lookup in fp8, dequantize only the selected rows.
+ if isinstance(weight, QuantizedTensor) and len(self.weight_function) == 0:
+ qdata, _, offload_stream = cast_bias_weight(self, device=input.device, dtype=weight.dtype, offloadable=True)
+ if isinstance(qdata, QuantizedTensor):
+ scale = qdata._params.scale
+ qdata = qdata._qdata
+ else:
+ scale = None
+
+ x = torch.nn.functional.embedding(
+ input, qdata, self.padding_idx, self.max_norm,
+ self.norm_type, self.scale_grad_by_freq, self.sparse)
+ uncast_bias_weight(self, qdata, None, offload_stream)
+ target_dtype = out_dtype if out_dtype is not None else weight._params.orig_dtype
+ x = x.to(dtype=target_dtype)
+ if scale is not None and scale != 1.0:
+ x = x * scale.to(dtype=target_dtype)
+ return x
+
+ # Fallback for non-quantized or weight_function (LoRA) case
+ return super().forward_comfy_cast_weights(input, out_dtype=out_dtype)
return MixedPrecisionOps
@@ -1176,6 +1520,7 @@ def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_
if not fp8_compute:
disabled.add("float8_e4m3fn")
disabled.add("float8_e5m2")
+ logging.info("Native ops: {} {}".format(", ".join(QUANT_ALGOS.keys() - disabled), ", emulated ops: {}".format(", ".join(disabled)) if len(disabled) > 0 else ""))
return mixed_precision_ops(model_config.quant_config, compute_dtype, disabled=disabled)
if (
diff --git a/comfy/patcher_extension.py b/comfy/patcher_extension.py
index 5ee4d5ee5..189ee84ca 100644
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
@@ -1,8 +1,9 @@
-from __future__ import annotations
from typing import Callable
class CallbacksMP:
ON_CLONE = "on_clone"
+ ON_DEEPCLONE_MULTIGPU = "on_deepclone_multigpu"
+ ON_MATCH_MULTIGPU_CLONES = "on_match_multigpu_clones"
ON_LOAD = "on_load_after"
ON_DETACH = "on_detach_after"
ON_CLEANUP = "on_cleanup"
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 6f142282d..0e8f573ba 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -2,48 +2,62 @@ import comfy.model_management
import comfy.memory_management
import comfy_aimdo.host_buffer
import comfy_aimdo.torch
-import psutil
+import torch
from comfy.cli_args import args
-def get_pin(module):
- return getattr(module, "_pin", None)
+def get_pin(module, subset="weights"):
+ pin = getattr(module, "_pin", None)
+ if pin is None or module._pin_registered or args.disable_pinned_memory:
+ return pin
-def pin_memory(module):
- if module.pin_failed or args.disable_pinned_memory or get_pin(module) is not None:
+ _, _, stack_split, pinned_size = module._pin_state[subset]
+ size = pin.nbytes
+ comfy.model_management.ensure_pin_registerable(size)
+
+ if torch.cuda.cudart().cudaHostRegister(pin.data_ptr(), size, 1) != 0:
+ comfy.model_management.discard_cuda_async_error()
+ return pin
+
+ module._pin_registered = True
+ stack_split[0] = max(stack_split[0], module._pin_stack_index)
+ comfy.model_management.TOTAL_PINNED_MEMORY += size
+ pinned_size[0] += size
+ return pin
+
+def pin_memory(module, subset="weights", size=None):
+ pin_state = module._pin_state
+ if args.disable_pinned_memory:
return
- #FIXME: This is a RAM cache trigger event
- ram_headroom = comfy.memory_management.RAM_CACHE_HEADROOM
- #we split the difference and assume half the RAM cache headroom is for us
- if ram_headroom > 0 and psutil.virtual_memory().available < (ram_headroom * 0.5):
- comfy.memory_management.extra_ram_release(ram_headroom)
- size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+ pin = get_pin(module, subset)
+ if pin is not None or pin_state["failed"]:
+ return
- if comfy.model_management.MAX_PINNED_MEMORY <= 0 or (comfy.model_management.TOTAL_PINNED_MEMORY + size) > comfy.model_management.MAX_PINNED_MEMORY:
- module.pin_failed = True
+ hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+ if size is None:
+ size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
+ offset = hostbuf.size
+ registerable_size = size + max(0, hostbuf.size - pinned_size[0])
+
+ comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
+ if (not comfy.model_management.ensure_pin_budget(size) or
+ not comfy.model_management.ensure_pin_registerable(registerable_size)):
+ pin_state["failed"] = True
return False
try:
- hostbuf = comfy_aimdo.host_buffer.HostBuffer(size)
+ hostbuf.extend(size=size)
except RuntimeError:
- module.pin_failed = True
+ pin_state["failed"] = True
return False
- module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)
- module._pin_hostbuf = hostbuf
+ module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
+ module._pin.untyped_storage()._comfy_hostbuf = hostbuf
+ stack.append((module, offset))
+ module._pin_registered = True
+ module._pin_stack_index = len(stack) - 1
+ stack_split[0] = max(stack_split[0], module._pin_stack_index)
comfy.model_management.TOTAL_PINNED_MEMORY += size
+ pinned_size[0] += size
return True
-
-def unpin_memory(module):
- if get_pin(module) is None:
- return 0
- size = module._pin.numel() * module._pin.element_size()
-
- comfy.model_management.TOTAL_PINNED_MEMORY -= size
- if comfy.model_management.TOTAL_PINNED_MEMORY < 0:
- comfy.model_management.TOTAL_PINNED_MEMORY = 0
-
- del module._pin
- del module._pin_hostbuf
- return size
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
index 42ee08fb2..b90bcfd25 100644
--- a/comfy/quant_ops.py
+++ b/comfy/quant_ops.py
@@ -1,6 +1,8 @@
import torch
import logging
+from comfy.cli_args import args
+
try:
import comfy_kitchen as ck
from comfy_kitchen.tensor import (
@@ -21,7 +23,15 @@ try:
ck.registry.disable("cuda")
logging.warning("WARNING: You need pytorch with cu130 or higher to use optimized CUDA operations.")
- ck.registry.disable("triton")
+ if args.enable_triton_backend:
+ try:
+ import triton
+ logging.info("Found triton %s. Enabling comfy-kitchen triton backend.", triton.__version__)
+ except ImportError as e:
+ logging.error(f"Failed to import triton, Error: {e}, the comfy-kitchen triton backend will not be available.")
+ ck.registry.disable("triton")
+ else:
+ ck.registry.disable("triton")
for k, v in ck.list_backends().items():
logging.info(f"Found comfy_kitchen backend {k}: {v}")
except ImportError as e:
diff --git a/comfy/rmsnorm.py b/comfy/rmsnorm.py
index ab7cf14fa..e54be98d6 100644
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
@@ -3,6 +3,7 @@ import comfy.model_management
RMSNorm = torch.nn.RMSNorm
+# Note: torch's fused F.rms_norm is faster but produces slightly different output than manual implementations (rsqrt/reduction rounding).
def rms_norm(x, weight=None, eps=1e-6):
if weight is None:
return torch.nn.functional.rms_norm(x, (x.shape[-1],), eps=eps)
diff --git a/comfy/sample.py b/comfy/sample.py
index 653829582..2be0cae5f 100644
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -37,11 +37,12 @@ def prepare_noise(latent_image, seed, noise_inds=None):
return noises
-def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None):
+def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None, downscale_ratio_temporal=None):
if latent_image.is_nested:
return latent_image
latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
- if torch.count_nonzero(latent_image) == 0:
+ is_empty = torch.count_nonzero(latent_image) == 0
+ if is_empty:
if latent_format.latent_channels != latent_image.shape[1]:
latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
if downscale_ratio_spacial is not None:
@@ -51,6 +52,13 @@ def fix_empty_latent_channels(model, latent_image, downscale_ratio_spacial=None)
if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
latent_image = latent_image.unsqueeze(2)
+
+ if is_empty and downscale_ratio_temporal is not None:
+ if downscale_ratio_temporal != latent_format.temporal_downscale_ratio:
+ ratio = downscale_ratio_temporal / latent_format.temporal_downscale_ratio
+ new_t = max(1, round(latent_image.shape[2] * ratio))
+ latent_image = comfy.utils.repeat_to_batch_size(latent_image, new_t, dim=2)
+
return latent_image
def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
diff --git a/comfy/sampler_helpers.py b/comfy/sampler_helpers.py
index bbba09e26..bdce2f2d8 100644
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
@@ -1,16 +1,18 @@
from __future__ import annotations
+import torch
import uuid
import math
import collections
import comfy.model_management
import comfy.conds
+import comfy.model_patcher
import comfy.utils
import comfy.hooks
import comfy.patcher_extension
from typing import TYPE_CHECKING
if TYPE_CHECKING:
- from comfy.model_patcher import ModelPatcher
from comfy.model_base import BaseModel
+ from comfy.model_patcher import ModelPatcher
from comfy.controlnet import ControlBase
def prepare_mask(noise_mask, shape, device):
@@ -89,7 +91,8 @@ def get_additional_models(conds, dtype):
gligen += get_models_from_cond(conds[k], "gligen")
add_models += get_models_from_cond(conds[k], "additional_models")
- control_nets = set(cnets)
+ # Order-preserving dedup. A plain set() would randomize iteration order across runs
+ control_nets = list(dict.fromkeys(cnets))
inference_memory = 0
control_models = []
@@ -118,6 +121,47 @@ def cleanup_additional_models(models):
if hasattr(m, 'cleanup'):
m.cleanup()
+def preprocess_multigpu_conds(conds: dict[str, list[dict[str]]], model: ModelPatcher, model_options: dict[str]):
+ '''If multigpu acceleration required, creates deepclones of ControlNets and GLIGEN per device.'''
+ multigpu_models: list[ModelPatcher] = model.get_additional_models_with_key("multigpu")
+ if len(multigpu_models) == 0:
+ return
+ extra_devices = [x.load_device for x in multigpu_models]
+ # handle controlnets
+ controlnets: set[ControlBase] = set()
+ for k in conds:
+ for kk in conds[k]:
+ if 'control' in kk:
+ controlnets.add(kk['control'])
+ if len(controlnets) > 0:
+ # first, unload all controlnet clones
+ for cnet in list(controlnets):
+ cnet_models = cnet.get_models()
+ for cm in cnet_models:
+ comfy.model_management.unload_model_and_clones(cm, unload_additional_models=True)
+
+ # next, make sure each controlnet has a deepclone for all relevant devices
+ for cnet in controlnets:
+ curr_cnet = cnet
+ while curr_cnet is not None:
+ for device in extra_devices:
+ if device not in curr_cnet.multigpu_clones:
+ curr_cnet.deepclone_multigpu(device, autoregister=True)
+ curr_cnet = curr_cnet.previous_controlnet
+ # since all device clones are now present, recreate the linked list for cloned cnets per device
+ for cnet in controlnets:
+ curr_cnet = cnet
+ while curr_cnet is not None:
+ prev_cnet = curr_cnet.previous_controlnet
+ for device in extra_devices:
+ device_cnet = curr_cnet.get_instance_for_device(device)
+ prev_device_cnet = None
+ if prev_cnet is not None:
+ prev_device_cnet = prev_cnet.get_instance_for_device(device)
+ device_cnet.set_previous_controlnet(prev_device_cnet)
+ curr_cnet = prev_cnet
+ # potentially handle gligen - since not widely used, ignored for now
+
def estimate_memory(model, noise_shape, conds):
cond_shapes = collections.defaultdict(list)
cond_shapes_min = {}
@@ -142,7 +186,8 @@ def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None
return executor.execute(model, noise_shape, conds, model_options=model_options, force_full_load=force_full_load, force_offload=force_offload)
def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None, force_full_load=False, force_offload=False):
- real_model: BaseModel = None
+ model.match_multigpu_clones()
+ preprocess_multigpu_conds(conds, model, model_options)
models, inference_memory = get_additional_models(conds, model.model_dtype())
models += get_additional_models_from_model_options(model_options)
models += model.get_nested_additional_models() # TODO: does this require inference_memory update?
@@ -154,7 +199,7 @@ def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=Non
memory_required += inference_memory
minimum_memory_required += inference_memory
comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required, minimum_memory_required=minimum_memory_required, force_full_load=force_full_load)
- real_model = model.model
+ real_model: BaseModel = model.model
return real_model, conds, models
@@ -200,3 +245,18 @@ def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name],
copy_dict1=False)
return to_load_options
+
+def prepare_model_patcher_multigpu_clones(model_patcher: ModelPatcher, loaded_models: list[ModelPatcher], model_options: dict):
+ '''
+ In case multigpu acceleration is enabled, prep ModelPatchers for each device.
+ '''
+ multigpu_patchers: list[ModelPatcher] = [x for x in loaded_models if x.is_multigpu_base_clone]
+ if len(multigpu_patchers) > 0:
+ multigpu_dict: dict[torch.device, ModelPatcher] = {}
+ multigpu_dict[model_patcher.load_device] = model_patcher
+ for x in multigpu_patchers:
+ x.hook_patches = comfy.model_patcher.create_hook_patches_clone(model_patcher.hook_patches, copy_tuples=True)
+ x.hook_mode = model_patcher.hook_mode # match main model's hook_mode
+ multigpu_dict[x.load_device] = x
+ model_options["multigpu_clones"] = multigpu_dict
+ return multigpu_patchers
diff --git a/comfy/samplers.py b/comfy/samplers.py
index 0a4d062db..e31277f7b 100755
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -1,7 +1,9 @@
from __future__ import annotations
+
+import comfy.model_management
from .k_diffusion import sampling as k_diffusion_sampling
from .extra_samplers import uni_pc
-from typing import TYPE_CHECKING, Callable, NamedTuple
+from typing import TYPE_CHECKING, Callable, NamedTuple, Any
if TYPE_CHECKING:
from comfy.model_patcher import ModelPatcher
from comfy.model_base import BaseModel
@@ -16,6 +18,7 @@ import comfy.model_patcher
import comfy.patcher_extension
import comfy.hooks
import comfy.context_windows
+import comfy.multigpu
import comfy.utils
import scipy.stats
import numpy
@@ -141,7 +144,7 @@ def can_concat_cond(c1, c2):
return cond_equal_size(c1.conditioning, c2.conditioning)
-def cond_cat(c_list):
+def cond_cat(c_list, device=None):
temp = {}
for x in c_list:
for k in x:
@@ -153,6 +156,8 @@ def cond_cat(c_list):
for k in temp:
conds = temp[k]
out[k] = conds[0].concat(conds[1:])
+ if device is not None and hasattr(out[k], 'to'):
+ out[k] = out[k].to(device)
return out
@@ -212,7 +217,12 @@ def _calc_cond_batch_outer(model: BaseModel, conds: list[list[dict]], x_in: torc
)
return executor.execute(model, conds, x_in, timestep, model_options)
-def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep, model_options):
+def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
+ # NOTE: keep in sync with _calc_cond_batch_multigpu below. Shared logic
+ # (hooked_to_run accumulation, memory-fit batching, per-chunk output
+ # aggregation) is duplicated there with per-device scheduling layered on top.
+ if 'multigpu_clones' in model_options:
+ return _calc_cond_batch_multigpu(model, conds, x_in, timestep, model_options)
out_conds = []
out_counts = []
# separate conds by matching hooks
@@ -244,7 +254,7 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
if has_default_conds:
finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)
- model.current_patcher.prepare_state(timestep)
+ model.current_patcher.prepare_state(timestep, model_options)
# run every hooked_to_run separately
for hooks, to_run in hooked_to_run.items():
@@ -265,7 +275,6 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
cond_shapes = collections.defaultdict(list)
for tt in batch_amount:
- cond = {k: v.size() for k, v in to_run[tt][0].conditioning.items()}
for k, v in to_run[tt][0].conditioning.items():
cond_shapes[k].append(v.size())
@@ -345,6 +354,239 @@ def _calc_cond_batch(model: BaseModel, conds: list[list[dict]], x_in: torch.Tens
return out_conds
+def _calc_cond_batch_multigpu(model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
+ # NOTE: keep in sync with _calc_cond_batch above. Same conds-by-hooks
+ # accumulation, memory-fit batching, and output aggregation, but adds a
+ # per-device scheduler, per-device patcher/control lookup, tensor .to(device)
+ # placement, and MultiGPUThreadPool dispatch around the inner loop.
+ out_conds = []
+ out_counts = []
+ # separate conds by matching hooks
+ hooked_to_run: dict[comfy.hooks.HookGroup,list[tuple[tuple,int]]] = {}
+ default_conds = []
+ has_default_conds = False
+
+ output_device = x_in.device
+
+ for i in range(len(conds)):
+ out_conds.append(torch.zeros_like(x_in))
+ out_counts.append(torch.ones_like(x_in) * 1e-37)
+
+ cond = conds[i]
+ default_c = []
+ if cond is not None:
+ for x in cond:
+ if 'default' in x:
+ default_c.append(x)
+ has_default_conds = True
+ continue
+ p = get_area_and_mult(x, x_in, timestep)
+ if p is None:
+ continue
+ if p.hooks is not None:
+ model.current_patcher.prepare_hook_patches_current_keyframe(timestep, p.hooks, model_options)
+ hooked_to_run.setdefault(p.hooks, list())
+ hooked_to_run[p.hooks] += [(p, i)]
+ default_conds.append(default_c)
+
+ if has_default_conds:
+ finalize_default_conds(model, hooked_to_run, default_conds, x_in, timestep, model_options)
+
+ model.current_patcher.prepare_state(timestep, model_options)
+
+ devices = list(model_options['multigpu_clones'].keys())
+ device_batched_hooked_to_run: dict[torch.device, list[tuple[comfy.hooks.HookGroup, tuple]]] = {}
+ # Track conds currently scheduled per device; single source of truth for capacity checks.
+ device_load: dict[torch.device, int] = {d: 0 for d in devices}
+
+ total_conds = sum(len(to_run) for to_run in hooked_to_run.values())
+ conds_per_device = max(1, math.ceil(total_conds / len(devices)))
+
+ def next_available_device(start: int) -> tuple[int, torch.device]:
+ """Return (index, device) for the next device with remaining capacity, starting at `start`.
+
+ Scans at most len(devices) positions, so this always terminates. Raises if no device
+ has remaining capacity, which would indicate a bug in conds_per_device accounting.
+ """
+ for offset in range(len(devices)):
+ i = (start + offset) % len(devices)
+ if device_load[devices[i]] < conds_per_device:
+ return i, devices[i]
+ raise RuntimeError(
+ f"MultiGPU scheduler: all {len(devices)} devices at capacity "
+ f"({conds_per_device}) but conds remain to schedule"
+ )
+
+ # run every hooked_to_run separately
+ index_device = 0
+ for hooks, to_run in hooked_to_run.items():
+ while len(to_run) > 0:
+ index_device, current_device = next_available_device(index_device)
+ remaining_capacity = conds_per_device - device_load[current_device]
+
+ first = to_run[0]
+ first_shape = first[0][0].shape
+ # collect candidate indices that can be concatenated with `first`, up to remaining capacity
+ to_batch_temp = []
+ for x in range(len(to_run)):
+ if can_concat_cond(to_run[x][0], first[0]) and len(to_batch_temp) < remaining_capacity:
+ to_batch_temp += [x]
+
+ to_batch_temp.reverse()
+ to_batch = to_batch_temp[:1]
+
+ free_memory = comfy.model_management.get_free_memory(current_device)
+ for i in range(1, len(to_batch_temp) + 1):
+ batch_amount = to_batch_temp[:len(to_batch_temp)//i]
+ input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
+ cond_shapes = collections.defaultdict(list)
+ for tt in batch_amount:
+ for k, v in to_run[tt][0].conditioning.items():
+ cond_shapes[k].append(v.size())
+ if model.memory_required(input_shape, cond_shapes=cond_shapes) * 1.5 < free_memory:
+ to_batch = batch_amount
+ break
+
+ conds_to_batch = [to_run.pop(x) for x in to_batch]
+ device_load[current_device] += len(conds_to_batch)
+ device_batched_hooked_to_run.setdefault(current_device, []).append((hooks, conds_to_batch))
+
+ if device_load[current_device] >= conds_per_device:
+ index_device += 1
+
+ class thread_result(NamedTuple):
+ output: Any
+ mult: Any
+ area: Any
+ batch_chunks: int
+ cond_or_uncond: Any
+ error: Exception = None
+
+ def _handle_batch(device: torch.device, batch_tuple: tuple[comfy.hooks.HookGroup, tuple], results: list[thread_result]):
+ try:
+ # TODO: non-NVIDIA support -- guard with `if device.type == "cuda":` once
+ # we extend multigpu QA beyond CUDA. Unconditional call crashes on
+ # XPU/NPU/MPS/CPU/DirectML backends.
+ torch.cuda.set_device(device)
+ model_current: BaseModel = model_options["multigpu_clones"][device].model
+ # run every hooked_to_run separately
+ with torch.no_grad():
+ for hooks, to_batch in batch_tuple:
+ input_x = []
+ mult = []
+ c = []
+ cond_or_uncond = []
+ uuids = []
+ area = []
+ control: ControlBase = None
+ patches = None
+ for x in to_batch:
+ o = x
+ p = o[0]
+ input_x.append(p.input_x)
+ mult.append(p.mult)
+ c.append(p.conditioning)
+ area.append(p.area)
+ cond_or_uncond.append(o[1])
+ uuids.append(p.uuid)
+ control = p.control
+ patches = p.patches
+
+ batch_chunks = len(cond_or_uncond)
+ input_x = torch.cat(input_x).to(device)
+ c = cond_cat(c, device=device)
+ timestep_ = torch.cat([timestep.to(device)] * batch_chunks)
+
+ transformer_options = model_current.current_patcher.apply_hooks(hooks=hooks)
+ if 'transformer_options' in model_options:
+ transformer_options = comfy.patcher_extension.merge_nested_dicts(transformer_options,
+ model_options['transformer_options'],
+ copy_dict1=False)
+
+ if patches is not None:
+ transformer_options["patches"] = comfy.patcher_extension.merge_nested_dicts(
+ transformer_options.get("patches", {}),
+ patches
+ )
+
+ transformer_options["cond_or_uncond"] = cond_or_uncond[:]
+ transformer_options["uuids"] = uuids[:]
+ transformer_options["sigmas"] = timestep.to(device)
+ transformer_options["sample_sigmas"] = transformer_options["sample_sigmas"].to(device)
+ transformer_options["multigpu_thread_device"] = device
+
+ cast_transformer_options(transformer_options, device=device)
+ c['transformer_options'] = transformer_options
+
+ if control is not None:
+ device_control = control.get_instance_for_device(device)
+ c['control'] = device_control.get_control(input_x, timestep_, c, len(cond_or_uncond), transformer_options)
+
+ if 'model_function_wrapper' in model_options:
+ output = model_options['model_function_wrapper'](model_current.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).to(output_device).chunk(batch_chunks)
+ else:
+ output = model_current.apply_model(input_x, timestep_, **c).to(output_device).chunk(batch_chunks)
+ # TODO: non-NVIDIA support -- the `.to(output_device)` copies
+ # above are async on CUDA, so the main thread's aggregation
+ # could race with in-flight transfers. CUDA-only QA has not
+ # surfaced this in practice, but before extending multigpu
+ # beyond NVIDIA add a `torch.cuda.synchronize(output_device)`
+ # here (guarded by `output_device.type == "cuda"`).
+ results.append(thread_result(output, mult, area, batch_chunks, cond_or_uncond))
+ except Exception as e:
+ results.append(thread_result(None, None, None, None, None, error=e))
+ raise
+
+
+ def _handle_batch_pooled(device, batch_tuple):
+ worker_results = []
+ _handle_batch(device, batch_tuple, worker_results)
+ return worker_results
+
+ results: list[thread_result] = []
+ thread_pool: comfy.multigpu.MultiGPUThreadPool = model_options.get("multigpu_thread_pool")
+
+ # Submit all GPU work to pool threads
+ pool_devices = []
+ for device, batch_tuple in device_batched_hooked_to_run.items():
+ if thread_pool is not None:
+ thread_pool.submit(device, _handle_batch_pooled, device, batch_tuple)
+ pool_devices.append(device)
+ else:
+ # Fallback: no pool, run everything on main thread
+ _handle_batch(device, batch_tuple, results)
+
+ # Collect results from pool workers
+ for device in pool_devices:
+ worker_results, error = thread_pool.get_result(device)
+ if error is not None:
+ raise error
+ results.extend(worker_results)
+
+ for output, mult, area, batch_chunks, cond_or_uncond, error in results:
+ if error is not None:
+ raise error
+ for o in range(batch_chunks):
+ cond_index = cond_or_uncond[o]
+ a = area[o]
+ if a is None:
+ out_conds[cond_index] += output[o] * mult[o]
+ out_counts[cond_index] += mult[o]
+ else:
+ out_c = out_conds[cond_index]
+ out_cts = out_counts[cond_index]
+ dims = len(a) // 2
+ for i in range(dims):
+ out_c = out_c.narrow(i + 2, a[i + dims], a[i])
+ out_cts = out_cts.narrow(i + 2, a[i + dims], a[i])
+ out_c += output[o] * mult[o]
+ out_cts += mult[o]
+
+ for i in range(len(out_conds)):
+ out_conds[i] /= out_counts[i]
+
+ return out_conds
+
def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options): #TODO: remove
logging.warning("WARNING: The comfy.samplers.calc_cond_uncond_batch function is deprecated please use the calc_cond_batch one instead.")
return tuple(calc_cond_batch(model, [cond, uncond], x_in, timestep, model_options))
@@ -643,12 +885,21 @@ def calculate_start_end_timesteps(model, conds):
def pre_run_control(model, conds):
s = model.model_sampling
+ # Per-device model lookup so multigpu control clones get the matching
+ # diffusion_model (e.g. QwenFunControlNet stashes it into extra_args).
+ device_models: dict = {}
+ patcher = getattr(model, "current_patcher", None)
+ if patcher is not None:
+ for p in patcher.get_additional_models_with_key("multigpu"):
+ device_models[p.load_device] = p.model
for t in range(len(conds)):
x = conds[t]
percent_to_timestep_function = lambda a: s.percent_to_sigma(a)
if 'control' in x:
x['control'].pre_run(model, percent_to_timestep_function)
+ for device, device_cnet in x['control'].multigpu_clones.items():
+ device_cnet.pre_run(device_models.get(device, model), percent_to_timestep_function)
def apply_empty_x_to_equal_area(conds, uncond, name, uncond_fill_func):
cond_cnets = []
@@ -891,7 +1142,9 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
to_load_options = model_options.get("to_load_options", None)
if to_load_options is None:
return
+ cast_transformer_options(to_load_options, device, dtype)
+def cast_transformer_options(transformer_options: dict[str], device=None, dtype=None):
casts = []
if device is not None:
casts.append(device)
@@ -900,18 +1153,17 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
# if nothing to apply, do nothing
if len(casts) == 0:
return
-
# try to call .to on patches
- if "patches" in to_load_options:
- patches = to_load_options["patches"]
+ if "patches" in transformer_options:
+ patches = transformer_options["patches"]
for name in patches:
patch_list = patches[name]
for i in range(len(patch_list)):
if hasattr(patch_list[i], "to"):
for cast in casts:
patch_list[i] = patch_list[i].to(cast)
- if "patches_replace" in to_load_options:
- patches = to_load_options["patches_replace"]
+ if "patches_replace" in transformer_options:
+ patches = transformer_options["patches_replace"]
for name in patches:
patch_list = patches[name]
for k in patch_list:
@@ -921,8 +1173,8 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
# try to call .to on any wrappers/callbacks
wrappers_and_callbacks = ["wrappers", "callbacks"]
for wc_name in wrappers_and_callbacks:
- if wc_name in to_load_options:
- wc: dict[str, list] = to_load_options[wc_name]
+ if wc_name in transformer_options:
+ wc: dict[str, list] = transformer_options[wc_name]
for wc_dict in wc.values():
for wc_list in wc_dict.values():
for i in range(len(wc_list)):
@@ -930,7 +1182,6 @@ def cast_to_load_options(model_options: dict[str], device=None, dtype=None):
for cast in casts:
wc_list[i] = wc_list[i].to(cast)
-
class CFGGuider:
def __init__(self, model_patcher: ModelPatcher):
self.model_patcher = model_patcher
@@ -985,16 +1236,32 @@ class CFGGuider:
self.inner_model, self.conds, self.loaded_models = comfy.sampler_helpers.prepare_sampling(self.model_patcher, noise.shape, self.conds, self.model_options)
device = self.model_patcher.load_device
- noise = noise.to(device=device, dtype=torch.float32)
- latent_image = latent_image.to(device=device, dtype=torch.float32)
- sigmas = sigmas.to(device)
- cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype())
+ multigpu_patchers = comfy.sampler_helpers.prepare_model_patcher_multigpu_clones(self.model_patcher, self.loaded_models, self.model_options)
- try:
- self.model_patcher.pre_run()
- output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
- finally:
- self.model_patcher.cleanup()
+ # Create persistent thread pool for all GPU devices (main + extras)
+ if multigpu_patchers:
+ extra_devices = [p.load_device for p in multigpu_patchers]
+ all_devices = [device] + extra_devices
+ self.model_options["multigpu_thread_pool"] = comfy.multigpu.MultiGPUThreadPool(all_devices)
+
+ with comfy.model_management.cuda_device_context(device):
+ try:
+ noise = noise.to(device=device, dtype=torch.float32)
+ latent_image = latent_image.to(device=device, dtype=torch.float32)
+ sigmas = sigmas.to(device)
+ cast_to_load_options(self.model_options, device=device, dtype=self.model_patcher.model_dtype())
+
+ self.model_patcher.pre_run()
+ for multigpu_patcher in multigpu_patchers:
+ multigpu_patcher.pre_run()
+ output = self.inner_sample(noise, latent_image, device, sampler, sigmas, denoise_mask, callback, disable_pbar, seed, latent_shapes=latent_shapes)
+ finally:
+ thread_pool = self.model_options.pop("multigpu_thread_pool", None)
+ if thread_pool is not None:
+ thread_pool.shutdown()
+ self.model_patcher.cleanup()
+ for multigpu_patcher in multigpu_patchers:
+ multigpu_patcher.cleanup()
comfy.sampler_helpers.cleanup_models(self.conds, self.loaded_models)
del self.inner_model
diff --git a/comfy/sd.py b/comfy/sd.py
index e573804a5..30b877b85 100644
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import json
import torch
from enum import Enum
@@ -12,13 +11,16 @@ from .ldm.cascade.stage_c_coder import StageC_coder
from .ldm.audio.autoencoder import AudioOobleckVAE
import comfy.ldm.genmo.vae.model
import comfy.ldm.lightricks.vae.causal_video_autoencoder
+import comfy.ldm.lightricks.vae.audio_vae
import comfy.ldm.cosmos.vae
import comfy.ldm.wan.vae
import comfy.ldm.wan.vae2_2
import comfy.ldm.hunyuan3d.vae
import comfy.ldm.ace.vae.music_dcae_pipeline
+import comfy.ldm.cogvideo.vae
import comfy.ldm.hunyuan_video.vae
import comfy.ldm.mmaudio.vae.autoencoder
+import comfy.ldm.audio.vae_sa3
import comfy.pixel_space_convert
import comfy.weight_adapter
import yaml
@@ -47,6 +49,7 @@ import comfy.text_encoders.lt
import comfy.text_encoders.hunyuan_video
import comfy.text_encoders.cosmos
import comfy.text_encoders.lumina2
+import comfy.text_encoders.pixeldit
import comfy.text_encoders.wan
import comfy.text_encoders.hidream
import comfy.text_encoders.ace
@@ -63,6 +66,10 @@ import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.text_encoders.qwen35
import comfy.text_encoders.ernie
+import comfy.text_encoders.gemma4
+import comfy.text_encoders.cogvideo
+import comfy.text_encoders.sa3
+import comfy.text_encoders.gpt_oss
import comfy.model_patcher
import comfy.lora
@@ -75,7 +82,7 @@ import comfy.latent_formats
import comfy.ldm.flux.redux
-def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
+def load_lora_for_models(model, clip, lora, strength_model, strength_clip, lora_metadata=None):
key_map = {}
if model is not None:
key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
@@ -87,6 +94,8 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
if model is not None:
new_modelpatcher = model.clone()
k = new_modelpatcher.add_patches(loaded, strength_model)
+ if lora_metadata:
+ new_modelpatcher.set_attachments("lora_metadata", lora_metadata)
else:
k = ()
new_modelpatcher = None
@@ -94,6 +103,8 @@ def load_lora_for_models(model, clip, lora, strength_model, strength_clip):
if clip is not None:
new_clip = clip.clone()
k1 = new_clip.add_patches(loaded, strength_clip)
+ if lora_metadata:
+ new_clip.patcher.set_attachments("lora_metadata", lora_metadata)
else:
k1 = ()
new_clip = None
@@ -235,7 +246,8 @@ class CLIP:
model_management.archive_model_dtypes(self.cond_stage_model)
self.tokenizer = tokenizer(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data)
- ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher
+ te_disable_dynamic = disable_dynamic or getattr(self.cond_stage_model, "disable_offload", False)
+ ModelPatcher = comfy.model_patcher.ModelPatcher if te_disable_dynamic else comfy.model_patcher.CoreModelPatcher
self.patcher = ModelPatcher(self.cond_stage_model, load_device=load_device, offload_device=offload_device)
#Match torch.float32 hardcode upcast in TE implemention
self.patcher.set_model_compute_dtype(torch.float32)
@@ -324,41 +336,43 @@ class CLIP:
self.cond_stage_model.set_clip_options({"projected_pooled": False})
self.load_model(tokens)
- self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
+ device = self.patcher.load_device
+ self.cond_stage_model.set_clip_options({"execution_device": device})
all_hooks.reset()
self.patcher.patch_hooks(None)
if show_pbar:
pbar = ProgressBar(len(scheduled_keyframes))
- for scheduled_opts in scheduled_keyframes:
- t_range = scheduled_opts[0]
- # don't bother encoding any conds outside of start_percent and end_percent bounds
- if "start_percent" in add_dict:
- if t_range[1] < add_dict["start_percent"]:
- continue
- if "end_percent" in add_dict:
- if t_range[0] > add_dict["end_percent"]:
- continue
- hooks_keyframes = scheduled_opts[1]
- for hook, keyframe in hooks_keyframes:
- hook.hook_keyframe._current_keyframe = keyframe
- # apply appropriate hooks with values that match new hook_keyframe
- self.patcher.patch_hooks(all_hooks)
- # perform encoding as normal
- o = self.cond_stage_model.encode_token_weights(tokens)
- cond, pooled = o[:2]
- pooled_dict = {"pooled_output": pooled}
- # add clip_start_percent and clip_end_percent in pooled
- pooled_dict["clip_start_percent"] = t_range[0]
- pooled_dict["clip_end_percent"] = t_range[1]
- # add/update any keys with the provided add_dict
- pooled_dict.update(add_dict)
- # add hooks stored on clip
- self.add_hooks_to_dict(pooled_dict)
- all_cond_pooled.append([cond, pooled_dict])
- if show_pbar:
- pbar.update(1)
- model_management.throw_exception_if_processing_interrupted()
+ with model_management.cuda_device_context(device):
+ for scheduled_opts in scheduled_keyframes:
+ t_range = scheduled_opts[0]
+ # don't bother encoding any conds outside of start_percent and end_percent bounds
+ if "start_percent" in add_dict:
+ if t_range[1] < add_dict["start_percent"]:
+ continue
+ if "end_percent" in add_dict:
+ if t_range[0] > add_dict["end_percent"]:
+ continue
+ hooks_keyframes = scheduled_opts[1]
+ for hook, keyframe in hooks_keyframes:
+ hook.hook_keyframe._current_keyframe = keyframe
+ # apply appropriate hooks with values that match new hook_keyframe
+ self.patcher.patch_hooks(all_hooks)
+ # perform encoding as normal
+ o = self.cond_stage_model.encode_token_weights(tokens)
+ cond, pooled = o[:2]
+ pooled_dict = {"pooled_output": pooled}
+ # add clip_start_percent and clip_end_percent in pooled
+ pooled_dict["clip_start_percent"] = t_range[0]
+ pooled_dict["clip_end_percent"] = t_range[1]
+ # add/update any keys with the provided add_dict
+ pooled_dict.update(add_dict)
+ # add hooks stored on clip
+ self.add_hooks_to_dict(pooled_dict)
+ all_cond_pooled.append([cond, pooled_dict])
+ if show_pbar:
+ pbar.update(1)
+ model_management.throw_exception_if_processing_interrupted()
all_hooks.reset()
return all_cond_pooled
@@ -372,8 +386,12 @@ class CLIP:
self.cond_stage_model.set_clip_options({"projected_pooled": False})
self.load_model(tokens)
- self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
- o = self.cond_stage_model.encode_token_weights(tokens)
+ device = self.patcher.load_device
+ self.cond_stage_model.set_clip_options({"execution_device": device})
+
+ with model_management.cuda_device_context(device):
+ o = self.cond_stage_model.encode_token_weights(tokens)
+
cond, pooled = o[:2]
if return_dict:
out = {"cond": cond, "pooled_output": pooled}
@@ -414,6 +432,13 @@ class CLIP:
sd_clip[k] = sd_tokenizer[k]
return sd_clip
+ def state_dict_for_saving(self):
+ sd_clip = self.patcher.model_state_dict_for_saving()
+ sd_tokenizer = self.tokenizer.state_dict()
+ for k in sd_tokenizer:
+ sd_clip[k] = sd_tokenizer[k]
+ return sd_clip
+
def load_model(self, tokens={}):
memory_used = 0
if hasattr(self.cond_stage_model, "memory_estimation_function"):
@@ -428,9 +453,12 @@ class CLIP:
self.cond_stage_model.reset_clip_options()
self.load_model(tokens)
+ device = self.patcher.load_device
self.cond_stage_model.set_clip_options({"layer": None})
- self.cond_stage_model.set_clip_options({"execution_device": self.patcher.load_device})
- return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
+ self.cond_stage_model.set_clip_options({"execution_device": device})
+
+ with model_management.cuda_device_context(device):
+ return self.cond_stage_model.generate(tokens, do_sample=do_sample, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p, min_p=min_p, repetition_penalty=repetition_penalty, seed=seed, presence_penalty=presence_penalty)
def decode(self, token_ids, skip_special_tokens=True):
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
@@ -477,7 +505,10 @@ class VAE:
encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': encoder_config},
decoder_config={'target': "comfy.ldm.modules.temporal_ae.VideoDecoder", 'params': decoder_config})
elif "taesd_decoder.1.weight" in sd:
- self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
+ if isinstance(metadata, dict) and "tae_latent_channels" in metadata:
+ self.latent_channels = metadata["tae_latent_channels"]
+ else:
+ self.latent_channels = sd["taesd_decoder.1.weight"].shape[1]
self.first_stage_model = comfy.taesd.taesd.TAESD(latent_channels=self.latent_channels)
elif "vquantizer.codebook.weight" in sd: #VQGan: stage a of stable cascade
self.first_stage_model = StageA()
@@ -651,6 +682,17 @@ class VAE:
self.memory_used_encode = lambda shape, dtype: (1400 * 9 * shape[-2] * shape[-1]) * model_management.dtype_size(dtype)
self.memory_used_decode = lambda shape, dtype: (3600 * 4 * shape[-2] * shape[-1] * 16 * 16) * model_management.dtype_size(dtype)
+ elif "decoder.conv_in.conv.weight" in sd and "decoder.mid_block.resnets.0.norm1.norm_layer.weight" in sd: # CogVideoX VAE
+ self.upscale_ratio = (lambda a: max(0, a * 4 - 3), 8, 8)
+ self.upscale_index_formula = (4, 8, 8)
+ self.downscale_ratio = (lambda a: max(0, math.floor((a + 3) / 4)), 8, 8)
+ self.downscale_index_formula = (4, 8, 8)
+ self.latent_dim = 3
+ self.latent_channels = sd["encoder.conv_out.conv.weight"].shape[0] // 2
+ self.first_stage_model = comfy.ldm.cogvideo.vae.AutoencoderKLCogVideoX(latent_channels=self.latent_channels)
+ self.memory_used_decode = lambda shape, dtype: (2800 * max(2, ((shape[2] - 1) * 4) + 1) * shape[3] * shape[4] * (8 * 8)) * model_management.dtype_size(dtype)
+ self.memory_used_encode = lambda shape, dtype: (1400 * max(1, shape[2]) * shape[3] * shape[4]) * model_management.dtype_size(dtype)
+ self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
elif "decoder.conv_in.conv.weight" in sd:
ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
ddconfig["conv3d"] = True
@@ -758,6 +800,7 @@ class VAE:
self.latent_channels = 3
self.latent_dim = 2
self.output_channels = 3
+ self.disable_offload = True
elif "vocoder.activation_post.downsample.lowpass.filter" in sd: #MMAudio VAE
sample_rate = 16000
if sample_rate == 16000:
@@ -805,6 +848,52 @@ class VAE:
self.downscale_index_formula = (4, 8, 8)
self.memory_used_encode = lambda shape, dtype: (700 * (max(1, (shape[-3] ** 0.66 * 0.11)) * shape[-2] * shape[-1]) * model_management.dtype_size(dtype))
self.memory_used_decode = lambda shape, dtype: (50 * (max(1, (shape[-3] ** 0.65 * 0.26)) * shape[-2] * shape[-1] * 32 * 32) * model_management.dtype_size(dtype))
+ elif "vocoder.resblocks.0.convs1.0.weight" in sd or "vocoder.vocoder.resblocks.0.convs1.0.weight" in sd: # LTX Audio
+ sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder."})
+ self.first_stage_model = comfy.ldm.lightricks.vae.audio_vae.AudioVAE(metadata=metadata)
+ self.memory_used_encode = lambda shape, dtype: (shape[2] * 330) * model_management.dtype_size(dtype)
+ self.memory_used_decode = lambda shape, dtype: (shape[2] * shape[3] * 87000) * model_management.dtype_size(dtype)
+ self.latent_channels = self.first_stage_model.latent_channels
+ self.audio_sample_rate_output = self.first_stage_model.output_sample_rate
+ self.autoencoder = self.first_stage_model.autoencoder # TODO: remove hack for ltxv custom nodes
+ self.output_channels = 2
+ self.pad_channel_value = "replicate"
+ self.upscale_ratio = 4096
+ self.downscale_ratio = 4096
+ self.latent_dim = 2
+ self.process_output = lambda audio: audio
+ self.process_input = lambda audio: audio
+ self.working_dtypes = [torch.float32]
+ self.disable_offload = True
+ self.extra_1d_channel = 16
+ elif "decoder.layers.3.transformers.0.pre_norm.alpha" in sd: # Stable Audio 3 VAE
+ if "decoder.layers.3.transformers.11.self_attn.to_out.weight" in sd:
+ config = {"channels": 256, "transformer_depths": 12, "sinusoidal_blocks": 8,
+ "sliding_window": [1, 1], "decoder_conv_mapping": False,
+ "chunk_size": 128, "chunk_midpoint_shift": False}
+ self.memory_used_encode = lambda shape, dtype: (1500 * shape[2]) * model_management.dtype_size(dtype)
+ self.memory_used_decode = lambda shape, dtype: (1500 * shape[2] * 4096) * model_management.dtype_size(dtype)
+ else:
+ config = {"channels": 128, "transformer_depths": 6, "sinusoidal_blocks": 0,
+ "sliding_window": None, "decoder_conv_mapping": True,
+ "chunk_size": 32, "chunk_midpoint_shift": True}
+ self.memory_used_encode = lambda shape, dtype: (72 * shape[2]) * model_management.dtype_size(dtype)
+ self.memory_used_decode = lambda shape, dtype: (72 * shape[2] * 4096) * model_management.dtype_size(dtype)
+
+ self.first_stage_model = comfy.ldm.audio.vae_sa3.SA3AudioVAE(**config)
+ self.latent_channels = 256
+ self.output_channels = 2
+ self.upscale_ratio = 4096
+ self.downscale_ratio = 4096
+ self.latent_dim = 1
+ self.audio_sample_rate = 44100
+ self.process_output = lambda audio: audio
+ self.process_input = lambda audio: audio
+ self.working_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+ #This VAE has Parameters and Buffers the non-dynamic caster cannot handle
+ #Force cast it for --disable-dynamic-vram users until there is a true core fix.
+ if not comfy.memory_management.aimdo_enabled:
+ self.disable_offload = True
else:
logging.warning("WARNING: No VAE weights detected, VAE not initalized.")
self.first_stage_model = None
@@ -947,50 +1036,52 @@ class VAE:
do_tile = False
if self.latent_dim == 2 and samples_in.ndim == 5:
samples_in = samples_in[:, :, 0]
- try:
- memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
- model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
- free_memory = self.patcher.get_free_memory(self.device)
- batch_number = int(free_memory / memory_used)
- batch_number = max(1, batch_number)
- # Pre-allocate output for VAEs that support direct buffer writes
- preallocated = False
- if getattr(self.first_stage_model, 'comfy_has_chunked_io', False):
- pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype())
- preallocated = True
+ with model_management.cuda_device_context(self.device):
+ try:
+ memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
+ model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+ free_memory = self.patcher.get_free_memory(self.device)
+ batch_number = int(free_memory / memory_used)
+ batch_number = max(1, batch_number)
- for x in range(0, samples_in.shape[0], batch_number):
- samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype)
- if preallocated:
- self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options)
- else:
- out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True)
- if pixel_samples is None:
- pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
- pixel_samples[x:x+batch_number].copy_(out)
- del out
- self.process_output(pixel_samples[x:x+batch_number])
- except Exception as e:
- model_management.raise_non_oom(e)
- logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
- #NOTE: We don't know what tensors were allocated to stack variables at the time of the
- #exception and the exception itself refs them all until we get out of this except block.
- #So we just set a flag for tiler fallback so that tensor gc can happen once the
- #exception is fully off the books.
- do_tile = True
+ # Pre-allocate output for VAEs that support direct buffer writes
+ preallocated = False
+ if getattr(self.first_stage_model, 'comfy_has_chunked_io', False):
+ pixel_samples = torch.empty(self.first_stage_model.decode_output_shape(samples_in.shape), device=self.output_device, dtype=self.vae_output_dtype())
+ preallocated = True
- if do_tile:
- comfy.model_management.soft_empty_cache()
- dims = samples_in.ndim - 2
- if dims == 1 or self.extra_1d_channel is not None:
- pixel_samples = self.decode_tiled_1d(samples_in)
- elif dims == 2:
- pixel_samples = self.decode_tiled_(samples_in)
- elif dims == 3:
- tile = 256 // self.spacial_compression_decode()
- overlap = tile // 4
- pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
+ for x in range(0, samples_in.shape[0], batch_number):
+ samples = samples_in[x:x + batch_number].to(device=self.device, dtype=self.vae_dtype)
+ if preallocated:
+ self.first_stage_model.decode(samples, output_buffer=pixel_samples[x:x+batch_number], **vae_options)
+ else:
+ out = self.first_stage_model.decode(samples, **vae_options).to(device=self.output_device, dtype=self.vae_output_dtype(), copy=True)
+ if pixel_samples is None:
+ pixel_samples = torch.empty((samples_in.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
+ pixel_samples[x:x+batch_number].copy_(out)
+ del out
+ self.process_output(pixel_samples[x:x+batch_number])
+ except Exception as e:
+ model_management.raise_non_oom(e)
+ logging.warning("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
+ #NOTE: We don't know what tensors were allocated to stack variables at the time of the
+ #exception and the exception itself refs them all until we get out of this except block.
+ #So we just set a flag for tiler fallback so that tensor gc can happen once the
+ #exception is fully off the books.
+ do_tile = True
+
+ if do_tile:
+ comfy.model_management.soft_empty_cache()
+ dims = samples_in.ndim - 2
+ if dims == 1 or self.extra_1d_channel is not None:
+ pixel_samples = self.decode_tiled_1d(samples_in)
+ elif dims == 2:
+ pixel_samples = self.decode_tiled_(samples_in)
+ elif dims == 3:
+ tile = 256 // self.spacial_compression_decode()
+ overlap = tile // 4
+ pixel_samples = self.decode_tiled_3d(samples_in, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
pixel_samples = pixel_samples.to(self.output_device).movedim(1,-1)
return pixel_samples
@@ -1008,20 +1099,21 @@ class VAE:
if overlap is not None:
args["overlap"] = overlap
- if dims == 1 or self.extra_1d_channel is not None:
- args.pop("tile_y")
- output = self.decode_tiled_1d(samples, **args)
- elif dims == 2:
- output = self.decode_tiled_(samples, **args)
- elif dims == 3:
- if overlap_t is None:
- args["overlap"] = (1, overlap, overlap)
- else:
- args["overlap"] = (max(1, overlap_t), overlap, overlap)
- if tile_t is not None:
- args["tile_t"] = max(2, tile_t)
+ with model_management.cuda_device_context(self.device):
+ if dims == 1 or self.extra_1d_channel is not None:
+ args.pop("tile_y")
+ output = self.decode_tiled_1d(samples, **args)
+ elif dims == 2:
+ output = self.decode_tiled_(samples, **args)
+ elif dims == 3:
+ if overlap_t is None:
+ args["overlap"] = (1, overlap, overlap)
+ else:
+ args["overlap"] = (max(1, overlap_t), overlap, overlap)
+ if tile_t is not None:
+ args["tile_t"] = max(2, tile_t)
- output = self.decode_tiled_3d(samples, **args)
+ output = self.decode_tiled_3d(samples, **args)
return output.movedim(1, -1)
def encode(self, pixel_samples):
@@ -1034,44 +1126,46 @@ class VAE:
pixel_samples = pixel_samples.movedim(1, 0).unsqueeze(0)
else:
pixel_samples = pixel_samples.unsqueeze(2)
- try:
- memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
- model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
- free_memory = self.patcher.get_free_memory(self.device)
- batch_number = int(free_memory / max(1, memory_used))
- batch_number = max(1, batch_number)
- samples = None
- for x in range(0, pixel_samples.shape[0], batch_number):
- pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype)
- if getattr(self.first_stage_model, 'comfy_has_chunked_io', False):
- out = self.first_stage_model.encode(pixels_in, device=self.device)
+
+ with model_management.cuda_device_context(self.device):
+ try:
+ memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
+ model_management.load_models_gpu([self.patcher], memory_required=memory_used, force_full_load=self.disable_offload)
+ free_memory = self.patcher.get_free_memory(self.device)
+ batch_number = int(free_memory / max(1, memory_used))
+ batch_number = max(1, batch_number)
+ samples = None
+ for x in range(0, pixel_samples.shape[0], batch_number):
+ pixels_in = self.process_input(pixel_samples[x:x + batch_number]).to(self.vae_dtype)
+ if getattr(self.first_stage_model, 'comfy_has_chunked_io', False):
+ out = self.first_stage_model.encode(pixels_in, device=self.device)
+ else:
+ pixels_in = pixels_in.to(self.device)
+ out = self.first_stage_model.encode(pixels_in)
+ out = out.to(self.output_device).to(dtype=self.vae_output_dtype())
+ if samples is None:
+ samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
+ samples[x:x + batch_number] = out
+
+ except Exception as e:
+ model_management.raise_non_oom(e)
+ logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
+ #NOTE: We don't know what tensors were allocated to stack variables at the time of the
+ #exception and the exception itself refs them all until we get out of this except block.
+ #So we just set a flag for tiler fallback so that tensor gc can happen once the
+ #exception is fully off the books.
+ do_tile = True
+
+ if do_tile:
+ comfy.model_management.soft_empty_cache()
+ if self.latent_dim == 3:
+ tile = 256
+ overlap = tile // 4
+ samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
+ elif self.latent_dim == 1 or self.extra_1d_channel is not None:
+ samples = self.encode_tiled_1d(pixel_samples)
else:
- pixels_in = pixels_in.to(self.device)
- out = self.first_stage_model.encode(pixels_in)
- out = out.to(self.output_device).to(dtype=self.vae_output_dtype())
- if samples is None:
- samples = torch.empty((pixel_samples.shape[0],) + tuple(out.shape[1:]), device=self.output_device, dtype=self.vae_output_dtype())
- samples[x:x + batch_number] = out
-
- except Exception as e:
- model_management.raise_non_oom(e)
- logging.warning("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
- #NOTE: We don't know what tensors were allocated to stack variables at the time of the
- #exception and the exception itself refs them all until we get out of this except block.
- #So we just set a flag for tiler fallback so that tensor gc can happen once the
- #exception is fully off the books.
- do_tile = True
-
- if do_tile:
- comfy.model_management.soft_empty_cache()
- if self.latent_dim == 3:
- tile = 256
- overlap = tile // 4
- samples = self.encode_tiled_3d(pixel_samples, tile_x=tile, tile_y=tile, overlap=(1, overlap, overlap))
- elif self.latent_dim == 1 or self.extra_1d_channel is not None:
- samples = self.encode_tiled_1d(pixel_samples)
- else:
- samples = self.encode_tiled_(pixel_samples)
+ samples = self.encode_tiled_(pixel_samples)
return samples
@@ -1097,26 +1191,27 @@ class VAE:
if overlap is not None:
args["overlap"] = overlap
- if dims == 1:
- args.pop("tile_y")
- samples = self.encode_tiled_1d(pixel_samples, **args)
- elif dims == 2:
- samples = self.encode_tiled_(pixel_samples, **args)
- elif dims == 3:
- if tile_t is not None:
- tile_t_latent = max(2, self.downscale_ratio[0](tile_t))
- else:
- tile_t_latent = 9999
- args["tile_t"] = self.upscale_ratio[0](tile_t_latent)
+ with model_management.cuda_device_context(self.device):
+ if dims == 1:
+ args.pop("tile_y")
+ samples = self.encode_tiled_1d(pixel_samples, **args)
+ elif dims == 2:
+ samples = self.encode_tiled_(pixel_samples, **args)
+ elif dims == 3:
+ if tile_t is not None:
+ tile_t_latent = max(2, self.downscale_ratio[0](tile_t))
+ else:
+ tile_t_latent = 9999
+ args["tile_t"] = self.upscale_ratio[0](tile_t_latent)
- if overlap_t is None:
- args["overlap"] = (1, overlap, overlap)
- else:
- args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap)
- maximum = pixel_samples.shape[2]
- maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum))
+ if overlap_t is None:
+ args["overlap"] = (1, overlap, overlap)
+ else:
+ args["overlap"] = (self.upscale_ratio[0](max(1, min(tile_t_latent // 2, self.downscale_ratio[0](overlap_t)))), overlap, overlap)
+ maximum = pixel_samples.shape[2]
+ maximum = self.upscale_ratio[0](self.downscale_ratio[0](maximum))
- samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args)
+ samples = self.encode_tiled_3d(pixel_samples[:,:,:maximum], **args)
return samples
@@ -1189,6 +1284,9 @@ class CLIPType(Enum):
NEWBIE = 24
FLUX2 = 25
LONGCAT_IMAGE = 26
+ COGVIDEOX = 27
+ LENS = 28
+ PIXELDIT = 29
@@ -1237,6 +1335,11 @@ class TEModel(Enum):
QWEN35_9B = 26
QWEN35_27B = 27
MINISTRAL_3_3B = 28
+ GEMMA_4_E4B = 29
+ GEMMA_4_E2B = 30
+ GEMMA_4_31B = 31
+ T5_GEMMA = 32
+ GPT_OSS_20B = 33
def detect_te_model(sd):
@@ -1261,7 +1364,15 @@ def detect_te_model(sd):
if weight.shape[0] == 384:
return TEModel.BYT5_SMALL_GLYPH
return TEModel.T5_BASE
+ if "model.encoder.layers.0.pre_self_attn_layernorm.weight" in sd:
+ return TEModel.T5_GEMMA
if 'model.layers.0.post_feedforward_layernorm.weight' in sd:
+ if 'model.layers.59.self_attn.q_norm.weight' in sd:
+ return TEModel.GEMMA_4_31B
+ if 'model.layers.41.self_attn.q_norm.weight' in sd and 'model.layers.47.self_attn.q_norm.weight' not in sd:
+ return TEModel.GEMMA_4_E4B
+ if 'model.layers.34.self_attn.q_norm.weight' in sd and 'model.layers.41.self_attn.q_norm.weight' not in sd:
+ return TEModel.GEMMA_4_E2B
if 'model.layers.47.self_attn.q_norm.weight' in sd:
return TEModel.GEMMA_3_12B
if 'model.layers.0.self_attn.q_norm.weight' in sd:
@@ -1270,6 +1381,9 @@ def detect_te_model(sd):
else:
return TEModel.GEMMA_3_4B
return TEModel.GEMMA_2_2B
+ # Must precede the Qwen2.5-7B k_proj.bias=512 check (GPT-OSS also has 8*64=512).
+ if "layers.0.self_attn.sinks" in sd and "layers.0.mlp.experts.gate_up_proj.weight" in sd:
+ return TEModel.GPT_OSS_20B
if 'model.layers.0.self_attn.k_proj.bias' in sd:
weight = sd['model.layers.0.self_attn.k_proj.bias']
if weight.shape[0] == 256:
@@ -1384,6 +1498,9 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
clip_target.clip = comfy.text_encoders.hidream.hidream_clip(**t5xxl_detect(clip_data),
clip_l=False, clip_g=False, t5=True, llama=False, dtype_llama=None)
clip_target.tokenizer = comfy.text_encoders.hidream.HiDreamTokenizer
+ elif clip_type == CLIPType.COGVIDEOX:
+ clip_target.clip = comfy.text_encoders.cogvideo.cogvideo_te(**t5xxl_detect(clip_data))
+ clip_target.tokenizer = comfy.text_encoders.cogvideo.CogVideoXTokenizer
else: #CLIPType.MOCHI
clip_target.clip = comfy.text_encoders.genmo.mochi_te(**t5xxl_detect(clip_data))
clip_target.tokenizer = comfy.text_encoders.genmo.MochiT5Tokenizer
@@ -1401,9 +1518,24 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
else:
clip_target.clip = comfy.text_encoders.sa_t5.SAT5Model
clip_target.tokenizer = comfy.text_encoders.sa_t5.SAT5Tokenizer
+ elif te_model == TEModel.T5_GEMMA:
+ clip_target.clip = comfy.text_encoders.sa3.SAT5GemmaModel
+ clip_target.tokenizer = comfy.text_encoders.sa3.SAT5GemmaTokenizer
+ tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
+ elif te_model in (TEModel.GEMMA_4_E4B, TEModel.GEMMA_4_E2B, TEModel.GEMMA_4_31B):
+ variant = {TEModel.GEMMA_4_E4B: comfy.text_encoders.gemma4.Gemma4_E4B,
+ TEModel.GEMMA_4_E2B: comfy.text_encoders.gemma4.Gemma4_E2B,
+ TEModel.GEMMA_4_31B: comfy.text_encoders.gemma4.Gemma4_31B}[te_model]
+ clip_target.clip = comfy.text_encoders.gemma4.gemma4_te(**llama_detect(clip_data), model_class=variant)
+ clip_target.tokenizer = variant.tokenizer
+ tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None)
elif te_model == TEModel.GEMMA_2_2B:
- clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
- clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
+ if clip_type == CLIPType.PIXELDIT:
+ clip_target.clip = comfy.text_encoders.pixeldit.pixeldit_te(**llama_detect(clip_data))
+ clip_target.tokenizer = comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer
+ else:
+ clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data))
+ clip_target.tokenizer = comfy.text_encoders.lumina2.LuminaTokenizer
tokenizer_data["spiece_model"] = clip_data[0].get("spiece_model", None)
elif te_model == TEModel.GEMMA_3_4B:
clip_target.clip = comfy.text_encoders.lumina2.te(**llama_detect(clip_data), model_type="gemma3_4b")
@@ -1438,6 +1570,10 @@ def load_text_encoder_state_dicts(state_dicts=[], embedding_directory=None, clip
clip_target.clip = comfy.text_encoders.flux.flux2_te(**llama_detect(clip_data), pruned=te_model == TEModel.MISTRAL3_24B_PRUNED_FLUX2)
clip_target.tokenizer = comfy.text_encoders.flux.Flux2Tokenizer
tokenizer_data["tekken_model"] = clip_data[0].get("tekken_model", None)
+ elif te_model == TEModel.GPT_OSS_20B:
+ clip_target.clip = comfy.text_encoders.gpt_oss.lens_te(**llama_detect(clip_data))
+ clip_target.tokenizer = comfy.text_encoders.gpt_oss.LensTokenizer
+ tokenizer_data["tokenizer_json"] = clip_data[0].get("tokenizer_json", None)
elif te_model == TEModel.QWEN3_4B:
if clip_type == CLIPType.FLUX or clip_type == CLIPType.FLUX2:
clip_target.clip = comfy.text_encoders.flux.klein_te(**llama_detect(clip_data), model_type="qwen3_4b")
@@ -1604,12 +1740,52 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
out = load_state_dict_guess_config(sd, output_vae, output_clip, output_clipvision, embedding_directory, output_model, model_options, te_model_options=te_model_options, metadata=metadata, disable_dynamic=disable_dynamic)
if out is None:
raise RuntimeError("ERROR: Could not detect model type of: {}\n{}".format(ckpt_path, model_detection_error_hint(ckpt_path, sd)))
- if output_model and out[0] is not None:
- out[0].cached_patcher_init = (load_checkpoint_guess_config_model_only, (ckpt_path, embedding_directory, model_options, te_model_options))
- if output_clip and out[1] is not None:
- out[1].patcher.cached_patcher_init = (load_checkpoint_guess_config_clip_only, (ckpt_path, embedding_directory, model_options, te_model_options))
+ if out[0] is not None:
+ out[0].cached_patcher_init = (load_checkpoint_guess_config, (ckpt_path, False, False, False, embedding_directory, output_model, model_options, te_model_options), 0)
+ # Register reload factories for the CLIP and VAE produced by the same checkpoint so
+ # ModelPatcher.deepclone_multigpu can spawn per-device copies (Select{CLIP,VAE}Device,
+ # MultiGPU work-units, etc.) without falling back to copy.deepcopy of an
+ # already-loaded module.
+ if out[1] is not None and getattr(out[1], "patcher", None) is not None:
+ out[1].patcher.cached_patcher_init = (load_checkpoint_clip_patcher, (ckpt_path, embedding_directory, model_options, te_model_options))
+ if out[2] is not None and getattr(out[2], "patcher", None) is not None:
+ out[2].patcher.cached_patcher_init = (load_checkpoint_vae_patcher, (ckpt_path, embedding_directory, model_options, te_model_options))
return out
+
+def load_checkpoint_clip_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False):
+ """Reload only the CLIP patcher from a checkpoint. Used as the cached_patcher_init
+ factory for the CLIP returned by load_checkpoint_guess_config."""
+ _, clip, _, _ = load_checkpoint_guess_config(
+ ckpt_path,
+ output_vae=False,
+ output_clip=True,
+ output_clipvision=False,
+ embedding_directory=embedding_directory,
+ output_model=False,
+ model_options=model_options,
+ te_model_options=te_model_options,
+ disable_dynamic=disable_dynamic,
+ )
+ return clip.patcher
+
+
+def load_checkpoint_vae_patcher(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False):
+ """Reload only the VAE patcher from a checkpoint. Used as the cached_patcher_init
+ factory for the VAE returned by load_checkpoint_guess_config."""
+ _, _, vae, _ = load_checkpoint_guess_config(
+ ckpt_path,
+ output_vae=True,
+ output_clip=False,
+ output_clipvision=False,
+ embedding_directory=embedding_directory,
+ output_model=False,
+ model_options=model_options,
+ te_model_options=te_model_options,
+ disable_dynamic=disable_dynamic,
+ )
+ return vae.patcher
+
def load_checkpoint_guess_config_model_only(ckpt_path, embedding_directory=None, model_options={}, te_model_options={}, disable_dynamic=False):
model, *_ = load_checkpoint_guess_config(ckpt_path, False, False, False,
embedding_directory=embedding_directory,
@@ -1636,7 +1812,7 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
diffusion_model_prefix = model_detection.unet_prefix_from_state_dict(sd)
parameters = comfy.utils.calculate_parameters(sd, diffusion_model_prefix)
weight_dtype = comfy.utils.weight_dtype(sd, diffusion_model_prefix)
- load_device = model_management.get_torch_device()
+ load_device = model_options.get("load_device", model_management.get_torch_device())
custom_operations = model_options.get("custom_operations", None)
if custom_operations is None:
@@ -1676,13 +1852,15 @@ def load_state_dict_guess_config(sd, output_vae=True, output_clip=True, output_c
inital_load_device = model_management.unet_inital_load_device(parameters, unet_dtype)
model = model_config.get_model(sd, diffusion_model_prefix, device=inital_load_device)
ModelPatcher = comfy.model_patcher.ModelPatcher if disable_dynamic else comfy.model_patcher.CoreModelPatcher
- model_patcher = ModelPatcher(model, load_device=load_device, offload_device=model_management.unet_offload_device())
+ offload_device = model_options.get("offload_device", model_management.unet_offload_device())
+ model_patcher = ModelPatcher(model, load_device=load_device, offload_device=offload_device)
model.load_model_weights(sd, diffusion_model_prefix, assign=model_patcher.is_dynamic())
if output_vae:
vae_sd = comfy.utils.state_dict_prefix_replace(sd, {k: "" for k in model_config.vae_key_prefix}, filter_keys=True)
vae_sd = model_config.process_vae_state_dict(vae_sd)
- vae = VAE(sd=vae_sd, metadata=metadata)
+ vae_device = model_options.get("load_device", None)
+ vae = VAE(sd=vae_sd, metadata=metadata, device=vae_device)
if output_clip:
if te_model_options.get("custom_operations", None) is None:
@@ -1766,7 +1944,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
parameters = comfy.utils.calculate_parameters(sd)
weight_dtype = comfy.utils.weight_dtype(sd)
- load_device = model_management.get_torch_device()
+ load_device = model_options.get("load_device", model_management.get_torch_device())
model_config = model_detection.model_config_from_unet(sd, "", metadata=metadata)
if model_config is not None:
@@ -1791,7 +1969,7 @@ def load_diffusion_model_state_dict(sd, model_options={}, metadata=None, disable
else:
logging.warning("{} {}".format(diffusers_keys[k], k))
- offload_device = model_management.unet_offload_device()
+ offload_device = model_options.get("offload_device", model_management.unet_offload_device())
unet_weight_dtype = list(model_config.supported_inference_dtypes)
if model_config.quant_config is not None:
weight_dtype = None
@@ -1833,6 +2011,26 @@ def load_diffusion_model(unet_path, model_options={}, disable_dynamic=False):
model.cached_patcher_init = (load_diffusion_model, (unet_path, model_options))
return model
+
+def load_vae_patcher(vae_path, metadata=None, device=None, disable_dynamic=False):
+ """Reload a disk-backed VAE from ``vae_path`` and return its patcher.
+
+ Used as the ``cached_patcher_init`` factory on ``VAE.patcher`` so
+ :meth:`comfy.model_patcher.ModelPatcher.deepclone_multigpu` can produce a
+ fresh, untainted VAE patcher (no inherited per-device load state, no
+ in-place quantization fallout) for multigpu work-units and the
+ SelectVAEDevice node. The optional ``device`` matches the source loader's
+ VAE initialization path; the deepclone's ``load_device`` still controls
+ where the cloned patcher is targeted.
+ """
+ if metadata is None:
+ sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True)
+ else:
+ sd = comfy.utils.load_torch_file(vae_path)
+ vae = VAE(sd=sd, metadata=metadata, device=device)
+ vae.throw_exception_if_invalid()
+ return vae.patcher
+
def load_unet(unet_path, dtype=None):
logging.warning("The load_unet function has been deprecated and will be removed please switch to: load_diffusion_model")
return load_diffusion_model(unet_path, model_options={"dtype": dtype})
@@ -1846,7 +2044,7 @@ def save_checkpoint(output_path, model, clip=None, vae=None, clip_vision=None, m
load_models = [model]
if clip is not None:
load_models.append(clip.load_model())
- clip_sd = clip.get_sd()
+ clip_sd = clip.state_dict_for_saving()
vae_sd = None
if vae is not None:
vae_sd = vae.get_sd()
diff --git a/comfy/supported_models.py b/comfy/supported_models.py
index 58d4ce731..00941da53 100644
--- a/comfy/supported_models.py
+++ b/comfy/supported_models.py
@@ -7,6 +7,7 @@ from . import sdxl_clip
import comfy.text_encoders.sd2_clip
import comfy.text_encoders.sd3_clip
import comfy.text_encoders.sa_t5
+import comfy.text_encoders.sa3
import comfy.text_encoders.aura_t5
import comfy.text_encoders.pixart_t5
import comfy.text_encoders.hydit
@@ -27,6 +28,9 @@ import comfy.text_encoders.anima
import comfy.text_encoders.ace15
import comfy.text_encoders.longcat_image
import comfy.text_encoders.ernie
+import comfy.text_encoders.cogvideo
+import comfy.text_encoders.hidream_o1
+import comfy.text_encoders.pixeldit
from . import supported_models_base
from . import latent_formats
@@ -601,6 +605,29 @@ class StableAudio(supported_models_base.BASE):
def clip_target(self, state_dict={}):
return supported_models_base.ClipTarget(comfy.text_encoders.sa_t5.SAT5Tokenizer, comfy.text_encoders.sa_t5.SAT5Model)
+class StableAudio3(StableAudio):
+ unet_config = {
+ "audio_model": "dit1.0",
+ "global_cond_shared_embed": True,
+ }
+
+ sampling_settings = {
+ "multiplier": 1.0,
+ "shift": 2.0,
+ }
+
+ latent_format = latent_formats.StableAudio3
+
+ memory_usage_factor = 7
+
+ def get_model(self, state_dict, prefix="", device=None):
+ seconds_total_sd = utils.state_dict_prefix_replace(state_dict, {"conditioner.conditioners.seconds_total.": ""}, filter_keys=True)
+ padding_embedding = state_dict.get("conditioner.conditioners.prompt.padding_embedding", None)
+ return model_base.StableAudio3(self, seconds_total_embedder_weights=seconds_total_sd, padding_embedding=padding_embedding, device=device)
+
+ def clip_target(self, state_dict={}):
+ return supported_models_base.ClipTarget(comfy.text_encoders.sa3.SAT5GemmaTokenizer, comfy.text_encoders.sa3.SAT5GemmaModel)
+
class AuraFlow(supported_models_base.BASE):
unet_config = {
"cond_seq_dim": 2048,
@@ -803,6 +830,50 @@ class Flux2(Flux):
return None
+
+class Lens(supported_models_base.BASE):
+ """Microsoft Lens (3.8B dual-stream MMDiT, GPT-OSS-20B text features, Flux2 VAE)."""
+
+ unet_config = {
+ "image_model": "lens",
+ }
+
+ sampling_settings = {
+ "shift": 1.829, # Default mu for 1440x1440 (and any seq_len > 4300
+ }
+
+ unet_extra_config = {}
+ latent_format = latent_formats.Flux2
+
+ memory_usage_factor = 4.0
+
+ supported_inference_dtypes = [torch.bfloat16, torch.float32] # fp16 causes NaNs
+
+ vae_key_prefix = ["vae."]
+ text_encoder_key_prefix = ["text_encoders."]
+
+ def __init__(self, unet_config):
+ super().__init__(unet_config)
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.Lens(self, model_type=model_base.ModelType.FLUX, device=device)
+
+ def clip_target(self, state_dict={}):
+ pref = self.text_encoder_key_prefix[0]
+ for hint in ("gpt_oss.transformer.", ""):
+ full_prefix = "{}{}".format(pref, hint)
+ if "{}layers.0.self_attn.sinks".format(full_prefix) in state_dict:
+ detect = comfy.text_encoders.hunyuan_video.llama_detect(state_dict, full_prefix)
+ return supported_models_base.ClipTarget(
+ comfy.text_encoders.gpt_oss.LensTokenizer,
+ comfy.text_encoders.gpt_oss.lens_te(**detect),
+ )
+ return supported_models_base.ClipTarget(
+ comfy.text_encoders.gpt_oss.LensTokenizer,
+ comfy.text_encoders.gpt_oss.lens_te(),
+ )
+
+
class GenmoMochi(supported_models_base.BASE):
unet_config = {
"image_model": "mochi_preview",
@@ -1133,6 +1204,72 @@ class ZImagePixelSpace(ZImage):
def get_model(self, state_dict, prefix="", device=None):
return model_base.ZImagePixelSpace(self, device=device)
+class PixelDiTT2I(supported_models_base.BASE):
+ unet_config = {
+ "image_model": "pixeldit_t2i",
+ }
+
+ unet_extra_config = {}
+
+ sampling_settings = {
+ "shift": 4.0, # 1024px stage 3 default; 2.0 for 512px
+ }
+
+ latent_format = latent_formats.PixelDiTPixel
+ memory_usage_factor = 0.04
+ supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+ vae_key_prefix = ["vae."]
+ text_encoder_key_prefix = ["text_encoders."]
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.PixelDiTT2I(self, device=device)
+
+ def process_unet_state_dict(self, state_dict):
+ # pixel_dim from pixel_embedder.proj.weight = (pixel_dim, in_channels); p2 derived per-weight from total // (6 * pixel_dim).
+ pixel_dim = next(v for k, v in state_dict.items() if k.endswith("pixel_embedder.proj.weight")).shape[0]
+
+ out = {}
+ marker = ".adaLN_modulation.0."
+ for k, v in state_dict.items():
+ if k.startswith("_repa_projector") or k.startswith("net_ema."):
+ continue
+ if k.startswith("core."):
+ k = k[len("core."):]
+ elif k.startswith("net."):
+ k = k[len("net."):]
+ if "pixel_blocks." in k and marker in k:
+ # Split into msa (chunks 0-2) and mlp (chunks 3-5) for the two-Linear PiTBlock to reduce peak VRAM
+ p2 = v.shape[0] // (6 * pixel_dim)
+ trail = v.shape[1:] # () for bias, (in_dim,) for weight
+ vv = v.view(p2, 6, pixel_dim, *trail)
+ base, suffix = k.split(marker)
+ out[f"{base}.adaLN_modulation_msa.{suffix}"] = vv[:, 0:3].reshape(3 * p2 * pixel_dim, *trail).contiguous()
+ out[f"{base}.adaLN_modulation_mlp.{suffix}"] = vv[:, 3:6].reshape(3 * p2 * pixel_dim, *trail).contiguous()
+ else:
+ out[k] = v
+ return out
+
+ def clip_target(self, state_dict={}):
+ return supported_models_base.ClipTarget(
+ comfy.text_encoders.pixeldit.PixelDiTGemma2Tokenizer,
+ comfy.text_encoders.pixeldit.PixelDiTGemma2TE,
+ )
+
+class PiD(PixelDiTT2I):
+ unet_config = {
+ "image_model": "pid",
+ }
+
+ sampling_settings = {
+ "shift": 1.5, # close approximation of the original distill 4 steps [0.999, 0.866, 0.634, 0.342, 0]
+ }
+
+ memory_usage_factor = 0.04
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.PiD(self, device=device)
+
class WAN21_T2V(supported_models_base.BASE):
unet_config = {
"image_model": "wan2.1",
@@ -1166,6 +1303,25 @@ class WAN21_T2V(supported_models_base.BASE):
t5_detect = comfy.text_encoders.sd3_clip.t5_xxl_detect(state_dict, "{}umt5xxl.transformer.".format(pref))
return supported_models_base.ClipTarget(comfy.text_encoders.wan.WanT5Tokenizer, comfy.text_encoders.wan.te(**t5_detect))
+class WAN21_CausalAR_T2V(WAN21_T2V):
+ unet_config = {
+ "image_model": "wan2.1",
+ "model_type": "t2v",
+ "causal_ar": True,
+ }
+
+ sampling_settings = {
+ "shift": 5.0,
+ }
+
+ def __init__(self, unet_config):
+ super().__init__(unet_config)
+ self.unet_config.pop("causal_ar", None)
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.WAN21_CausalAR(self, device=device)
+
+
class WAN21_I2V(WAN21_T2V):
unet_config = {
"image_model": "wan2.1",
@@ -1293,6 +1449,37 @@ class WAN21_SCAIL(WAN21_T2V):
out = model_base.WAN21_SCAIL(self, image_to_video=False, device=device)
return out
+class WAN22_WanDancer(WAN21_T2V):
+ unet_config = {
+ "image_model": "wan2.1",
+ "model_type": "wandancer",
+ "in_dim": 36,
+ }
+
+ def __init__(self, unet_config):
+ super().__init__(unet_config)
+ self.memory_usage_factor = 1.8
+
+ def get_model(self, state_dict, prefix="", device=None):
+ out = model_base.WAN22_WanDancer(self, image_to_video=True, device=device)
+ return out
+
+ def process_unet_state_dict(self, state_dict):
+ out_sd = {}
+ for k in list(state_dict.keys()):
+ # split music_encoder in_proj into q_proj, k_proj, v_proj
+ if "music_encoder" in k and "self_attn.in_proj" in k:
+ suffix = "weight" if k.endswith("weight") else "bias"
+ tensor = state_dict[k]
+ d = tensor.shape[0] // 3
+ prefix = k.replace(f"in_proj_{suffix}", "")
+ out_sd[f"{prefix}q_proj.{suffix}"] = tensor[:d]
+ out_sd[f"{prefix}k_proj.{suffix}"] = tensor[d:2*d]
+ out_sd[f"{prefix}v_proj.{suffix}"] = tensor[2*d:]
+ else:
+ out_sd[k] = state_dict[k]
+ return out_sd
+
class Hunyuan3Dv2(supported_models_base.BASE):
unet_config = {
"image_model": "hunyuan3d2",
@@ -1380,6 +1567,50 @@ class HiDream(supported_models_base.BASE):
def clip_target(self, state_dict={}):
return None # TODO
+class HiDreamO1(supported_models_base.BASE):
+ unet_config = {
+ "image_model": "hidream_o1",
+ }
+
+ sampling_settings = {
+ "shift": 3.0,
+ "noise_scale": 8.0,
+ }
+
+ latent_format = latent_formats.HiDreamO1Pixel
+ memory_usage_factor = 0.033
+ # fp16 not supported: LM MLP down_proj activations fp16 overflow, causing NaNs
+ supported_inference_dtypes = [torch.bfloat16, torch.float32]
+
+ vae_key_prefix = ["vae."]
+ text_encoder_key_prefix = ["text_encoders."]
+
+ optimizations = {"fp8": False}
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.HiDreamO1(self, device=device)
+
+ def process_unet_state_dict(self, state_dict):
+ # Drop unused Qwen3-VL deepstack merger weights; upstream discards them at inference.
+ for key in list(state_dict.keys()):
+ if "visual.deepstack_merger_list" in key:
+ del state_dict[key]
+ return state_dict
+
+ def process_vae_state_dict(self, state_dict):
+ # Pixel-space model: inject sentinel so VAE construction picks PixelspaceConversionVAE.
+ return {"pixel_space_vae": torch.tensor(1.0)}
+
+ def process_clip_state_dict(self, state_dict):
+ # Tokenizer-only TE: inject sentinel so load_state_dict_guess_config triggers CLIP init.
+ return {"_hidream_o1_te_sentinel": torch.zeros(1)}
+
+ def clip_target(self, state_dict={}):
+ return supported_models_base.ClipTarget(
+ comfy.text_encoders.hidream_o1.HiDreamO1Tokenizer,
+ comfy.text_encoders.hidream_o1.HiDreamO1TE,
+ )
+
class Chroma(supported_models_base.BASE):
unet_config = {
"image_model": "chroma",
@@ -1781,6 +2012,213 @@ class ErnieImage(supported_models_base.BASE):
return supported_models_base.ClipTarget(comfy.text_encoders.ernie.ErnieTokenizer, comfy.text_encoders.ernie.te(**hunyuan_detect))
-models = [LotusD, Stable_Zero123, SD15_instructpix2pix, SD15, SD20, SD21UnclipL, SD21UnclipH, SDXL_instructpix2pix, SDXLRefiner, SDXL, SSD1B, KOALA_700M, KOALA_1B, Segmind_Vega, SD_X4Upscaler, Stable_Cascade_C, Stable_Cascade_B, SV3D_u, SV3D_p, SD3, StableAudio, AuraFlow, PixArtAlpha, PixArtSigma, HunyuanDiT, HunyuanDiT1, FluxInpaint, Flux, LongCatImage, FluxSchnell, GenmoMochi, LTXV, LTXAV, HunyuanVideo15_SR_Distilled, HunyuanVideo15, HunyuanImage21Refiner, HunyuanImage21, HunyuanVideoSkyreelsI2V, HunyuanVideoI2V, HunyuanVideo, CosmosT2V, CosmosI2V, CosmosT2IPredict2, CosmosI2VPredict2, ZImagePixelSpace, ZImage, Lumina2, WAN22_T2V, WAN21_T2V, WAN21_I2V, WAN21_FunControl2V, WAN21_Vace, WAN21_Camera, WAN22_Camera, WAN22_S2V, WAN21_HuMo, WAN22_Animate, WAN21_FlowRVS, WAN21_SCAIL, Hunyuan3Dv2mini, Hunyuan3Dv2, Hunyuan3Dv2_1, HiDream, Chroma, ChromaRadiance, ACEStep, ACEStep15, Omnigen2, QwenImage, Flux2, Kandinsky5Image, Kandinsky5, Anima, RT_DETR_v4, ErnieImage]
+class SAM3(supported_models_base.BASE):
+ unet_config = {"image_model": "SAM3"}
+ supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+ text_encoder_key_prefix = ["detector.backbone.language_backbone."]
+ unet_extra_prefix = ""
-models += [SVD_img2vid]
+ def process_clip_state_dict(self, state_dict):
+ clip_keys = getattr(self, "_clip_stash", {})
+ clip_keys = utils.state_dict_prefix_replace(clip_keys, {"detector.backbone.language_backbone.": "", "backbone.language_backbone.": ""}, filter_keys=True)
+ clip_keys = utils.clip_text_transformers_convert(clip_keys, "encoder.", "sam3_clip.transformer.")
+ return {k: v for k, v in clip_keys.items() if not k.startswith("resizer.")}
+
+ def process_unet_state_dict(self, state_dict):
+ self._clip_stash = {k: state_dict.pop(k) for k in list(state_dict.keys()) if "language_backbone" in k and "resizer" not in k}
+ # SAM3.1: remap tracker.model.* -> tracker.*
+ for k in list(state_dict.keys()):
+ if k.startswith("tracker.model."):
+ state_dict["tracker." + k[len("tracker.model."):]] = state_dict.pop(k)
+ # SAM3.1: remove per-block freqs_cis buffers (computed dynamically)
+ for k in [k for k in list(state_dict.keys()) if ".attn.freqs_cis" in k]:
+ state_dict.pop(k)
+ # Split fused QKV projections
+ for k in [k for k in list(state_dict.keys()) if k.endswith((".in_proj_weight", ".in_proj_bias"))]:
+ t = state_dict.pop(k)
+ base, suffix = k.rsplit(".in_proj_", 1)
+ s = ".weight" if suffix == "weight" else ".bias"
+ d = t.shape[0] // 3
+ state_dict[base + ".q_proj" + s] = t[:d]
+ state_dict[base + ".k_proj" + s] = t[d:2*d]
+ state_dict[base + ".v_proj" + s] = t[2*d:]
+ # Remap tracker SAM decoder transformer key names to match sam.py TwoWayTransformer
+ for k in list(state_dict.keys()):
+ if "sam_mask_decoder.transformer." not in k:
+ continue
+ new_k = k.replace(".mlp.lin1.", ".mlp.0.").replace(".mlp.lin2.", ".mlp.2.").replace(".norm_final_attn.", ".norm_final.")
+ if new_k != k:
+ state_dict[new_k] = state_dict.pop(k)
+ return state_dict
+
+ def get_model(self, state_dict, prefix="", device=None):
+ return model_base.SAM3(self, device=device)
+
+ def clip_target(self, state_dict={}):
+ import comfy.text_encoders.sam3_clip
+ return supported_models_base.ClipTarget(comfy.text_encoders.sam3_clip.SAM3TokenizerWrapper, comfy.text_encoders.sam3_clip.SAM3ClipModelWrapper)
+
+
+class SAM31(SAM3):
+ unet_config = {"image_model": "SAM31"}
+
+
+class CogVideoX_T2V(supported_models_base.BASE):
+ unet_config = {
+ "image_model": "cogvideox",
+ }
+
+ sampling_settings = {
+ "linear_start": 0.00085,
+ "linear_end": 0.012,
+ "beta_schedule": "linear",
+ "zsnr": True,
+ }
+
+ unet_extra_config = {}
+ latent_format = latent_formats.CogVideoX
+
+ supported_inference_dtypes = [torch.bfloat16, torch.float16, torch.float32]
+
+ vae_key_prefix = ["vae."]
+ text_encoder_key_prefix = ["text_encoders."]
+
+ def __init__(self, unet_config):
+ # 2b-class (dim=1920, heads=30) uses scale_factor=1.15258426.
+ # 5b-class (dim=3072, heads=48) — incl. CogVideoX-5b, 1.5-5B, and
+ # Fun-V1.5 inpainting — uses scale_factor=0.7 per vae/config.json.
+ if unet_config.get("num_attention_heads", 0) >= 48:
+ self.latent_format = latent_formats.CogVideoX1_5
+ super().__init__(unet_config)
+
+ def get_model(self, state_dict, prefix="", device=None):
+ # CogVideoX 1.5 (patch_size_t=2) has different training base dimensions for RoPE
+ if self.unet_config.get("patch_size_t") is not None:
+ self.unet_config.setdefault("sample_height", 96)
+ self.unet_config.setdefault("sample_width", 170)
+ self.unet_config.setdefault("sample_frames", 81)
+ out = model_base.CogVideoX(self, device=device)
+ return out
+
+ def clip_target(self, state_dict={}):
+ return supported_models_base.ClipTarget(comfy.text_encoders.cogvideo.CogVideoXT5Tokenizer, comfy.text_encoders.sd3_clip.T5XXLModel)
+
+class CogVideoX_I2V(CogVideoX_T2V):
+ unet_config = {
+ "image_model": "cogvideox",
+ "in_channels": 32,
+ }
+
+ def get_model(self, state_dict, prefix="", device=None):
+ if self.unet_config.get("patch_size_t") is not None:
+ self.unet_config.setdefault("sample_height", 96)
+ self.unet_config.setdefault("sample_width", 170)
+ self.unet_config.setdefault("sample_frames", 81)
+ out = model_base.CogVideoX(self, image_to_video=True, device=device)
+ return out
+
+class CogVideoX_Inpaint(CogVideoX_T2V):
+ unet_config = {
+ "image_model": "cogvideox",
+ "in_channels": 48,
+ }
+
+ def get_model(self, state_dict, prefix="", device=None):
+ if self.unet_config.get("patch_size_t") is not None:
+ self.unet_config.setdefault("sample_height", 96)
+ self.unet_config.setdefault("sample_width", 170)
+ self.unet_config.setdefault("sample_frames", 81)
+ out = model_base.CogVideoX(self, image_to_video=True, device=device)
+ return out
+
+
+models = [
+ LotusD,
+ Stable_Zero123,
+ SD15_instructpix2pix,
+ SD15,
+ SD20,
+ SD21UnclipL,
+ SD21UnclipH,
+ SDXL_instructpix2pix,
+ SDXLRefiner,
+ SDXL,
+ SSD1B,
+ KOALA_700M,
+ KOALA_1B,
+ Segmind_Vega,
+ SD_X4Upscaler,
+ Stable_Cascade_C,
+ Stable_Cascade_B,
+ SV3D_u,
+ SV3D_p,
+ SD3,
+ StableAudio3,
+ StableAudio,
+ AuraFlow,
+ PixArtAlpha,
+ PixArtSigma,
+ HunyuanDiT,
+ HunyuanDiT1,
+ FluxInpaint,
+ Flux,
+ LongCatImage,
+ FluxSchnell,
+ GenmoMochi,
+ LTXV,
+ LTXAV,
+ HunyuanVideo15_SR_Distilled,
+ HunyuanVideo15,
+ HunyuanImage21Refiner,
+ HunyuanImage21,
+ HunyuanVideoSkyreelsI2V,
+ HunyuanVideoI2V,
+ HunyuanVideo,
+ CosmosT2V,
+ CosmosI2V,
+ CosmosT2IPredict2,
+ CosmosI2VPredict2,
+ ZImagePixelSpace,
+ ZImage,
+ PiD,
+ PixelDiTT2I,
+ Lumina2,
+ WAN22_T2V,
+ WAN21_CausalAR_T2V,
+ WAN21_T2V,
+ WAN21_I2V,
+ WAN21_FunControl2V,
+ WAN21_Vace,
+ WAN21_Camera,
+ WAN22_Camera,
+ WAN22_S2V,
+ WAN21_HuMo,
+ WAN22_Animate,
+ WAN21_FlowRVS,
+ WAN21_SCAIL,
+ WAN22_WanDancer,
+ Hunyuan3Dv2mini,
+ Hunyuan3Dv2,
+ Hunyuan3Dv2_1,
+ HiDream,
+ HiDreamO1,
+ Chroma,
+ ChromaRadiance,
+ ACEStep,
+ ACEStep15,
+ Omnigen2,
+ QwenImage,
+ Flux2,
+ Lens,
+ Kandinsky5Image,
+ Kandinsky5,
+ Anima,
+ RT_DETR_v4,
+ ErnieImage,
+ SAM3,
+ SAM31,
+ CogVideoX_Inpaint,
+ CogVideoX_I2V,
+ CogVideoX_T2V,
+ SVD_img2vid,
+]
diff --git a/comfy/taesd/taehv.py b/comfy/taesd/taehv.py
index 6c06ce19d..696013200 100644
--- a/comfy/taesd/taehv.py
+++ b/comfy/taesd/taehv.py
@@ -7,6 +7,7 @@ from tqdm.auto import tqdm
from collections import namedtuple, deque
import comfy.ops
+import comfy.model_management
operations=comfy.ops.disable_weight_init
DecoderResult = namedtuple("DecoderResult", ("frame", "memory"))
@@ -47,11 +48,14 @@ class TGrow(nn.Module):
x = self.conv(x)
return x.reshape(-1, C, H, W)
-def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
+def apply_model_with_memblocks(model, x, parallel, show_progress_bar, output_device=None,
+ patch_size=1, decode=False):
B, T, C, H, W = x.shape
if parallel:
x = x.reshape(B*T, C, H, W)
+ if not decode and patch_size > 1:
+ x = F.pixel_unshuffle(x, patch_size)
# parallel over input timesteps, iterate over blocks
for b in tqdm(model, disable=not show_progress_bar):
if isinstance(b, MemBlock):
@@ -62,20 +66,27 @@ def apply_model_with_memblocks(model, x, parallel, show_progress_bar):
x = b(x, mem)
else:
x = b(x)
- BT, C, H, W = x.shape
- T = BT // B
- x = x.view(B, T, C, H, W)
+ if decode and patch_size > 1:
+ x = F.pixel_shuffle(x, patch_size)
+ x = x.view(B, x.shape[0] // B, *x.shape[1:])
+ x = x.to(output_device)
else:
out = []
- work_queue = deque([TWorkItem(xt, 0) for t, xt in enumerate(x.reshape(B, T * C, H, W).chunk(T, dim=1))])
+ # Chunk along the time dim directly (chunks are [B,1,C,H,W] views, squeeze to [B,C,H,W] views).
+ # Avoids forcing a contiguous copy when x is non-contiguous (e.g. after movedim in encode/decode).
+ work_queue = deque([TWorkItem(xt.squeeze(1), 0) for xt in x.chunk(T, dim=1)])
progress_bar = tqdm(range(T), disable=not show_progress_bar)
mem = [None] * len(model)
while work_queue:
xt, i = work_queue.popleft()
if i == 0:
progress_bar.update(1)
+ if not decode and patch_size > 1:
+ xt = F.pixel_unshuffle(xt, patch_size)
if i == len(model):
- out.append(xt)
+ if decode and patch_size > 1:
+ xt = F.pixel_shuffle(xt, patch_size)
+ out.append(xt.to(output_device))
del xt
else:
b = model[i]
@@ -165,24 +176,20 @@ class TAEHV(nn.Module):
def encode(self, x, **kwargs):
x = x.movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
- if self.patch_size > 1:
- B, T, C, H, W = x.shape
- x = x.reshape(B * T, C, H, W)
- x = F.pixel_unshuffle(x, self.patch_size)
- x = x.reshape(B, T, C * self.patch_size ** 2, H // self.patch_size, W // self.patch_size)
if x.shape[1] % self.t_downscale != 0:
# pad at end to multiple of t_downscale
n_pad = self.t_downscale - x.shape[1] % self.t_downscale
padding = x[:, -1:].repeat_interleave(n_pad, dim=1)
x = torch.cat([x, padding], 1)
- x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar).movedim(2, 1)
+ x = apply_model_with_memblocks(self.encoder, x, self.parallel, self.show_progress_bar,
+ patch_size=self.patch_size).movedim(2, 1)
return self.process_out(x)
def decode(self, x, **kwargs):
x = x.unsqueeze(0) if x.ndim == 4 else x # [T, C, H, W] -> [1, T, C, H, W]
x = x.movedim(1, 2) if x.shape[1] != self.latent_channels else x # [B, T, C, H, W] or [B, C, T, H, W]
x = self.process_in(x).movedim(2, 1) # [B, C, T, H, W] -> [B, T, C, H, W]
- x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar)
- if self.patch_size > 1:
- x = F.pixel_shuffle(x, self.patch_size)
+ x = apply_model_with_memblocks(self.decoder, x, self.parallel, self.show_progress_bar,
+ output_device=comfy.model_management.intermediate_device(),
+ patch_size=self.patch_size, decode=True)
return x[:, self.frames_to_trim:].movedim(2, 1)
diff --git a/comfy/taesd/taesd.py b/comfy/taesd/taesd.py
index ce36f1a84..05d370209 100644
--- a/comfy/taesd/taesd.py
+++ b/comfy/taesd/taesd.py
@@ -17,32 +17,79 @@ class Clamp(nn.Module):
return torch.tanh(x / 3) * 3
class Block(nn.Module):
- def __init__(self, n_in, n_out):
+ def __init__(self, n_in: int, n_out: int, use_midblock_gn: bool = False):
super().__init__()
self.conv = nn.Sequential(conv(n_in, n_out), nn.ReLU(), conv(n_out, n_out), nn.ReLU(), conv(n_out, n_out))
self.skip = comfy.ops.disable_weight_init.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity()
self.fuse = nn.ReLU()
- def forward(self, x):
+ if not use_midblock_gn:
+ self.pool = None
+ return
+ n_gn = n_in * 4
+ self.pool = nn.Sequential(
+ comfy.ops.disable_weight_init.Conv2d(n_in, n_gn, 1, bias=False),
+ comfy.ops.disable_weight_init.GroupNorm(4, n_gn),
+ nn.ReLU(inplace=True),
+ comfy.ops.disable_weight_init.Conv2d(n_gn, n_in, 1, bias=False),
+ )
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ if self.pool is not None:
+ x = x + self.pool(x)
return self.fuse(self.conv(x) + self.skip(x))
-def Encoder(latent_channels=4):
- return nn.Sequential(
- conv(3, 64), Block(64, 64),
- conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
- conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
- conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
- conv(64, latent_channels),
- )
+class Encoder(nn.Sequential):
+ def __init__(self, latent_channels: int = 4, use_gn: bool = False):
+ super().__init__(
+ conv(3, 64), Block(64, 64),
+ conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
+ conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64),
+ conv(64, 64, stride=2, bias=False), Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn),
+ conv(64, latent_channels),
+ )
+class Decoder(nn.Sequential):
+ def __init__(self, latent_channels: int = 4, use_gn: bool = False):
+ super().__init__(
+ Clamp(), conv(latent_channels, 64), nn.ReLU(),
+ Block(64, 64, use_gn), Block(64, 64, use_gn), Block(64, 64, use_gn), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+ Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+ Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
+ Block(64, 64), conv(64, 3),
+ )
+
+class DecoderFlux2(Decoder):
+ def __init__(self, latent_channels: int = 128, use_gn: bool = True):
+ if latent_channels != 128 or not use_gn:
+ raise ValueError("Unexpected parameters for Flux2 TAE module")
+ super().__init__(latent_channels=32, use_gn=True)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ B, C, H, W = x.shape
+ x = (
+ x
+ .reshape(B, 32, 2, 2, H, W)
+ .permute(0, 1, 4, 2, 5, 3)
+ .reshape(B, 32, H * 2, W * 2)
+ )
+ return super().forward(x)
+
+class EncoderFlux2(Encoder):
+ def __init__(self, latent_channels: int = 128, use_gn: bool = True):
+ if latent_channels != 128 or not use_gn:
+ raise ValueError("Unexpected parameters for Flux2 TAE module")
+ super().__init__(latent_channels=32, use_gn=True)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ result = super().forward(x)
+ B, C, H, W = result.shape
+ return (
+ result
+ .reshape(B, C, H // 2, 2, W // 2, 2)
+ .permute(0, 1, 3, 5, 2, 4)
+ .reshape(B, 128, H // 2, W // 2)
+ )
-def Decoder(latent_channels=4):
- return nn.Sequential(
- Clamp(), conv(latent_channels, 64), nn.ReLU(),
- Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
- Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
- Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False),
- Block(64, 64), conv(64, 3),
- )
class TAESD(nn.Module):
latent_magnitude = 3
@@ -51,8 +98,15 @@ class TAESD(nn.Module):
def __init__(self, encoder_path=None, decoder_path=None, latent_channels=4):
"""Initialize pretrained TAESD on the given device from the given checkpoints."""
super().__init__()
- self.taesd_encoder = Encoder(latent_channels=latent_channels)
- self.taesd_decoder = Decoder(latent_channels=latent_channels)
+ if latent_channels == 128:
+ encoder_class = EncoderFlux2
+ decoder_class = DecoderFlux2
+ else:
+ encoder_class = Encoder
+ decoder_class = Decoder
+ self.taesd_encoder = encoder_class(latent_channels=latent_channels)
+ self.taesd_decoder = decoder_class(latent_channels=latent_channels)
+
self.vae_scale = torch.nn.Parameter(torch.tensor(1.0))
self.vae_shift = torch.nn.Parameter(torch.tensor(0.0))
if encoder_path is not None:
@@ -61,19 +115,19 @@ class TAESD(nn.Module):
self.taesd_decoder.load_state_dict(comfy.utils.load_torch_file(decoder_path, safe_load=True))
@staticmethod
- def scale_latents(x):
+ def scale_latents(x: torch.Tensor) -> torch.Tensor:
"""raw latents -> [0, 1]"""
return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1)
@staticmethod
- def unscale_latents(x):
+ def unscale_latents(x: torch.Tensor) -> torch.Tensor:
"""[0, 1] -> raw latents"""
return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude)
- def decode(self, x):
+ def decode(self, x: torch.Tensor) -> torch.Tensor:
x_sample = self.taesd_decoder((x - self.vae_shift) * self.vae_scale)
x_sample = x_sample.sub(0.5).mul(2)
return x_sample
- def encode(self, x):
+ def encode(self, x: torch.Tensor) -> torch.Tensor:
return (self.taesd_encoder(x * 0.5 + 0.5) / self.vae_scale) + self.vae_shift
diff --git a/comfy/text_encoders/cogvideo.py b/comfy/text_encoders/cogvideo.py
new file mode 100644
index 000000000..b97310709
--- /dev/null
+++ b/comfy/text_encoders/cogvideo.py
@@ -0,0 +1,48 @@
+import comfy.text_encoders.sd3_clip
+from comfy import sd1_clip
+
+
+class CogVideoXT5Tokenizer(comfy.text_encoders.sd3_clip.T5XXLTokenizer):
+ """Inner T5 tokenizer for CogVideoX.
+
+ CogVideoX was trained with T5 embeddings padded to 226 tokens (not 77 like SD3).
+ Used both directly by supported_models.CogVideoX_T2V.clip_target (paired with
+ the raw T5XXLModel) and by the CogVideoXTokenizer outer wrapper below.
+ """
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, min_length=226)
+
+
+class CogVideoXTokenizer(sd1_clip.SD1Tokenizer):
+ """Outer tokenizer wrapper for CLIPLoader (type="cogvideox")."""
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
+ clip_name="t5xxl", tokenizer=CogVideoXT5Tokenizer)
+
+
+class CogVideoXT5XXL(sd1_clip.SD1ClipModel):
+ """Outer T5XXL model wrapper for CLIPLoader (type="cogvideox").
+
+ Wraps the raw T5XXL model in the SD1ClipModel interface so that CLIP.__init__
+ (which reads self.dtypes) works correctly. The inner model is the standard
+ sd3_clip.T5XXLModel (no attention_mask change needed for CogVideoX).
+ """
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ super().__init__(device=device, dtype=dtype, name="t5xxl",
+ clip_model=comfy.text_encoders.sd3_clip.T5XXLModel,
+ model_options=model_options)
+
+
+def cogvideo_te(dtype_t5=None, t5_quantization_metadata=None):
+ """Factory that returns a CogVideoXT5XXL class configured with the detected
+ T5 dtype and optional quantization metadata, for use in load_text_encoder_state_dicts.
+ """
+ class CogVideoXTEModel_(CogVideoXT5XXL):
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ if t5_quantization_metadata is not None:
+ model_options = model_options.copy()
+ model_options["t5xxl_quantization_metadata"] = t5_quantization_metadata
+ if dtype_t5 is not None:
+ dtype = dtype_t5
+ super().__init__(device=device, dtype=dtype, model_options=model_options)
+ return CogVideoXTEModel_
diff --git a/comfy/text_encoders/ernie.py b/comfy/text_encoders/ernie.py
index 8c56c1c11..46d24d222 100644
--- a/comfy/text_encoders/ernie.py
+++ b/comfy/text_encoders/ernie.py
@@ -3,7 +3,7 @@ from comfy import sd1_clip
import comfy.text_encoders.llama
class Ministral3_3BTokenizer(Mistral3Tokenizer):
- def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='mistral3_24b', tokenizer_data={}):
+ def __init__(self, embedding_directory=None, embedding_size=5120, embedding_key='ministral3_3b', tokenizer_data={}):
return super().__init__(embedding_directory=embedding_directory, embedding_size=embedding_size, embedding_key=embedding_key, tokenizer_data=tokenizer_data)
class ErnieTokenizer(sd1_clip.SD1Tokenizer):
@@ -35,4 +35,4 @@ def te(dtype_llama=None, llama_quantization_metadata=None):
model_options = model_options.copy()
model_options["quantization_metadata"] = llama_quantization_metadata
super().__init__(device=device, dtype=dtype, model_options=model_options)
- return ErnieTEModel
+ return ErnieTEModel_
diff --git a/comfy/text_encoders/gemma4.py b/comfy/text_encoders/gemma4.py
new file mode 100644
index 000000000..f050061ed
--- /dev/null
+++ b/comfy/text_encoders/gemma4.py
@@ -0,0 +1,1298 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from dataclasses import dataclass
+import math
+
+from comfy import sd1_clip
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention_for_device
+from comfy.rmsnorm import rms_norm
+from comfy.text_encoders.llama import RMSNorm, MLP, BaseLlama, BaseGenerate, _make_scaled_embedding
+
+
+# Intentional minor divergences from transformers -reference implementation:
+# - Embedding sqrt(hidden_size) scale applied as a Python scalar (full precision) instead of dtype-matched buffer tensor.
+# - RMSNorm uses torch fused F.rms_norm, very slight numerical differences, but considerably faster
+# - Input image and audio resizing/resampling slightly different numerically
+
+
+GEMMA4_VISION_CONFIG = {"hidden_size": 768, "image_size": 896, "intermediate_size": 3072, "num_attention_heads": 12, "num_hidden_layers": 16, "patch_size": 16, "head_dim": 64, "rms_norm_eps": 1e-6, "position_embedding_size": 10240, "pooling_kernel_size": 3}
+GEMMA4_VISION_31B_CONFIG = {"hidden_size": 1152, "image_size": 896, "intermediate_size": 4304, "num_attention_heads": 16, "num_hidden_layers": 27, "patch_size": 16, "head_dim": 72, "rms_norm_eps": 1e-6, "position_embedding_size": 10240, "pooling_kernel_size": 3}
+GEMMA4_AUDIO_CONFIG = {"hidden_size": 1024, "num_hidden_layers": 12, "num_attention_heads": 8, "intermediate_size": 4096, "conv_kernel_size": 5, "attention_chunk_size": 12, "attention_context_left": 13, "attention_context_right": 0, "attention_logit_cap": 50.0, "output_proj_dims": 1536, "rms_norm_eps": 1e-6, "residual_weight": 0.5}
+
+@dataclass
+class Gemma4Config:
+ vocab_size: int = 262144
+ hidden_size: int = 2560
+ intermediate_size: int = 10240
+ num_hidden_layers: int = 42
+ num_attention_heads: int = 8
+ num_key_value_heads: int = 2
+ max_position_embeddings: int = 131072
+ rms_norm_eps: float = 1e-6
+ rope_theta = [1000000.0, 10000.0]
+ transformer_type: str = "gemma4"
+ head_dim = 256
+ global_head_dim = 512
+ rms_norm_add = False
+ mlp_activation = "gelu_pytorch_tanh"
+ qkv_bias = False
+ rope_dims = None
+ q_norm = "gemma3"
+ k_norm = "gemma3"
+ sliding_attention = [512, 512, 512, 512, 512, False]
+ rope_scale = None
+ partial_rotary_factor: float = 0.25
+ final_norm: bool = True
+ lm_head: bool = False
+ final_logit_softcapping: float = 30.0
+ hidden_size_per_layer_input: int = 256
+ num_kv_shared_layers: int = 18
+ use_double_wide_mlp: bool = False
+ stop_tokens = [1, 50, 106]
+ vision_config = GEMMA4_VISION_CONFIG
+ audio_config = GEMMA4_AUDIO_CONFIG
+ mm_tokens_per_image = 280
+
+@dataclass
+class Gemma4_E2B_Config(Gemma4Config):
+ hidden_size: int = 1536
+ intermediate_size: int = 6144
+ num_hidden_layers: int = 35
+ num_key_value_heads: int = 1
+ sliding_attention = [512, 512, 512, 512, False]
+ num_kv_shared_layers: int = 20
+ use_double_wide_mlp: bool = True
+
+@dataclass
+class Gemma4_31B_Config(Gemma4Config):
+ hidden_size: int = 5376
+ intermediate_size: int = 21504
+ num_hidden_layers: int = 60
+ num_attention_heads: int = 32
+ num_key_value_heads: int = 16
+ sliding_attention = [1024, 1024, 1024, 1024, 1024, False]
+ hidden_size_per_layer_input: int = 0
+ num_kv_shared_layers: int = 0
+ audio_config = None
+ vision_config = GEMMA4_VISION_31B_CONFIG
+
+
+# unfused RoPE as addcmul_ RoPE diverges from reference code
+def _apply_rotary_pos_emb(x, freqs_cis):
+ cos, sin = freqs_cis[0], freqs_cis[1]
+ half = x.shape[-1] // 2
+ out = x * cos
+ out[..., :half] -= x[..., half:] * sin[..., :half]
+ out[..., half:] += x[..., :half] * sin[..., half:]
+ return out
+
+class Gemma4Attention(nn.Module):
+ def __init__(self, config, head_dim, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.num_heads = config.num_attention_heads
+ self.num_kv_heads = config.num_key_value_heads
+ self.hidden_size = config.hidden_size
+ self.head_dim = head_dim
+ self.inner_size = self.num_heads * head_dim
+
+ self.q_proj = ops.Linear(config.hidden_size, self.inner_size, bias=config.qkv_bias, device=device, dtype=dtype)
+ self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
+ self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * head_dim, bias=config.qkv_bias, device=device, dtype=dtype)
+ self.o_proj = ops.Linear(self.inner_size, config.hidden_size, bias=False, device=device, dtype=dtype)
+
+ self.q_norm = None
+ self.k_norm = None
+ if config.q_norm == "gemma3":
+ self.q_norm = RMSNorm(head_dim, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ if config.k_norm == "gemma3":
+ self.k_norm = RMSNorm(head_dim, eps=config.rms_norm_eps, device=device, dtype=dtype)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask=None,
+ freqs_cis=None,
+ past_key_value=None,
+ sliding_window=None,
+ shared_kv=None,
+ ):
+ batch_size, seq_length, _ = hidden_states.shape
+
+ xq = self.q_proj(hidden_states)
+ xq = xq.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+ if self.q_norm is not None:
+ xq = self.q_norm(xq)
+
+ if shared_kv is not None:
+ xk, xv = shared_kv
+ # Apply RoPE to Q only (K already has RoPE from source layer)
+ xq = _apply_rotary_pos_emb(xq, freqs_cis)
+ present_key_value = None
+ shareable_kv = None
+ else:
+ xk = self.k_proj(hidden_states).view(batch_size, seq_length, self.num_kv_heads, self.head_dim)
+ xv = self.v_proj(hidden_states).view(batch_size, seq_length, self.num_kv_heads, self.head_dim)
+ if self.k_norm is not None:
+ xk = self.k_norm(xk)
+ xv = rms_norm(xv)
+ xk = xk.transpose(1, 2)
+ xv = xv.transpose(1, 2)
+ xq = _apply_rotary_pos_emb(xq, freqs_cis)
+ xk = _apply_rotary_pos_emb(xk, freqs_cis)
+
+ present_key_value = None
+ if past_key_value is not None:
+ cumulative_len = 0
+ if len(past_key_value) > 0:
+ past_key, past_value, cumulative_len = past_key_value
+ xk = torch.cat((past_key, xk), dim=2)
+ xv = torch.cat((past_value, xv), dim=2)
+ new_cumulative = cumulative_len + seq_length
+ if sliding_window is not None and xk.shape[2] > sliding_window - 1:
+ cache_k = xk[:, :, -(sliding_window - 1):]
+ cache_v = xv[:, :, -(sliding_window - 1):]
+ else:
+ cache_k = xk
+ cache_v = xv
+ present_key_value = (cache_k, cache_v, new_cumulative)
+
+ # KV for sharing: full xk/xv that SDPA sees (not evicted cache)
+ shareable_kv = (xk, xv)
+
+ # GQA: pass unexpanded KV with enable_gqa when no sliding mask,
+ # expand heads when sliding mask is present
+ # has to be done within SDPA itself to match the reference code, pre-scaling expansion causes numerical differences
+ expand_kv = (self.num_heads != self.num_kv_heads and
+ sliding_window is not None and
+ xk.shape[2] >= sliding_window)
+ if expand_kv:
+ xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+ xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+ gqa_kwargs = {} if expand_kv else ({"enable_gqa": True} if self.num_heads != self.num_kv_heads else {})
+ output = optimized_attention_for_device(xq.device, mask=attention_mask is not None, small_input=True)(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True, scale=1.0, **gqa_kwargs)
+
+ return self.o_proj(output), present_key_value, shareable_kv
+
+
+class TransformerBlockGemma4(nn.Module):
+ def __init__(self, config, index, device=None, dtype=None, ops=None):
+ super().__init__()
+ if config.sliding_attention is not None:
+ self.sliding_attention = config.sliding_attention[index % len(config.sliding_attention)]
+ else:
+ self.sliding_attention = False
+
+ head_dim = config.head_dim if self.sliding_attention else config.global_head_dim
+
+ self.self_attn = Gemma4Attention(config, head_dim=head_dim, device=device, dtype=dtype, ops=ops)
+
+ num_kv_shared = config.num_kv_shared_layers
+ first_kv_shared = config.num_hidden_layers - num_kv_shared
+ mlp_size = config.intermediate_size * 2 if config.use_double_wide_mlp and index >= first_kv_shared else None
+ self.mlp = MLP(config, device=device, dtype=dtype, ops=ops, intermediate_size=mlp_size)
+
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+
+ self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+ if self.hidden_size_per_layer_input:
+ self.per_layer_input_gate = ops.Linear(config.hidden_size, self.hidden_size_per_layer_input, bias=False, device=device, dtype=dtype)
+ self.per_layer_projection = ops.Linear(self.hidden_size_per_layer_input, config.hidden_size, bias=False, device=device, dtype=dtype)
+ self.post_per_layer_input_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.register_buffer("layer_scalar", torch.ones(1, device=device, dtype=dtype))
+ else:
+ self.layer_scalar = None
+
+ def forward(self, x, attention_mask=None, freqs_cis=None, past_key_value=None, per_layer_input=None, shared_kv=None):
+ sliding_window = None
+ if self.sliding_attention:
+ sliding_window = self.sliding_attention
+ # For prefill > sliding window, add sliding window restriction to the causal mask.
+ if x.shape[1] > self.sliding_attention:
+ sw_mask = torch.zeros(x.shape[1], x.shape[1], dtype=x.dtype, device=x.device)
+ sw_mask.masked_fill_(torch.ones_like(sw_mask, dtype=torch.bool).tril_(-self.sliding_attention), torch.finfo(x.dtype).min)
+ attention_mask = attention_mask + sw_mask if attention_mask is not None else sw_mask
+ freqs_cis = freqs_cis[1]
+ else:
+ freqs_cis = freqs_cis[0]
+
+ residual = x
+ x = self.input_layernorm(x)
+ x, present_key_value, shareable_kv = self.self_attn(
+ hidden_states=x, attention_mask=attention_mask, freqs_cis=freqs_cis,
+ past_key_value=past_key_value, sliding_window=sliding_window, shared_kv=shared_kv,
+ )
+ x = self.post_attention_layernorm(x)
+ x = residual + x
+
+ residual = x
+ x = self.pre_feedforward_layernorm(x)
+ x = self.mlp(x)
+ x = self.post_feedforward_layernorm(x)
+ x = residual + x
+
+ if self.hidden_size_per_layer_input and per_layer_input is not None:
+ residual = x
+ x = self.per_layer_input_gate(x)
+ x = torch.nn.functional.gelu(x, approximate="tanh")
+ x = x * per_layer_input
+ x = self.per_layer_projection(x)
+ x = self.post_per_layer_input_norm(x)
+ x = residual + x
+
+ if self.layer_scalar is not None:
+ x = x * self.layer_scalar
+
+ return x, present_key_value, shareable_kv
+
+
+class Gemma4Transformer(nn.Module):
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.config = config
+
+ self.embed_tokens = _make_scaled_embedding(ops, config.vocab_size, config.hidden_size, config.hidden_size ** 0.5, device, dtype)
+
+ self.layers = nn.ModuleList([
+ TransformerBlockGemma4(config, index=i, device=device, dtype=dtype, ops=ops)
+ for i in range(config.num_hidden_layers)
+ ])
+
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype) if config.final_norm else None
+
+ # Precompute RoPE inv_freq on CPU to match reference code's exact value
+ rope_angles_global = int(config.partial_rotary_factor * config.global_head_dim // 2)
+ nope_global = config.global_head_dim // 2 - rope_angles_global
+ global_inv = 1.0 / (config.rope_theta[0] ** (torch.arange(0, 2 * rope_angles_global, 2).float() / config.global_head_dim))
+ if nope_global > 0:
+ global_inv = torch.cat([global_inv, torch.zeros(nope_global)])
+ self.register_buffer("_global_inv_freq", global_inv, persistent=False)
+
+ sliding_inv = 1.0 / (config.rope_theta[1] ** (torch.arange(0, config.head_dim, 2).float() / config.head_dim))
+ self.register_buffer("_sliding_inv_freq", sliding_inv, persistent=False)
+
+ # Per-layer input mechanism
+ self.hidden_size_per_layer_input = config.hidden_size_per_layer_input
+ if self.hidden_size_per_layer_input:
+ self.embed_tokens_per_layer = _make_scaled_embedding(ops, config.vocab_size, config.num_hidden_layers * self.hidden_size_per_layer_input, self.hidden_size_per_layer_input ** 0.5, device, dtype)
+ self.per_layer_model_projection = ops.Linear(
+ config.hidden_size, config.num_hidden_layers * self.hidden_size_per_layer_input,
+ bias=False, device=device, dtype=dtype)
+ self.per_layer_projection_norm = RMSNorm(
+ self.hidden_size_per_layer_input, eps=config.rms_norm_eps,
+ device=device, dtype=dtype)
+
+ def get_past_len(self, past_key_values):
+ for kv in past_key_values:
+ if len(kv) >= 3:
+ return kv[2]
+ return 0
+
+ def _freqs_from_inv(self, inv_freq, position_ids, device, dtype):
+ """Compute cos/sin from stored inv_freq"""
+ inv_exp = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(device)
+ pos_exp = position_ids[:, None, :].float()
+ freqs = (inv_exp @ pos_exp).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ return emb.cos().unsqueeze(1).to(dtype), emb.sin().unsqueeze(1).to(dtype)
+
+ def compute_freqs_cis(self, position_ids, device, dtype=None):
+ global_freqs = self._freqs_from_inv(self._global_inv_freq, position_ids, device, dtype)
+ sliding_freqs = self._freqs_from_inv(self._sliding_inv_freq, position_ids, device, dtype)
+ return [global_freqs, sliding_freqs]
+
+ def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None,
+ final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=None,
+ past_key_values=None, input_ids=None):
+ if embeds is not None:
+ x = embeds
+ else:
+ x = self.embed_tokens(x, out_dtype=dtype)
+
+ seq_len = x.shape[1]
+ past_len = 0
+ if past_key_values is not None and len(past_key_values) > 0:
+ past_len = self.get_past_len(past_key_values)
+
+ if position_ids is None:
+ position_ids = torch.arange(past_len, past_len + seq_len, device=x.device).unsqueeze(0)
+
+ freqs_cis = self.compute_freqs_cis(position_ids, x.device, dtype=x.dtype)
+
+ mask = None
+ min_val = torch.finfo(x.dtype).min
+ if attention_mask is not None:
+ mask = 1.0 - attention_mask.to(x.dtype).reshape((attention_mask.shape[0], 1, -1, attention_mask.shape[-1])).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
+ mask = mask.masked_fill(mask.to(torch.bool), min_val)
+
+ if seq_len > 1:
+ causal_mask = torch.zeros(past_len + seq_len, past_len + seq_len, dtype=x.dtype, device=x.device)
+ causal_mask.masked_fill_(torch.ones_like(causal_mask, dtype=torch.bool).triu_(1), min_val)
+ mask = mask + causal_mask if mask is not None else causal_mask
+
+ # Per-layer inputs
+ per_layer_inputs = None
+ if self.hidden_size_per_layer_input:
+ num_layers = self.config.num_hidden_layers
+ hpl = self.hidden_size_per_layer_input
+ per_layer_proj = self.per_layer_model_projection(x) * (1.0 / (self.config.hidden_size ** 0.5))
+ per_layer_proj = self.per_layer_projection_norm(per_layer_proj.reshape(*x.shape[:-1], num_layers, hpl))
+ if input_ids is not None and input_ids.shape[1] == x.shape[1]:
+ per_layer_emb = self.embed_tokens_per_layer(input_ids).reshape(*input_ids.shape, num_layers, hpl)
+ per_layer_inputs = (per_layer_proj + per_layer_emb) * (0.5 ** 0.5)
+ else:
+ per_layer_inputs = per_layer_proj
+
+ # KV sharing: later layers reuse KV from the last non-shared sliding/global layer
+ num_kv_shared = self.config.num_kv_shared_layers
+ first_kv_shared = self.config.num_hidden_layers - num_kv_shared if num_kv_shared > 0 else self.config.num_hidden_layers
+ shared_sliding_kv = None # KV from last non-shared sliding layer
+ shared_global_kv = None # KV from last non-shared global layer
+
+ intermediate = None
+ next_key_values = []
+ for i, layer in enumerate(self.layers):
+ past_kv = past_key_values[i] if past_key_values is not None and len(past_key_values) > 0 else None
+
+ layer_kwargs = {}
+ if per_layer_inputs is not None:
+ layer_kwargs['per_layer_input'] = per_layer_inputs[:, :, i, :]
+
+ is_sliding = hasattr(layer, 'sliding_attention') and layer.sliding_attention
+ if i >= first_kv_shared and num_kv_shared > 0:
+ shared = shared_sliding_kv if is_sliding else shared_global_kv
+ if shared is not None:
+ layer_kwargs['shared_kv'] = shared
+
+ x, current_kv, shareable_kv = layer(x=x, attention_mask=mask, freqs_cis=freqs_cis, past_key_value=past_kv, **layer_kwargs)
+
+ next_key_values.append(current_kv if current_kv is not None else ())
+
+ # Only track the last sliding/global before the sharing boundary
+ if i < first_kv_shared and shareable_kv is not None:
+ if is_sliding:
+ shared_sliding_kv = shareable_kv
+ else:
+ shared_global_kv = shareable_kv
+
+ if i == intermediate_output:
+ intermediate = x.clone()
+
+ if self.norm is not None:
+ x = self.norm(x)
+
+ if len(next_key_values) > 0:
+ return x, intermediate, next_key_values
+ return x, intermediate
+
+
+class Gemma4Base(BaseLlama, BaseGenerate, torch.nn.Module):
+ """Common base for all Gemma4 variants: text model + vision."""
+ def _init_model(self, config, dtype, device, operations):
+ self.num_layers = config.num_hidden_layers
+ self.model = Gemma4Transformer(config, device=device, dtype=dtype, ops=operations)
+ self.dtype = dtype
+ self.multi_modal_projector = Gemma4MultiModalProjector(config, dtype=dtype, device=device, ops=operations)
+ self.vision_model = Gemma4VisionEncoder(config.vision_config, dtype=dtype, device=device, ops=operations)
+
+ def logits(self, x):
+ logits = super().logits(x)
+ cap = self.model.config.final_logit_softcapping
+ if cap:
+ logits = cap * torch.tanh(logits / cap)
+ return logits
+
+ def init_kv_cache(self, batch, max_cache_len, device, execution_dtype):
+ past_key_values = []
+ for _ in range(self.model.config.num_hidden_layers):
+ past_key_values.append(())
+ return past_key_values
+
+ def preprocess_embed(self, embed, device):
+ if embed["type"] == "image":
+ image = embed.pop("data").movedim(-1, 1) # [B, H, W, C] -> [B, C, H, W]
+ max_soft_tokens = embed.get("max_soft_tokens", None)
+ vision_out = self.vision_model(image.to(device, dtype=torch.float32), max_soft_tokens=max_soft_tokens)
+ return self.multi_modal_projector(vision_out), None
+ return None, None
+
+
+class Gemma4AudioMixin:
+ """Adds audio support to a Gemma4 model."""
+ def _init_audio(self, config, dtype, device, operations):
+ self.audio_model = Gemma4AudioEncoder(config.audio_config, dtype=dtype, device=device, ops=operations)
+ self.audio_projector = Gemma4AudioProjector({"audio_output_proj_dims": config.audio_config["output_proj_dims"], "text_hidden_size": config.hidden_size, "rms_norm_eps": config.rms_norm_eps}, dtype=dtype, device=device, ops=operations)
+
+ def preprocess_embed(self, embed, device):
+ result, extra = super().preprocess_embed(embed, device)
+ if result is not None:
+ return result, extra
+ if embed["type"] == "audio":
+ audio = embed.pop("data").to(device, dtype=torch.float32)
+ audio_mask = embed.pop("mask", None)
+ if audio_mask is not None:
+ audio_mask = audio_mask.to(device)
+ audio_out = self.audio_model(audio, audio_mask=audio_mask)
+ return self.audio_projector(audio_out), None
+ return None, None
+
+
+# Vision Encoder
+
+def _compute_vision_2d_rope(head_dim, pixel_position_ids, theta=100.0, device=None):
+ """Compute 2D RoPE for vision: separate frequencies for x and y dimensions.
+
+ Args:
+ head_dim: dimension per head (e.g. 64)
+ pixel_position_ids: [batch, num_patches, 2] with (x, y) coords
+ theta: RoPE base frequency
+ Returns:
+ (cos, sin) each of shape [batch, num_patches, head_dim]
+ """
+ rotary_dim_per_axis = head_dim // 2
+ freq_indices = torch.arange(0, rotary_dim_per_axis, 2, device=device).float()
+ inv_freq = 1.0 / (theta ** (freq_indices / rotary_dim_per_axis))
+
+ all_cos, all_sin = [], []
+ for i in range(2): # x and y
+ dim_positions = pixel_position_ids[:, :, i].float() # [batch, num_patches]
+ freqs = torch.einsum('bi,j->bij', dim_positions, inv_freq.to(device)) # [batch, num_patches, rotary_dim/2]
+ emb = torch.cat([freqs, freqs], dim=-1) # [batch, num_patches, rotary_dim]
+ all_cos.append(emb.cos())
+ all_sin.append(emb.sin())
+
+ cos = torch.cat(all_cos, dim=-1).to(pixel_position_ids.device) # [batch, num_patches, head_dim]
+ sin = torch.cat(all_sin, dim=-1).to(pixel_position_ids.device)
+ return cos, sin
+
+
+def _apply_vision_2d_rope(x, freqs):
+ """Apply 2D RoPE (multidimensional) to vision query/key states.
+
+ Splits x and cos/sin into ndim=2 parts, applies 1D RoPE to each independently.
+
+ x: [batch, heads, seq, head_dim]
+ freqs: (cos, sin) each [batch, seq, head_dim]
+ """
+ cos = freqs[0].unsqueeze(1) # [batch, 1, seq, head_dim]
+ sin = freqs[1].unsqueeze(1)
+ half = x.shape[-1] // 2
+ a = _apply_rotary_pos_emb(x[..., :half], (cos[..., :half], sin[..., :half]))
+ b = _apply_rotary_pos_emb(x[..., half:], (cos[..., half:], sin[..., half:]))
+ return torch.cat([a, b], dim=-1)
+
+
+class ClippedLinear(nn.Module):
+ """Linear layer with activation clipping (from quantization-aware training).
+
+ Stores input_max/min and output_max/min as buffers loaded from checkpoint.
+ """
+ def __init__(self, in_features, out_features, bias=False, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.linear = ops.Linear(in_features, out_features, bias=bias, device=device, dtype=dtype)
+ self.register_buffer('input_max', torch.tensor(float('inf'), device=device, dtype=dtype))
+ self.register_buffer('input_min', torch.tensor(float('-inf'), device=device, dtype=dtype))
+ self.register_buffer('output_max', torch.tensor(float('inf'), device=device, dtype=dtype))
+ self.register_buffer('output_min', torch.tensor(float('-inf'), device=device, dtype=dtype))
+
+ @property
+ def weight(self):
+ return self.linear.weight
+
+ def forward(self, x):
+ x = x.clamp(min=self.input_min, max=self.input_max)
+ x = self.linear(x)
+ return x.clamp_(min=self.output_min, max=self.output_max)
+
+
+class Gemma4VisionMLP(nn.Module):
+ """SwiGLU MLP matching gate_proj/up_proj/down_proj structure."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ hidden_size = config["hidden_size"]
+ intermediate_size = config["intermediate_size"]
+ self.gate_proj = ClippedLinear(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
+ self.up_proj = ClippedLinear(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
+ self.down_proj = ClippedLinear(intermediate_size, hidden_size, device=device, dtype=dtype, ops=ops)
+
+ def forward(self, x):
+ return self.down_proj(torch.nn.functional.gelu(self.gate_proj(x), approximate="tanh") * self.up_proj(x))
+
+
+class Gemma4VisionAttention(nn.Module):
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.hidden_size = config["hidden_size"]
+ self.num_heads = config["num_attention_heads"]
+ self.head_dim = config.get("head_dim", self.hidden_size // self.num_heads)
+
+ self.q_proj = ClippedLinear(self.hidden_size, self.num_heads * self.head_dim, device=device, dtype=dtype, ops=ops)
+ self.k_proj = ClippedLinear(self.hidden_size, self.num_heads * self.head_dim, device=device, dtype=dtype, ops=ops)
+ self.v_proj = ClippedLinear(self.hidden_size, self.num_heads * self.head_dim, device=device, dtype=dtype, ops=ops)
+ self.o_proj = ClippedLinear(self.num_heads * self.head_dim, self.hidden_size, device=device, dtype=dtype, ops=ops)
+
+ self.q_norm = RMSNorm(self.head_dim, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ self.k_norm = RMSNorm(self.head_dim, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+
+ def forward(self, x, freqs, attention_mask=None):
+ batch_size, seq_length, _ = x.shape
+
+ xq = self.q_proj(x).view(batch_size, seq_length, self.num_heads, self.head_dim)
+ xk = self.k_proj(x).view(batch_size, seq_length, self.num_heads, self.head_dim)
+ xv = self.v_proj(x).view(batch_size, seq_length, self.num_heads, self.head_dim)
+
+ xq = self.q_norm(xq).transpose(1, 2)
+ xk = self.k_norm(xk).transpose(1, 2)
+ xv = rms_norm(xv)
+
+ xq = _apply_vision_2d_rope(xq, freqs)
+ xk = _apply_vision_2d_rope(xk, freqs)
+
+ xv = xv.to(xq.dtype).transpose(1, 2)
+
+ output = optimized_attention_for_device(xq.device, mask=attention_mask is not None, small_input=True)(xq, xk, xv, self.num_heads, mask=attention_mask, skip_reshape=True, scale=1.0)
+ return self.o_proj(output)
+
+
+class Gemma4VisionLayer(nn.Module):
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.self_attn = Gemma4VisionAttention(config, device=device, dtype=dtype, ops=ops)
+ self.mlp = Gemma4VisionMLP(config, device=device, dtype=dtype, ops=ops)
+ norm_kwargs = dict(eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ hidden = config["hidden_size"]
+ self.input_layernorm = RMSNorm(hidden, **norm_kwargs)
+ self.post_attention_layernorm = RMSNorm(hidden, **norm_kwargs)
+ self.pre_feedforward_layernorm = RMSNorm(hidden, **norm_kwargs)
+ self.post_feedforward_layernorm = RMSNorm(hidden, **norm_kwargs)
+
+ def forward(self, x, freqs, attention_mask=None):
+ residual = x
+ x = self.input_layernorm(x)
+ x = self.self_attn(x, freqs, attention_mask=attention_mask)
+ x = self.post_attention_layernorm(x)
+ x = residual + x
+
+ residual = x
+ x = self.pre_feedforward_layernorm(x)
+ x = self.mlp(x)
+ x = self.post_feedforward_layernorm(x)
+ x = residual + x
+ return x
+
+
+class Gemma4PatchEmbedder(nn.Module):
+ """Patch embedding with learned 2D position embeddings via one-hot lookup."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ hidden_size = config["hidden_size"]
+ patch_size = config["patch_size"]
+ self.patch_size = patch_size
+ self.position_embedding_size = config.get("position_embedding_size", 10240)
+
+ self.input_proj = ops.Linear(3 * patch_size * patch_size, hidden_size, bias=False, device=device, dtype=dtype)
+ self.position_embedding_table = nn.Parameter(
+ torch.empty(2, self.position_embedding_size, hidden_size, device=device, dtype=dtype)
+ )
+
+ def forward(self, patches, pixel_position_ids):
+ """
+ patches: [B, num_patches, 3*patch_size²] in [0,1] range (normalized to [-1,1] inside, matching HF)
+ pixel_position_ids: [B, num_patches, 2] with (x,y) positions, (-1,-1) for padding
+ """
+ hidden_states = self.input_proj((2.0 * (patches - 0.5)).to(self.input_proj.weight.dtype))
+
+ clamped_positions = pixel_position_ids.clamp(min=0)
+ pos_table = comfy.model_management.cast_to_device(self.position_embedding_table, hidden_states.device, hidden_states.dtype)
+ position_embeddings = pos_table[0][clamped_positions[..., 0]] + pos_table[1][clamped_positions[..., 1]]
+
+ # Zero out position embeddings for padding patches (matching HF)
+ padding_positions = (pixel_position_ids == -1).all(dim=-1)
+ position_embeddings = torch.where(padding_positions.unsqueeze(-1), 0.0, position_embeddings)
+
+ return hidden_states + position_embeddings
+
+
+class Gemma4VisionEncoderLayers(nn.Module):
+ """Wrapper to produce state dict keys as encoder.layers.X.*"""
+ def __init__(self, config, dtype=None, device=None, ops=None):
+ super().__init__()
+ self.layers = nn.ModuleList([
+ Gemma4VisionLayer(config, device=device, dtype=dtype, ops=ops)
+ for _ in range(config["num_hidden_layers"])
+ ])
+
+
+class Gemma4VisionEncoder(nn.Module):
+ def __init__(self, config, dtype=None, device=None, ops=None):
+ super().__init__()
+ self.config = config
+ self.hidden_size = config["hidden_size"]
+ self.head_dim = config.get("head_dim", config["hidden_size"] // config["num_attention_heads"])
+ self.patch_size = config["patch_size"]
+ self.pooling_kernel_size = config.get("pooling_kernel_size", 3)
+ self.root_hidden_size = self.hidden_size ** 0.5
+
+ self.patch_embedder = Gemma4PatchEmbedder(config, device=device, dtype=dtype, ops=ops)
+ self.encoder = Gemma4VisionEncoderLayers(config, dtype=dtype, device=device, ops=ops)
+
+ def forward(self, pixel_values, max_soft_tokens=None):
+ """
+ pixel_values: [B, C, H, W] in [0,1] range
+ max_soft_tokens: if provided, pad to max_soft_tokens * k² total patches
+ """
+ batch_size, _, height, width = pixel_values.shape
+ ps = self.patch_size
+ k = self.pooling_kernel_size
+ patches_h, patches_w = height // ps, width // ps
+ num_patches = patches_h * patches_w
+ output_length = max_soft_tokens if max_soft_tokens is not None else num_patches // (k * k)
+ n_padding = output_length * k * k - num_patches
+
+ # Patchify and build position grid
+ patches = pixel_values.reshape(batch_size, -1, patches_h, ps, patches_w, ps)
+ patches = patches.permute(0, 2, 4, 3, 5, 1).reshape(batch_size, num_patches, -1)
+ grid_y, grid_x = torch.meshgrid(torch.arange(patches_h, device=pixel_values.device), torch.arange(patches_w, device=pixel_values.device), indexing='ij')
+ position_ids = torch.stack([grid_x.flatten(), grid_y.flatten()], dim=-1).unsqueeze(0).expand(batch_size, -1, -1)
+
+ # Append zero-pixel padding with (-1,-1) positions
+ if n_padding > 0:
+ patches = torch.cat([patches, patches.new_zeros(batch_size, n_padding, patches.shape[-1])], dim=1)
+ position_ids = torch.cat([position_ids, position_ids.new_full((batch_size, n_padding, 2), -1)], dim=1)
+
+ padding = (position_ids == -1).all(dim=-1)
+
+ # Embed, encode, pool
+ x = self.patch_embedder(patches, position_ids)
+ freqs = _compute_vision_2d_rope(self.head_dim, position_ids, device=pixel_values.device)
+ freqs = tuple(t.to(x.dtype) for t in freqs)
+ if n_padding > 0:
+ mask = padding.unsqueeze(1).unsqueeze(2).expand(-1, 1, position_ids.shape[1], -1)
+ mask = torch.zeros_like(mask, dtype=x.dtype).masked_fill_(mask, torch.finfo(x.dtype).min)
+ else:
+ mask = None
+
+ for layer in self.encoder.layers:
+ x = layer(x, freqs, attention_mask=mask)
+
+ if n_padding > 0:
+ x = x.masked_fill(padding.unsqueeze(-1), 0.0)
+
+ # Average pool by spatial position
+ clamped = position_ids.clamp(min=0)
+ max_x = clamped[:, :, 0].max(dim=-1, keepdim=True)[0] + 1
+ ki = torch.div(clamped, k, rounding_mode="floor")
+ ki = ki[:, :, 0] + (max_x // k) * ki[:, :, 1]
+ weights = torch.nn.functional.one_hot(ki.long(), output_length).float() / (k * k)
+ x = (weights.transpose(1, 2) @ x.float()).to(x.dtype)
+
+ # Strip empty output tokens
+ valid_out = ~((weights == 0).all(dim=1))
+ if valid_out.any() and not valid_out.all():
+ x = x[:, valid_out[0]] if batch_size > 1 else x[valid_out].unsqueeze(0)
+
+ return x * self.root_hidden_size
+
+
+class Gemma4RMSNormProjector(nn.Module):
+ """Shared projector: parameterless RMSNorm → linear. Used for both vision and audio."""
+ def __init__(self, in_dim, out_dim, dtype=None, device=None, ops=None):
+ super().__init__()
+ self.embedding_projection = ops.Linear(in_dim, out_dim, bias=False, device=device, dtype=dtype)
+
+ def forward(self, x):
+ return self.embedding_projection(rms_norm(x))
+
+
+class Gemma4MultiModalProjector(Gemma4RMSNormProjector):
+ def __init__(self, config, dtype=None, device=None, ops=None):
+ super().__init__(config.vision_config["hidden_size"], config.hidden_size, dtype=dtype, device=device, ops=ops)
+
+
+# Audio Encoder
+
+class Gemma4AudioConvSubsampler(nn.Module):
+ """2D convolution subsampling for audio features"""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ eps = config["rms_norm_eps"]
+ self.layer0 = nn.ModuleDict({
+ 'conv': ops.Conv2d(1, 128, kernel_size=3, stride=2, padding=1, bias=False, device=device, dtype=dtype),
+ 'norm': ops.LayerNorm(128, eps=eps, elementwise_affine=True, bias=False, device=device, dtype=dtype),
+ })
+ self.layer1 = nn.ModuleDict({
+ 'conv': ops.Conv2d(128, 32, kernel_size=3, stride=2, padding=1, bias=False, device=device, dtype=dtype),
+ 'norm': ops.LayerNorm(32, eps=eps, elementwise_affine=True, bias=False, device=device, dtype=dtype),
+ })
+ # proj_input_dim = (128 // 4) * 32 = 1024
+ self.input_proj_linear = ops.Linear(1024, config["hidden_size"], bias=False, device=device, dtype=dtype)
+
+ def _conv_layer(self, x, layer, mask):
+ if mask is not None:
+ x = x * mask[:, None, :, None].to(x.device)
+ x = layer['conv'](x.to(layer['conv'].weight.dtype))
+ x = torch.relu(layer['norm'](x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2).contiguous())
+ if mask is not None:
+ mask = mask[:, ::2]
+ return x, mask
+
+ def forward(self, x, mask=None):
+ x = x.unsqueeze(1)
+ x, mask = self._conv_layer(x, self.layer0, mask)
+ x, mask = self._conv_layer(x, self.layer1, mask)
+ batch_size, _, seq_len, _ = x.shape
+ x = x.permute(0, 2, 3, 1).contiguous().reshape(batch_size, seq_len, -1)
+ return self.input_proj_linear(x), mask
+
+
+class Gemma4AudioFeedForward(nn.Module):
+ """Conformer feed-forward with residual scaling."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ hidden_size = config["hidden_size"]
+ intermediate_size = config.get("intermediate_size", hidden_size * 4)
+ self.pre_layer_norm = RMSNorm(hidden_size, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ self.ffw_layer_1 = ClippedLinear(hidden_size, intermediate_size, device=device, dtype=dtype, ops=ops)
+ self.ffw_layer_2 = ClippedLinear(intermediate_size, hidden_size, device=device, dtype=dtype, ops=ops)
+ self.post_layer_norm = RMSNorm(hidden_size, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ self.post_layer_scale = config.get("residual_weight", 0.5)
+
+ def forward(self, x):
+ residual = x
+ x = self.pre_layer_norm(x)
+ x = torch.nn.functional.silu(self.ffw_layer_1(x))
+ x = self.ffw_layer_2(x)
+ x = self.post_layer_norm(x)
+ x = x * self.post_layer_scale
+ return x + residual
+
+
+class Gemma4AudioRelPositionalEncoding(nn.Module):
+ """Sinusoidal relative positional encoding for audio attention."""
+ def __init__(self, config, device=None, dtype=None):
+ super().__init__()
+ hidden_size = config["hidden_size"]
+ context_left = config.get("attention_context_left", 13)
+ context_right = config.get("attention_context_right", 0)
+ self.chunk_size = config.get("attention_chunk_size", 12)
+ self.context_size = self.chunk_size + context_left - 1 + context_right
+
+ num_timescales = hidden_size // 2
+ log_inc = math.log(10000.0) / max(num_timescales - 1, 1)
+ inv_timescales = torch.exp(torch.arange(num_timescales) * -log_inc).to(dtype=dtype).unsqueeze(0).unsqueeze(0)
+ self.register_buffer("inv_timescales", inv_timescales, persistent=False)
+
+ def forward(self, hidden_states):
+ positions = torch.arange(self.chunk_size, -1, -1, device=hidden_states.device).unsqueeze(-1)
+ scaled = positions * self.inv_timescales.to(device=hidden_states.device)
+ return torch.cat([torch.sin(scaled), torch.cos(scaled)], dim=-1).to(dtype=hidden_states.dtype)
+
+
+class Gemma4AudioAttention(nn.Module):
+ """Chunked block attention with relative position bias and softcap."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.hidden_size = config["hidden_size"]
+ self.num_heads = config["num_attention_heads"]
+ self.head_dim = self.hidden_size // self.num_heads
+ self.chunk_size = config.get("attention_chunk_size", 12)
+ self.max_past_horizon = config.get("attention_context_left", 13) - 1
+ self.max_future_horizon = config.get("attention_context_right", 0)
+ self.context_size = self.chunk_size + self.max_past_horizon + self.max_future_horizon
+
+ self.q_scale = (self.head_dim ** -0.5) / math.log(2)
+ self.k_scale = math.log(1 + math.e) / math.log(2)
+ self.register_buffer("softcap", torch.tensor(config.get("attention_logit_cap", 50.0), dtype=dtype), persistent=False)
+
+ self.q_proj = ClippedLinear(self.hidden_size, self.hidden_size, device=device, dtype=dtype, ops=ops)
+ self.k_proj = ClippedLinear(self.hidden_size, self.hidden_size, device=device, dtype=dtype, ops=ops)
+ self.v_proj = ClippedLinear(self.hidden_size, self.hidden_size, device=device, dtype=dtype, ops=ops)
+ self.post = ClippedLinear(self.hidden_size, self.hidden_size, device=device, dtype=dtype, ops=ops)
+ self.per_dim_scale = nn.Parameter(torch.empty(self.head_dim, device=device, dtype=dtype))
+ self.relative_k_proj = ops.Linear(self.hidden_size, self.hidden_size, bias=False, device=device, dtype=dtype)
+
+ def _convert_to_block(self, x):
+ B, S, H, D = x.shape
+ num_blocks = (S + self.chunk_size - 1) // self.chunk_size
+ pad = num_blocks * self.chunk_size - S
+ x = torch.nn.functional.pad(x, (0, 0, 0, 0, 0, pad))
+ return x.reshape(B, num_blocks, self.chunk_size, H, D).contiguous()
+
+ def _extract_block_context(self, x):
+ x = torch.nn.functional.pad(x, (0, 0, 0, 0, self.max_past_horizon, self.max_future_horizon + self.chunk_size - 1))
+ x = x.unfold(1, self.context_size, self.chunk_size)
+ return torch.movedim(x, -1, 2).contiguous()
+
+ def _rel_shift(self, x):
+ B, H, NB, BS, PL = x.shape
+ CS = self.context_size
+ x = torch.nn.functional.pad(x, (0, CS + 1 - PL))
+ x = x.view(B, H, NB, BS * (CS + 1))
+ x = x[..., :BS * CS]
+ return x.view(B, H, NB, BS, CS)
+
+ def _build_blocked_mask(self, seq_len, num_blocks, device, audio_mask=None):
+ """Build 5D boolean blocked attention mask (True=attend, False=mask)"""
+ q = torch.arange(seq_len, device=device)
+ dist = q[:, None] - q[None, :]
+ mask = (dist >= 0) & (dist < self.max_past_horizon)
+ if self.max_future_horizon > 0:
+ mask = mask | ((dist < 0) & ((-dist) < self.max_future_horizon))
+ if audio_mask is not None:
+ mask = mask & audio_mask[0, None, :].bool()
+ m = mask[None, None]
+ # Reshape to blocked 5D matching reference code
+ p = num_blocks * self.chunk_size - seq_len
+ m = torch.nn.functional.pad(m, (0, p, 0, p), value=False)
+ m = m.reshape(1, 1, num_blocks, self.chunk_size, -1)
+ m = torch.nn.functional.pad(m, (self.max_past_horizon, self.max_future_horizon), value=False)
+ idx = (torch.arange(num_blocks, device=device) * self.chunk_size)[:, None] + torch.arange(self.context_size, device=device)[None, :]
+ return m.gather(-1, idx[None, None, :, None, :].expand(1, 1, -1, self.chunk_size, -1))
+
+ def forward(self, x, position_embeddings=None, attn_mask=None):
+ B, S, _ = x.shape
+
+ q = self.q_proj(x).float().view(B, S, self.num_heads, self.head_dim)
+ k = self.k_proj(x).float().view(B, S, self.num_heads, self.head_dim)
+ v = self.v_proj(x).float().view(B, S, self.num_heads, self.head_dim)
+
+ q = q * self.q_scale * torch.nn.functional.softplus(self.per_dim_scale)
+ k = k * self.k_scale
+
+ q_blocks = self._convert_to_block(q)
+ k_context = self._extract_block_context(k)
+ v_context = self._extract_block_context(v)
+ num_blocks = q_blocks.shape[1]
+
+ rel_k = self.relative_k_proj(position_embeddings).view(-1, self.num_heads, self.head_dim).to(q.dtype)
+
+ queries = q_blocks.permute(0, 3, 1, 2, 4) # [B, H, NB, CS, D]
+ matrix_ac = queries @ k_context.permute(0, 3, 1, 4, 2)
+
+ queries_flat = queries.reshape(B, self.num_heads, -1, self.head_dim)
+ matrix_bd = queries_flat @ rel_k.permute(1, 2, 0)
+ matrix_bd = matrix_bd.reshape(B, self.num_heads, num_blocks, self.chunk_size, -1)
+ matrix_bd = self._rel_shift(matrix_bd)
+
+ attn_weights = matrix_ac + matrix_bd
+ attn_weights = torch.tanh(attn_weights / self.softcap) * self.softcap
+
+ # Mask out invalid positions in chunk context (matching reference's masked_fill approach)
+ if attn_mask is None:
+ attn_mask = self._build_blocked_mask(S, num_blocks, x.device)
+ attn_weights = attn_weights.masked_fill(attn_mask.logical_not(), -1e9)
+
+ attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(v.dtype)
+ out = attn_weights @ v_context.permute(0, 3, 1, 2, 4)
+ out = out.permute(0, 2, 3, 1, 4).reshape(B, num_blocks * self.chunk_size, -1)
+ out = out[:, :S].contiguous()
+ return self.post(out.to(self.post.linear.weight.dtype))
+
+
+class Gemma4AudioLConv1d(nn.Module):
+ """Lightweight convolution with standard GLU."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ hidden_size = config["hidden_size"]
+ conv_kernel_size = config.get("conv_kernel_size", 5)
+ self.pre_layer_norm = RMSNorm(hidden_size, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ self.linear_start = ClippedLinear(hidden_size, hidden_size * 2, device=device, dtype=dtype, ops=ops)
+ # Causal conv: left-pad only
+ self.depthwise_conv1d = ops.Conv1d(hidden_size, hidden_size, kernel_size=conv_kernel_size, padding=0, groups=hidden_size, bias=False, device=device, dtype=dtype)
+ self.conv_left_pad = conv_kernel_size - 1 # causal: pad left by kernel-1
+ self.conv_norm = RMSNorm(hidden_size, eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ self.linear_end = ClippedLinear(hidden_size, hidden_size, device=device, dtype=dtype, ops=ops)
+
+ def forward(self, x):
+ residual = x
+ x = self.pre_layer_norm(x)
+ x = self.linear_start(x)
+ x = torch.nn.functional.glu(x, dim=-1)
+ x = x.transpose(1, 2)
+ x = torch.nn.functional.pad(x, (self.conv_left_pad, 0))
+ x = self.depthwise_conv1d(x).transpose(1, 2)
+ x = self.conv_norm(x)
+ x = torch.nn.functional.silu(x)
+ x = self.linear_end(x)
+ return x + residual
+
+
+class Gemma4AudioLayer(nn.Module):
+ """Conformer block: FFN1 -> Attention -> LConv -> FFN2."""
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.feed_forward1 = Gemma4AudioFeedForward(config, device=device, dtype=dtype, ops=ops)
+ self.self_attn = Gemma4AudioAttention(config, device=device, dtype=dtype, ops=ops)
+ norm_kwargs = dict(eps=config["rms_norm_eps"], device=device, dtype=dtype)
+ hidden_size = config["hidden_size"]
+ self.norm_pre_attn = RMSNorm(hidden_size, **norm_kwargs)
+ self.norm_post_attn = RMSNorm(hidden_size, **norm_kwargs)
+ self.lconv1d = Gemma4AudioLConv1d(config, device=device, dtype=dtype, ops=ops)
+ self.feed_forward2 = Gemma4AudioFeedForward(config, device=device, dtype=dtype, ops=ops)
+ self.norm_out = RMSNorm(hidden_size, **norm_kwargs)
+
+ def forward(self, x, position_embeddings=None, attn_mask=None):
+ x = self.feed_forward1(x)
+
+ residual = x
+ x = self.norm_pre_attn(x)
+ x = self.self_attn(x, position_embeddings=position_embeddings, attn_mask=attn_mask)
+ x = self.norm_post_attn(x)
+ x = x + residual
+
+ x = self.lconv1d(x)
+ x = self.feed_forward2(x)
+
+ x = self.norm_out(x)
+ return x
+
+
+class Gemma4AudioEncoder(nn.Module):
+ def __init__(self, config, dtype=None, device=None, ops=None):
+ super().__init__()
+ self.hidden_size = config["hidden_size"]
+ self.output_proj_dims = config.get("output_proj_dims", 1536)
+
+ self.subsample_conv_projection = Gemma4AudioConvSubsampler(config, device=device, dtype=dtype, ops=ops)
+ self.rel_pos_enc = Gemma4AudioRelPositionalEncoding(config, device=device, dtype=dtype)
+
+ self.layers = nn.ModuleList([
+ Gemma4AudioLayer(config, device=device, dtype=dtype, ops=ops)
+ for _ in range(config["num_hidden_layers"])
+ ])
+
+ self.output_proj = ops.Linear(self.hidden_size, self.output_proj_dims, bias=True, device=device, dtype=dtype)
+
+ def forward(self, audio_features, audio_mask=None):
+ x, audio_mask = self.subsample_conv_projection(audio_features, audio_mask)
+ position_embeddings = self.rel_pos_enc(x)
+
+ # Build blocked attention mask once for all layers
+ attn_mask = self.layers[0].self_attn._build_blocked_mask(
+ x.shape[1], (x.shape[1] + self.layers[0].self_attn.chunk_size - 1) // self.layers[0].self_attn.chunk_size,
+ x.device, audio_mask=audio_mask)
+
+ for layer in self.layers:
+ x = layer(x, position_embeddings=position_embeddings, attn_mask=attn_mask)
+
+ x = self.output_proj(x)
+ return x
+
+
+class Gemma4AudioProjector(Gemma4RMSNormProjector):
+ def __init__(self, config, dtype=None, device=None, ops=None):
+ super().__init__(config.get("audio_output_proj_dims", 1536), config.get("text_hidden_size", 2560), dtype=dtype, device=device, ops=ops)
+
+
+# Tokenizer and Wrappers
+
+class Gemma4_Tokenizer():
+ tokenizer_json_data = None
+
+ def state_dict(self):
+ if self.tokenizer_json_data is not None:
+ return {"tokenizer_json": self.tokenizer_json_data}
+ return {}
+
+ def _extract_mel_spectrogram(self, waveform, sample_rate):
+ """Extract 128-bin log mel spectrogram.
+ Uses numpy for FFT/matmul/log to produce bit-identical results with reference code.
+ """
+ # Mix to mono first, then resample to 16kHz
+ if waveform.dim() > 1 and waveform.shape[0] > 1:
+ waveform = waveform.mean(dim=0, keepdim=True)
+ if waveform.dim() == 1:
+ waveform = waveform.unsqueeze(0)
+ audio = waveform.squeeze(0).float().numpy()
+ if sample_rate != 16000:
+ # Use scipy's resample_poly with a high-quality FIR filter to get as close as possible to librosa's resampling (while still not full match)
+ from scipy.signal import resample_poly, firwin
+ from math import gcd
+ g = gcd(sample_rate, 16000)
+ up, down = 16000 // g, sample_rate // g
+ L = max(up, down)
+ h = firwin(160 * L + 1, 0.96 / L, window=('kaiser', 6.5))
+ audio = resample_poly(audio, up, down, window=h).astype(np.float32)
+ n = len(audio)
+
+ # Pad to multiple of 128, build sample-level mask
+ if n % 128 != 0:
+ audio = np.pad(audio, (0, 128 - n % 128))
+ mask_raw = np.ones(len(audio), dtype=np.float32)
+ mask_raw[n:] = 0.0
+
+ # Semicausal padding: 160 zeros prepended
+ audio = np.pad(audio, (160, 0))
+ mask_raw = np.pad(mask_raw, (160, 0))
+
+ # Extract 321-sample frames via stride tricks, drop last → 320
+ nf = (len(audio) - 321) // 160 + 1
+ strides = (audio.strides[0] * 160, audio.strides[0])
+ frames = np.lib.stride_tricks.as_strided(audio, (nf, 321), strides)[..., :-1].copy()
+
+ # Periodic Hann window, FFT magnitude, mel filterbank, log
+ window = (0.5 - 0.5 * np.cos(2 * np.pi * np.arange(320) / 320)).astype(np.float32)
+ magnitude = np.abs(np.fft.rfft(frames * window, n=512, axis=-1))
+ mel_fb = self._build_mel_filterbank()
+ log_mel = np.log(np.matmul(magnitude, mel_fb) + np.float64(0.001)).astype(np.float32)
+
+ # Frame mask: valid when last sample in window is real audio
+ mask = mask_raw[np.arange(nf) * 160 + 320].astype(bool)
+ log_mel = log_mel * mask[:, None]
+ return torch.from_numpy(log_mel), torch.from_numpy(mask) # [T, 128], [T]
+
+ @staticmethod
+ def _build_mel_filterbank():
+ """Build 128-bin HTK mel filterbank [257, 128] for 512-pt FFT at 16kHz."""
+ mel_freqs = np.linspace(0.0, 2595.0 * np.log10(1.0 + 8000.0 / 700.0), 130)
+ filter_freqs = 700.0 * (10.0 ** (mel_freqs / 2595.0) - 1.0)
+ fft_freqs = np.linspace(0, 16000 // 2, 257)
+ filter_diff = np.diff(filter_freqs)
+ slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+ down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+ up_slopes = slopes[:, 2:] / filter_diff[1:]
+ return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+
+ def tokenize_with_weights(self, text, return_word_ids=False, image=None, audio=None, video=None, llama_template=None, skip_template=True, thinking=False, **kwargs):
+
+ # Process audio
+ audio_features = []
+ if audio is not None:
+ waveform = audio["waveform"].squeeze(0) if hasattr(audio, "__getitem__") else audio
+ sample_rate = audio.get("sample_rate", 16000) if hasattr(audio, "get") else 16000
+ mel, mel_mask = self._extract_mel_spectrogram(waveform, sample_rate)
+ audio_features = [(mel.unsqueeze(0), mel_mask.unsqueeze(0))] # ([1, T, 128], [1, T])
+
+ # Process image/video frames
+ is_video = video is not None
+ source = video if is_video else image
+ images = []
+ if source is not None:
+ samples = source.movedim(-1, 1) # [B, C, H, W]
+ num_frames = samples.shape[0]
+
+ # Subsample video to 1fps
+ if is_video:
+ fps = kwargs.get("fps", 24)
+ step = max(1, round(fps))
+ indices = list(range(0, num_frames, step))
+ if len(indices) == 0:
+ indices = [0]
+ samples = samples[indices]
+ num_frames = len(indices)
+
+ h, w = samples.shape[2], samples.shape[3]
+ patch_size = 16
+ pooling_k = 3
+ max_soft_tokens = 70 if is_video else 280 # video uses smaller token budget per frame
+ max_patches = max_soft_tokens * pooling_k * pooling_k
+ target_px = max_patches * patch_size * patch_size
+ factor = (target_px / (h * w)) ** 0.5
+ side_mult = pooling_k * patch_size
+ target_h = max(int(factor * h // side_mult) * side_mult, side_mult)
+ target_w = max(int(factor * w // side_mult) * side_mult, side_mult)
+
+ import torchvision.transforms.functional as TVF
+ for i in range(num_frames):
+ # rescaling to match reference code
+ s = (samples[i].clamp(0, 1) * 255).to(torch.uint8) # [C, H, W] uint8
+ if target_h != h or target_w != w:
+ s = TVF.resize(s, [target_h, target_w], interpolation=TVF.InterpolationMode.BICUBIC, antialias=True)
+ s = s.float() * (1.0 / 255.0)
+ images.append({"pixels": s.unsqueeze(0).movedim(1, -1)[:, :, :, :3], "max_soft_tokens": max_soft_tokens})
+
+ if text.startswith('<|turn>'):
+ skip_template = True
+
+ if skip_template:
+ llama_text = text
+ else:
+ if llama_template is not None:
+ llama_text = llama_template.format(text)
+ else:
+ # Build template from modalities present
+ system = "<|turn>system\n<|think|>\n" if thinking else ""
+ media = ""
+ if len(images) > 0:
+ if is_video:
+ media += "\n\n"
+ for i in range(len(images)):
+ ts = f"{int(i // 60):02d}:{int(i % 60):02d}"
+ sep = "" if i == 0 else " "
+ media += f"{sep}{ts} <|image><|video|>"
+ media += "\n\n"
+ else:
+ media += "\n\n"
+ for i in range(len(images)):
+ if i > 0:
+ media += "\n\n\n\n"
+ media += "<|image><|image|>"
+ media += "\n\n"
+ if len(audio_features) > 0:
+ # Compute audio token count (always at 16kHz)
+ num_samples = int(waveform.shape[-1] * 16000 / sample_rate) if sample_rate != 16000 else waveform.shape[-1]
+ _fl = 320 # int(round(16000 * 20.0 / 1000.0))
+ _hl = 160 # int(round(16000 * 10.0 / 1000.0))
+ _nmel = (num_samples + _fl // 2 - (_fl + 1)) // _hl + 1
+ _t = _nmel
+ for _ in range(2):
+ _t = (_t + 2 - 3) // 2 + 1
+ n_audio_tokens = min(_t, 750)
+ media += "<|audio>" + "<|audio|>" * n_audio_tokens + ""
+ llama_text = f"{system}<|turn>user\n{media}{text}\n<|turn>model\n"
+
+ text_tokens = super().tokenize_with_weights(llama_text, return_word_ids)
+
+ def _replace_placeholders(token_list, token_id, embeds):
+ """Replace first placeholder with embed dict, remove remaining consecutive ones."""
+ embed_idx = 0
+ i = 0
+ while i < len(token_list):
+ if token_list[i][0] == token_id and embed_idx < len(embeds):
+ token_list[i] = (embeds[embed_idx],) + token_list[i][1:]
+ embed_idx += 1
+ i += 1
+ while i < len(token_list) and token_list[i][0] == token_id:
+ token_list.pop(i)
+ else:
+ i += 1
+
+ if len(images) > 0:
+ img_token_id = 258884 if is_video else 258880
+ img_embeds = [{"type": "image", "data": img["pixels"], "max_soft_tokens": img["max_soft_tokens"]} for img in images]
+ for r in text_tokens:
+ _replace_placeholders(r, img_token_id, img_embeds)
+
+ if len(audio_features) > 0:
+ aud_embeds = [{"type": "audio", "data": mel, "mask": mask} for mel, mask in audio_features]
+ for r in text_tokens:
+ _replace_placeholders(r, 258881, aud_embeds)
+
+ return text_tokens
+
+
+class _Gemma4Tokenizer:
+ """Tokenizer using the tokenizers (Gemma4 doesn't come with sentencepiece model)"""
+ def __init__(self, tokenizer_json_bytes=None, **kwargs):
+ from tokenizers import Tokenizer
+ if isinstance(tokenizer_json_bytes, torch.Tensor):
+ tokenizer_json_bytes = bytes(tokenizer_json_bytes.tolist())
+ self.tokenizer = Tokenizer.from_str(tokenizer_json_bytes.decode("utf-8"))
+
+ @classmethod
+ def from_pretrained(cls, tokenizer_data, **kwargs):
+ return cls(tokenizer_json_bytes=tokenizer_data, **kwargs)
+
+ def __call__(self, text):
+ return {"input_ids": self.tokenizer.encode(text, add_special_tokens=False).ids}
+
+ def get_vocab(self):
+ return self.tokenizer.get_vocab()
+
+ def convert_tokens_to_ids(self, tokens):
+ return [self.tokenizer.token_to_id(t) for t in tokens]
+
+ def decode(self, ids, **kwargs):
+ return self.tokenizer.decode(ids, skip_special_tokens=kwargs.get("skip_special_tokens", False))
+
+
+# Tokenizer
+class Gemma4SDTokenizer(Gemma4_Tokenizer, sd1_clip.SDTokenizer):
+ embedding_size = 2560
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ tokenizer_json = tokenizer_data.get("tokenizer_json", None)
+ self.tokenizer_json_data = tokenizer_json
+ super().__init__(tokenizer_json, pad_with_end=False, embedding_size=self.embedding_size, embedding_key='gemma4', tokenizer_class=_Gemma4Tokenizer, has_start_token=True, has_end_token=False, pad_to_max_length=False, max_length=99999999, min_length=1, pad_left=True, disable_weights=True, start_token=2, tokenizer_data=tokenizer_data)
+
+ def decode(self, token_ids, **kwargs):
+ text = super().decode(token_ids, skip_special_tokens=False)
+ # Translate thinking channel markers to standard / tags
+ text = text.replace("<|channel>thought\n", "\n")
+ text = text.replace("", " ")
+ # Strip remaining special tokens
+ text = text.replace("", "").replace("", "").strip()
+ return text
+
+
+class Gemma4Tokenizer(sd1_clip.SD1Tokenizer):
+ tokenizer_class = Gemma4SDTokenizer
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, name="gemma4", tokenizer=self.tokenizer_class)
+
+
+# Model wrappers
+class Gemma4Model(sd1_clip.SDClipModel):
+ model_class = None
+ def __init__(self, device="cpu", layer="all", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+ self.dtypes = set()
+ self.dtypes.add(dtype)
+ super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=self.model_class, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
+
+ def process_tokens(self, tokens, device):
+ embeds, _, _, _ = super().process_tokens(tokens, device)
+ return embeds
+
+ def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty=0.0):
+ if isinstance(tokens, dict):
+ tokens = next(iter(tokens.values()))
+ tokens_only = [[t[0] for t in b] for b in tokens]
+ embeds, _, _, embeds_info = sd1_clip.SDClipModel.process_tokens(self, tokens_only, self.execution_device)
+ seq_len = embeds.shape[1]
+ ids = [0] * seq_len
+ expanded_idx = 0
+ embed_map = {info["index"]: info["size"] for info in embeds_info}
+ for t in tokens_only[0]:
+ if expanded_idx in embed_map:
+ expanded_idx += embed_map[expanded_idx]
+ elif isinstance(t, int):
+ if expanded_idx < seq_len:
+ ids[expanded_idx] = t
+ expanded_idx += 1
+ else:
+ expanded_idx += 1
+ initial_token_ids = [ids]
+ input_ids = torch.tensor(initial_token_ids, device=self.execution_device)
+ return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, initial_tokens=initial_token_ids[0], presence_penalty=presence_penalty, initial_input_ids=input_ids)
+
+
+def gemma4_te(dtype_llama=None, llama_quantization_metadata=None, model_class=None):
+ clip_model = type('Gemma4Model_', (Gemma4Model,), {'model_class': model_class})
+ class Gemma4TEModel_(sd1_clip.SD1ClipModel):
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ if llama_quantization_metadata is not None:
+ model_options = model_options.copy()
+ model_options["quantization_metadata"] = llama_quantization_metadata
+ if dtype_llama is not None:
+ dtype = dtype_llama
+ super().__init__(device=device, dtype=dtype, name="gemma4", clip_model=clip_model, model_options=model_options)
+ return Gemma4TEModel_
+
+
+# Variants
+
+def _make_variant(config_cls):
+ audio = config_cls.audio_config is not None
+ bases = (Gemma4AudioMixin, Gemma4Base) if audio else (Gemma4Base,)
+ class Variant(*bases):
+ def __init__(self, config_dict, dtype, device, operations):
+ super().__init__()
+ self._init_model(config_cls(**config_dict), dtype, device, operations)
+ if audio:
+ self._init_audio(self.model.config, dtype, device, operations)
+ embedding_size = config_cls.hidden_size
+ if embedding_size != Gemma4SDTokenizer.embedding_size:
+ tok_cls = type('T', (Gemma4SDTokenizer,), {'embedding_size': embedding_size})
+ class Tokenizer(Gemma4Tokenizer):
+ tokenizer_class = tok_cls
+ Variant.tokenizer = Tokenizer
+ else:
+ Variant.tokenizer = Gemma4Tokenizer
+ return Variant
+
+Gemma4_E4B = _make_variant(Gemma4Config)
+Gemma4_E2B = _make_variant(Gemma4_E2B_Config)
+Gemma4_31B = _make_variant(Gemma4_31B_Config)
diff --git a/comfy/text_encoders/gpt_oss.py b/comfy/text_encoders/gpt_oss.py
new file mode 100644
index 000000000..d596ef9a0
--- /dev/null
+++ b/comfy/text_encoders/gpt_oss.py
@@ -0,0 +1,600 @@
+"""GPT-OSS text encoder for Lens."""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from typing import Any, List, Optional, Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+from comfy import sd1_clip
+from comfy.ldm.modules.attention import TORCH_HAS_GQA, optimized_attention_for_device
+from comfy.text_encoders.llama import RMSNorm, apply_rope
+
+
+@dataclass
+class GptOss20BConfig:
+ vocab_size: int = 201088
+ hidden_size: int = 2880
+ intermediate_size: int = 2880
+ num_hidden_layers: int = 24
+ num_attention_heads: int = 64
+ num_key_value_heads: int = 8
+ head_dim: int = 64
+ num_local_experts: int = 32
+ num_experts_per_tok: int = 4
+ sliding_window: int = 128
+ original_max_position_embeddings: int = 4096
+ rope_theta: float = 150000.0
+ rope_factor: float = 32.0
+ rope_beta_fast: float = 32.0
+ rope_beta_slow: float = 1.0
+ rope_truncate: bool = False
+ rms_norm_eps: float = 1e-5
+ attention_bias: bool = True
+ layer_types: Optional[List[str]] = None
+ moe_alpha: float = 1.702
+ moe_limit: float = 7.0
+
+ def __post_init__(self):
+ if self.layer_types is None:
+ self.layer_types = [
+ "sliding_attention" if (i + 1) % 2 else "full_attention"
+ for i in range(self.num_hidden_layers)
+ ]
+
+
+def _yarn_inv_freq(head_dim: int, base: float, factor: float, beta_fast: float, beta_slow: float,
+ original_max_position_embeddings: int, truncate: bool, device=None) -> tuple[torch.Tensor, float]:
+ """YARN inv_freq + attention scaling (matches transformers)."""
+ dim = head_dim
+
+ def find_correction_dim(num_rotations: float) -> float:
+ return (dim * math.log(original_max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+ 2 * math.log(base)
+ )
+
+ def find_correction_range() -> tuple[float, float]:
+ low = find_correction_dim(beta_fast)
+ high = find_correction_dim(beta_slow)
+ if truncate:
+ low = math.floor(low)
+ high = math.ceil(high)
+ return max(low, 0), min(high, dim - 1)
+
+ def linear_ramp_factor(min_: float, max_: float, n: int) -> torch.Tensor:
+ if min_ == max_:
+ max_ += 0.001
+ linear = (torch.arange(n, dtype=torch.float32, device=device) - min_) / (max_ - min_)
+ return torch.clamp(linear, 0, 1)
+
+ def get_mscale(scale: float) -> float:
+ if scale <= 1:
+ return 1.0
+ return 0.1 * math.log(scale) + 1.0
+
+ attention_scaling = get_mscale(factor)
+
+ pos_freqs = base ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+ inv_freq_extrapolation = 1.0 / pos_freqs
+ inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+
+ low, high = find_correction_range()
+ extrap_factor = 1 - linear_ramp_factor(low, high, dim // 2)
+ inv_freq = inv_freq_interpolation * (1 - extrap_factor) + inv_freq_extrapolation * extrap_factor
+ return inv_freq, attention_scaling
+
+
+def _build_freqs_cis(inv_freq: torch.Tensor, attention_scaling: float, position_ids: torch.Tensor, dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+ inv_freq_e = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+ pos_e = position_ids[:, None, :].float()
+ freqs = (inv_freq_e @ pos_e).transpose(1, 2)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = (emb.cos() * attention_scaling).to(dtype).unsqueeze(1)
+ sin = (emb.sin() * attention_scaling).to(dtype).unsqueeze(1)
+ sin_split = sin.shape[-1] // 2
+ return cos, sin[..., :sin_split], -sin[..., sin_split:]
+
+
+def _attention_with_sinks(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, sinks: torch.Tensor,
+ attention_mask: Optional[torch.Tensor], num_heads: int, num_kv_groups: int) -> torch.Tensor:
+ """Attention with per-head sinks.
+
+ Sinks add a learned term to each row's softmax denominator but contribute
+ nothing to the output. We fake this by appending one zero k/v position and
+ putting the sink logit in the mask at that column.
+ """
+
+ if num_kv_groups > 1 and not TORCH_HAS_GQA:
+ k = k.repeat_interleave(num_kv_groups, dim=1)
+ v = v.repeat_interleave(num_kv_groups, dim=1)
+
+ B, _, S_q, D = q.shape
+ H_kv = k.shape[1]
+ S_kv = k.shape[-2]
+
+ k = torch.cat([k, k.new_zeros(B, H_kv, 1, D)], dim=-2)
+ v = torch.cat([v, v.new_zeros(B, H_kv, 1, D)], dim=-2)
+
+ sinks_col = sinks.to(q.dtype).view(1, num_heads, 1, 1).expand(B, num_heads, S_q, 1)
+ if attention_mask is not None:
+ mask_left = attention_mask[..., :S_kv].expand(B, num_heads, S_q, S_kv)
+ else:
+ mask_left = q.new_zeros(B, num_heads, S_q, S_kv)
+ mask = torch.cat([mask_left, sinks_col], dim=-1)
+
+ op = optimized_attention_for_device(q.device, mask=True, small_input=True)
+ return op(q, k, v, num_heads, mask=mask, skip_reshape=True, enable_gqa=True)
+
+
+class GptOssAttention(nn.Module):
+ def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None):
+ super().__init__()
+ self.layer_idx = layer_idx
+ self.layer_type = config.layer_types[layer_idx]
+ self.num_heads = config.num_attention_heads
+ self.num_kv_heads = config.num_key_value_heads
+ self.num_kv_groups = self.num_heads // self.num_kv_heads
+ self.head_dim = config.head_dim
+ self.hidden_size = config.hidden_size
+ self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None
+
+ bias = config.attention_bias
+ self.q_proj = ops.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=bias, device=device, dtype=dtype)
+ self.k_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype)
+ self.v_proj = ops.Linear(config.hidden_size, self.num_kv_heads * self.head_dim, bias=bias, device=device, dtype=dtype)
+ self.o_proj = ops.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=bias, device=device, dtype=dtype)
+ self.sinks = nn.Parameter(torch.empty(self.num_heads, device=device, dtype=dtype))
+
+ def forward(self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], freqs_cis) -> torch.Tensor:
+ B, S, _ = hidden_states.shape
+
+ q = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+ k = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
+ v = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
+
+ q, k = apply_rope(q, k, freqs_cis)
+
+ out = _attention_with_sinks(q, k, v, self.sinks, attention_mask, self.num_heads, self.num_kv_groups)
+ return self.o_proj(out)
+
+
+# Mixture of Experts
+
+class GptOssTopKRouter(nn.Module):
+ def __init__(self, config: GptOss20BConfig, device=None, dtype=None):
+ super().__init__()
+ self.top_k = config.num_experts_per_tok
+ self.num_experts = config.num_local_experts
+ self.weight = nn.Parameter(torch.empty(config.num_local_experts, config.hidden_size, device=device, dtype=dtype))
+ self.bias = nn.Parameter(torch.empty(config.num_local_experts, device=device, dtype=dtype))
+
+ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+ weight = comfy.ops.cast_to_input(self.weight, hidden_states, copy=False)
+ bias = comfy.ops.cast_to_input(self.bias, hidden_states, copy=False)
+ logits = F.linear(hidden_states, weight, bias)
+ top_vals, top_idx = torch.topk(logits, self.top_k, dim=-1)
+ # Softmax over top-k slice only
+ scores = F.softmax(top_vals, dim=-1, dtype=top_vals.dtype)
+ return scores, top_idx
+
+
+class GptOssExperts(nn.Module):
+ def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None):
+ super().__init__()
+ self.num_experts = config.num_local_experts
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
+ self.alpha = config.moe_alpha
+ self.limit = config.moe_limit
+
+ E = self.num_experts
+ H = self.hidden_size
+ I = self.intermediate_size
+
+ self.gate_up_proj = ops.MoEExperts(num_experts=E, in_features=H, out_features=2 * I, bias=True, device=device, dtype=dtype)
+ self.down_proj = ops.MoEExperts(num_experts=E, in_features=I, out_features=H, bias=True, device=device, dtype=dtype)
+
+ def _apply_gate(self, gate_up: torch.Tensor) -> torch.Tensor:
+ gate = gate_up[..., ::2]
+ up = gate_up[..., 1::2]
+ gate = gate.clamp(max=self.limit)
+ up = up.clamp(min=-self.limit, max=self.limit)
+ glu = gate * torch.sigmoid(gate * self.alpha)
+ return torch.addcmul(glu, up, glu)
+
+ def forward(self, hidden_states: torch.Tensor, router_indices: torch.Tensor, routing_weights: torch.Tensor) -> torch.Tensor:
+ N = hidden_states.shape[0]
+ top_k = router_indices.shape[-1]
+ H = hidden_states.shape[-1]
+
+ per_pair = torch.zeros((N * top_k, H), dtype=hidden_states.dtype, device=hidden_states.device)
+
+ expert_mask = F.one_hot(router_indices, num_classes=self.num_experts).permute(2, 1, 0)
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+
+ with self.gate_up_proj.bank_resident(hidden_states) as gate_up_bank, \
+ self.down_proj.bank_resident(hidden_states) as down_bank:
+ for ei in expert_hit:
+ expert_idx = int(ei.item())
+ top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
+ current = hidden_states[token_idx]
+
+ gate_up = gate_up_bank.expert_linear(current, expert_idx)
+ gated = self._apply_gate(gate_up)
+ expert_out = down_bank.expert_linear(gated, expert_idx)
+
+ weighted = expert_out * routing_weights[token_idx, top_k_pos, None]
+
+ flat_idx = token_idx * top_k + top_k_pos
+ per_pair[flat_idx] = weighted.to(per_pair.dtype)
+
+ return per_pair.view(N, top_k, H).sum(dim=1)
+
+
+class GptOssMLP(nn.Module):
+ def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None):
+ super().__init__()
+ self.router = GptOssTopKRouter(config, device=device, dtype=dtype)
+ self.experts = GptOssExperts(config, device=device, dtype=dtype, ops=ops)
+
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+ B, S, H = hidden_states.shape
+ flat = hidden_states.reshape(-1, H)
+ scores, idx = self.router(flat)
+ out = self.experts(flat, idx, scores)
+ return out.reshape(B, S, H)
+
+
+# Decoder layer + model
+
+class GptOssDecoderLayer(nn.Module):
+ def __init__(self, config: GptOss20BConfig, layer_idx: int, device=None, dtype=None, ops: Any = None):
+ super().__init__()
+ self.self_attn = GptOssAttention(config, layer_idx, device=device, dtype=dtype, ops=ops)
+ self.mlp = GptOssMLP(config, device=device, dtype=dtype, ops=ops)
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+ self.layer_type = config.layer_types[layer_idx]
+
+ def forward(self, x: torch.Tensor, attention_masks: dict[str, Optional[torch.Tensor]], freqs_cis) -> torch.Tensor:
+ residual = x
+ x = self.input_layernorm(x)
+ x = self.self_attn(x, attention_masks[self.layer_type], freqs_cis)
+ x = residual + x
+
+ residual = x
+ x = self.post_attention_layernorm(x)
+ x = self.mlp(x)
+ x = residual + x
+ return x
+
+
+def _make_full_causal_mask(B: int, S: int, key_padding_mask: Optional[torch.Tensor], dtype, device):
+ neg = torch.finfo(dtype).min
+ mask = torch.full((S, S), neg, dtype=dtype, device=device).triu_(1)
+ mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous()
+ if key_padding_mask is not None:
+ kp = key_padding_mask.to(dtype=dtype)
+ kp = (1.0 - kp).reshape(B, 1, 1, S) * neg
+ mask = mask + kp
+ return mask
+
+
+def _make_sliding_causal_mask(B: int, S: int, window: int, key_padding_mask: Optional[torch.Tensor], dtype, device):
+ neg = torch.finfo(dtype).min
+ i = torch.arange(S, device=device).view(-1, 1)
+ j = torch.arange(S, device=device).view(1, -1)
+ keep = (j <= i) & (j > i - window)
+ mask = torch.where(keep, torch.zeros((), dtype=dtype, device=device), torch.full((), neg, dtype=dtype, device=device))
+ mask = mask.unsqueeze(0).unsqueeze(0).expand(B, 1, S, S).contiguous()
+ if key_padding_mask is not None:
+ kp = key_padding_mask.to(dtype=dtype)
+ kp = (1.0 - kp).reshape(B, 1, 1, S) * neg
+ mask = mask + kp
+ return mask
+
+
+class GptOssModel(nn.Module):
+ """GPT-OSS decoder with multi-layer hidden-state capture + early exit."""
+
+ def __init__(self, config: GptOss20BConfig, device=None, dtype=None, ops: Any = None):
+ super().__init__()
+ self.config = config
+ self.dtype = dtype
+ self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype)
+ self.layers = nn.ModuleList(
+ [
+ GptOssDecoderLayer(config, i, device=device, dtype=dtype, ops=ops)
+ for i in range(config.num_hidden_layers)
+ ]
+ )
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, device=device, dtype=dtype)
+
+ # Always build on CPU so the buffer survives meta-device construction.
+ inv_freq, attn_scaling = _yarn_inv_freq(
+ head_dim=config.head_dim,
+ base=config.rope_theta,
+ factor=config.rope_factor,
+ beta_fast=config.rope_beta_fast,
+ beta_slow=config.rope_beta_slow,
+ original_max_position_embeddings=config.original_max_position_embeddings,
+ truncate=config.rope_truncate,
+ device=torch.device("cpu"),
+ )
+ self.register_buffer("rope_inv_freq", inv_freq, persistent=False)
+ self.rope_attention_scaling = float(attn_scaling)
+
+ @property
+ def num_layers(self) -> int:
+ return self.config.num_hidden_layers
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def _build_attention_masks(self, B: int, S: int, attention_mask: Optional[torch.Tensor], dtype: torch.dtype, device,
+ ) -> dict[str, torch.Tensor]:
+ full = _make_full_causal_mask(B, S, attention_mask, dtype, device)
+ masks = {"full_attention": full}
+ if any(t == "sliding_attention" for t in self.config.layer_types):
+ masks["sliding_attention"] = _make_sliding_causal_mask(
+ B, S, self.config.sliding_window, attention_mask, dtype, device
+ )
+ return masks
+
+ def forward(self, input_ids: torch.LongTensor, attention_mask: Optional[torch.Tensor] = None,
+ capture_layers: Optional[Sequence[int]] = None) -> dict[str, Any]:
+ B, S = input_ids.shape
+ device = input_ids.device
+ dtype = self.dtype
+
+ hidden_states = self.embed_tokens(input_ids, out_dtype=dtype)
+
+ position_ids = torch.arange(S, device=device).unsqueeze(0).expand(B, -1)
+ freqs_cis = _build_freqs_cis(self.rope_inv_freq.to(device=device), self.rope_attention_scaling, position_ids, dtype)
+
+ attn_masks = self._build_attention_masks(B, S, attention_mask, dtype, device)
+
+ capture_layers = list(capture_layers) if capture_layers else None
+ if capture_layers:
+ max_layer = max(capture_layers)
+ wanted = {idx: pos for pos, idx in enumerate(capture_layers)}
+ captured: List[Optional[torch.Tensor]] = [None] * len(capture_layers)
+ else:
+ max_layer = self.config.num_hidden_layers - 1
+ wanted = None
+ captured = None
+
+ for i, layer in enumerate(self.layers):
+ hidden_states = layer(hidden_states, attn_masks, freqs_cis)
+ if wanted is not None and i in wanted:
+ captured[wanted[i]] = hidden_states
+ if i >= max_layer:
+ break
+
+ if captured is not None:
+ return {"hidden_states": captured}
+ return {"last_hidden_state": self.norm(hidden_states)}
+
+
+# Lens chat-template constants (verbatim from the reference pipeline).
+_LENS_CHAT_SYSTEM = (
+ "Describe the image by detailing the color, shape, size, texture, "
+ "quantity, text, spatial relationships of the objects and background."
+)
+_LENS_CHAT_ASSISTANT_THINKING = "Need to generate one image according to the description."
+LENS_TXT_OFFSET = 97
+LENS_SELECTED_LAYERS = (5, 11, 17, 23)
+LENS_MAX_TOKENS = 512
+
+
+# The reference GPT-OSS Harmony template injects today's date here
+_LENS_CHAT_DATE = "2026-05-23"
+
+
+def _lens_render_chat(prompt: str) -> str:
+ """Render the Lens prompt in GPT-OSS Harmony format."""
+ return (
+ f"<|start|>system<|message|>"
+ f"You are ChatGPT, a large language model trained by OpenAI.\n"
+ f"Knowledge cutoff: 2024-06\n"
+ f"Current date: {_LENS_CHAT_DATE}\n\n"
+ f"Reasoning: medium\n\n"
+ f"# Valid channels: analysis, commentary, final. "
+ f"Channel must be included for every message.<|end|>"
+ f"<|start|>developer<|message|># Instructions\n\n"
+ f"{_LENS_CHAT_SYSTEM}\n\n<|end|>"
+ f"<|start|>user<|message|>{prompt}<|end|>"
+ f"<|start|>assistant<|channel|>analysis<|message|>"
+ f"{_LENS_CHAT_ASSISTANT_THINKING}<|end|>"
+ f"<|start|>assistant<|channel|>final<|message|>"
+ )
+
+
+# GPT-OSS-20B fixed token IDs (from the tokenizer's added-tokens table).
+_LENS_PAD_TOKEN_ID = 199999 # <|endoftext|>
+
+
+class _GptOssRawTokenizer:
+ """Raw ``tokenizers.Tokenizer`` wrapper.
+
+ The tokenizer JSON ships as a byte tensor inside the encoder checkpoint
+ (``tokenizer_json`` key) rather than as a committed file. Extracted
+ it in ``sd.py`` and passes it here via ``tokenizer_data``.
+ """
+
+ def __init__(self, tokenizer_json_bytes=None, **kwargs):
+ from tokenizers import Tokenizer
+ if isinstance(tokenizer_json_bytes, torch.Tensor):
+ tokenizer_json_bytes = bytes(tokenizer_json_bytes.tolist())
+ if tokenizer_json_bytes is None:
+ raise ValueError(
+ "Lens tokenizer requires the ``tokenizer_json`` byte tensor in the "
+ "encoder state dict. Re-bundle the encoder via bundle_te.py so it "
+ "embeds the tokenizer."
+ )
+ self.tokenizer = Tokenizer.from_str(tokenizer_json_bytes.decode("utf-8"))
+
+ @classmethod
+ def from_pretrained(cls, tokenizer_data, **kwargs):
+ return cls(tokenizer_json_bytes=tokenizer_data, **kwargs)
+
+ def __call__(self, text):
+ return {"input_ids": self.tokenizer.encode(text, add_special_tokens=False).ids}
+
+ def get_vocab(self):
+ return self.tokenizer.get_vocab()
+
+ def convert_tokens_to_ids(self, tokens):
+ return [self.tokenizer.token_to_id(t) for t in tokens]
+
+ def decode(self, ids, **kwargs):
+ return self.tokenizer.decode(ids, skip_special_tokens=kwargs.get("skip_special_tokens", False))
+
+
+class LensGptOssTokenizer(sd1_clip.SDTokenizer):
+ tokenizer_json_data = None
+
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ tokenizer_json = tokenizer_data.get("tokenizer_json", None)
+ self.tokenizer_json_data = tokenizer_json
+ super().__init__(
+ tokenizer_json,
+ embedding_directory=embedding_directory,
+ pad_with_end=False,
+ embedding_size=2880,
+ embedding_key="gpt_oss",
+ tokenizer_class=_GptOssRawTokenizer,
+ has_start_token=False,
+ has_end_token=False,
+ pad_to_max_length=False,
+ max_length=99999999,
+ min_length=1,
+ pad_left=False,
+ disable_weights=True,
+ tokenizer_data=tokenizer_data,
+ )
+ self.pad_token_id = _LENS_PAD_TOKEN_ID
+
+ def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
+ # Empty prompt -> empty list; encode_token_weights returns zeros (uncond).
+ if not text or not text.strip():
+ return [[]]
+ rendered = _lens_render_chat(text)
+ ids = self.tokenizer(rendered)["input_ids"]
+ if len(ids) > LENS_MAX_TOKENS:
+ ids = ids[:LENS_MAX_TOKENS]
+ return [[(int(t), 1.0) for t in ids]]
+
+ def state_dict(self):
+ if self.tokenizer_json_data is not None:
+ return {"tokenizer_json": self.tokenizer_json_data}
+ return {}
+
+
+class LensTokenizer(sd1_clip.SD1Tokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(
+ embedding_directory=embedding_directory,
+ tokenizer_data=tokenizer_data,
+ name="gpt_oss",
+ tokenizer=LensGptOssTokenizer,
+ )
+
+
+class LensGptOssClipModel(nn.Module):
+ """SDClipModel-shaped Lens GPT-OSS encoder (multi-layer feature extractor)."""
+
+ def __init__(self, device="cpu", dtype=None, model_options=None, **kwargs):
+ super().__init__()
+ model_options = dict(model_options or {})
+
+ operations = model_options.get("custom_operations")
+ if operations is None:
+ quant_config = model_options.get("quantization_metadata") or {}
+ operations = comfy.ops.mixed_precision_ops(quant_config, dtype, full_precision_mm=True)
+ self.operations = operations
+
+ cfg_overrides = model_options.get("gpt_oss_config", {})
+ self.config = GptOss20BConfig(**cfg_overrides)
+ self.selected_layers = tuple(model_options.get("selected_layers", LENS_SELECTED_LAYERS))
+ self.txt_offset = int(model_options.get("txt_offset", LENS_TXT_OFFSET))
+
+ self.transformer = GptOssModel(self.config, device=device, dtype=dtype, ops=operations)
+ self.num_layers = self.config.num_hidden_layers
+ self.dtype = dtype
+ self.execution_device = None
+ self._pad_token_id = _LENS_PAD_TOKEN_ID
+
+ def set_clip_options(self, options):
+ self.execution_device = options.get("execution_device", self.execution_device)
+
+ def reset_clip_options(self):
+ self.execution_device = None
+
+ def _gather_tokens(self, token_weight_pairs):
+ ids_list = [[int(t[0]) for t in batch] for batch in token_weight_pairs]
+ pad_id = self._pad_token_id
+ max_len = max(len(x) for x in ids_list)
+ device = self.execution_device
+ ids = torch.full((len(ids_list), max_len), pad_id, dtype=torch.long, device=device)
+ mask = torch.zeros((len(ids_list), max_len), dtype=torch.long, device=device)
+ for i, x in enumerate(ids_list):
+ ids[i, : len(x)] = torch.tensor(x, dtype=torch.long, device=device)
+ mask[i, : len(x)] = 1
+ return ids, mask
+
+ def encode_token_weights(self, token_weight_pairs):
+ # Empty negative: emit zero-length features + zero mask
+ if all(len(batch) == 0 for batch in token_weight_pairs):
+ device = self.execution_device
+ B = len(token_weight_pairs)
+ L = len(self.selected_layers)
+ H = self.config.hidden_size
+ flat = torch.zeros(B, 0, L * H, dtype=self.dtype, device=device)
+ mask = torch.zeros(B, 0, dtype=torch.long, device=device)
+ return flat, None, {"attention_mask": mask, "num_layers_stacked": L}
+
+ input_ids, attn_mask = self._gather_tokens(token_weight_pairs)
+ out = self.transformer(input_ids, attention_mask=attn_mask, capture_layers=self.selected_layers)
+ layers = out["hidden_states"] # list of L × [B, S, H]
+ stacked = torch.stack(layers, dim=2) # [B, S, L, H]
+
+ offset = self.txt_offset
+ if stacked.shape[1] > offset:
+ stacked = stacked[:, offset:].contiguous()
+ mask_trim = attn_mask[:, offset:]
+ else:
+ stacked = stacked[:, :0]
+ mask_trim = attn_mask[:, :0]
+
+ B, S, L, H = stacked.shape
+ flat = stacked.reshape(B, S, L * H)
+ extra = {"attention_mask": mask_trim, "num_layers_stacked": L}
+ return flat, None, extra
+
+ def load_sd(self, sd):
+ return self.transformer.load_state_dict(sd, strict=False, assign=True)
+
+
+class LensTEModel(sd1_clip.SD1ClipModel):
+ def __init__(self, device="cpu", dtype=None, model_options=None):
+ super().__init__(device=device, dtype=dtype, name="gpt_oss", clip_model=LensGptOssClipModel, model_options=model_options or {})
+
+
+def lens_te(dtype_llama=None, llama_quantization_metadata=None):
+ class LensTEModel_(LensTEModel):
+ def __init__(self, device="cpu", dtype=None, model_options=None):
+ mo = dict(model_options or {})
+ if llama_quantization_metadata is not None:
+ mo["quantization_metadata"] = llama_quantization_metadata
+ if dtype is None and dtype_llama is not None:
+ dtype = dtype_llama
+ super().__init__(device=device, dtype=dtype, model_options=mo)
+
+ return LensTEModel_
diff --git a/comfy/text_encoders/hidream_o1.py b/comfy/text_encoders/hidream_o1.py
new file mode 100644
index 000000000..5d287b784
--- /dev/null
+++ b/comfy/text_encoders/hidream_o1.py
@@ -0,0 +1,119 @@
+"""HiDream-O1-Image tokenizer-only text encoder.
+
+The real Qwen3-VL backbone runs inside diffusion_model.* every step, so this
+module just tokenizes the prompt into text_input_ids and emits them as
+conditioning. Position ids / token_types / vinput_mask depend on target H/W
+and are built later in model_base.HiDreamO1.extra_conds.
+"""
+
+import os
+
+import torch
+from transformers import Qwen2Tokenizer
+
+from comfy import sd1_clip
+
+
+# Qwen3-VL special tokens
+IM_START_ID = 151644
+IM_END_ID = 151645
+ASSISTANT_ID = 77091
+USER_ID = 872
+NEWLINE_ID = 198
+VISION_START_ID = 151652
+VISION_END_ID = 151653
+IMAGE_TOKEN_ID = 151655
+VIDEO_TOKEN_ID = 151656
+# HiDream-O1-specific tokens
+BOI_TOKEN_ID = 151669
+BOR_TOKEN_ID = 151670
+EOR_TOKEN_ID = 151671
+BOT_TOKEN_ID = 151672
+TMS_TOKEN_ID = 151673
+
+
+class HiDreamO1QwenTokenizer(sd1_clip.SDTokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ tokenizer_path = os.path.join(
+ os.path.dirname(os.path.realpath(__file__)), "qwen25_tokenizer"
+ )
+ super().__init__(
+ tokenizer_path,
+ pad_with_end=False,
+ embedding_size=4096,
+ embedding_key="hidream_o1",
+ tokenizer_class=Qwen2Tokenizer,
+ has_start_token=False,
+ has_end_token=False,
+ pad_to_max_length=False,
+ max_length=99999999,
+ min_length=1,
+ pad_token=151643,
+ tokenizer_data=tokenizer_data,
+ )
+
+
+class HiDreamO1Tokenizer(sd1_clip.SD1Tokenizer):
+ """Wraps prompt in the upstream chat template ending with boi/tms markers.
+ Image tokens get spliced in at sample time once target H/W is known.
+ """
+
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(
+ embedding_directory=embedding_directory,
+ tokenizer_data=tokenizer_data,
+ name="hidream_o1",
+ tokenizer=HiDreamO1QwenTokenizer,
+ )
+
+ def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
+ text_tokens_dict = super().tokenize_with_weights(
+ text, return_word_ids=return_word_ids, disable_weights=True, **kwargs
+ )
+ text_tuples = text_tokens_dict["hidream_o1"][0]
+ text_tuples = [t for t in text_tuples if int(t[0]) != 151643] # strip pad
+
+ # <|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n<|boi|><|tms|>
+ def tok(tid):
+ return (tid, 1.0) if not return_word_ids else (tid, 1.0, 0)
+
+ prefix = [tok(IM_START_ID), tok(USER_ID), tok(NEWLINE_ID)]
+ suffix = [
+ tok(IM_END_ID), tok(NEWLINE_ID),
+ tok(IM_START_ID), tok(ASSISTANT_ID), tok(NEWLINE_ID),
+ tok(BOI_TOKEN_ID), tok(TMS_TOKEN_ID),
+ ]
+ full = prefix + list(text_tuples) + suffix
+ return {"hidream_o1": [full]}
+
+
+class HiDreamO1TE(torch.nn.Module):
+ """Passthrough TE: emits int token ids; the Qwen3-VL backbone in diffusion_model does the actual encoding."""
+
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ super().__init__()
+ self.dtypes = {torch.float32}
+ self.disable_offload = True # skips dynamic VRAM management for this zero-parameter module
+ self.device = torch.device("cpu") if device is None else torch.device(device)
+
+ def encode_token_weights(self, token_weight_pairs):
+ tok_pairs = token_weight_pairs["hidream_o1"][0]
+ ids = [int(t[0]) for t in tok_pairs]
+ input_ids = torch.tensor([ids], dtype=torch.long)
+ # Surrogate keeps the cross_attn slot non-empty for CONDITIONING
+ # plumbing; the model reads text_input_ids out of `extra` instead.
+ cross_attn = input_ids.unsqueeze(-1).to(torch.float32)
+ extra = {"text_input_ids": input_ids}
+ return cross_attn, None, extra
+
+ def load_sd(self, sd):
+ return []
+
+ def get_sd(self):
+ return {}
+
+ def reset_clip_options(self):
+ pass
+
+ def set_clip_options(self, options):
+ pass
diff --git a/comfy/text_encoders/llama.py b/comfy/text_encoders/llama.py
index 6cdc47757..5087228ca 100644
--- a/comfy/text_encoders/llama.py
+++ b/comfy/text_encoders/llama.py
@@ -82,6 +82,7 @@ class Ministral3_3BConfig:
rope_scale = None
final_norm: bool = True
lm_head: bool = False
+ stop_tokens = [2]
@dataclass
class Qwen25_3BConfig:
@@ -396,7 +397,7 @@ class RMSNorm(nn.Module):
-def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None):
+def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_dims=None, device=None, interleaved_mrope=False):
if not isinstance(theta, list):
theta = [theta]
@@ -414,16 +415,27 @@ def precompute_freqs_cis(head_dim, position_ids, theta, rope_scale=None, rope_di
inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
position_ids_expanded = position_ids[:, None, :].float()
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
- emb = torch.cat((freqs, freqs), dim=-1)
- cos = emb.cos()
- sin = emb.sin()
- if rope_dims is not None and position_ids.shape[0] > 1:
- mrope_section = rope_dims * 2
- cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
- sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+ if rope_dims is not None and position_ids.shape[0] > 1 and interleaved_mrope:
+ # Qwen3-VL interleaved MRoPE: T-freqs by default, H/W replace every 3rd dim.
+ freqs_inter = freqs[0].clone()
+ for axis_idx, offset in ((1, 1), (2, 2)):
+ length = rope_dims[axis_idx] * 3
+ idx = slice(offset, length, 3)
+ freqs_inter[..., idx] = freqs[axis_idx, ..., idx]
+ emb = torch.cat((freqs_inter, freqs_inter), dim=-1)
+ cos = emb.cos().unsqueeze(0)
+ sin = emb.sin().unsqueeze(0)
else:
- cos = cos.unsqueeze(1)
- sin = sin.unsqueeze(1)
+ emb = torch.cat((freqs, freqs), dim=-1)
+ cos = emb.cos()
+ sin = emb.sin()
+ if rope_dims is not None and position_ids.shape[0] > 1:
+ mrope_section = rope_dims * 2
+ cos = torch.cat([m[i % 3] for i, m in enumerate(cos.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+ sin = torch.cat([m[i % 3] for i, m in enumerate(sin.split(mrope_section, dim=-1))], dim=-1).unsqueeze(0)
+ else:
+ cos = cos.unsqueeze(1)
+ sin = sin.unsqueeze(1)
sin_split = sin.shape[-1] // 2
out.append((cos, sin[..., : sin_split], -sin[..., sin_split :]))
@@ -520,7 +532,7 @@ class Attention(nn.Module):
else:
present_key_value = (xk, xv, index + num_tokens)
- if sliding_window is not None and xk.shape[2] > sliding_window:
+ if sliding_window is not None and xk.shape[2] > sliding_window and seq_length == 1:
xk = xk[:, :, -sliding_window:]
xv = xv[:, :, -sliding_window:]
attention_mask = attention_mask[..., -sliding_window:] if attention_mask is not None else None
@@ -532,12 +544,12 @@ class Attention(nn.Module):
return self.o_proj(output), present_key_value
class MLP(nn.Module):
- def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None):
+ def __init__(self, config: Llama2Config, device=None, dtype=None, ops: Any = None, intermediate_size=None):
super().__init__()
- ops = ops or nn
- self.gate_proj = ops.Linear(config.hidden_size, config.intermediate_size, bias=False, device=device, dtype=dtype)
- self.up_proj = ops.Linear(config.hidden_size, config.intermediate_size, bias=False, device=device, dtype=dtype)
- self.down_proj = ops.Linear(config.intermediate_size, config.hidden_size, bias=False, device=device, dtype=dtype)
+ intermediate_size = intermediate_size or config.intermediate_size
+ self.gate_proj = ops.Linear(config.hidden_size, intermediate_size, bias=False, device=device, dtype=dtype)
+ self.up_proj = ops.Linear(config.hidden_size, intermediate_size, bias=False, device=device, dtype=dtype)
+ self.down_proj = ops.Linear(intermediate_size, config.hidden_size, bias=False, device=device, dtype=dtype)
if config.mlp_activation == "silu":
self.activation = torch.nn.functional.silu
elif config.mlp_activation == "gelu_pytorch_tanh":
@@ -646,24 +658,25 @@ class TransformerBlockGemma2(nn.Module):
return x, present_key_value
+def _make_scaled_embedding(ops, vocab_size, hidden_size, scale, device, dtype):
+ class ScaledEmbedding(ops.Embedding):
+ def forward(self, input_ids, out_dtype=None):
+ return super().forward(input_ids, out_dtype=out_dtype) * scale
+ return ScaledEmbedding(vocab_size, hidden_size, device=device, dtype=dtype)
+
+
class Llama2_(nn.Module):
def __init__(self, config, device=None, dtype=None, ops=None):
super().__init__()
self.config = config
self.vocab_size = config.vocab_size
- self.embed_tokens = ops.Embedding(
- config.vocab_size,
- config.hidden_size,
- device=device,
- dtype=dtype
- )
if self.config.transformer_type == "gemma2" or self.config.transformer_type == "gemma3":
transformer = TransformerBlockGemma2
- self.normalize_in = True
+ self.embed_tokens = _make_scaled_embedding(ops, config.vocab_size, config.hidden_size, config.hidden_size ** 0.5, device, dtype)
else:
transformer = TransformerBlock
- self.normalize_in = False
+ self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype)
self.layers = nn.ModuleList([
transformer(config, index=i, device=device, dtype=dtype, ops=ops)
@@ -687,17 +700,15 @@ class Llama2_(nn.Module):
self.config.rope_theta,
self.config.rope_scale,
self.config.rope_dims,
+ interleaved_mrope=getattr(self.config, "interleaved_mrope", False),
device=device)
- def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None):
+ def forward(self, x, attention_mask=None, embeds=None, num_tokens=None, intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, position_ids=None, embeds_info=[], past_key_values=None, input_ids=None):
if embeds is not None:
x = embeds
else:
x = self.embed_tokens(x, out_dtype=dtype)
- if self.normalize_in:
- x *= self.config.hidden_size ** 0.5
-
seq_len = x.shape[1]
past_len = 0
if past_key_values is not None and len(past_key_values) > 0:
@@ -849,7 +860,7 @@ class BaseGenerate:
torch.empty([batch, model_config.num_key_value_heads, max_cache_len, model_config.head_dim], device=device, dtype=execution_dtype), 0))
return past_key_values
- def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0):
+ def generate(self, embeds=None, do_sample=True, max_length=256, temperature=1.0, top_k=50, top_p=0.9, min_p=0.0, repetition_penalty=1.0, seed=42, stop_tokens=None, initial_tokens=[], execution_dtype=None, min_tokens=0, presence_penalty=0.0, initial_input_ids=None):
device = embeds.device
if stop_tokens is None:
@@ -874,14 +885,16 @@ class BaseGenerate:
pbar = comfy.utils.ProgressBar(max_length)
# Generation loop
+ current_input_ids = initial_input_ids
for step in tqdm(range(max_length), desc="Generating tokens"):
- x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values)
+ x, _, past_key_values = self.model.forward(None, embeds=embeds, attention_mask=None, past_key_values=past_key_values, input_ids=current_input_ids)
logits = self.logits(x)[:, -1]
next_token = self.sample_token(logits, temperature, top_k, top_p, min_p, repetition_penalty, initial_tokens + generated_token_ids, generator, do_sample=do_sample, presence_penalty=presence_penalty)
token_id = next_token[0].item()
generated_token_ids.append(token_id)
embeds = self.model.embed_tokens(next_token).to(execution_dtype)
+ current_input_ids = next_token if initial_input_ids is not None else None
pbar.update(1)
if token_id in stop_tokens:
@@ -969,7 +982,7 @@ class Mistral3Small24B(BaseLlama, torch.nn.Module):
self.model = Llama2_(config, device=device, dtype=dtype, ops=operations)
self.dtype = dtype
-class Ministral3_3B(BaseLlama, torch.nn.Module):
+class Ministral3_3B(BaseLlama, BaseQwen3, BaseGenerate, torch.nn.Module):
def __init__(self, config_dict, dtype, device, operations):
super().__init__()
config = Ministral3_3BConfig(**config_dict)
diff --git a/comfy/text_encoders/lt.py b/comfy/text_encoders/lt.py
index 5aee1f4c0..bc5cbae28 100644
--- a/comfy/text_encoders/lt.py
+++ b/comfy/text_encoders/lt.py
@@ -93,8 +93,7 @@ class Gemma3_12BModel(sd1_clip.SDClipModel):
def generate(self, tokens, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, presence_penalty):
tokens_only = [[t[0] for t in b] for b in tokens]
- embeds, _, _, embeds_info = self.process_tokens(tokens_only, self.execution_device)
- comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5)
+ embeds, _, _, _ = self.process_tokens(tokens_only, self.execution_device)
return self.transformer.generate(embeds, do_sample, max_length, temperature, top_k, top_p, min_p, repetition_penalty, seed, stop_tokens=[106], presence_penalty=presence_penalty) # 106 is
class DualLinearProjection(torch.nn.Module):
diff --git a/comfy/text_encoders/lumina2.py b/comfy/text_encoders/lumina2.py
index 01ebdfabe..b1f1dbb9f 100644
--- a/comfy/text_encoders/lumina2.py
+++ b/comfy/text_encoders/lumina2.py
@@ -50,8 +50,7 @@ class Gemma3_4B_Vision_Model(sd1_clip.SDClipModel):
super().__init__(device=device, layer=layer, layer_idx=layer_idx, textmodel_json_config={}, dtype=dtype, special_tokens={"start": 2, "pad": 0}, layer_norm_hidden_state=False, model_class=comfy.text_encoders.llama.Gemma3_4B_Vision, enable_attention_masks=attention_mask, return_attention_masks=attention_mask, model_options=model_options)
def process_tokens(self, tokens, device):
- embeds, _, _, embeds_info = super().process_tokens(tokens, device)
- comfy.utils.normalize_image_embeddings(embeds, embeds_info, self.transformer.model.config.hidden_size ** 0.5)
+ embeds, _, _, _ = super().process_tokens(tokens, device)
return embeds
class LuminaModel(sd1_clip.SD1ClipModel):
diff --git a/comfy/text_encoders/pixeldit.py b/comfy/text_encoders/pixeldit.py
new file mode 100644
index 000000000..3539711e4
--- /dev/null
+++ b/comfy/text_encoders/pixeldit.py
@@ -0,0 +1,104 @@
+import torch
+
+from comfy import sd1_clip
+from .lumina2 import Gemma2BTokenizer, LuminaModel
+import comfy.text_encoders.llama
+
+
+class PixelDiTGemma2_2BModel(sd1_clip.SDClipModel):
+ def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, attention_mask=True, model_options={}):
+ llama_quantization_metadata = model_options.get("llama_quantization_metadata", None)
+ if llama_quantization_metadata is not None:
+ model_options = model_options.copy()
+ model_options["quantization_metadata"] = llama_quantization_metadata
+
+ super().__init__(
+ device=device, layer=layer, layer_idx=layer_idx,
+ textmodel_json_config={}, dtype=dtype,
+ special_tokens={"start": 2, "pad": 0},
+ layer_norm_hidden_state=False,
+ model_class=comfy.text_encoders.llama.Gemma2_2B,
+ enable_attention_masks=attention_mask,
+ return_attention_masks=attention_mask,
+ model_options=model_options,
+ )
+
+
+_PIXELDIT_CHI_PROMPT = (
+ 'Given a user prompt, generate an "Enhanced prompt" that provides detailed visual descriptions '
+ "suitable for image generation. Evaluate the level of detail in the user prompt:\n"
+ "- If the prompt is simple, focus on adding specifics about colors, shapes, sizes, textures, "
+ "and spatial relationships to create vivid and concrete scenes.\n"
+ "- If the prompt is already detailed, refine and enhance the existing details slightly without "
+ "overcomplicating.\n"
+ "Here are examples of how to transform or refine prompts:\n"
+ "- User Prompt: A cat sleeping -> Enhanced: A small, fluffy white cat curled up in a round shape, "
+ "sleeping peacefully on a warm sunny windowsill, surrounded by pots of blooming red flowers.\n"
+ "- User Prompt: A busy city street -> Enhanced: A bustling city street scene at dusk, featuring "
+ "glowing street lamps, a diverse crowd of people in colorful clothing, and a double-decker bus "
+ "passing by towering glass skyscrapers.\n"
+ "Please generate only the enhanced description for the prompt below and avoid including any "
+ "additional commentary or evaluations:\n"
+ "User Prompt: "
+)
+
+_PIXELDIT_MAX_LENGTH = 300
+_PIXELDIT_CHI_PROMPT_DETECT_PREFIX = 'Given a user prompt, generate an "Enhanced prompt"'
+
+
+class PixelDiTGemma2Tokenizer(sd1_clip.SD1Tokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data=None):
+ if tokenizer_data is None:
+ tokenizer_data = {}
+ super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data,
+ name="gemma2_2b", tokenizer=Gemma2BTokenizer)
+
+ def tokenize_with_weights(self, text, return_word_ids=False, **kwargs):
+ if not text.strip():
+ return super().tokenize_with_weights("", return_word_ids=return_word_ids, disable_weights=True, min_length=_PIXELDIT_MAX_LENGTH)
+
+ chi_token_count = len(self.gemma2_2b.tokenizer(_PIXELDIT_CHI_PROMPT)["input_ids"])
+ combined = text if text.startswith(_PIXELDIT_CHI_PROMPT_DETECT_PREFIX) else _PIXELDIT_CHI_PROMPT + text
+ max_length_all = chi_token_count + _PIXELDIT_MAX_LENGTH - 2
+ out = super().tokenize_with_weights(combined, return_word_ids=return_word_ids,
+ disable_weights=True, min_length=max_length_all)
+ out["gemma2_2b"] = [out["gemma2_2b"][0][:max_length_all]]
+ return out
+
+ def untokenize(self, token_weight_pair):
+ return self.gemma2_2b.untokenize(token_weight_pair)
+
+ def state_dict(self):
+ return self.gemma2_2b.state_dict()
+
+
+class PixelDiTGemma2TE(LuminaModel):
+ # PixelDiT's select_index: keep BOS + last 299 embeddings of the padded sequence.
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ super().__init__(device=device, dtype=dtype, name="gemma2_2b",
+ clip_model=PixelDiTGemma2_2BModel, model_options=model_options)
+
+ def encode_token_weights(self, token_weight_pairs):
+ result = super().encode_token_weights(token_weight_pairs)
+ cond, pooled = result[0], result[1]
+ extra = result[2] if len(result) > 2 else None
+ if cond.shape[1] > _PIXELDIT_MAX_LENGTH:
+ cond = torch.cat([cond[:, :1], cond[:, -(_PIXELDIT_MAX_LENGTH - 1):]], dim=1)
+ if extra is not None and "attention_mask" in extra:
+ am = extra["attention_mask"]
+ extra["attention_mask"] = torch.cat([am[..., :1], am[..., -(_PIXELDIT_MAX_LENGTH - 1):]], dim=-1)
+ if extra is not None:
+ return cond, pooled, extra
+ return cond, pooled
+
+
+def pixeldit_te(dtype_llama=None, llama_quantization_metadata=None):
+ class PixelDiTTE_(PixelDiTGemma2TE):
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ if llama_quantization_metadata is not None:
+ model_options = model_options.copy()
+ model_options["llama_quantization_metadata"] = llama_quantization_metadata
+ if dtype_llama is not None:
+ dtype = dtype_llama
+ super().__init__(device=device, dtype=dtype, model_options=model_options)
+ return PixelDiTTE_
diff --git a/comfy/text_encoders/qwen35.py b/comfy/text_encoders/qwen35.py
index ce9b07464..416ce9d18 100644
--- a/comfy/text_encoders/qwen35.py
+++ b/comfy/text_encoders/qwen35.py
@@ -408,8 +408,6 @@ class Qwen35Transformer(Llama2_):
nn.Module.__init__(self)
self.config = config
self.vocab_size = config.vocab_size
- self.normalize_in = False
-
self.embed_tokens = ops.Embedding(config.vocab_size, config.hidden_size, device=device, dtype=dtype)
self.layers = nn.ModuleList([
Qwen35TransformerBlock(config, index=i, device=device, dtype=dtype, ops=ops)
@@ -453,9 +451,8 @@ class Qwen35VisionPatchEmbed(nn.Module):
self.proj = ops.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True, device=device, dtype=dtype)
def forward(self, x):
- target_dtype = self.proj.weight.dtype
x = x.view(-1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size)
- return self.proj(x.to(target_dtype)).view(-1, self.embed_dim)
+ return self.proj(x).view(-1, self.embed_dim)
class Qwen35VisionMLP(nn.Module):
@@ -653,7 +650,7 @@ class Qwen35VisionModel(nn.Module):
x = self.patch_embed(x)
pos_embeds = self.fast_pos_embed_interpolate(grid_thw).to(x.device)
x = x + pos_embeds
- rotary_pos_emb = self.rot_pos_emb(grid_thw)
+ rotary_pos_emb = self.rot_pos_emb(grid_thw).to(x.device)
seq_len = x.shape[0]
x = x.reshape(seq_len, -1)
rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
@@ -763,7 +760,7 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
def tokenize_with_weights(self, text, return_word_ids=False, llama_template=None, images=[], prevent_empty_text=False, thinking=False, **kwargs):
image = kwargs.get("image", None)
if image is not None and len(images) == 0:
- images = [image]
+ images = [image[i:i + 1] for i in range(image.shape[0])]
skip_template = False
if text.startswith('<|im_start|>'):
@@ -774,13 +771,16 @@ class Qwen35ImageTokenizer(sd1_clip.SD1Tokenizer):
if skip_template:
llama_text = text
else:
- if llama_template is None:
- if len(images) > 0:
- llama_text = self.llama_template_images.format(text)
- else:
- llama_text = self.llama_template.format(text)
+ if llama_template is not None:
+ template = llama_template
+ elif len(images) == 0:
+ template = self.llama_template
else:
- llama_text = llama_template.format(text)
+ template = self.llama_template_images
+ if len(images) > 1:
+ vision_block = "<|vision_start|><|image_pad|><|vision_end|>"
+ template = template.replace(vision_block, vision_block * len(images), 1)
+ llama_text = template.format(text)
if not thinking:
llama_text += "\n \n"
diff --git a/comfy/text_encoders/sa3.py b/comfy/text_encoders/sa3.py
new file mode 100644
index 000000000..0a1c73ec1
--- /dev/null
+++ b/comfy/text_encoders/sa3.py
@@ -0,0 +1,207 @@
+import torch
+import torch.nn as nn
+from comfy import sd1_clip
+from comfy.text_encoders.llama import Attention as LlamaAttention, RMSNorm, MLP, precompute_freqs_cis, apply_rope, _make_scaled_embedding
+from comfy.text_encoders.spiece_tokenizer import SPieceTokenizer
+
+
+class T5GemmaEncoderConfig:
+ def __init__(self):
+ self.vocab_size = 256000
+ self.hidden_size = 768
+ self.intermediate_size = 2048
+ self.num_hidden_layers = 12
+ self.num_attention_heads = 12
+ self.num_key_value_heads = 12
+ self.head_dim = 64
+ self.rms_norm_eps = 1e-6
+ self.rms_norm_add = False
+ self.rope_theta = 10000.0
+ self.attn_logit_softcapping = 50.0
+ self.query_pre_attn_scalar = 64
+ self.sliding_window = 4096
+ self.mlp_activation = "gelu_pytorch_tanh"
+ self.layer_types = ["sliding_attention", "full_attention"] * 6
+ self.qkv_bias = False
+ self.q_norm = None
+ self.k_norm = None
+ self.rms_norm_add = True
+
+
+class T5GemmaAttention(LlamaAttention):
+ """Reuses LlamaAttention projection setup; overrides forward for softcap attention.
+
+ T5Gemma applies tanh(QK^T * scale / cap) * cap between the matmul and softmax.
+ This nonlinearity is incompatible with fused SDPA kernels, so attention is
+ computed manually. Everything else (projections, RoPE, GQA expansion) is identical
+ to LlamaAttention so __init__ is inherited unchanged.
+ """
+
+ def __init__(self, config, device=None, dtype=None, ops=None):
+ super().__init__(config, device=device, dtype=dtype, ops=ops)
+ self.scale = config.query_pre_attn_scalar ** -0.5
+ self.softcap = config.attn_logit_softcapping
+
+ def forward(self, hidden_states, attention_mask=None, freqs_cis=None, **kwargs):
+ B, S, _ = hidden_states.shape
+ xq = self.q_proj(hidden_states).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+ xk = self.k_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
+ xv = self.v_proj(hidden_states).view(B, S, self.num_kv_heads, self.head_dim).transpose(1, 2)
+ xq, xk = apply_rope(xq, xk, freqs_cis)
+ xk = xk.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+ xv = xv.repeat_interleave(self.num_heads // self.num_kv_heads, dim=1)
+ attn = torch.matmul(xq * self.scale, xk.transpose(-2, -1))
+ attn = torch.tanh(attn / self.softcap) * self.softcap
+ if attention_mask is not None:
+ attn = attn + attention_mask
+ attn = torch.nn.functional.softmax(attn.float(), dim=-1).to(xq.dtype)
+ out = torch.matmul(attn, xv).transpose(1, 2).reshape(B, S, self.inner_size)
+ return self.o_proj(out), None
+
+
+class T5GemmaBlock(nn.Module):
+ def __init__(self, config, layer_type, device=None, dtype=None, ops=None):
+ super().__init__()
+ self.self_attn = T5GemmaAttention(config, device=device, dtype=dtype, ops=ops)
+ self.mlp = MLP(config, device=device, dtype=dtype, ops=ops)
+ # Names match checkpoint keys: model.encoder.layers.X..weight
+ self.pre_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
+ self.post_self_attn_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
+ self.pre_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
+ self.post_feedforward_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
+ self.is_sliding = (layer_type == "sliding_attention")
+ self.sliding_window = config.sliding_window
+
+ def forward(self, x, attention_mask=None, freqs_cis=None):
+ attn_mask = attention_mask
+ if self.is_sliding and x.shape[1] > self.sliding_window:
+ S = x.shape[1]
+ pos = torch.arange(S, device=x.device)
+ dist = (pos.unsqueeze(0) - pos.unsqueeze(1)).abs()
+ sw_mask = torch.zeros(S, S, dtype=x.dtype, device=x.device)
+ sw_mask.masked_fill_(dist > self.sliding_window, -torch.finfo(x.dtype).max)
+ sw_mask = sw_mask.unsqueeze(0).unsqueeze(0)
+ attn_mask = (attention_mask + sw_mask) if attention_mask is not None else sw_mask
+ residual = x
+ x = self.pre_self_attn_layernorm(x)
+ x, _ = self.self_attn(x, attention_mask=attn_mask, freqs_cis=freqs_cis)
+ x = self.post_self_attn_layernorm(x)
+ x = residual + x
+ residual = x
+ x = self.pre_feedforward_layernorm(x)
+ x = self.mlp(x)
+ x = self.post_feedforward_layernorm(x)
+ x = residual + x
+ return x
+
+
+class T5GemmaEncoder(nn.Module):
+ """Encoder stack: embed_tokens, layers, norm.
+ Keys: embed_tokens.*, layers.X.*, norm.*"""
+
+ def __init__(self, config, device, dtype, ops):
+ super().__init__()
+ self.config = config
+ # Gemma-style scaled embedding: output *= sqrt(hidden_size)
+ self.embed_tokens = _make_scaled_embedding(
+ ops, config.vocab_size, config.hidden_size, config.hidden_size ** 0.5, device, dtype)
+ self.layers = nn.ModuleList([
+ T5GemmaBlock(config, config.layer_types[i], device=device, dtype=dtype, ops=ops)
+ for i in range(config.num_hidden_layers)
+ ])
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps, add=True, device=device, dtype=dtype)
+
+ def forward(self, input_ids, attention_mask=None, embeds=None, intermediate_output=None,
+ final_layer_norm_intermediate=True, dtype=None, num_layers=None):
+ x = embeds if embeds is not None else self.embed_tokens(input_ids, out_dtype=dtype or torch.float32)
+ seq_len = x.shape[1]
+ position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0)
+ freqs_cis = precompute_freqs_cis(self.config.head_dim, position_ids, self.config.rope_theta, device=x.device)
+ mask = None
+ if attention_mask is not None:
+ mask = 1.0 - attention_mask.to(x.dtype).reshape(
+ (attention_mask.shape[0], 1, -1, attention_mask.shape[-1])
+ ).expand(attention_mask.shape[0], 1, seq_len, attention_mask.shape[-1])
+ mask = mask.masked_fill(mask.to(torch.bool), -torch.finfo(x.dtype).max)
+ intermediate = None
+ for i, layer in enumerate(self.layers):
+ x = layer(x, attention_mask=mask, freqs_cis=freqs_cis)
+ if i == intermediate_output:
+ intermediate = x.clone()
+ x = self.norm(x)
+ if intermediate is not None and final_layer_norm_intermediate:
+ intermediate = self.norm(intermediate)
+ return x, intermediate
+
+
+class T5GemmaBody(nn.Module):
+ """Provides the 'encoder' sub-module.
+ Keys: encoder.*"""
+
+ def __init__(self, config, device, dtype, ops):
+ super().__init__()
+ self.encoder = T5GemmaEncoder(config, device, dtype, ops)
+
+
+class T5GemmaModel(nn.Module):
+ """Top-level model class passed to SDClipModel as model_class.
+ Module layout: self.model.encoder.* → matches checkpoint keys model.encoder.*"""
+
+ def __init__(self, config_dict, dtype, device, operations):
+ super().__init__()
+ config = T5GemmaEncoderConfig()
+ self.num_layers = config.num_hidden_layers
+ self.dtype = dtype
+ self.model = T5GemmaBody(config, device, dtype, operations)
+
+ def get_input_embeddings(self):
+ return self.model.encoder.embed_tokens
+
+ def set_input_embeddings(self, embeddings):
+ self.model.encoder.embed_tokens = embeddings
+
+ def forward(self, input_ids, attention_mask=None, embeds=None, num_tokens=None,
+ intermediate_output=None, final_layer_norm_intermediate=True, dtype=None, **kwargs):
+ if intermediate_output is not None and intermediate_output < 0:
+ intermediate_output = self.num_layers + intermediate_output
+ return self.model.encoder(
+ input_ids, attention_mask=attention_mask, embeds=embeds,
+ intermediate_output=intermediate_output,
+ final_layer_norm_intermediate=final_layer_norm_intermediate,
+ dtype=dtype, num_layers=self.num_layers)
+
+
+class T5GemmaSDClipModel(sd1_clip.SDClipModel):
+ def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None, model_options={}):
+ super().__init__(device=device, layer=layer, layer_idx=layer_idx,
+ textmodel_json_config={}, dtype=dtype,
+ special_tokens={"pad": 0},
+ model_class=T5GemmaModel,
+ enable_attention_masks=True, zero_out_masked=True,
+ model_options=model_options)
+
+
+class T5GemmaSDTokenizer(sd1_clip.SDTokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ tokenizer_model = tokenizer_data.get("spiece_model", None)
+ super().__init__(tokenizer_model, pad_with_end=False, embedding_size=768,
+ embedding_key="t5gemma", tokenizer_class=SPieceTokenizer,
+ has_start_token=False, has_end_token=False, pad_to_max_length=False,
+ max_length=99999999, min_length=1, pad_token=0,
+ tokenizer_data=tokenizer_data,
+ tokenizer_args={"add_bos": False, "add_eos": False})
+
+ def state_dict(self):
+ return {"spiece_model": self.tokenizer.serialize_model()}
+
+
+class SAT5GemmaTokenizer(sd1_clip.SD1Tokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(embedding_directory=embedding_directory,
+ tokenizer_data=tokenizer_data, clip_name="t5gemma", tokenizer=T5GemmaSDTokenizer)
+
+
+class SAT5GemmaModel(sd1_clip.SD1ClipModel):
+ def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+ super().__init__(device=device, dtype=dtype, model_options=model_options,
+ name="t5gemma", clip_model=T5GemmaSDClipModel, **kwargs)
diff --git a/comfy/text_encoders/sam3_clip.py b/comfy/text_encoders/sam3_clip.py
new file mode 100644
index 000000000..11cb7d9db
--- /dev/null
+++ b/comfy/text_encoders/sam3_clip.py
@@ -0,0 +1,97 @@
+import re
+from comfy import sd1_clip
+
+SAM3_CLIP_CONFIG = {
+ "architectures": ["CLIPTextModel"],
+ "hidden_act": "quick_gelu",
+ "hidden_size": 1024,
+ "intermediate_size": 4096,
+ "num_attention_heads": 16,
+ "num_hidden_layers": 24,
+ "max_position_embeddings": 32,
+ "projection_dim": 512,
+ "vocab_size": 49408,
+ "layer_norm_eps": 1e-5,
+ "eos_token_id": 49407,
+}
+
+
+class SAM3ClipModel(sd1_clip.SDClipModel):
+ def __init__(self, device="cpu", dtype=None, model_options={}):
+ super().__init__(device=device, dtype=dtype, max_length=32, layer="last", textmodel_json_config=SAM3_CLIP_CONFIG, special_tokens={"start": 49406, "end": 49407, "pad": 0}, return_projected_pooled=False, return_attention_masks=True, enable_attention_masks=True, model_options=model_options)
+
+
+class SAM3Tokenizer(sd1_clip.SDTokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(max_length=32, pad_with_end=False, pad_token=0, embedding_directory=embedding_directory, embedding_size=1024, embedding_key="sam3_clip", tokenizer_data=tokenizer_data)
+ self.disable_weights = True
+
+
+def _parse_prompts(text):
+ """Split comma-separated prompts with optional :N max detections per category"""
+ text = text.replace("(", "").replace(")", "")
+ parts = [p.strip() for p in text.split(",") if p.strip()]
+ result = []
+ for part in parts:
+ m = re.match(r'^(.+?)\s*:\s*([\d.]+)\s*$', part)
+ if m:
+ text_part = m.group(1).strip()
+ val = m.group(2)
+ max_det = max(1, round(float(val)))
+ result.append((text_part, max_det))
+ else:
+ result.append((part, 1))
+ return result
+
+
+class SAM3TokenizerWrapper(sd1_clip.SD1Tokenizer):
+ def __init__(self, embedding_directory=None, tokenizer_data={}):
+ super().__init__(embedding_directory=embedding_directory, tokenizer_data=tokenizer_data, clip_name="l", tokenizer=SAM3Tokenizer, name="sam3_clip")
+
+ def tokenize_with_weights(self, text: str, return_word_ids=False, **kwargs):
+ parsed = _parse_prompts(text)
+ if len(parsed) <= 1 and (not parsed or parsed[0][1] == 1):
+ return super().tokenize_with_weights(text, return_word_ids, **kwargs)
+ # Tokenize each prompt part separately, store per-part batches and metadata
+ inner = getattr(self, self.clip)
+ per_prompt = []
+ for prompt_text, max_det in parsed:
+ batches = inner.tokenize_with_weights(prompt_text, return_word_ids, **kwargs)
+ per_prompt.append((batches, max_det))
+ # Main output uses first prompt's tokens (for compatibility)
+ out = {self.clip_name: per_prompt[0][0], "sam3_per_prompt": per_prompt}
+ return out
+
+
+class SAM3ClipModelWrapper(sd1_clip.SD1ClipModel):
+ def __init__(self, device="cpu", dtype=None, model_options={}, **kwargs):
+ super().__init__(device=device, dtype=dtype, model_options=model_options, clip_name="l", clip_model=SAM3ClipModel, name="sam3_clip")
+
+ def encode_token_weights(self, token_weight_pairs):
+ per_prompt = token_weight_pairs.pop("sam3_per_prompt", None)
+ if per_prompt is None:
+ return super().encode_token_weights(token_weight_pairs)
+
+ # Encode each prompt separately, pack into extra dict
+ inner = getattr(self, self.clip)
+ multi_cond = []
+ first_pooled = None
+ for batches, max_det in per_prompt:
+ out = inner.encode_token_weights(batches)
+ cond, pooled = out[0], out[1]
+ extra = out[2] if len(out) > 2 else {}
+ if first_pooled is None:
+ first_pooled = pooled
+ multi_cond.append({
+ "cond": cond,
+ "attention_mask": extra.get("attention_mask"),
+ "max_detections": max_det,
+ })
+
+ # Return first prompt as main (for non-SAM3 consumers), all prompts in metadata
+ main = multi_cond[0]
+ main_extra = {}
+ if main["attention_mask"] is not None:
+ main_extra["attention_mask"] = main["attention_mask"]
+ main_extra["sam3_multi_cond"] = multi_cond
+ return (main["cond"], first_pooled, main_extra)
diff --git a/comfy/utils.py b/comfy/utils.py
index 78c491b98..49ae12b06 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -86,6 +86,7 @@ def load_safetensors(ckpt):
import comfy_aimdo.model_mmap
f = open(ckpt, "rb", buffering=0)
+ file_lock = threading.Lock()
model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt)
file_size = os.path.getsize(ckpt)
mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get()))
@@ -111,9 +112,8 @@ def load_safetensors(ckpt):
storage = tensor.untyped_storage()
setattr(storage,
"_comfy_tensor_file_slice",
- comfy.memory_management.TensorFileSlice(f, threading.get_ident(), data_base_offset + start, end - start))
+ comfy.memory_management.TensorFileSlice(f, file_lock, data_base_offset + start, end - start))
setattr(storage, "_comfy_tensor_mmap_refs", (model_mmap, mv))
- setattr(storage, "_comfy_tensor_mmap_touched", False)
sd[name] = tensor
return sd, header.get("__metadata__", {}),
@@ -1020,10 +1020,11 @@ def bislerp(samples, width, height):
def lanczos(samples, width, height):
#the below API is strict and expects grayscale to be squeezed
- samples = samples.squeeze(1) if samples.shape[1] == 1 else samples.movedim(1, -1)
+ if samples.ndim == 4:
+ samples = samples.squeeze(1) if samples.shape[1] == 1 else samples.movedim(1, -1)
images = [Image.fromarray(np.clip(255. * image.cpu().numpy(), 0, 255).astype(np.uint8)) for image in samples]
images = [image.resize((width, height), resample=Image.Resampling.LANCZOS) for image in images]
- images = [torch.from_numpy(np.array(image).astype(np.float32) / 255.0).movedim(-1, 0) for image in images]
+ images = [torch.from_numpy(t).movedim(-1, 0) if (t := np.array(image).astype(np.float32) / 255.0).ndim == 3 else torch.from_numpy(t) for image in images]
result = torch.stack(images)
return result.to(samples.device, samples.dtype)
@@ -1164,12 +1165,18 @@ def tiled_scale_multidim(samples, function, tile=(64, 64), overlap=8, upscale_am
o = out
o_d = out_div
+ ps_view = ps
+ mask_view = mask
for d in range(dims):
- o = o.narrow(d + 2, upscaled[d], mask.shape[d + 2])
- o_d = o_d.narrow(d + 2, upscaled[d], mask.shape[d + 2])
+ l = min(ps_view.shape[d + 2], o.shape[d + 2] - upscaled[d])
+ o = o.narrow(d + 2, upscaled[d], l)
+ o_d = o_d.narrow(d + 2, upscaled[d], l)
+ if l < ps_view.shape[d + 2]:
+ ps_view = ps_view.narrow(d + 2, 0, l)
+ mask_view = mask_view.narrow(d + 2, 0, l)
- o.add_(ps * mask)
- o_d.add_(mask)
+ o.add_(ps_view * mask_view)
+ o_d.add_(mask_view)
if pbar is not None:
pbar.update(1)
@@ -1196,7 +1203,7 @@ def model_trange(*args, **kwargs):
pbar.i1_time = time.time()
pbar.set_postfix_str(" Model Initialization complete! ")
elif pbar._i == 2:
- #bring forward the effective start time based the the diff between first and second iteration
+ #bring forward the effective start time based the diff between first and second iteration
#to attempt to remove load overhead from the final step rate estimate.
pbar.start_t = pbar.i1_time - (time.time() - pbar.i1_time)
pbar.set_postfix_str("")
@@ -1390,7 +1397,7 @@ def convert_old_quants(state_dict, model_prefix="", metadata={}):
k_out = "{}.weight_scale".format(layer)
if layer is not None:
- layer_conf = {"format": "float8_e4m3fn"} # TODO: check if anyone did some non e4m3fn scaled checkpoints
+ layer_conf = {"format": "float8_e4m3fn"}
if full_precision_matrix_mult:
layer_conf["full_precision_matrix_mult"] = full_precision_matrix_mult
layers[layer] = layer_conf
@@ -1445,11 +1452,3 @@ def deepcopy_list_dict(obj, memo=None):
memo[obj_id] = res
return res
-
-def normalize_image_embeddings(embeds, embeds_info, scale_factor):
- """Normalize image embeddings to match text embedding scale"""
- for info in embeds_info:
- if info.get("type") == "image":
- start_idx = info["index"]
- end_idx = start_idx + info["size"]
- embeds[:, start_idx:end_idx, :] /= scale_factor
diff --git a/comfy/windows.py b/comfy/windows.py
deleted file mode 100644
index 213dc481d..000000000
--- a/comfy/windows.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import ctypes
-import logging
-import psutil
-from ctypes import wintypes
-
-import comfy_aimdo.control
-
-psapi = ctypes.WinDLL("psapi")
-kernel32 = ctypes.WinDLL("kernel32")
-
-class PERFORMANCE_INFORMATION(ctypes.Structure):
- _fields_ = [
- ("cb", wintypes.DWORD),
- ("CommitTotal", ctypes.c_size_t),
- ("CommitLimit", ctypes.c_size_t),
- ("CommitPeak", ctypes.c_size_t),
- ("PhysicalTotal", ctypes.c_size_t),
- ("PhysicalAvailable", ctypes.c_size_t),
- ("SystemCache", ctypes.c_size_t),
- ("KernelTotal", ctypes.c_size_t),
- ("KernelPaged", ctypes.c_size_t),
- ("KernelNonpaged", ctypes.c_size_t),
- ("PageSize", ctypes.c_size_t),
- ("HandleCount", wintypes.DWORD),
- ("ProcessCount", wintypes.DWORD),
- ("ThreadCount", wintypes.DWORD),
- ]
-
-def get_free_ram():
- #Windows is way too conservative and chalks recently used uncommitted model RAM
- #as "in-use". So, calculate free RAM for the sake of general use as the greater of:
- #
- #1: What psutil says
- #2: Total Memory - (Committed Memory - VRAM in use)
- #
- #We have to subtract VRAM in use from the comitted memory as WDDM creates a naked
- #commit charge for all VRAM used just incase it wants to page it all out. This just
- #isn't realistic so "overcommit" on our calculations by just subtracting it off.
-
- pi = PERFORMANCE_INFORMATION()
- pi.cb = ctypes.sizeof(pi)
-
- if not psapi.GetPerformanceInfo(ctypes.byref(pi), pi.cb):
- logging.warning("WARNING: Failed to query windows performance info. RAM usage may be sub optimal")
- return psutil.virtual_memory().available
-
- committed = pi.CommitTotal * pi.PageSize
- total = pi.PhysicalTotal * pi.PageSize
-
- return max(psutil.virtual_memory().available,
- total - (committed - comfy_aimdo.control.get_total_vram_usage()))
-
diff --git a/comfy_api/feature_flags.py b/comfy_api/feature_flags.py
index 9f6918315..adb5a3144 100644
--- a/comfy_api/feature_flags.py
+++ b/comfy_api/feature_flags.py
@@ -5,12 +5,95 @@ This module handles capability negotiation between frontend and backend,
allowing graceful protocol evolution while maintaining backward compatibility.
"""
-from typing import Any
+import logging
+from typing import Any, TypedDict
from comfy.cli_args import args
+
+class FeatureFlagInfo(TypedDict):
+ type: str
+ default: Any
+ description: str
+
+
+# Registry of known CLI-settable feature flags.
+# Launchers can query this via --list-feature-flags to discover valid flags.
+CLI_FEATURE_FLAG_REGISTRY: dict[str, FeatureFlagInfo] = {
+ "show_signin_button": {
+ "type": "bool",
+ "default": False,
+ "description": "Show the sign-in button in the frontend even when not signed in",
+ },
+}
+
+
+def _coerce_bool(v: str) -> bool:
+ """Strict bool coercion: only 'true'/'false' (case-insensitive).
+
+ Anything else raises ValueError so the caller can warn and drop the flag,
+ rather than silently treating typos like 'ture' or 'yes' as False.
+ """
+ lower = v.lower()
+ if lower == "true":
+ return True
+ if lower == "false":
+ return False
+ raise ValueError(f"expected 'true' or 'false', got {v!r}")
+
+
+_COERCE_FNS: dict[str, Any] = {
+ "bool": _coerce_bool,
+ "int": lambda v: int(v),
+ "float": lambda v: float(v),
+}
+
+
+def _coerce_flag_value(key: str, raw_value: str) -> Any:
+ """Coerce a raw string value using the registry type, or keep as string.
+
+ Returns the raw string if the key is unregistered or the type is unknown.
+ Raises ValueError/TypeError if the key is registered with a known type but
+ the value cannot be coerced; callers are expected to warn and drop the flag.
+ """
+ info = CLI_FEATURE_FLAG_REGISTRY.get(key)
+ if info is None:
+ return raw_value
+ coerce = _COERCE_FNS.get(info["type"])
+ if coerce is None:
+ return raw_value
+ return coerce(raw_value)
+
+
+def _parse_cli_feature_flags() -> dict[str, Any]:
+ """Parse --feature-flag key=value pairs from CLI args into a dict.
+
+ Items without '=' default to the value 'true' (bare flag form).
+ Flags whose value cannot be coerced to the registered type are dropped
+ with a warning, so a typo like '--feature-flag some_bool=ture' does not
+ silently take effect as the wrong value.
+ """
+ result: dict[str, Any] = {}
+ for item in getattr(args, "feature_flag", []):
+ key, sep, raw_value = item.partition("=")
+ key = key.strip()
+ if not key:
+ continue
+ if not sep:
+ raw_value = "true"
+ try:
+ result[key] = _coerce_flag_value(key, raw_value.strip())
+ except (ValueError, TypeError) as e:
+ info = CLI_FEATURE_FLAG_REGISTRY.get(key, {})
+ logging.warning(
+ "Could not coerce --feature-flag %s=%r to %s (%s); dropping flag.",
+ key, raw_value.strip(), info.get("type", "?"), e,
+ )
+ return result
+
+
# Default server capabilities
-SERVER_FEATURE_FLAGS: dict[str, Any] = {
+_CORE_FEATURE_FLAGS: dict[str, Any] = {
"supports_preview_metadata": True,
"max_upload_size": args.max_upload_size * 1024 * 1024, # Convert MB to bytes
"extension": {"manager": {"supports_v4": True}},
@@ -18,6 +101,11 @@ SERVER_FEATURE_FLAGS: dict[str, Any] = {
"assets": args.enable_assets,
}
+# CLI-provided flags cannot overwrite core flags
+_cli_flags = {k: v for k, v in _parse_cli_feature_flags().items() if k not in _CORE_FEATURE_FLAGS}
+
+SERVER_FEATURE_FLAGS: dict[str, Any] = {**_CORE_FEATURE_FLAGS, **_cli_flags}
+
def get_connection_feature(
sockets_metadata: dict[str, dict[str, Any]],
diff --git a/comfy_api/input/__init__.py b/comfy_api/input/__init__.py
index 16d4acfd1..dc33533cc 100644
--- a/comfy_api/input/__init__.py
+++ b/comfy_api/input/__init__.py
@@ -9,6 +9,7 @@ from comfy_api.latest._input import (
CurveInput,
MonotoneCubicCurve,
LinearCurve,
+ RangeInput,
)
__all__ = [
@@ -21,4 +22,5 @@ __all__ = [
"CurveInput",
"MonotoneCubicCurve",
"LinearCurve",
+ "RangeInput",
]
diff --git a/comfy_api/latest/__init__.py b/comfy_api/latest/__init__.py
index 04973fea0..e0a585b10 100644
--- a/comfy_api/latest/__init__.py
+++ b/comfy_api/latest/__init__.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
from comfy_api.internal import ComfyAPIBase
diff --git a/comfy_api/latest/_input/__init__.py b/comfy_api/latest/_input/__init__.py
index 05cd3d40a..f0229717e 100644
--- a/comfy_api/latest/_input/__init__.py
+++ b/comfy_api/latest/_input/__init__.py
@@ -1,5 +1,6 @@
from .basic_types import ImageInput, AudioInput, MaskInput, LatentInput
from .curve_types import CurvePoint, CurveInput, MonotoneCubicCurve, LinearCurve
+from .range_types import RangeInput
from .video_types import VideoInput
__all__ = [
@@ -12,4 +13,5 @@ __all__ = [
"CurveInput",
"MonotoneCubicCurve",
"LinearCurve",
+ "RangeInput",
]
diff --git a/comfy_api/latest/_input/range_types.py b/comfy_api/latest/_input/range_types.py
new file mode 100644
index 000000000..f4c5cb290
--- /dev/null
+++ b/comfy_api/latest/_input/range_types.py
@@ -0,0 +1,70 @@
+from __future__ import annotations
+
+import logging
+import math
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+class RangeInput:
+ """Represents a levels/range adjustment: input range [min, max] with
+ optional midpoint (gamma control).
+
+ Generates a 1D LUT identical to GIMP's levels mapping:
+ 1. Normalize input to [0, 1] using [min, max]
+ 2. Apply gamma correction: pow(value, 1/gamma)
+ 3. Clamp to [0, 1]
+
+ The midpoint field is a position in [0, 1] representing where the
+ midtone falls within [min, max]. It maps to gamma via:
+ gamma = -log2(midpoint)
+ So midpoint=0.5 → gamma=1.0 (linear).
+ """
+
+ def __init__(self, min_val: float, max_val: float, midpoint: float | None = None):
+ self.min_val = min_val
+ self.max_val = max_val
+ self.midpoint = midpoint
+
+ @staticmethod
+ def from_raw(data) -> RangeInput:
+ if isinstance(data, RangeInput):
+ return data
+ if isinstance(data, dict):
+ return RangeInput(
+ min_val=float(data.get("min", 0.0)),
+ max_val=float(data.get("max", 1.0)),
+ midpoint=float(data["midpoint"]) if data.get("midpoint") is not None else None,
+ )
+ raise TypeError(f"Cannot convert {type(data)} to RangeInput")
+
+ def to_lut(self, size: int = 256) -> np.ndarray:
+ """Generate a float64 lookup table mapping [0, 1] input through this
+ levels adjustment.
+
+ The LUT maps normalized input values (0..1) to output values (0..1),
+ matching the GIMP levels formula.
+ """
+ xs = np.linspace(0.0, 1.0, size, dtype=np.float64)
+
+ in_range = self.max_val - self.min_val
+ if abs(in_range) < 1e-10:
+ return np.where(xs >= self.min_val, 1.0, 0.0).astype(np.float64)
+
+ # Normalize: map [min, max] → [0, 1]
+ result = (xs - self.min_val) / in_range
+ result = np.clip(result, 0.0, 1.0)
+
+ # Gamma correction from midpoint
+ if self.midpoint is not None and self.midpoint > 0 and self.midpoint != 0.5:
+ gamma = max(-math.log2(self.midpoint), 0.001)
+ inv_gamma = 1.0 / gamma
+ mask = result > 0
+ result[mask] = np.power(result[mask], inv_gamma)
+
+ return result
+
+ def __repr__(self) -> str:
+ mid = f", midpoint={self.midpoint}" if self.midpoint is not None else ""
+ return f"RangeInput(min={self.min_val}, max={self.max_val}{mid})"
diff --git a/comfy_api/latest/_input_impl/video_types.py b/comfy_api/latest/_input_impl/video_types.py
index 1b4993aa7..99e67d363 100644
--- a/comfy_api/latest/_input_impl/video_types.py
+++ b/comfy_api/latest/_input_impl/video_types.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from av.container import InputContainer
from av.subtitles.stream import SubtitleStream
from fractions import Fraction
@@ -12,6 +11,7 @@ import numpy as np
import math
import torch
from .._util import VideoContainer, VideoCodec, VideoComponents
+import logging
def container_to_output_format(container_format: str | None) -> str | None:
@@ -238,64 +238,125 @@ class VideoFromFile(VideoInput):
start_time = max(self._get_raw_duration() + self.__start_time, 0)
else:
start_time = self.__start_time
+
# Get video frames
frames = []
+ audio_frames = []
+ alphas = None
start_pts = int(start_time / video_stream.time_base)
end_pts = int((start_time + self.__duration) / video_stream.time_base)
- container.seek(start_pts, stream=video_stream)
- for frame in container.decode(video_stream):
- if frame.pts < start_pts:
- continue
- if self.__duration and frame.pts >= end_pts:
- break
- img = frame.to_ndarray(format='rgb24') # shape: (H, W, 3)
- img = torch.from_numpy(img) / 255.0 # shape: (H, W, 3)
- frames.append(img)
- images = torch.stack(frames) if len(frames) > 0 else torch.zeros(0, 3, 0, 0)
+ if start_pts != 0:
+ container.seek(start_pts, stream=video_stream)
+
+ image_format = 'gbrpf32le'
+ process_image_format = lambda a: a
+ audio = None
+
+ streams = [video_stream]
+ has_first_audio_frame = False
+ checked_alpha = False
+
+ # Default to False so we decode until EOF if duration is 0
+ video_done = False
+ audio_done = True
+
+ if len(container.streams.audio):
+ audio_stream = container.streams.audio[-1]
+ streams += [audio_stream]
+ resampler = av.audio.resampler.AudioResampler(format='fltp')
+ audio_done = False
+
+ for packet in container.demux(*streams):
+ if video_done and audio_done:
+ break
+
+ if packet.stream.type == "video":
+ if video_done:
+ continue
+ try:
+ for frame in packet.decode():
+ if frame.pts < start_pts:
+ continue
+ if self.__duration and frame.pts >= end_pts:
+ video_done = True
+ break
+
+ if not checked_alpha:
+ alpha_channel = False
+ for comp in frame.format.components:
+ if comp.is_alpha or frame.format.name == "pal8":
+ alphas = []
+ alpha_channel = True
+ break
+ if frame.format.name in ("yuvj420p", "yuvj422p", "yuvj444p", "rgb24", "rgba", "pal8"):
+ process_image_format = lambda a: a.float() / 255.0
+ if alpha_channel:
+ image_format = 'rgba'
+ else:
+ image_format = 'rgb24'
+ else:
+ process_image_format = lambda a: a
+ if alpha_channel:
+ image_format = 'gbrapf32le'
+ else:
+ image_format = 'gbrpf32le'
+
+ checked_alpha = True
+
+ img = frame.to_ndarray(format=image_format) # shape: (H, W, 4)
+ if frame.rotation != 0:
+ k = int(round(frame.rotation // 90))
+ img = np.rot90(img, k=k, axes=(0, 1)).copy()
+ if alphas is None:
+ frames.append(torch.from_numpy(img))
+ else:
+ frames.append(torch.from_numpy(img[..., :-1]))
+ alphas.append(torch.from_numpy(img[..., -1:]))
+ except av.error.InvalidDataError:
+ logging.info("pyav decode error")
+
+ elif packet.stream.type == "audio":
+ if audio_done:
+ continue
+
+ aframes = itertools.chain.from_iterable(
+ map(resampler.resample, packet.decode())
+ )
+ for frame in aframes:
+ if self.__duration and frame.time > start_time + self.__duration:
+ audio_done = True
+ break
+
+ if not has_first_audio_frame:
+ offset_seconds = start_time - frame.pts * audio_stream.time_base
+ to_skip = max(0, int(offset_seconds * audio_stream.sample_rate))
+ if to_skip < frame.samples:
+ has_first_audio_frame = True
+ audio_frames.append(frame.to_ndarray()[..., to_skip:])
+ else:
+ audio_frames.append(frame.to_ndarray())
+
+ images = process_image_format(torch.stack(frames)) if len(frames) > 0 else torch.zeros(0, 0, 0, 3)
+ if alphas is not None:
+ alphas = process_image_format(torch.stack(alphas)) if len(alphas) > 0 else torch.zeros(0, 0, 0, 1)
# Get frame rate
frame_rate = Fraction(video_stream.average_rate) if video_stream.average_rate else Fraction(1)
- # Get audio if available
- audio = None
- container.seek(start_pts, stream=video_stream)
- # Use last stream for consistency
- if len(container.streams.audio):
- audio_stream = container.streams.audio[-1]
- audio_frames = []
- resample = av.audio.resampler.AudioResampler(format='fltp').resample
- frames = itertools.chain.from_iterable(
- map(resample, container.decode(audio_stream))
- )
+ if len(audio_frames) > 0:
+ audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
+ if self.__duration:
+ audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
- has_first_frame = False
- for frame in frames:
- offset_seconds = start_time - frame.pts * audio_stream.time_base
- to_skip = max(0, int(offset_seconds * audio_stream.sample_rate))
- if to_skip < frame.samples:
- has_first_frame = True
- break
- if has_first_frame:
- audio_frames.append(frame.to_ndarray()[..., to_skip:])
-
- for frame in frames:
- if self.__duration and frame.time > start_time + self.__duration:
- break
- audio_frames.append(frame.to_ndarray()) # shape: (channels, samples)
- if len(audio_frames) > 0:
- audio_data = np.concatenate(audio_frames, axis=1) # shape: (channels, total_samples)
- if self.__duration:
- audio_data = audio_data[..., :int(self.__duration * audio_stream.sample_rate)]
-
- audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
- audio = AudioInput({
- "waveform": audio_tensor,
- "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
- })
+ audio_tensor = torch.from_numpy(audio_data).unsqueeze(0) # shape: (1, channels, total_samples)
+ audio = AudioInput({
+ "waveform": audio_tensor,
+ "sample_rate": int(audio_stream.sample_rate) if audio_stream.sample_rate else 1,
+ })
metadata = container.metadata
- return VideoComponents(images=images, audio=audio, frame_rate=frame_rate, metadata=metadata)
+ return VideoComponents(images=images, alpha=alphas, audio=audio, frame_rate=frame_rate, metadata=metadata)
def get_components(self) -> VideoComponents:
if isinstance(self.__file, io.BytesIO):
diff --git a/comfy_api/latest/_io.py b/comfy_api/latest/_io.py
index fdeffea2d..e430c0ecf 100644
--- a/comfy_api/latest/_io.py
+++ b/comfy_api/latest/_io.py
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
from spandrel import ImageModelDescriptor
from comfy.clip_vision import ClipVisionModel
from comfy.clip_vision import Output as ClipVisionOutput_
+ from comfy.bg_removal_model import BackgroundRemovalModel
from comfy.controlnet import ControlNet
from comfy.hooks import HookGroup, HookKeyframeGroup
from comfy.model_patcher import ModelPatcher
@@ -395,7 +396,6 @@ class Combo(ComfyTypeIO):
@comfytype(io_type="COMBO")
class MultiCombo(ComfyTypeI):
'''Multiselect Combo input (dropdown for selecting potentially more than one value).'''
- # TODO: something is wrong with the serialization, frontend does not recognize it as multiselect
Type = list[str]
class Input(Combo.Input):
def __init__(self, id: str, options: list[str], display_name: str=None, optional=False, tooltip: str=None, lazy: bool=None,
@@ -408,12 +408,14 @@ class MultiCombo(ComfyTypeI):
self.default: list[str]
def as_dict(self):
- to_return = super().as_dict() | prune_dict({
- "multi_select": self.multiselect,
- "placeholder": self.placeholder,
- "chip": self.chip,
+ # Frontend expects `multi_select` to be an object config (not a boolean).
+ # Keep top-level `multiselect` from Combo.Input for backwards compatibility.
+ return super().as_dict() | prune_dict({
+ "multi_select": prune_dict({
+ "placeholder": self.placeholder,
+ "chip": self.chip,
+ }),
})
- return to_return
@comfytype(io_type="IMAGE")
class Image(ComfyTypeIO):
@@ -613,6 +615,11 @@ class Model(ComfyTypeIO):
if TYPE_CHECKING:
Type = ModelPatcher
+@comfytype(io_type="BACKGROUND_REMOVAL")
+class BackgroundRemoval(ComfyTypeIO):
+ if TYPE_CHECKING:
+ Type = BackgroundRemovalModel
+
@comfytype(io_type="CLIP_VISION")
class ClipVision(ComfyTypeIO):
if TYPE_CHECKING:
@@ -759,6 +766,13 @@ class Load3DCamera(ComfyTypeIO):
target: dict[str, float | int]
zoom: int
cameraType: str
+ quaternion: NotRequired[dict[str, float | int]]
+ rotation: NotRequired[dict[str, float | int | str]]
+ fov: NotRequired[float | int]
+ aspect: NotRequired[float | int]
+ near: NotRequired[float | int]
+ far: NotRequired[float | int]
+ frustum: NotRequired[dict[str, float | int]]
Type = CameraInfo
@@ -1266,6 +1280,43 @@ class Histogram(ComfyTypeIO):
Type = list[int]
+@comfytype(io_type="RANGE")
+class Range(ComfyTypeIO):
+ from comfy_api.input import RangeInput
+ if TYPE_CHECKING:
+ Type = RangeInput
+
+ class Input(WidgetInput):
+ def __init__(self, id: str, display_name: str=None, optional=False, tooltip: str=None,
+ socketless: bool=True, default: dict=None,
+ display: str=None,
+ gradient_stops: list=None,
+ show_midpoint: bool=None,
+ midpoint_scale: str=None,
+ value_min: float=None,
+ value_max: float=None,
+ advanced: bool=None):
+ super().__init__(id, display_name, optional, tooltip, None, default, socketless, None, None, None, None, advanced)
+ if default is None:
+ self.default = {"min": 0.0, "max": 1.0}
+ self.display = display
+ self.gradient_stops = gradient_stops
+ self.show_midpoint = show_midpoint
+ self.midpoint_scale = midpoint_scale
+ self.value_min = value_min
+ self.value_max = value_max
+
+ def as_dict(self):
+ return super().as_dict() | prune_dict({
+ "display": self.display,
+ "gradient_stops": self.gradient_stops,
+ "show_midpoint": self.show_midpoint,
+ "midpoint_scale": self.midpoint_scale,
+ "value_min": self.value_min,
+ "value_max": self.value_max,
+ })
+
+
DYNAMIC_INPUT_LOOKUP: dict[str, Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]] = {}
def register_dynamic_input_func(io_type: str, func: Callable[[dict[str, Any], dict[str, Any], tuple[str, dict[str, Any]], str, list[str] | None], None]):
DYNAMIC_INPUT_LOOKUP[io_type] = func
@@ -2219,6 +2270,7 @@ __all__ = [
"ModelPatch",
"ClipVision",
"ClipVisionOutput",
+ "BackgroundRemoval",
"AudioEncoder",
"AudioEncoderOutput",
"StyleModel",
@@ -2276,5 +2328,6 @@ __all__ = [
"BoundingBox",
"Curve",
"Histogram",
+ "Range",
"NodeReplace",
]
diff --git a/comfy_api/latest/_util/geometry_types.py b/comfy_api/latest/_util/geometry_types.py
index b586fceb3..cdde60b10 100644
--- a/comfy_api/latest/_util/geometry_types.py
+++ b/comfy_api/latest/_util/geometry_types.py
@@ -12,9 +12,24 @@ class VOXEL:
class MESH:
- def __init__(self, vertices: torch.Tensor, faces: torch.Tensor):
- self.vertices = vertices
- self.faces = faces
+ def __init__(self, vertices: torch.Tensor, faces: torch.Tensor,
+ uvs: torch.Tensor | None = None,
+ vertex_colors: torch.Tensor | None = None,
+ texture: torch.Tensor | None = None,
+ vertex_counts: torch.Tensor | None = None,
+ face_counts: torch.Tensor | None = None):
+
+ assert (vertex_counts is None) == (face_counts is None), \
+ "vertex_counts and face_counts must be provided together (both or neither)"
+ self.vertices = vertices # vertices: (B, N, 3)
+ self.faces = faces # faces: (B, M, 3)
+ self.uvs = uvs # uvs: (B, N, 2)
+ self.vertex_colors = vertex_colors # vertex_colors: (B, N, 3 or 4)
+ self.texture = texture # texture: (B, H, W, 3)
+ # When vertices/faces are zero-padded to a common N/M across the batch (variable-size mesh batch),
+ # these hold the real per-item lengths (B,). None means rows are uniform and no slicing is needed.
+ self.vertex_counts = vertex_counts
+ self.face_counts = face_counts
class File3D:
diff --git a/comfy_api/latest/_util/video_types.py b/comfy_api/latest/_util/video_types.py
index fd3b5a510..6c9d6a526 100644
--- a/comfy_api/latest/_util/video_types.py
+++ b/comfy_api/latest/_util/video_types.py
@@ -1,9 +1,8 @@
-from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from fractions import Fraction
from typing import Optional
-from .._input import ImageInput, AudioInput
+from .._input import ImageInput, AudioInput, MaskInput
class VideoCodec(str, Enum):
AUTO = "auto"
@@ -48,5 +47,4 @@ class VideoComponents:
frame_rate: Fraction
audio: Optional[AudioInput] = None
metadata: Optional[dict] = None
-
-
+ alpha: Optional[MaskInput] = None
diff --git a/comfy_api_nodes/apis/__init__.py b/comfy_api_nodes/apis/__init__.py
index 46a583b5e..9c4cfb9b6 100644
--- a/comfy_api_nodes/apis/__init__.py
+++ b/comfy_api_nodes/apis/__init__.py
@@ -3,7 +3,6 @@
# timestamp: 2025-07-30T08:54:00+00:00
# pylint: disable
-from __future__ import annotations
from datetime import date, datetime
from enum import Enum
diff --git a/comfy_api_nodes/apis/anthropic.py b/comfy_api_nodes/apis/anthropic.py
new file mode 100644
index 000000000..46a5bb428
--- /dev/null
+++ b/comfy_api_nodes/apis/anthropic.py
@@ -0,0 +1,98 @@
+from enum import Enum
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class AnthropicRole(str, Enum):
+ user = "user"
+ assistant = "assistant"
+
+
+class AnthropicTextContent(BaseModel):
+ type: Literal["text"] = "text"
+ text: str = Field(...)
+
+
+class AnthropicImageSourceBase64(BaseModel):
+ type: Literal["base64"] = "base64"
+ media_type: str = Field(..., description="MIME type of the image, e.g. image/png, image/jpeg")
+ data: str = Field(..., description="Base64-encoded image data")
+
+
+class AnthropicImageSourceUrl(BaseModel):
+ type: Literal["url"] = "url"
+ url: str = Field(...)
+
+
+class AnthropicImageContent(BaseModel):
+ type: Literal["image"] = "image"
+ source: AnthropicImageSourceBase64 | AnthropicImageSourceUrl = Field(...)
+
+
+class AnthropicMessage(BaseModel):
+ role: AnthropicRole = Field(...)
+ content: list[AnthropicTextContent | AnthropicImageContent] = Field(...)
+
+
+class AnthropicThinkingConfig(BaseModel):
+ type: Literal["enabled", "disabled", "adaptive"] = Field(...)
+ budget_tokens: int | None = Field(
+ None, ge=1024,
+ description="Reasoning budget in tokens. Used when type is 'enabled'. Must be less than max_tokens.",
+ )
+
+
+class AnthropicOutputConfig(BaseModel):
+ """Used with `thinking.type='adaptive'` on models like Opus 4.7."""
+ effort: Literal["low", "medium", "high"] | None = Field(None)
+
+
+class AnthropicMessagesRequest(BaseModel):
+ model: str = Field(...)
+ messages: list[AnthropicMessage] = Field(...)
+ max_tokens: int = Field(..., ge=1)
+ system: str | None = Field(None, description="Top-level system prompt")
+ temperature: float | None = Field(None, ge=0.0, le=1.0)
+ top_p: float | None = Field(None, ge=0.0, le=1.0)
+ top_k: int | None = Field(None, ge=0)
+ stop_sequences: list[str] | None = Field(None)
+ thinking: AnthropicThinkingConfig | None = Field(None)
+ output_config: AnthropicOutputConfig | None = Field(None)
+
+
+class AnthropicResponseTextBlock(BaseModel):
+ type: Literal["text"] = "text"
+ text: str = Field(...)
+
+
+class AnthropicResponseThinkingBlock(BaseModel):
+ type: Literal["thinking"] = "thinking"
+ thinking: str = Field(...)
+
+
+AnthropicResponseBlock = AnthropicResponseTextBlock | AnthropicResponseThinkingBlock
+
+
+class AnthropicCacheCreationUsage(BaseModel):
+ ephemeral_5m_input_tokens: int | None = Field(None)
+ ephemeral_1h_input_tokens: int | None = Field(None)
+
+
+class AnthropicMessagesUsage(BaseModel):
+ input_tokens: int | None = Field(None)
+ output_tokens: int | None = Field(None)
+ cache_creation_input_tokens: int | None = Field(None)
+ cache_read_input_tokens: int | None = Field(None)
+ cache_creation: AnthropicCacheCreationUsage | None = Field(None)
+
+
+class AnthropicMessagesResponse(BaseModel):
+ id: str | None = Field(None)
+ type: str | None = Field(None)
+ role: str | None = Field(None)
+ model: str | None = Field(None)
+ content: list[AnthropicResponseBlock] | None = Field(None)
+ stop_reason: str | None = Field(None)
+ stop_sequence: str | None = Field(None)
+ usage: AnthropicMessagesUsage | None = Field(None)
diff --git a/comfy_api_nodes/apis/beeble.py b/comfy_api_nodes/apis/beeble.py
new file mode 100644
index 000000000..90175b214
--- /dev/null
+++ b/comfy_api_nodes/apis/beeble.py
@@ -0,0 +1,32 @@
+from pydantic import BaseModel, Field
+
+
+class CreateSwitchXRequest(BaseModel):
+ generation_type: str = Field(...)
+ source_uri: str = Field(...)
+ alpha_mode: str = Field(...)
+ prompt: str | None = Field(None, max_length=2000)
+ reference_image_uri: str | None = Field(None)
+ alpha_uri: str | None = Field(None)
+ max_resolution: int = Field(1080)
+ callback_url: str | None = Field(None)
+ idempotency_key: str | None = Field(None, max_length=256, min_length=1)
+
+
+class SwitchXOutputUrls(BaseModel):
+ render: str | None = Field(None)
+ source: str | None = Field(None)
+ alpha: str | None = Field(None)
+
+
+class SwitchXStatusResponse(BaseModel):
+ id: str = Field(...)
+ status: str = Field(...)
+ progress: int | None = Field(None)
+ generation_type: str | None = Field(None)
+ alpha_mode: str | None = Field(None)
+ output: SwitchXOutputUrls | None = Field(None)
+ error: str | None = Field(None)
+ created_at: str | None = Field(None)
+ modified_at: str | None = Field(None)
+ completed_at: str | None = Field(None)
diff --git a/comfy_api_nodes/apis/bfl.py b/comfy_api_nodes/apis/bfl.py
index d8d3557b3..f0665fa09 100644
--- a/comfy_api_nodes/apis/bfl.py
+++ b/comfy_api_nodes/apis/bfl.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
from enum import Enum
from typing import Any, Dict, Optional
diff --git a/comfy_api_nodes/apis/bria.py b/comfy_api_nodes/apis/bria.py
index 8c496b56c..e08a519a8 100644
--- a/comfy_api_nodes/apis/bria.py
+++ b/comfy_api_nodes/apis/bria.py
@@ -23,7 +23,7 @@ class BriaEditImageRequest(BaseModel):
None,
description="Mask image (black and white). Black areas will be preserved, white areas will be edited. "
"If omitted, the edit applies to the entire image. "
- "The input image and the the input mask must be of the same size.",
+ "The input image and the input mask must be of the same size.",
)
negative_prompt: str | None = Field(None)
guidance_scale: float = Field(...)
diff --git a/comfy_api_nodes/apis/bytedance.py b/comfy_api_nodes/apis/bytedance.py
index 18455396d..47f24586c 100644
--- a/comfy_api_nodes/apis/bytedance.py
+++ b/comfy_api_nodes/apis/bytedance.py
@@ -52,6 +52,26 @@ class TaskImageContent(BaseModel):
role: Literal["first_frame", "last_frame", "reference_image"] | None = Field(None)
+class TaskVideoContentUrl(BaseModel):
+ url: str = Field(...)
+
+
+class TaskVideoContent(BaseModel):
+ type: str = Field("video_url")
+ video_url: TaskVideoContentUrl = Field(...)
+ role: str = Field("reference_video")
+
+
+class TaskAudioContentUrl(BaseModel):
+ url: str = Field(...)
+
+
+class TaskAudioContent(BaseModel):
+ type: str = Field("audio_url")
+ audio_url: TaskAudioContentUrl = Field(...)
+ role: str = Field("reference_audio")
+
+
class Text2VideoTaskCreationRequest(BaseModel):
model: str = Field(...)
content: list[TaskTextContent] = Field(..., min_length=1)
@@ -64,6 +84,17 @@ class Image2VideoTaskCreationRequest(BaseModel):
generate_audio: bool | None = Field(...)
+class Seedance2TaskCreationRequest(BaseModel):
+ model: str = Field(...)
+ content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = Field(..., min_length=1)
+ generate_audio: bool | None = Field(None)
+ resolution: str | None = Field(None)
+ ratio: str | None = Field(None)
+ duration: int | None = Field(None, ge=4, le=15)
+ seed: int | None = Field(None, ge=0, le=2147483647)
+ watermark: bool | None = Field(None)
+
+
class TaskCreationResponse(BaseModel):
id: str = Field(...)
@@ -77,12 +108,68 @@ class TaskStatusResult(BaseModel):
video_url: str = Field(...)
+class TaskStatusUsage(BaseModel):
+ completion_tokens: int = Field(0)
+ total_tokens: int = Field(0)
+
+
class TaskStatusResponse(BaseModel):
id: str = Field(...)
model: str = Field(...)
status: Literal["queued", "running", "cancelled", "succeeded", "failed"] = Field(...)
error: TaskStatusError | None = Field(None)
content: TaskStatusResult | None = Field(None)
+ usage: TaskStatusUsage | None = Field(None)
+
+
+class GetAssetResponse(BaseModel):
+ id: str = Field(...)
+ name: str | None = Field(None)
+ url: str | None = Field(None)
+ asset_type: str = Field(...)
+ group_id: str = Field(...)
+ status: str = Field(...)
+ error: TaskStatusError | None = Field(None)
+
+
+class SeedanceCreateVisualValidateSessionResponse(BaseModel):
+ session_id: str = Field(...)
+ h5_link: str = Field(...)
+
+
+class SeedanceGetVisualValidateSessionResponse(BaseModel):
+ session_id: str = Field(...)
+ status: str = Field(...)
+ group_id: str | None = Field(None)
+ error_code: str | None = Field(None)
+ error_message: str | None = Field(None)
+
+
+class SeedanceCreateAssetRequest(BaseModel):
+ group_id: str = Field(...)
+ url: str = Field(...)
+ asset_type: str = Field(...)
+ name: str | None = Field(None, max_length=64)
+ project_name: str | None = Field(None)
+
+
+class SeedanceCreateAssetResponse(BaseModel):
+ asset_id: str = Field(...)
+
+
+class SeedanceVirtualLibraryCreateAssetRequest(BaseModel):
+ url: str = Field(..., description="Publicly accessible URL of the asset to upload.")
+ hash: str = Field(..., description="Dedup key. Re-submitting the same hash returns the existing asset id.")
+ asset_type: str | None = Field(None, description="BytePlus asset type. Defaults to Image server-side when omitted.")
+
+
+# Dollars per 1K tokens, keyed by (model_id, has_video_input).
+SEEDANCE2_PRICE_PER_1K_TOKENS = {
+ ("dreamina-seedance-2-0-260128", False): 0.007,
+ ("dreamina-seedance-2-0-260128", True): 0.0043,
+ ("dreamina-seedance-2-0-fast-260128", False): 0.0056,
+ ("dreamina-seedance-2-0-fast-260128", True): 0.0033,
+}
RECOMMENDED_PRESETS = [
@@ -112,6 +199,75 @@ RECOMMENDED_PRESETS_SEEDREAM_4 = [
("Custom", None, None),
]
+_PRESETS_SEEDREAM_1K = [
+ ("(1K) 1024x1024 (1:1)", 1024, 1024),
+ ("(1K) 864x1152 (3:4)", 864, 1152),
+ ("(1K) 1152x864 (4:3)", 1152, 864),
+ ("(1K) 1312x736 (16:9)", 1312, 736),
+ ("(1K) 736x1312 (9:16)", 736, 1312),
+ ("(1K) 832x1248 (2:3)", 832, 1248),
+ ("(1K) 1248x832 (3:2)", 1248, 832),
+ ("(1K) 1568x672 (21:9)", 1568, 672),
+]
+
+_PRESETS_SEEDREAM_2K = [
+ ("(2K) 2048x2048 (1:1)", 2048, 2048),
+ ("(2K) 1728x2304 (3:4)", 1728, 2304),
+ ("(2K) 2304x1728 (4:3)", 2304, 1728),
+ ("(2K) 2848x1600 (16:9)", 2848, 1600),
+ ("(2K) 1600x2848 (9:16)", 1600, 2848),
+ ("(2K) 1664x2496 (2:3)", 1664, 2496),
+ ("(2K) 2496x1664 (3:2)", 2496, 1664),
+ ("(2K) 3136x1344 (21:9)", 3136, 1344),
+]
+
+_PRESETS_SEEDREAM_3K = [
+ ("(3K) 3072x3072 (1:1)", 3072, 3072),
+ ("(3K) 2592x3456 (3:4)", 2592, 3456),
+ ("(3K) 3456x2592 (4:3)", 3456, 2592),
+ ("(3K) 4096x2304 (16:9)", 4096, 2304),
+ ("(3K) 2304x4096 (9:16)", 2304, 4096),
+ ("(3K) 2496x3744 (2:3)", 2496, 3744),
+ ("(3K) 3744x2496 (3:2)", 3744, 2496),
+ ("(3K) 4704x2016 (21:9)", 4704, 2016),
+]
+
+_PRESETS_SEEDREAM_4K = [
+ ("(4K) 4096x4096 (1:1)", 4096, 4096),
+ ("(4K) 3520x4704 (3:4)", 3520, 4704),
+ ("(4K) 4704x3520 (4:3)", 4704, 3520),
+ ("(4K) 5504x3040 (16:9)", 5504, 3040),
+ ("(4K) 3040x5504 (9:16)", 3040, 5504),
+ ("(4K) 3328x4992 (2:3)", 3328, 4992),
+ ("(4K) 4992x3328 (3:2)", 4992, 3328),
+ ("(4K) 6240x2656 (21:9)", 6240, 2656),
+]
+
+_CUSTOM_PRESET = [("Custom", None, None)]
+
+RECOMMENDED_PRESETS_SEEDREAM_5_LITE = (
+ _PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_3K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
+)
+RECOMMENDED_PRESETS_SEEDREAM_4_5 = (
+ _PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
+)
+RECOMMENDED_PRESETS_SEEDREAM_4_0 = (
+ _PRESETS_SEEDREAM_1K + _PRESETS_SEEDREAM_2K + _PRESETS_SEEDREAM_4K + _CUSTOM_PRESET
+)
+
+# Seedance 2.0 reference video pixel count limits per model and output resolution.
+SEEDANCE2_REF_VIDEO_PIXEL_LIMITS = {
+ "dreamina-seedance-2-0-260128": {
+ "480p": {"min": 409_600, "max": 927_408},
+ "720p": {"min": 409_600, "max": 927_408},
+ "1080p": {"min": 409_600, "max": 2_073_600},
+ },
+ "dreamina-seedance-2-0-fast-260128": {
+ "480p": {"min": 409_600, "max": 927_408},
+ "720p": {"min": 409_600, "max": 927_408},
+ },
+}
+
# The time in this dictionary are given for 10 seconds duration.
VIDEO_TASKS_EXECUTION_TIME = {
"seedance-1-0-lite-t2v-250428": {
diff --git a/comfy_api_nodes/apis/bytedance_llm.py b/comfy_api_nodes/apis/bytedance_llm.py
new file mode 100644
index 000000000..654c875fc
--- /dev/null
+++ b/comfy_api_nodes/apis/bytedance_llm.py
@@ -0,0 +1,101 @@
+"""Pydantic models for BytePlus ModelArk Responses API.
+
+See: https://docs.byteplus.com/en/docs/ModelArk/1585128 (request)
+ https://docs.byteplus.com/en/docs/ModelArk/1783703 (response)
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class BytePlusInputText(BaseModel):
+ type: Literal["input_text"] = "input_text"
+ text: str = Field(...)
+
+
+class BytePlusInputImage(BaseModel):
+ type: Literal["input_image"] = "input_image"
+ image_url: str = Field(..., description="Image URL or `data:image/...;base64,...` payload")
+ detail: str = Field("auto", description="One of high, low, auto")
+
+
+class BytePlusInputVideo(BaseModel):
+ type: Literal["input_video"] = "input_video"
+ video_url: str = Field(..., description="Video URL or `data:video/...;base64,...` payload")
+ fps: float | None = Field(None, ge=0.2, le=5.0)
+
+
+BytePlusMessageContent = BytePlusInputText | BytePlusInputImage | BytePlusInputVideo
+
+
+class BytePlusInputMessage(BaseModel):
+ type: Literal["message"] = "message"
+ role: str = Field(..., description="One of user, system, assistant, developer")
+ content: list[BytePlusMessageContent] = Field(...)
+
+
+class BytePlusResponseCreateRequest(BaseModel):
+ model: str = Field(...)
+ input: list[BytePlusInputMessage] = Field(...)
+ instructions: str | None = Field(None)
+ max_output_tokens: int | None = Field(None, ge=1)
+ temperature: float | None = Field(None, ge=0.0, le=2.0)
+ store: bool | None = Field(False)
+ stream: bool | None = Field(False)
+
+
+class BytePlusOutputText(BaseModel):
+ type: Literal["output_text"] = "output_text"
+ text: str = Field(...)
+
+
+class BytePlusOutputRefusal(BaseModel):
+ type: Literal["refusal"] = "refusal"
+ refusal: str = Field(...)
+
+
+class BytePlusOutputContent(BaseModel):
+ type: str = Field(...)
+ text: str | None = Field(None)
+ refusal: str | None = Field(None)
+
+
+class BytePlusOutputMessage(BaseModel):
+ type: str = Field(...)
+ id: str | None = Field(None)
+ role: str | None = Field(None)
+ status: str | None = Field(None)
+ content: list[BytePlusOutputContent] | None = Field(None)
+
+
+class BytePlusInputTokensDetails(BaseModel):
+ cached_tokens: int | None = Field(None)
+
+
+class BytePlusOutputTokensDetails(BaseModel):
+ reasoning_tokens: int | None = Field(None)
+
+
+class BytePlusResponseUsage(BaseModel):
+ input_tokens: int | None = Field(None)
+ output_tokens: int | None = Field(None)
+ total_tokens: int | None = Field(None)
+ input_tokens_details: BytePlusInputTokensDetails | None = Field(None)
+ output_tokens_details: BytePlusOutputTokensDetails | None = Field(None)
+
+
+class BytePlusResponseError(BaseModel):
+ code: str = Field(...)
+ message: str = Field(...)
+
+
+class BytePlusResponseObject(BaseModel):
+ id: str | None = Field(None)
+ object: str | None = Field(None)
+ created_at: int | None = Field(None)
+ model: str | None = Field(None)
+ status: str | None = Field(None)
+ error: BytePlusResponseError | None = Field(None)
+ output: list[BytePlusOutputMessage] | None = Field(None)
+ usage: BytePlusResponseUsage | None = Field(None)
diff --git a/comfy_api_nodes/apis/krea.py b/comfy_api_nodes/apis/krea.py
new file mode 100644
index 000000000..6e294a3b7
--- /dev/null
+++ b/comfy_api_nodes/apis/krea.py
@@ -0,0 +1,46 @@
+"""Pydantic models for the Krea image-generation API."""
+
+from pydantic import BaseModel, Field
+
+
+class KreaMoodboard(BaseModel):
+ id: str = Field(...)
+ strength: float = Field(default=0.35, ge=-0.5, le=1.5)
+
+
+class KreaImageStyleReference(BaseModel):
+ strength: float = Field(..., ge=-2.0, le=2.0)
+ url: str | None = Field(default=None)
+
+
+class KreaGenerateImageRequest(BaseModel):
+ prompt: str = Field(...)
+ aspect_ratio: str = Field(...)
+ resolution: str = Field(...)
+ seed: int | None = Field(default=None)
+ creativity: str = Field(default="medium")
+ moodboards: list[KreaMoodboard] | None = Field(default=None)
+ image_style_references: list[KreaImageStyleReference] | None = Field(default=None)
+
+
+class KreaJobResult(BaseModel):
+ urls: list[str] | None = Field(default=None)
+ style_id: str | None = Field(default=None)
+
+
+class KreaJob(BaseModel):
+ job_id: str = Field(...)
+ status: str = Field(...)
+ created_at: str = Field(...)
+ completed_at: str | None = Field(default=None)
+ result: KreaJobResult | None = Field(default=None)
+
+
+class KreaAssetResponse(BaseModel):
+ id: str = Field(...)
+ image_url: str = Field(...)
+ uploaded_at: str = Field(...)
+ width: float | None = Field(default=None)
+ height: float | None = Field(default=None)
+ size_bytes: float | None = Field(default=None)
+ mime_type: str | None = Field(default=None)
diff --git a/comfy_api_nodes/apis/luma.py b/comfy_api_nodes/apis/luma.py
index 632c4ab96..8c6db2022 100644
--- a/comfy_api_nodes/apis/luma.py
+++ b/comfy_api_nodes/apis/luma.py
@@ -1,15 +1,12 @@
from __future__ import annotations
-
-import torch
-
from enum import Enum
from typing import Optional, Union
+import torch
from pydantic import BaseModel, Field, confloat
-
class LumaIO:
LUMA_REF = "LUMA_REF"
LUMA_CONCEPTS = "LUMA_CONCEPTS"
@@ -183,13 +180,13 @@ class LumaAssets(BaseModel):
class LumaImageRef(BaseModel):
- '''Used for image gen'''
+ """Used for image gen"""
url: str = Field(..., description='The URL of the image reference')
weight: confloat(ge=0.0, le=1.0) = Field(..., description='The weight of the image reference')
class LumaImageReference(BaseModel):
- '''Used for video gen'''
+ """Used for video gen"""
type: Optional[str] = Field('image', description='Input type, defaults to image')
url: str = Field(..., description='The URL of the image')
@@ -251,3 +248,32 @@ class LumaGeneration(BaseModel):
assets: Optional[LumaAssets] = Field(None, description='The assets of the generation')
model: str = Field(..., description='The model used for the generation')
request: Union[LumaGenerationRequest, LumaImageGenerationRequest] = Field(..., description="The request used for the generation")
+
+
+class Luma2ImageRef(BaseModel):
+ url: str | None = None
+ data: str | None = None
+ media_type: str | None = None
+
+
+class Luma2GenerationRequest(BaseModel):
+ prompt: str = Field(..., min_length=1, max_length=6000)
+ model: str | None = None
+ type: str | None = None
+ aspect_ratio: str | None = None
+ style: str | None = None
+ output_format: str | None = None
+ web_search: bool | None = None
+ image_ref: list[Luma2ImageRef] | None = None
+ source: Luma2ImageRef | None = None
+
+
+class Luma2Generation(BaseModel):
+ id: str | None = None
+ type: str | None = None
+ state: str | None = None
+ model: str | None = None
+ created_at: str | None = None
+ output: list[LumaImageReference] | None = None
+ failure_reason: str | None = None
+ failure_code: str | None = None
diff --git a/comfy_api_nodes/apis/moonvalley.py b/comfy_api_nodes/apis/moonvalley.py
deleted file mode 100644
index 7ec7a4ade..000000000
--- a/comfy_api_nodes/apis/moonvalley.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from enum import Enum
-from typing import Optional, Dict, Any
-
-from pydantic import BaseModel, Field, StrictBytes
-
-
-class MoonvalleyPromptResponse(BaseModel):
- error: Optional[Dict[str, Any]] = None
- frame_conditioning: Optional[Dict[str, Any]] = None
- id: Optional[str] = None
- inference_params: Optional[Dict[str, Any]] = None
- meta: Optional[Dict[str, Any]] = None
- model_params: Optional[Dict[str, Any]] = None
- output_url: Optional[str] = None
- prompt_text: Optional[str] = None
- status: Optional[str] = None
-
-
-class MoonvalleyTextToVideoInferenceParams(BaseModel):
- add_quality_guidance: Optional[bool] = Field(
- True, description='Whether to add quality guidance'
- )
- caching_coefficient: Optional[float] = Field(
- 0.3, description='Caching coefficient for optimization'
- )
- caching_cooldown: Optional[int] = Field(
- 3, description='Number of caching cooldown steps'
- )
- caching_warmup: Optional[int] = Field(
- 3, description='Number of caching warmup steps'
- )
- clip_value: Optional[float] = Field(
- 3, description='CLIP value for generation control'
- )
- conditioning_frame_index: Optional[int] = Field(
- 0, description='Index of the conditioning frame'
- )
- cooldown_steps: Optional[int] = Field(
- 75, description='Number of cooldown steps (calculated based on num_frames)'
- )
- fps: Optional[int] = Field(
- 24, description='Frames per second of the generated video'
- )
- guidance_scale: Optional[float] = Field(
- 10, description='Guidance scale for generation control'
- )
- height: Optional[int] = Field(
- 1080, description='Height of the generated video in pixels'
- )
- negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
- num_frames: Optional[int] = Field(64, description='Number of frames to generate')
- seed: Optional[int] = Field(
- None, description='Random seed for generation (default: random)'
- )
- shift_value: Optional[float] = Field(
- 3, description='Shift value for generation control'
- )
- steps: Optional[int] = Field(80, description='Number of denoising steps')
- use_guidance_schedule: Optional[bool] = Field(
- True, description='Whether to use guidance scheduling'
- )
- use_negative_prompts: Optional[bool] = Field(
- False, description='Whether to use negative prompts'
- )
- use_timestep_transform: Optional[bool] = Field(
- True, description='Whether to use timestep transformation'
- )
- warmup_steps: Optional[int] = Field(
- 0, description='Number of warmup steps (calculated based on num_frames)'
- )
- width: Optional[int] = Field(
- 1920, description='Width of the generated video in pixels'
- )
-
-
-class MoonvalleyTextToVideoRequest(BaseModel):
- image_url: Optional[str] = None
- inference_params: Optional[MoonvalleyTextToVideoInferenceParams] = None
- prompt_text: Optional[str] = None
- webhook_url: Optional[str] = None
-
-
-class MoonvalleyUploadFileRequest(BaseModel):
- file: Optional[StrictBytes] = None
-
-
-class MoonvalleyUploadFileResponse(BaseModel):
- access_url: Optional[str] = None
-
-
-class MoonvalleyVideoToVideoInferenceParams(BaseModel):
- add_quality_guidance: Optional[bool] = Field(
- True, description='Whether to add quality guidance'
- )
- caching_coefficient: Optional[float] = Field(
- 0.3, description='Caching coefficient for optimization'
- )
- caching_cooldown: Optional[int] = Field(
- 3, description='Number of caching cooldown steps'
- )
- caching_warmup: Optional[int] = Field(
- 3, description='Number of caching warmup steps'
- )
- clip_value: Optional[float] = Field(
- 3, description='CLIP value for generation control'
- )
- conditioning_frame_index: Optional[int] = Field(
- 0, description='Index of the conditioning frame'
- )
- cooldown_steps: Optional[int] = Field(
- 36, description='Number of cooldown steps (calculated based on num_frames)'
- )
- guidance_scale: Optional[float] = Field(
- 15, description='Guidance scale for generation control'
- )
- negative_prompt: Optional[str] = Field(None, description='Negative prompt text')
- seed: Optional[int] = Field(
- None, description='Random seed for generation (default: random)'
- )
- shift_value: Optional[float] = Field(
- 3, description='Shift value for generation control'
- )
- steps: Optional[int] = Field(80, description='Number of denoising steps')
- use_guidance_schedule: Optional[bool] = Field(
- True, description='Whether to use guidance scheduling'
- )
- use_negative_prompts: Optional[bool] = Field(
- False, description='Whether to use negative prompts'
- )
- use_timestep_transform: Optional[bool] = Field(
- True, description='Whether to use timestep transformation'
- )
- warmup_steps: Optional[int] = Field(
- 24, description='Number of warmup steps (calculated based on num_frames)'
- )
-
-
-class ControlType(str, Enum):
- motion_control = 'motion_control'
- pose_control = 'pose_control'
-
-
-class MoonvalleyVideoToVideoRequest(BaseModel):
- control_type: ControlType = Field(
- ..., description='Supported types for video control'
- )
- inference_params: Optional[MoonvalleyVideoToVideoInferenceParams] = None
- prompt_text: str = Field(..., description='Describes the video to generate')
- video_url: str = Field(..., description='Url to control video')
- webhook_url: Optional[str] = Field(
- None, description='Optional webhook URL for notifications'
- )
diff --git a/comfy_api_nodes/apis/openai.py b/comfy_api_nodes/apis/openai.py
index b85ef252b..bee75d639 100644
--- a/comfy_api_nodes/apis/openai.py
+++ b/comfy_api_nodes/apis/openai.py
@@ -56,14 +56,14 @@ class ModelResponseProperties(BaseModel):
instructions: str | None = Field(None)
max_output_tokens: int | None = Field(None)
model: str | None = Field(None)
- temperature: float | None = Field(1, description="Controls randomness in the response", ge=0.0, le=2.0)
+ temperature: float | None = Field(None, description="Controls randomness in the response", ge=0.0, le=2.0)
top_p: float | None = Field(
- 1,
+ None,
description="Controls diversity of the response via nucleus sampling",
ge=0.0,
le=1.0,
)
- truncation: str | None = Field("disabled", description="Allowed values: 'auto' or 'disabled'")
+ truncation: str | None = Field(None, description="Allowed values: 'auto' or 'disabled'")
class ResponseProperties(BaseModel):
diff --git a/comfy_api_nodes/apis/openrouter.py b/comfy_api_nodes/apis/openrouter.py
new file mode 100644
index 000000000..e30d9bcfb
--- /dev/null
+++ b/comfy_api_nodes/apis/openrouter.py
@@ -0,0 +1,93 @@
+"""Pydantic models for the OpenRouter chat completions API.
+
+See: https://openrouter.ai/docs/api/api-reference/chat/send-chat-completion-request
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class OpenRouterTextContent(BaseModel):
+ type: Literal["text"] = "text"
+ text: str = Field(...)
+
+
+class OpenRouterImageUrl(BaseModel):
+ url: str = Field(...)
+
+
+class OpenRouterImageContent(BaseModel):
+ type: Literal["image_url"] = "image_url"
+ image_url: OpenRouterImageUrl = Field(...)
+
+
+class OpenRouterVideoUrl(BaseModel):
+ url: str = Field(...)
+
+
+class OpenRouterVideoContent(BaseModel):
+ type: Literal["video_url"] = "video_url"
+ video_url: OpenRouterVideoUrl = Field(...)
+
+
+OpenRouterContentBlock = OpenRouterTextContent | OpenRouterImageContent | OpenRouterVideoContent
+
+
+class OpenRouterMessage(BaseModel):
+ role: Literal["system", "user", "assistant"] = Field(...)
+ content: str | list[OpenRouterContentBlock] = Field(...)
+
+
+class OpenRouterReasoningConfig(BaseModel):
+ effort: str | None = Field(None)
+ exclude: bool | None = Field(None, description="If true, model reasons but reasoning is excluded from response.")
+
+
+class OpenRouterWebSearchOptions(BaseModel):
+ search_context_size: str | None = Field(None)
+
+
+class OpenRouterChatRequest(BaseModel):
+ model: str = Field(...)
+ messages: list[OpenRouterMessage] = Field(...)
+ seed: int | None = Field(None)
+ reasoning: OpenRouterReasoningConfig | None = Field(None)
+ web_search_options: OpenRouterWebSearchOptions | None = Field(None)
+ stream: bool = Field(False)
+
+
+class OpenRouterUsage(BaseModel):
+ prompt_tokens: int | None = Field(None)
+ completion_tokens: int | None = Field(None)
+ total_tokens: int | None = Field(None)
+ cost: float | None = Field(None, description="Server-side authoritative USD cost of the call.")
+
+
+class OpenRouterResponseMessage(BaseModel):
+ role: str | None = Field(None)
+ content: str | None = Field(None)
+ reasoning: str | None = Field(None)
+ refusal: str | None = Field(None)
+
+
+class OpenRouterChoice(BaseModel):
+ index: int | None = Field(None)
+ message: OpenRouterResponseMessage | None = Field(None)
+ finish_reason: str | None = Field(None)
+
+
+class OpenRouterError(BaseModel):
+ code: int | str | None = Field(None)
+ message: str | None = Field(None)
+ metadata: dict | None = Field(None)
+
+
+class OpenRouterChatResponse(BaseModel):
+ id: str | None = Field(None)
+ model: str | None = Field(None)
+ object: str | None = Field(None)
+ provider: str | None = Field(None)
+ choices: list[OpenRouterChoice] | None = Field(None)
+ usage: OpenRouterUsage | None = Field(None)
+ error: OpenRouterError | None = Field(None)
diff --git a/comfy_api_nodes/apis/rodin.py b/comfy_api_nodes/apis/rodin.py
index fc26a6e73..24524d642 100644
--- a/comfy_api_nodes/apis/rodin.py
+++ b/comfy_api_nodes/apis/rodin.py
@@ -1,7 +1,5 @@
-from __future__ import annotations
-
from enum import Enum
-from typing import Optional, List
+
from pydantic import BaseModel, Field
@@ -11,44 +9,76 @@ class Rodin3DGenerateRequest(BaseModel):
material: str = Field(..., description="The material type.")
quality_override: int = Field(..., description="The poly count of the mesh.")
mesh_mode: str = Field(..., description="It controls the type of faces of generated models.")
- TAPose: Optional[bool] = Field(None, description="")
+ TAPose: bool | None = Field(None, description="")
+
+
+class Rodin3DGen25Request(BaseModel):
+
+ tier: str = Field(..., description="Gen-2.5 tier (e.g. Gen-2.5-High).")
+ prompt: str | None = Field(None, description="Required for Text-to-3D; ignored otherwise.")
+ seed: int | None = Field(None, description="0-65535.")
+ material: str | None = Field(None, description="PBR | Shaded | All | None.")
+ geometry_file_format: str | None = Field(None, description="glb | usdz | fbx | obj | stl.")
+ texture_mode: str | None = Field(None, description="legacy | extreme-low | low | medium | high.")
+ mesh_mode: str | None = Field(None, description="Raw (triangular) | Quad.")
+ quality_override: int | None = Field(None, description="Mesh face count override.")
+ geometry_instruct_mode: str | None = Field(None, description="faithful | creative.")
+ bbox_condition: list[int] | None = Field(None, description="Bounding box [Width(Y), Height(Z), Length(X)] in cm.")
+ height: int | None = Field(None, description="Approximate model height in cm.")
+ TAPose: bool | None = Field(None, description="T/A pose for human-like models.")
+ hd_texture: bool | None = Field(None, description="Enhanced texture quality.")
+ texture_delight: bool | None = Field(None, description="Remove baked lighting from textures.")
+ is_micro: bool | None = Field(None, description="Micro detail (Extreme-High only).")
+ use_original_alpha: bool | None = Field(None, description="Preserve image transparency.")
+ preview_render: bool | None = Field(None, description="Generate high-quality preview render.")
+ addons: list[str] | None = Field(None, description='Optional addons, e.g. ["HighPack"].')
+
class GenerateJobsData(BaseModel):
- uuids: List[str] = Field(..., description="str LIST")
+ uuids: list[str] = Field(..., description="str LIST")
subscription_key: str = Field(..., description="subscription key")
+
class Rodin3DGenerateResponse(BaseModel):
- message: Optional[str] = Field(None, description="Return message.")
- prompt: Optional[str] = Field(None, description="Generated Prompt from image.")
- submit_time: Optional[str] = Field(None, description="Submit Time")
- uuid: Optional[str] = Field(None, description="Task str")
- jobs: Optional[GenerateJobsData] = Field(None, description="Details of jobs")
+ message: str | None = Field(None, description="Return message.")
+ prompt: str | None = Field(None, description="Generated Prompt from image.")
+ submit_time: str | None = Field(None, description="Submit Time")
+ uuid: str | None = Field(None, description="Task str")
+ jobs: GenerateJobsData | None = Field(None, description="Details of jobs")
+
class JobStatus(str, Enum):
"""
Status for jobs
"""
+
Done = "Done"
Failed = "Failed"
Generating = "Generating"
Waiting = "Waiting"
+
class Rodin3DCheckStatusRequest(BaseModel):
subscription_key: str = Field(..., description="subscription from generate endpoint")
+
class JobItem(BaseModel):
uuid: str = Field(..., description="uuid")
- status: JobStatus = Field(...,description="Status Currently")
+ status: JobStatus = Field(..., description="Status Currently")
+
class Rodin3DCheckStatusResponse(BaseModel):
- jobs: List[JobItem] = Field(..., description="Job status List")
+ jobs: list[JobItem] = Field(..., description="Job status List")
+
class Rodin3DDownloadRequest(BaseModel):
task_uuid: str = Field(..., description="Task str")
+
class RodinResourceItem(BaseModel):
url: str = Field(..., description="Download Url")
name: str = Field(..., description="File name with ext")
+
class Rodin3DDownloadResponse(BaseModel):
- list: List[RodinResourceItem] = Field(..., description="Source List")
+ items: list[RodinResourceItem] = Field(..., alias="list", description="Source List")
diff --git a/comfy_api_nodes/apis/stability.py b/comfy_api_nodes/apis/stability.py
index 718360187..5b9b5ac7d 100644
--- a/comfy_api_nodes/apis/stability.py
+++ b/comfy_api_nodes/apis/stability.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
from enum import Enum
from typing import Optional
diff --git a/comfy_api_nodes/apis/topaz.py b/comfy_api_nodes/apis/topaz.py
index a9e6235a7..f91980e3d 100644
--- a/comfy_api_nodes/apis/topaz.py
+++ b/comfy_api_nodes/apis/topaz.py
@@ -1,4 +1,4 @@
-from typing import Optional, Union
+from typing import Optional
from pydantic import BaseModel, Field
@@ -72,8 +72,11 @@ class VideoEnhancementFilter(BaseModel):
grain: Optional[float] = Field(None, description="Grain after AI model processing")
grainSize: Optional[float] = Field(None, description="Size of generated grain")
recoverOriginalDetailValue: Optional[float] = Field(None, description="Source details into the output video")
- creativity: Optional[str] = Field(None, description="Creativity level(high, low) for slc-1 only")
+ creativity: float | str | None = Field(None, description="slc-1/slp-2.5: enum (low/middle/high). ast-2: decimal 0.0-1.0.")
isOptimizedMode: Optional[bool] = Field(None, description="Set to true for Starlight Creative (slc-1) only")
+ prompt: str | None = Field(None, description="Descriptive scene prompt (ast-2 only)")
+ sharp: float | None = Field(None, description="ast-2 pre-enhance sharpness")
+ realism: float | None = Field(None, description="ast-2 realism control")
class OutputInformationVideo(BaseModel):
@@ -90,7 +93,7 @@ class Overrides(BaseModel):
class CreateVideoRequest(BaseModel):
source: CreateVideoRequestSource = Field(...)
- filters: list[Union[VideoFrameInterpolationFilter, VideoEnhancementFilter]] = Field(...)
+ filters: list[VideoFrameInterpolationFilter | VideoEnhancementFilter] = Field(...)
output: OutputInformationVideo = Field(...)
overrides: Overrides = Field(Overrides(isPaidDiffusion=True))
diff --git a/comfy_api_nodes/apis/tripo.py b/comfy_api_nodes/apis/tripo.py
index ffaaa7dc1..bce6b0e89 100644
--- a/comfy_api_nodes/apis/tripo.py
+++ b/comfy_api_nodes/apis/tripo.py
@@ -1,10 +1,11 @@
-from __future__ import annotations
from enum import Enum
-from typing import Optional, List, Dict, Any, Union
+from typing import Optional, Any
from pydantic import BaseModel, Field, RootModel
+
class TripoModelVersion(str, Enum):
+ v3_1_20260211 = 'v3.1-20260211'
v3_0_20250812 = 'v3.0-20250812'
v2_5_20250123 = 'v2.5-20250123'
v2_0_20240919 = 'v2.0-20240919'
@@ -142,7 +143,7 @@ class TripoFileEmptyReference(BaseModel):
pass
class TripoFileReference(RootModel):
- root: Union[TripoFileTokenReference, TripoUrlReference, TripoObjectReference, TripoFileEmptyReference]
+ root: TripoFileTokenReference | TripoUrlReference | TripoObjectReference | TripoFileEmptyReference
class TripoGetStsTokenRequest(BaseModel):
format: str = Field(..., description='The format of the image')
@@ -183,7 +184,7 @@ class TripoImageToModelRequest(BaseModel):
class TripoMultiviewToModelRequest(BaseModel):
type: TripoTaskType = TripoTaskType.MULTIVIEW_TO_MODEL
- files: List[TripoFileReference] = Field(..., description='The file references to convert to a model')
+ files: list[TripoFileReference] = Field(..., description='The file references to convert to a model')
model_version: Optional[TripoModelVersion] = Field(None, description='The model version to use for generation')
orthographic_projection: Optional[bool] = Field(False, description='Whether to use orthographic projection')
face_limit: Optional[int] = Field(None, description='The number of faces to limit the generation to')
@@ -251,27 +252,13 @@ class TripoConvertModelRequest(BaseModel):
with_animation: Optional[bool] = Field(None, description='Whether to include animations')
pack_uv: Optional[bool] = Field(None, description='Whether to pack the UVs')
bake: Optional[bool] = Field(None, description='Whether to bake the model')
- part_names: Optional[List[str]] = Field(None, description='The names of the parts to include')
+ part_names: Optional[list[str]] = Field(None, description='The names of the parts to include')
fbx_preset: Optional[TripoFbxPreset] = Field(None, description='The preset for the FBX export')
export_vertex_colors: Optional[bool] = Field(None, description='Whether to export the vertex colors')
export_orientation: Optional[TripoOrientation] = Field(None, description='The orientation for the export')
animate_in_place: Optional[bool] = Field(None, description='Whether to animate in place')
-class TripoTaskRequest(RootModel):
- root: Union[
- TripoTextToModelRequest,
- TripoImageToModelRequest,
- TripoMultiviewToModelRequest,
- TripoTextureModelRequest,
- TripoRefineModelRequest,
- TripoAnimatePrerigcheckRequest,
- TripoAnimateRigRequest,
- TripoAnimateRetargetRequest,
- TripoStylizeModelRequest,
- TripoConvertModelRequest
- ]
-
class TripoTaskOutput(BaseModel):
model: Optional[str] = Field(None, description='URL to the model')
base_model: Optional[str] = Field(None, description='URL to the base model')
@@ -283,12 +270,13 @@ class TripoTask(BaseModel):
task_id: str = Field(..., description='The task ID')
type: Optional[str] = Field(None, description='The type of task')
status: Optional[TripoTaskStatus] = Field(None, description='The status of the task')
- input: Optional[Dict[str, Any]] = Field(None, description='The input parameters for the task')
+ input: Optional[dict[str, Any]] = Field(None, description='The input parameters for the task')
output: Optional[TripoTaskOutput] = Field(None, description='The output of the task')
progress: Optional[int] = Field(None, description='The progress of the task', ge=0, le=100)
create_time: Optional[int] = Field(None, description='The creation time of the task')
running_left_time: Optional[int] = Field(None, description='The estimated time left for the task')
queue_position: Optional[int] = Field(None, description='The position in the queue')
+ consumed_credit: int | None = Field(None)
class TripoTaskResponse(BaseModel):
code: int = Field(0, description='The response code')
@@ -296,7 +284,7 @@ class TripoTaskResponse(BaseModel):
class TripoGeneralResponse(BaseModel):
code: int = Field(0, description='The response code')
- data: Dict[str, str] = Field(..., description='The task ID data')
+ data: dict[str, str] = Field(..., description='The task ID data')
class TripoBalanceData(BaseModel):
balance: float = Field(..., description='The account balance')
diff --git a/comfy_api_nodes/apis/wan.py b/comfy_api_nodes/apis/wan.py
index 44b65e4f6..c64acae97 100644
--- a/comfy_api_nodes/apis/wan.py
+++ b/comfy_api_nodes/apis/wan.py
@@ -118,7 +118,7 @@ class Wan27ReferenceVideoInputField(BaseModel):
class Wan27ReferenceVideoParametersField(BaseModel):
resolution: str = Field(...)
ratio: str | None = Field(None)
- duration: int = Field(5, ge=2, le=10)
+ duration: int = Field(5, ge=2, le=15)
watermark: bool = Field(False)
seed: int = Field(..., ge=0, le=2147483647)
@@ -157,7 +157,7 @@ class Wan27VideoEditInputField(BaseModel):
class Wan27VideoEditParametersField(BaseModel):
resolution: str = Field(...)
ratio: str | None = Field(None)
- duration: int = Field(0)
+ duration: int | None = Field(0)
audio_setting: str = Field("auto")
watermark: bool = Field(False)
seed: int = Field(..., ge=0, le=2147483647)
diff --git a/comfy_api_nodes/nodes_anthropic.py b/comfy_api_nodes/nodes_anthropic.py
new file mode 100644
index 000000000..7805c96ce
--- /dev/null
+++ b/comfy_api_nodes/nodes_anthropic.py
@@ -0,0 +1,306 @@
+"""API Nodes for Anthropic Claude (Messages API). See: https://docs.anthropic.com/en/api/messages"""
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.anthropic import (
+ AnthropicImageContent,
+ AnthropicImageSourceUrl,
+ AnthropicMessage,
+ AnthropicMessagesRequest,
+ AnthropicMessagesResponse,
+ AnthropicOutputConfig,
+ AnthropicResponseTextBlock,
+ AnthropicRole,
+ AnthropicTextContent,
+ AnthropicThinkingConfig,
+)
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ get_number_of_images,
+ sync_op,
+ upload_images_to_comfyapi,
+ validate_string,
+)
+
+ANTHROPIC_MESSAGES_ENDPOINT = "/proxy/anthropic/v1/messages"
+ANTHROPIC_IMAGE_MAX_PIXELS = 1568 * 1568
+CLAUDE_MAX_IMAGES = 20
+
+CLAUDE_MODELS: dict[str, str] = {
+ "Opus 4.7": "claude-opus-4-7",
+ "Opus 4.6": "claude-opus-4-6",
+ "Sonnet 4.6": "claude-sonnet-4-6",
+ "Sonnet 4.5": "claude-sonnet-4-5-20250929",
+ "Haiku 4.5": "claude-haiku-4-5-20251001",
+}
+
+_THINKING_UNSUPPORTED = {"Haiku 4.5"}
+# Models that use the newer "adaptive" thinking mode (Opus 4.7 requires it; older models keep the explicit budget API).
+# Anthropic decides the actual budget when adaptive is used, based on the `output_config.effort` hint.
+_ADAPTIVE_THINKING_MODELS = {"Opus 4.7", "Opus 4.6", "Sonnet 4.6"}
+
+# Budget mode (Sonnet 4.5): effort -> reasoning budget in tokens. Must be < max_tokens.
+# Sized so even the "high" budget fits comfortably under the default max_tokens=32768.
+_REASONING_BUDGET: dict[str, int] = {
+ "low": 2048,
+ "medium": 8192,
+ "high": 16384,
+}
+_REASONING_EFFORTS = ["off", "low", "medium", "high"]
+
+
+def _claude_model_inputs(model_label: str):
+ inputs: list = [
+ IO.Int.Input(
+ "max_tokens",
+ default=32768,
+ min=4096,
+ max=64000,
+ tooltip="Maximum number of tokens to generate (includes reasoning tokens when enabled).",
+ advanced=True,
+ ),
+ IO.Float.Input(
+ "temperature",
+ default=1.0,
+ min=0.0,
+ max=1.0,
+ step=0.01,
+ tooltip=(
+ "Controls randomness. 0.0 is deterministic, 1.0 is most random. "
+ "Ignored for Opus 4.7 and any model when reasoning_effort is set."
+ ),
+ advanced=True,
+ ),
+ ]
+ if model_label not in _THINKING_UNSUPPORTED:
+ inputs.append(
+ IO.Combo.Input(
+ "reasoning_effort",
+ options=_REASONING_EFFORTS,
+ default="off",
+ tooltip="Extended thinking effort. 'off' disables reasoning.",
+ advanced=True,
+ )
+ )
+ return inputs
+
+
+def _model_price_per_million(model: str) -> tuple[float, float] | None:
+ """Return (input_per_1M, output_per_1M) USD for a Claude model, or None if unknown."""
+ if "opus-4-7" in model or "opus-4-6" in model or "opus-4-5" in model:
+ return 5.0, 25.0
+ if "sonnet-4" in model:
+ return 3.0, 15.0
+ if "haiku-4-5" in model:
+ return 1.0, 5.0
+ return None
+
+
+def calculate_tokens_price(response: AnthropicMessagesResponse) -> float | None:
+ """Compute approximate USD price from response usage. Server-side billing is authoritative."""
+ if not response.usage or not response.model:
+ return None
+ rates = _model_price_per_million(response.model)
+ if rates is None:
+ return None
+ input_rate, output_rate = rates
+ input_tokens = response.usage.input_tokens or 0
+ output_tokens = response.usage.output_tokens or 0
+ cache_read = response.usage.cache_read_input_tokens or 0
+ cache_5m = 0
+ cache_1h = 0
+ if response.usage.cache_creation:
+ cache_5m = response.usage.cache_creation.ephemeral_5m_input_tokens or 0
+ cache_1h = response.usage.cache_creation.ephemeral_1h_input_tokens or 0
+ total = (
+ input_tokens * input_rate
+ + output_tokens * output_rate
+ + cache_read * input_rate * 0.1
+ + cache_5m * input_rate * 1.25
+ + cache_1h * input_rate * 2.0
+ )
+ return total / 1_000_000.0
+
+
+def _get_text_from_response(response: AnthropicMessagesResponse) -> str:
+ if not response.content:
+ return ""
+ # Thinking blocks are silently dropped — we never want reasoning in the output.
+ return "\n".join(
+ block.text for block in response.content
+ if isinstance(block, AnthropicResponseTextBlock) and block.text
+ )
+
+
+async def _build_image_content_blocks(
+ cls: type[IO.ComfyNode],
+ image_tensors: list[Input.Image],
+) -> list[AnthropicImageContent]:
+ urls = await upload_images_to_comfyapi(
+ cls,
+ image_tensors,
+ max_images=CLAUDE_MAX_IMAGES,
+ total_pixels=ANTHROPIC_IMAGE_MAX_PIXELS,
+ wait_label="Uploading reference images",
+ )
+ return [AnthropicImageContent(source=AnthropicImageSourceUrl(url=url)) for url in urls]
+
+
+class ClaudeNode(IO.ComfyNode):
+ """Generate text responses from an Anthropic Claude model."""
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ClaudeNode",
+ display_name="Anthropic Claude",
+ category="text/partner/Anthropic",
+ essentials_category="Text Generation",
+ description="Generate text responses with Anthropic's Claude models. "
+ "Provide a text prompt and optionally one or more images for multimodal context.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text input to the model.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(label, _claude_model_inputs(label))
+ for label in CLAUDE_MODELS
+ ],
+ tooltip="The Claude model used to generate the response.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, CLAUDE_MAX_IMAGES + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional image(s) to use as context for the model. Up to {CLAUDE_MAX_IMAGES} images.",
+ ),
+ IO.String.Input(
+ "system_prompt",
+ multiline=True,
+ default="",
+ optional=True,
+ advanced=True,
+ tooltip="Foundational instructions that dictate the model's behavior.",
+ ),
+ ],
+ outputs=[IO.String.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr="""
+ (
+ $m := widgets.model;
+ $contains($m, "opus") ? {
+ "type": "list_usd",
+ "usd": [0.005, 0.025],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : $contains($m, "sonnet") ? {
+ "type": "list_usd",
+ "usd": [0.003, 0.015],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : $contains($m, "haiku") ? {
+ "type": "list_usd",
+ "usd": [0.001, 0.005],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : {"type":"text", "text":"Token-based"}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ images: dict | None = None,
+ system_prompt: str = "",
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ model_label = model["model"]
+ max_tokens = model.get("max_tokens", 32768)
+ reasoning_effort = model.get("reasoning_effort", "off")
+ thinking_enabled = reasoning_effort not in ("off", None) and model_label not in _THINKING_UNSUPPORTED
+
+ # Anthropic requires temperature to be unset (defaults to 1.0) when thinking is enabled.
+ # Opus 4.7 also rejects user-supplied temperature.
+ if thinking_enabled or model_label == "Opus 4.7":
+ temperature = None
+ else:
+ temperature = model.get("temperature", 1.0)
+
+ thinking_cfg: AnthropicThinkingConfig | None = None
+ output_cfg: AnthropicOutputConfig | None = None
+ if thinking_enabled:
+ if model_label in _ADAPTIVE_THINKING_MODELS:
+ # Adaptive mode - Anthropic chooses the budget based on effort hint
+ thinking_cfg = AnthropicThinkingConfig(type="adaptive")
+ output_cfg = AnthropicOutputConfig(effort=reasoning_effort)
+ else:
+ # Budget mode (Sonnet 4.5). Leave at least 1024 tokens for the actual response
+ budget = _REASONING_BUDGET[reasoning_effort]
+ budget = min(budget, max(1024, max_tokens - 1024))
+ thinking_cfg = AnthropicThinkingConfig(type="enabled", budget_tokens=budget)
+
+ image_tensors: list[Input.Image] = [t for t in (images or {}).values() if t is not None]
+ if sum(get_number_of_images(t) for t in image_tensors) > CLAUDE_MAX_IMAGES:
+ raise ValueError(f"Up to {CLAUDE_MAX_IMAGES} images are supported per request.")
+
+ content: list[AnthropicTextContent | AnthropicImageContent] = []
+ if image_tensors:
+ content.extend(await _build_image_content_blocks(cls, image_tensors))
+ content.append(AnthropicTextContent(text=prompt))
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path=ANTHROPIC_MESSAGES_ENDPOINT, method="POST"),
+ response_model=AnthropicMessagesResponse,
+ data=AnthropicMessagesRequest(
+ model=CLAUDE_MODELS[model_label],
+ max_tokens=max_tokens,
+ messages=[AnthropicMessage(role=AnthropicRole.user, content=content)],
+ system=system_prompt or None,
+ temperature=temperature,
+ thinking=thinking_cfg,
+ output_config=output_cfg,
+ ),
+ price_extractor=calculate_tokens_price,
+ )
+ return IO.NodeOutput(_get_text_from_response(response) or "Empty response from Claude model.")
+
+
+class AnthropicExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [ClaudeNode]
+
+
+async def comfy_entrypoint() -> AnthropicExtension:
+ return AnthropicExtension()
diff --git a/comfy_api_nodes/nodes_beeble.py b/comfy_api_nodes/nodes_beeble.py
new file mode 100644
index 000000000..6c0a8f050
--- /dev/null
+++ b/comfy_api_nodes/nodes_beeble.py
@@ -0,0 +1,404 @@
+from fractions import Fraction
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input, InputImpl, Types
+from comfy_api_nodes.apis.beeble import (
+ CreateSwitchXRequest,
+ SwitchXStatusResponse,
+)
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ bytesio_to_image_tensor,
+ convert_mask_to_image,
+ download_url_as_bytesio,
+ download_url_to_image_tensor,
+ download_url_to_video_output,
+ downscale_image_tensor,
+ downscale_video_to_max_pixels,
+ poll_op,
+ sync_op,
+ upload_image_to_comfyapi,
+ upload_video_to_comfyapi,
+ validate_string,
+ validate_video_frame_count,
+)
+
+_MAX_PIXELS = 2_770_000
+_MAX_FRAMES = 240
+_MAX_PROMPT_LEN = 2000
+
+
+def _validate_inputs(prompt: str | None, reference_image: Input.Image | None) -> str | None:
+ """Beeble requires at least one of prompt or reference_image. Returns the cleaned prompt."""
+ cleaned = prompt.strip() if prompt else ""
+ if not cleaned and reference_image is None:
+ raise ValueError("At least one of 'prompt' or 'reference_image' must be provided.")
+ if cleaned:
+ validate_string(cleaned, strip_whitespace=False, max_length=_MAX_PROMPT_LEN)
+ return cleaned or None
+
+
+async def _upload_mask_as_image(
+ cls: type[IO.ComfyNode],
+ mask: Input.Image,
+ *,
+ wait_label: str,
+) -> str:
+ """Encode a single-frame MASK (H, W) or (1, H, W) as a PNG and upload."""
+ if mask.dim() == 2:
+ mask = mask.unsqueeze(0)
+ image = convert_mask_to_image(mask[:1])
+ return await upload_image_to_comfyapi(
+ cls,
+ image,
+ mime_type="image/png",
+ wait_label=wait_label,
+ total_pixels=_MAX_PIXELS,
+ )
+
+
+async def _upload_mask_batch_as_video(
+ cls: type[IO.ComfyNode],
+ mask: Input.Image,
+ *,
+ frame_rate: Fraction,
+ source_frame_count: int,
+ wait_label: str,
+) -> str:
+ """Encode a MASK batch (N, H, W) as a grayscale H.264 MP4 at frame_rate and upload.
+
+ The matte is always downscaled to the pixel budget so it stays within Beeble's limit and
+ keeps the same dimensions as the (similarly downscaled) source — both use the same algorithm
+ from the same starting dimensions, and downscaling is a no-op when already within budget.
+ """
+ if mask.dim() == 2:
+ mask = mask.unsqueeze(0)
+ if mask.shape[0] != source_frame_count:
+ raise ValueError(
+ f"Custom alpha video frame count ({mask.shape[0]}) does not match the "
+ f"source video frame count ({source_frame_count}). The Beeble API requires "
+ "one mask per source frame."
+ )
+ images = downscale_image_tensor(convert_mask_to_image(mask), _MAX_PIXELS)
+ alpha_video = InputImpl.VideoFromComponents(Types.VideoComponents(images=images, audio=None, frame_rate=frame_rate))
+ return await upload_video_to_comfyapi(cls, alpha_video, wait_label=wait_label)
+
+
+def _alpha_mode_input(*, video: bool) -> IO.DynamicCombo.Input:
+ """Build the alpha_mode DynamicCombo with mode-specific extra inputs."""
+ select_keyframe_tooltip = (
+ "First-frame keyframe mask. Beeble propagates this across the video." if video else "Grayscale keyframe mask."
+ )
+ custom_tooltip = (
+ "Per-frame grayscale mask covering the entire video. "
+ "Must have the same frame count as the source. "
+ "Connect a MASK output from SAM3_TrackToMask or similar."
+ if video
+ else "Grayscale mask to apply."
+ )
+ return IO.DynamicCombo.Input(
+ "alpha_mode",
+ tooltip=(
+ "Controls how SwitchX decides what to keep vs. regenerate. "
+ "'auto' isolates the main subject automatically. "
+ "'fill' regenerates the entire frame while preserving geometry. "
+ "'select' propagates a first-frame keyframe across the clip. "
+ "'custom' uses a per-frame alpha matte you provide."
+ ),
+ options=[
+ IO.DynamicCombo.Option("auto", []),
+ IO.DynamicCombo.Option("fill", []),
+ IO.DynamicCombo.Option(
+ "select",
+ [IO.Mask.Input("alpha_keyframe", tooltip=select_keyframe_tooltip)],
+ ),
+ IO.DynamicCombo.Option(
+ "custom",
+ [IO.Mask.Input("alpha_mask", tooltip=custom_tooltip)],
+ ),
+ ],
+ )
+
+
+def _common_inputs(*, source: IO.Input, video: bool) -> list[IO.Input]:
+ return [
+ source,
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip=(
+ "Text description of the desired output (max 2000 chars). "
+ "At least one of 'prompt' or 'reference_image' is required."
+ ),
+ ),
+ IO.Image.Input(
+ "reference_image",
+ optional=True,
+ tooltip=(
+ "Reference image whose look (background, lighting, costume) the result "
+ "should adopt. At least one of 'reference_image' or 'prompt' is required."
+ ),
+ ),
+ _alpha_mode_input(video=video),
+ IO.Combo.Input(
+ "max_resolution",
+ options=["1080p", "720p"],
+ default="1080p",
+ tooltip="Maximum output resolution.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip=(
+ "Seed controls whether the node should re-run; " "results are non-deterministic regardless of seed."
+ ),
+ ),
+ ]
+
+
+async def _submit_and_poll(
+ cls: type[IO.ComfyNode],
+ request: CreateSwitchXRequest,
+) -> SwitchXStatusResponse:
+ initial = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/beeble/v1/switchx/generations", method="POST"),
+ response_model=SwitchXStatusResponse,
+ data=request,
+ )
+ return await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/beeble/v1/switchx/generations/{initial.id}"),
+ response_model=SwitchXStatusResponse,
+ status_extractor=lambda r: r.status,
+ progress_extractor=lambda r: r.progress,
+ )
+
+
+def _require_output_url(response: SwitchXStatusResponse, name: str) -> str:
+ if response.output is None or getattr(response.output, name) is None:
+ raise RuntimeError(f"Beeble job {response.id} completed without a {name!r} output URL.")
+ return getattr(response.output, name)
+
+
+def _alpha_url(response: SwitchXStatusResponse, mode: str) -> str | None:
+ """URL of the alpha matte, or None when the mode produces no separate matte.
+
+ 'fill' selects the whole frame, so Beeble writes no alpha asset even though the status
+ response still returns a (dangling) signed URL for it — fetching it 403s with S3
+ AccessDenied. The other three modes ('auto', 'custom', 'select') all produce a real,
+ downloadable matte.
+ """
+ if mode == "fill" or response.output is None:
+ return None
+ return response.output.alpha
+
+
+class BeebleSwitchXVideoEdit(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="BeebleSwitchXVideoEdit",
+ display_name="Beeble SwitchX Video Edit",
+ category="api node/video/Beeble",
+ description=(
+ "Edit a video with Beeble SwitchX. Switches anything in the scene (background, "
+ "lighting, costume) while preserving the original subject's pixels and motion. "
+ "Provide a reference image and/or text prompt to describe the new look. "
+ "Max 240 frames, max ~2.77MP per frame."
+ ),
+ inputs=_common_inputs(source=IO.Video.Input("video"), video=True),
+ outputs=[
+ IO.Video.Output(display_name="video"),
+ IO.Video.Output(
+ display_name="alpha",
+ tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.",
+ ),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]),
+ expr="""
+ (
+ $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143;
+ {"type":"usd","usd": $rate, "format":{"suffix":"/30 frames"}}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ video: Input.Video,
+ prompt: str,
+ alpha_mode: dict,
+ max_resolution: str,
+ seed: int,
+ reference_image: Input.Image | None = None,
+ ) -> IO.NodeOutput:
+ cleaned_prompt = _validate_inputs(prompt, reference_image)
+
+ validate_video_frame_count(video, max_frame_count=_MAX_FRAMES)
+ video = downscale_video_to_max_pixels(video, _MAX_PIXELS)
+
+ mode = alpha_mode["alpha_mode"]
+ alpha_uri: str | None = None
+ if mode == "select":
+ alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe")
+ elif mode == "custom":
+ alpha_uri = await _upload_mask_batch_as_video(
+ cls,
+ alpha_mode["alpha_mask"],
+ frame_rate=video.get_frame_rate(),
+ source_frame_count=video.get_frame_count(),
+ wait_label="Uploading alpha video",
+ )
+
+ source_uri = await upload_video_to_comfyapi(cls, video, wait_label="Uploading source")
+ reference_uri: str | None = None
+ if reference_image is not None:
+ reference_uri = await upload_image_to_comfyapi(
+ cls,
+ reference_image,
+ mime_type="image/png",
+ wait_label="Uploading reference",
+ total_pixels=_MAX_PIXELS,
+ )
+
+ request = CreateSwitchXRequest(
+ generation_type="video",
+ source_uri=source_uri,
+ alpha_mode=mode,
+ prompt=cleaned_prompt,
+ reference_image_uri=reference_uri,
+ alpha_uri=alpha_uri,
+ max_resolution=1080 if max_resolution == "1080p" else 720,
+ )
+ response = await _submit_and_poll(cls, request)
+
+ render = await download_url_to_video_output(_require_output_url(response, "render"))
+ alpha = None
+ if (alpha_url := _alpha_url(response, mode)) is not None:
+ alpha = await download_url_to_video_output(alpha_url)
+ return IO.NodeOutput(render, alpha)
+
+
+class BeebleSwitchXImageEdit(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="BeebleSwitchXImageEdit",
+ display_name="Beeble SwitchX Image Edit",
+ category="api node/image/Beeble",
+ description=(
+ "Edit a single image with Beeble SwitchX. Switches anything in the scene "
+ "(background, lighting, costume) while preserving the original subject's pixels. "
+ "Provide a reference image and/or text prompt to describe the new look. "
+ "Max ~2.77MP."
+ ),
+ inputs=_common_inputs(source=IO.Image.Input("image"), video=False),
+ outputs=[
+ IO.Image.Output(display_name="image"),
+ IO.Mask.Output(
+ display_name="alpha",
+ tooltip="The alpha matte Beeble used. Empty for 'fill' mode, which has no separate matte.",
+ ),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["max_resolution"]),
+ expr="""
+ (
+ $rate := widgets.max_resolution = "1080p" ? 0.429 : 0.143;
+ {"type":"usd","usd": $rate}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ image: Input.Image,
+ prompt: str,
+ alpha_mode: dict,
+ max_resolution: str,
+ seed: int,
+ reference_image: Input.Image | None = None,
+ ) -> IO.NodeOutput:
+ cleaned_prompt = _validate_inputs(prompt, reference_image)
+
+ image = downscale_image_tensor(image, _MAX_PIXELS)
+
+ mode = alpha_mode["alpha_mode"]
+ alpha_uri: str | None = None
+ if mode == "select":
+ alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_keyframe"], wait_label="Uploading keyframe")
+ elif mode == "custom":
+ alpha_uri = await _upload_mask_as_image(cls, alpha_mode["alpha_mask"], wait_label="Uploading alpha")
+
+ source_uri = await upload_image_to_comfyapi(
+ cls,
+ image,
+ mime_type="image/png",
+ wait_label="Uploading source",
+ total_pixels=None,
+ )
+ reference_uri: str | None = None
+ if reference_image is not None:
+ reference_uri = await upload_image_to_comfyapi(
+ cls,
+ reference_image,
+ mime_type="image/png",
+ wait_label="Uploading reference",
+ total_pixels=_MAX_PIXELS,
+ )
+
+ request = CreateSwitchXRequest(
+ generation_type="image",
+ source_uri=source_uri,
+ alpha_mode=mode,
+ prompt=cleaned_prompt,
+ reference_image_uri=reference_uri,
+ alpha_uri=alpha_uri,
+ max_resolution=1080 if max_resolution == "1080p" else 720,
+ )
+ response = await _submit_and_poll(cls, request)
+
+ render = await download_url_to_image_tensor(_require_output_url(response, "render"))
+ alpha_mask = None
+ if (alpha_url := _alpha_url(response, mode)) is not None:
+ alpha_image = bytesio_to_image_tensor(await download_url_as_bytesio(alpha_url), mode="L")
+ alpha_mask = alpha_image.squeeze(-1) if alpha_image.dim() == 4 else alpha_image
+ return IO.NodeOutput(render, alpha_mask)
+
+
+class BeebleExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [
+ BeebleSwitchXVideoEdit,
+ BeebleSwitchXImageEdit,
+ ]
+
+
+async def comfy_entrypoint() -> BeebleExtension:
+ return BeebleExtension()
diff --git a/comfy_api_nodes/nodes_bfl.py b/comfy_api_nodes/nodes_bfl.py
index 23590bf24..f1a5dc5f0 100644
--- a/comfy_api_nodes/nodes_bfl.py
+++ b/comfy_api_nodes/nodes_bfl.py
@@ -42,7 +42,7 @@ class FluxProUltraImageNode(IO.ComfyNode):
return IO.Schema(
node_id="FluxProUltraImageNode",
display_name="Flux 1.1 [pro] Ultra Image",
- category="api node/image/BFL",
+ category="image/partner/BFL",
description="Generates images using Flux Pro 1.1 Ultra via api based on prompt and resolution.",
inputs=[
IO.String.Input(
@@ -160,7 +160,7 @@ class FluxKontextProImageNode(IO.ComfyNode):
return IO.Schema(
node_id=cls.NODE_ID,
display_name=cls.DISPLAY_NAME,
- category="api node/image/BFL",
+ category="image/partner/BFL",
description="Edits images using Flux.1 Kontext [pro] via api based on prompt and aspect ratio.",
inputs=[
IO.String.Input(
@@ -282,7 +282,7 @@ class FluxProExpandNode(IO.ComfyNode):
return IO.Schema(
node_id="FluxProExpandNode",
display_name="Flux.1 Expand Image",
- category="api node/image/BFL",
+ category="image/partner/BFL",
description="Outpaints image based on prompt.",
inputs=[
IO.Image.Input("image"),
@@ -419,7 +419,7 @@ class FluxProFillNode(IO.ComfyNode):
return IO.Schema(
node_id="FluxProFillNode",
display_name="Flux.1 Fill Image",
- category="api node/image/BFL",
+ category="image/partner/BFL",
description="Inpaints image based on mask and prompt.",
inputs=[
IO.Image.Input("image"),
@@ -545,7 +545,7 @@ class Flux2ProImageNode(IO.ComfyNode):
return IO.Schema(
node_id=cls.NODE_ID,
display_name=cls.DISPLAY_NAME,
- category="api node/image/BFL",
+ category="image/partner/BFL",
description="Generates images synchronously based on prompt and resolution.",
inputs=[
IO.String.Input(
@@ -596,6 +596,7 @@ class Flux2ProImageNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(widgets=["width", "height"], inputs=["images"]),
expr=cls.PRICE_BADGE_EXPR,
),
+ is_deprecated=True,
)
@classmethod
@@ -674,6 +675,175 @@ class Flux2MaxImageNode(Flux2ProImageNode):
"""
+_FLUX2_MODEL_ENDPOINTS = {
+ "Flux.2 [pro]": "/proxy/bfl/flux-2-pro/generate",
+ "Flux.2 [max]": "/proxy/bfl/flux-2-max/generate",
+}
+
+
+def _flux2_model_inputs():
+ return [
+ IO.Int.Input(
+ "width",
+ default=1024,
+ min=256,
+ max=2048,
+ step=32,
+ ),
+ IO.Int.Input(
+ "height",
+ default=768,
+ min=256,
+ max=2048,
+ step=32,
+ ),
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, 9)],
+ min=0,
+ ),
+ tooltip="Optional reference image(s) for image-to-image generation. Up to 8 images.",
+ ),
+ ]
+
+
+class Flux2ImageNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="Flux2ImageNode",
+ display_name="Flux.2 Image",
+ category="image/partner/BFL",
+ description="Generate images via Flux.2 [pro] or Flux.2 [max] from a prompt and optional reference images.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Prompt for the image generation or edit",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option("Flux.2 [pro]", _flux2_model_inputs()),
+ IO.DynamicCombo.Option("Flux.2 [max]", _flux2_model_inputs()),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=0xFFFFFFFFFFFFFFFF,
+ control_after_generate=True,
+ tooltip="The random seed used for creating the noise.",
+ ),
+ ],
+ outputs=[IO.Image.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=["model", "model.width", "model.height"],
+ input_groups=["model.images"],
+ ),
+ expr="""
+ (
+ $isMax := widgets.model = "flux.2 [max]";
+ $MP := 1024 * 1024;
+ $w := $lookup(widgets, "model.width");
+ $h := $lookup(widgets, "model.height");
+ $outMP := $max([1, $floor((($w * $h) + $MP - 1) / $MP)]);
+ $outputCost := $isMax
+ ? (0.07 + 0.03 * ($outMP - 1))
+ : (0.03 + 0.015 * ($outMP - 1));
+ $refMin := $isMax ? 0.03 : 0.015;
+ $refMax := $isMax ? 0.24 : 0.12;
+ $hasRefs := $lookup(inputGroups, "model.images") > 0;
+ $hasRefs
+ ? {
+ "type": "range_usd",
+ "min_usd": $outputCost + $refMin,
+ "max_usd": $outputCost + $refMax,
+ "format": { "approximate": true }
+ }
+ : {"type": "usd", "usd": $outputCost}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ model_choice = model["model"]
+ endpoint = _FLUX2_MODEL_ENDPOINTS[model_choice]
+ width = model["width"]
+ height = model["height"]
+ images_dict = model.get("images") or {}
+
+ image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
+ n_images = sum(get_number_of_images(t) for t in image_tensors)
+ if n_images > 8:
+ raise ValueError("The current maximum number of supported images is 8.")
+
+ flat_tensors: list[torch.Tensor] = []
+ for tensor in image_tensors:
+ if len(tensor.shape) == 4:
+ flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
+ else:
+ flat_tensors.append(tensor)
+
+ reference_images: dict[str, str] = {}
+ for idx, tensor in enumerate(flat_tensors):
+ key_name = f"input_image_{idx + 1}" if idx else "input_image"
+ reference_images[key_name] = tensor_to_base64_string(tensor, total_pixels=2048 * 2048)
+
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=endpoint, method="POST"),
+ response_model=BFLFluxProGenerateResponse,
+ data=Flux2ProGenerateRequest(
+ prompt=prompt,
+ width=width,
+ height=height,
+ seed=seed,
+ **reference_images,
+ ),
+ )
+
+ def price_extractor(_r: BaseModel) -> float | None:
+ return None if initial_response.cost is None else initial_response.cost / 100
+
+ response = await poll_op(
+ cls,
+ ApiEndpoint(initial_response.polling_url),
+ response_model=BFLFluxStatusResponse,
+ status_extractor=lambda r: r.status,
+ progress_extractor=lambda r: r.progress,
+ price_extractor=price_extractor,
+ completed_statuses=[BFLStatus.ready],
+ failed_statuses=[
+ BFLStatus.request_moderated,
+ BFLStatus.content_moderated,
+ BFLStatus.error,
+ BFLStatus.task_not_found,
+ ],
+ queued_statuses=[],
+ )
+ return IO.NodeOutput(await download_url_to_image_tensor(response.result["sample"]))
+
+
class BFLExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -685,6 +855,7 @@ class BFLExtension(ComfyExtension):
FluxProFillNode,
Flux2ProImageNode,
Flux2MaxImageNode,
+ Flux2ImageNode,
]
diff --git a/comfy_api_nodes/nodes_bria.py b/comfy_api_nodes/nodes_bria.py
index 4044ee3ea..53e763210 100644
--- a/comfy_api_nodes/nodes_bria.py
+++ b/comfy_api_nodes/nodes_bria.py
@@ -31,7 +31,7 @@ class BriaImageEditNode(IO.ComfyNode):
return IO.Schema(
node_id="BriaImageEditNode",
display_name="Bria FIBO Image Edit",
- category="api node/image/Bria",
+ category="image/partner/Bria",
description="Edit images using Bria latest model",
inputs=[
IO.Combo.Input("model", options=["FIBO"]),
@@ -169,7 +169,7 @@ class BriaRemoveImageBackground(IO.ComfyNode):
return IO.Schema(
node_id="BriaRemoveImageBackground",
display_name="Bria Remove Image Background",
- category="api node/image/Bria",
+ category="image/partner/Bria",
description="Remove the background from an image using Bria RMBG 2.0.",
inputs=[
IO.Image.Input("image"),
@@ -245,7 +245,7 @@ class BriaRemoveVideoBackground(IO.ComfyNode):
return IO.Schema(
node_id="BriaRemoveVideoBackground",
display_name="Bria Remove Video Background",
- category="api node/video/Bria",
+ category="video/partner/Bria",
description="Remove the background from a video using Bria. ",
inputs=[
IO.Video.Input("video"),
diff --git a/comfy_api_nodes/nodes_bytedance.py b/comfy_api_nodes/nodes_bytedance.py
index de0c22e70..8ddce2622 100644
--- a/comfy_api_nodes/nodes_bytedance.py
+++ b/comfy_api_nodes/nodes_bytedance.py
@@ -1,23 +1,42 @@
+import hashlib
import logging
import math
+import re
+from io import BytesIO
import torch
from typing_extensions import override
-from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api.latest import IO, ComfyExtension, Input, Types
from comfy_api_nodes.apis.bytedance import (
RECOMMENDED_PRESETS,
RECOMMENDED_PRESETS_SEEDREAM_4,
+ RECOMMENDED_PRESETS_SEEDREAM_4_0,
+ RECOMMENDED_PRESETS_SEEDREAM_4_5,
+ RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
+ SEEDANCE2_PRICE_PER_1K_TOKENS,
+ SEEDANCE2_REF_VIDEO_PIXEL_LIMITS,
VIDEO_TASKS_EXECUTION_TIME,
+ GetAssetResponse,
Image2VideoTaskCreationRequest,
ImageTaskCreationResponse,
+ Seedance2TaskCreationRequest,
+ SeedanceCreateAssetRequest,
+ SeedanceCreateAssetResponse,
+ SeedanceCreateVisualValidateSessionResponse,
+ SeedanceGetVisualValidateSessionResponse,
+ SeedanceVirtualLibraryCreateAssetRequest,
Seedream4Options,
Seedream4TaskCreationRequest,
+ TaskAudioContent,
+ TaskAudioContentUrl,
TaskCreationResponse,
TaskImageContent,
TaskImageContentUrl,
TaskStatusResponse,
TaskTextContent,
+ TaskVideoContent,
+ TaskVideoContentUrl,
Text2ImageTaskCreationRequest,
Text2VideoTaskCreationRequest,
)
@@ -25,33 +44,305 @@ from comfy_api_nodes.util import (
ApiEndpoint,
download_url_to_image_tensor,
download_url_to_video_output,
+ downscale_video_to_max_pixels,
get_number_of_images,
image_tensor_pair_to_batch,
poll_op,
sync_op,
+ upload_audio_to_comfyapi,
+ upload_image_to_comfyapi,
upload_images_to_comfyapi,
+ upload_video_to_comfyapi,
+ upscale_video_to_min_pixels,
validate_image_aspect_ratio,
validate_image_dimensions,
validate_string,
+ validate_video_dimensions,
+ validate_video_duration,
)
+from server import PromptServer
BYTEPLUS_IMAGE_ENDPOINT = "/proxy/byteplus/api/v3/images/generations"
+_VERIFICATION_POLL_TIMEOUT_SEC = 120
+_VERIFICATION_POLL_INTERVAL_SEC = 3
+
SEEDREAM_MODELS = {
"seedream 5.0 lite": "seedream-5-0-260128",
"seedream-4-5-251128": "seedream-4-5-251128",
"seedream-4-0-250828": "seedream-4-0-250828",
}
+SEEDREAM_PRESETS = {
+ "seedream-5-0-260128": RECOMMENDED_PRESETS_SEEDREAM_5_LITE,
+ "seedream-4-5-251128": RECOMMENDED_PRESETS_SEEDREAM_4_5,
+ "seedream-4-0-250828": RECOMMENDED_PRESETS_SEEDREAM_4_0,
+}
+
# Long-running tasks endpoints(e.g., video)
BYTEPLUS_TASK_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks"
BYTEPLUS_TASK_STATUS_ENDPOINT = "/proxy/byteplus/api/v3/contents/generations/tasks" # + /{task_id}
+BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT = "/proxy/byteplus-seedance2/api/v3/contents/generations/tasks" # + /{task_id}
+
+SEEDANCE_MODELS = {
+ "Seedance 2.0": "dreamina-seedance-2-0-260128",
+ "Seedance 2.0 Fast": "dreamina-seedance-2-0-fast-260128",
+}
DEPRECATED_MODELS = {"seedance-1-0-lite-t2v-250428", "seedance-1-0-lite-i2v-250428"}
+
logger = logging.getLogger(__name__)
+def _validate_ref_video_pixels(video: Input.Video, model_id: str, resolution: str, index: int) -> None:
+ """Validate reference video pixel count against Seedance 2.0 model limits for the selected resolution."""
+ model_limits = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id)
+ if not model_limits:
+ return
+ limits = model_limits.get(resolution)
+ if not limits:
+ return
+ try:
+ w, h = video.get_dimensions()
+ except Exception:
+ return
+ pixels = w * h
+ min_px = limits.get("min")
+ max_px = limits.get("max")
+ if min_px and pixels < min_px:
+ raise ValueError(
+ f"Reference video {index} is too small: {w}x{h} = {pixels:,} total pixels. "
+ f"Minimum for this model is {min_px:,} total pixels."
+ )
+ if max_px and pixels > max_px:
+ raise ValueError(
+ f"Reference video {index} is too large: {w}x{h} = {pixels:,} total pixels. "
+ f"Maximum for this model is {max_px:,} total pixels. Try downscaling the video."
+ )
+
+
+async def _resolve_reference_assets(
+ cls: type[IO.ComfyNode],
+ asset_ids: list[str],
+) -> tuple[dict[str, str], dict[str, str], dict[str, str]]:
+ """Look up each asset, validate Active status, group by asset_type.
+
+ Returns (image_assets, video_assets, audio_assets), each mapping asset_id -> "asset://".
+ """
+ image_assets: dict[str, str] = {}
+ video_assets: dict[str, str] = {}
+ audio_assets: dict[str, str] = {}
+ for i, raw_id in enumerate(asset_ids, 1):
+ asset_id = (raw_id or "").strip()
+ if not asset_id:
+ continue
+ result = await sync_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
+ response_model=GetAssetResponse,
+ )
+ if result.status != "Active":
+ extra = f" {result.error.code}: {result.error.message}" if result.error else ""
+ raise ValueError(f"Reference asset {i} (Id={asset_id}) is not Active (Status={result.status}).{extra}")
+ asset_uri = f"asset://{asset_id}"
+ if result.asset_type == "Image":
+ image_assets[asset_id] = asset_uri
+ elif result.asset_type == "Video":
+ video_assets[asset_id] = asset_uri
+ elif result.asset_type == "Audio":
+ audio_assets[asset_id] = asset_uri
+ return image_assets, video_assets, audio_assets
+
+
+_ASSET_REF_RE = re.compile(r"\basset ?(\d{1,2})\b", re.IGNORECASE)
+
+
+def _build_asset_labels(
+ reference_assets: dict[str, str],
+ image_asset_uris: dict[str, str],
+ video_asset_uris: dict[str, str],
+ audio_asset_uris: dict[str, str],
+ n_reference_images: int,
+ n_reference_videos: int,
+ n_reference_audios: int,
+) -> dict[int, str]:
+ """Map asset slot number (from 'asset_N' keys) to its positional label.
+
+ Asset entries are appended to `content` after the reference_images/videos/audios,
+ so their 1-indexed labels continue from the count of existing same-type refs:
+ one reference_images entry + one Image-type asset -> asset labelled "Image 2".
+ """
+ image_n = n_reference_images
+ video_n = n_reference_videos
+ audio_n = n_reference_audios
+ labels: dict[int, str] = {}
+ for slot_key, raw_id in reference_assets.items():
+ asset_id = (raw_id or "").strip()
+ if not asset_id:
+ continue
+ try:
+ slot_num = int(slot_key.rsplit("_", 1)[-1])
+ except ValueError:
+ continue
+ if asset_id in image_asset_uris:
+ image_n += 1
+ labels[slot_num] = f"Image {image_n}"
+ elif asset_id in video_asset_uris:
+ video_n += 1
+ labels[slot_num] = f"Video {video_n}"
+ elif asset_id in audio_asset_uris:
+ audio_n += 1
+ labels[slot_num] = f"Audio {audio_n}"
+ return labels
+
+
+def _rewrite_asset_refs(prompt: str, labels: dict[int, str]) -> str:
+ """Case-insensitively replace 'assetNN' (1-2 digit) tokens with their labels."""
+ if not labels:
+ return prompt
+
+ def _sub(m: "re.Match[str]") -> str:
+ return labels.get(int(m.group(1)), m.group(0))
+
+ return _ASSET_REF_RE.sub(_sub, prompt)
+
+
+async def _obtain_group_id_via_h5_auth(cls: type[IO.ComfyNode]) -> str:
+ session = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/seedance/visual-validate/sessions", method="POST"),
+ response_model=SeedanceCreateVisualValidateSessionResponse,
+ )
+ logger.warning("Seedance authentication required. Open link: %s", session.h5_link)
+
+ h5_text = f"Open this link in your browser and complete face verification:\n\n{session.h5_link}"
+
+ result = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/seedance/visual-validate/sessions/{session.session_id}"),
+ response_model=SeedanceGetVisualValidateSessionResponse,
+ status_extractor=lambda r: r.status,
+ completed_statuses=["completed"],
+ failed_statuses=["failed"],
+ poll_interval=_VERIFICATION_POLL_INTERVAL_SEC,
+ max_poll_attempts=(_VERIFICATION_POLL_TIMEOUT_SEC // _VERIFICATION_POLL_INTERVAL_SEC) - 1,
+ estimated_duration=_VERIFICATION_POLL_TIMEOUT_SEC - 1,
+ extra_text=h5_text,
+ )
+
+ if not result.group_id:
+ raise RuntimeError(f"Seedance session {session.session_id} completed without a group_id")
+
+ logger.warning("Seedance authentication complete. New GroupId: %s", result.group_id)
+ PromptServer.instance.send_progress_text(
+ f"Authentication complete. New GroupId: {result.group_id}", cls.hidden.unique_id
+ )
+ return result.group_id
+
+
+async def _resolve_group_id(cls: type[IO.ComfyNode], group_id: str) -> str:
+ if group_id and group_id.strip():
+ return group_id.strip()
+ return await _obtain_group_id_via_h5_auth(cls)
+
+
+async def _create_seedance_asset(
+ cls: type[IO.ComfyNode],
+ *,
+ group_id: str,
+ url: str,
+ name: str,
+ asset_type: str,
+) -> str:
+ req = SeedanceCreateAssetRequest(
+ group_id=group_id,
+ url=url,
+ asset_type=asset_type,
+ name=name or None,
+ )
+ result = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/seedance/assets", method="POST"),
+ response_model=SeedanceCreateAssetResponse,
+ data=req,
+ )
+ return result.asset_id
+
+
+async def _wait_for_asset_active(cls: type[IO.ComfyNode], asset_id: str, group_id: str) -> GetAssetResponse:
+ """Poll the newly created asset until its status becomes Active."""
+ return await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/seedance/assets/{asset_id}"),
+ response_model=GetAssetResponse,
+ status_extractor=lambda r: r.status,
+ completed_statuses=["Active"],
+ failed_statuses=["Failed"],
+ poll_interval=5,
+ max_poll_attempts=1200,
+ extra_text=f"Waiting for asset pre-processing...\n\nasset_id: {asset_id}\n\ngroup_id: {group_id}",
+ )
+
+
+async def _seedance_virtual_library_upload_image_asset(
+ cls: type[IO.ComfyNode],
+ image: torch.Tensor,
+ *,
+ wait_label: str = "Uploading image",
+) -> str:
+ """Upload an image into the caller's per-customer Seedance virtual library."""
+ public_url = await upload_image_to_comfyapi(cls, image, wait_label=wait_label)
+ normalized = image.detach().cpu().contiguous().to(torch.float32)
+ digest = hashlib.sha256()
+ digest.update(str(tuple(normalized.shape)).encode("utf-8"))
+ digest.update(b"\0")
+ digest.update(normalized.numpy().tobytes())
+ image_hash = digest.hexdigest()
+ create_resp = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"),
+ response_model=SeedanceCreateAssetResponse,
+ data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=image_hash),
+ )
+ await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library")
+ return f"asset://{create_resp.asset_id}"
+
+
+async def _seedance_virtual_library_upload_video_asset(
+ cls: type[IO.ComfyNode],
+ video: Input.Video,
+ *,
+ wait_label: str = "Uploading video",
+) -> str:
+ buf = BytesIO()
+ video.save_to(buf, format=Types.VideoContainer.MP4, codec=Types.VideoCodec.H264)
+ video_hash = hashlib.sha256(buf.getbuffer()).hexdigest()
+ public_url = await upload_video_to_comfyapi(cls, video, wait_label=wait_label)
+ create_resp = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/seedance/virtual-library/assets", method="POST"),
+ response_model=SeedanceCreateAssetResponse,
+ data=SeedanceVirtualLibraryCreateAssetRequest(url=public_url, hash=video_hash, asset_type="Video"),
+ )
+ await _wait_for_asset_active(cls, create_resp.asset_id, group_id="virtual-library")
+ return f"asset://{create_resp.asset_id}"
+
+
+def _seedance2_price_extractor(model_id: str, has_video_input: bool):
+ """Returns a price_extractor closure for Seedance 2.0 poll_op."""
+ rate = SEEDANCE2_PRICE_PER_1K_TOKENS.get((model_id, has_video_input))
+ if rate is None:
+ return None
+
+ def extractor(response: TaskStatusResponse) -> float | None:
+ if response.usage is None:
+ return None
+ return response.usage.total_tokens * 1.43 * rate / 1_000.0
+
+ return extractor
+
+
def get_image_url_from_response(response: ImageTaskCreationResponse) -> str:
if response.error:
error_msg = f"ByteDance request failed. Code: {response.error['code']}, message: {response.error['message']}"
@@ -68,7 +359,7 @@ class ByteDanceImageNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceImageNode",
display_name="ByteDance Image",
- category="api node/image/ByteDance",
+ category="image/partner/ByteDance",
description="Generate images using ByteDance models via api based on prompt",
inputs=[
IO.Combo.Input("model", options=["seedream-3-0-t2i-250415"]),
@@ -192,7 +483,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceSeedreamNode",
display_name="ByteDance Seedream 4.5 & 5.0",
- category="api node/image/ByteDance",
+ category="image/partner/ByteDance",
description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.",
inputs=[
IO.Combo.Input(
@@ -303,6 +594,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
)
""",
),
+ is_deprecated=True,
)
@classmethod
@@ -335,8 +627,7 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
mp_provided = out_num_pixels / 1_000_000.0
if ("seedream-4-5" in model or "seedream-5-0" in model) and out_num_pixels < 3686400:
raise ValueError(
- f"Minimum image resolution for the selected model is 3.68MP, "
- f"but {mp_provided:.2f}MP provided."
+ f"Minimum image resolution for the selected model is 3.68MP, " f"but {mp_provided:.2f}MP provided."
)
if "seedream-4-0" in model and out_num_pixels < 921600:
raise ValueError(
@@ -393,6 +684,226 @@ class ByteDanceSeedreamNode(IO.ComfyNode):
return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
+def _seedream_model_inputs(*, max_ref_images: int, presets: list):
+ return [
+ IO.Combo.Input(
+ "size_preset",
+ options=[label for label, _, _ in presets],
+ tooltip="Pick a recommended size. Select Custom to use the width and height below.",
+ ),
+ IO.Int.Input(
+ "width",
+ default=2048,
+ min=1024,
+ max=6240,
+ step=2,
+ tooltip="Custom width for image. Value is working only if `size_preset` is set to `Custom`",
+ ),
+ IO.Int.Input(
+ "height",
+ default=2048,
+ min=1024,
+ max=4992,
+ step=2,
+ tooltip="Custom height for image. Value is working only if `size_preset` is set to `Custom`",
+ ),
+ IO.Int.Input(
+ "max_images",
+ default=1,
+ min=1,
+ max=max_ref_images,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ tooltip="Maximum number of images to generate. With 1, exactly one image is produced. "
+ "With >1, the model generates between 1 and max_images related images "
+ "(e.g., story scenes, character variations). "
+ "Total images (input + generated) cannot exceed 15.",
+ ),
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional reference image(s) for image-to-image or multi-reference generation. "
+ f"Up to {max_ref_images} images.",
+ ),
+ IO.Boolean.Input(
+ "fail_on_partial",
+ default=False,
+ tooltip="If enabled, abort execution if any requested images are missing or return an error.",
+ advanced=True,
+ ),
+ ]
+
+
+class ByteDanceSeedreamNodeV2(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ByteDanceSeedreamNodeV2",
+ display_name="ByteDance Seedream 4.5 & 5.0",
+ category="image/partner/ByteDance",
+ description="Unified text-to-image generation and precise single-sentence editing at up to 4K resolution.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text prompt for creating or editing an image.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "seedream 5.0 lite",
+ _seedream_model_inputs(max_ref_images=14, presets=RECOMMENDED_PRESETS_SEEDREAM_5_LITE),
+ ),
+ IO.DynamicCombo.Option(
+ "seedream-4-5-251128",
+ _seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_5),
+ ),
+ IO.DynamicCombo.Option(
+ "seedream-4-0-250828",
+ _seedream_model_inputs(max_ref_images=10, presets=RECOMMENDED_PRESETS_SEEDREAM_4_0),
+ ),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to use for generation.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip='Whether to add an "AI generated" watermark to the image.',
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Image.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr="""
+ (
+ $price := $contains(widgets.model, "5.0 lite") ? 0.035 :
+ $contains(widgets.model, "4-5") ? 0.04 : 0.03;
+ {
+ "type":"usd",
+ "usd": $price,
+ "format": { "suffix":" x images/Run", "approximate": true }
+ }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int = 0,
+ watermark: bool = False,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ model_id = SEEDREAM_MODELS[model["model"]]
+ presets = SEEDREAM_PRESETS[model_id]
+
+ size_preset = model.get("size_preset", presets[0][0])
+ width = model.get("width", 2048)
+ height = model.get("height", 2048)
+ max_images = model.get("max_images", 1)
+ sequential_image_generation = "disabled" if max_images == 1 else "auto"
+ images_dict = model.get("images") or {}
+ fail_on_partial = model.get("fail_on_partial", False)
+
+ w = h = None
+ for label, tw, th in presets:
+ if label == size_preset:
+ w, h = tw, th
+ break
+ if w is None or h is None:
+ w, h = width, height
+
+ out_num_pixels = w * h
+ mp_provided = out_num_pixels / 1_000_000.0
+ if ("seedream-4-5" in model_id or "seedream-5-0" in model_id) and out_num_pixels < 3686400:
+ raise ValueError(
+ f"Minimum image resolution for the selected model is 3.68MP, but {mp_provided:.2f}MP provided."
+ )
+ if "seedream-4-0" in model_id and out_num_pixels < 921600:
+ raise ValueError(
+ f"Minimum image resolution that the selected model can generate is 0.92MP, "
+ f"but {mp_provided:.2f}MP provided."
+ )
+ if out_num_pixels > 16_777_216:
+ raise ValueError(
+ f"Maximum image resolution for the selected model is 16.78MP, but {mp_provided:.2f}MP provided."
+ )
+
+ image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
+ n_input_images = sum(get_number_of_images(t) for t in image_tensors)
+ max_num_of_images = 14 if model_id == "seedream-5-0-260128" else 10
+ if n_input_images > max_num_of_images:
+ raise ValueError(
+ f"Maximum of {max_num_of_images} reference images are supported, but {n_input_images} received."
+ )
+ if sequential_image_generation == "auto" and n_input_images + max_images > 15:
+ raise ValueError(
+ "The maximum number of generated images plus the number of reference images cannot exceed 15."
+ )
+
+ reference_images_urls: list[str] = []
+ if image_tensors:
+ for tensor in image_tensors:
+ validate_image_aspect_ratio(tensor, (1, 3), (3, 1))
+ reference_images_urls = await upload_images_to_comfyapi(
+ cls,
+ image_tensors,
+ max_images=n_input_images,
+ mime_type="image/png",
+ wait_label="Uploading reference images",
+ )
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_IMAGE_ENDPOINT, method="POST"),
+ response_model=ImageTaskCreationResponse,
+ data=Seedream4TaskCreationRequest(
+ model=model_id,
+ prompt=prompt,
+ image=reference_images_urls,
+ size=f"{w}x{h}",
+ seed=seed,
+ sequential_image_generation=sequential_image_generation,
+ sequential_image_generation_options=Seedream4Options(max_images=max_images),
+ watermark=watermark,
+ ),
+ )
+ if len(response.data) == 1:
+ return IO.NodeOutput(await download_url_to_image_tensor(get_image_url_from_response(response)))
+ urls = [str(d["url"]) for d in response.data if isinstance(d, dict) and "url" in d]
+ if fail_on_partial and len(urls) < len(response.data):
+ raise RuntimeError(f"Only {len(urls)} of {len(response.data)} images were generated before error.")
+ return IO.NodeOutput(torch.cat([await download_url_to_image_tensor(i) for i in urls]))
+
+
class ByteDanceTextToVideoNode(IO.ComfyNode):
@classmethod
@@ -400,7 +911,7 @@ class ByteDanceTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceTextToVideoNode",
display_name="ByteDance Text to Video",
- category="api node/video/ByteDance",
+ category="video/partner/ByteDance",
description="Generate video using ByteDance models via api based on prompt",
inputs=[
IO.Combo.Input(
@@ -528,7 +1039,7 @@ class ByteDanceImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceImageToVideoNode",
display_name="ByteDance Image to Video",
- category="api node/video/ByteDance",
+ category="video/partner/ByteDance",
description="Generate video using ByteDance models via api based on image and prompt",
inputs=[
IO.Combo.Input(
@@ -665,7 +1176,7 @@ class ByteDanceFirstLastFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceFirstLastFrameNode",
display_name="ByteDance First-Last-Frame to Video",
- category="api node/video/ByteDance",
+ category="video/partner/ByteDance",
description="Generate video using prompt and first and last frames.",
inputs=[
IO.Combo.Input(
@@ -813,7 +1324,7 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
return IO.Schema(
node_id="ByteDanceImageReferenceNode",
display_name="ByteDance Reference Images to Video",
- category="api node/video/ByteDance",
+ category="video/partner/ByteDance",
description="Generate video using prompt and reference images.",
inputs=[
IO.Combo.Input(
@@ -952,33 +1463,6 @@ class ByteDanceImageReferenceNode(IO.ComfyNode):
)
-async def process_video_task(
- cls: type[IO.ComfyNode],
- payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
- estimated_duration: int | None,
-) -> IO.NodeOutput:
- if payload.model in DEPRECATED_MODELS:
- logger.warning(
- "Model '%s' is deprecated and will be deactivated on May 13, 2026. "
- "Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
- payload.model,
- )
- initial_response = await sync_op(
- cls,
- ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
- data=payload,
- response_model=TaskCreationResponse,
- )
- response = await poll_op(
- cls,
- ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
- status_extractor=lambda r: r.status,
- estimated_duration=estimated_duration,
- response_model=TaskStatusResponse,
- )
- return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
-
-
def raise_if_text_params(prompt: str, text_params: list[str]) -> None:
for i in text_params:
if f"--{i} " in prompt:
@@ -1040,16 +1524,865 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
)
+def _seedance2_text_inputs(resolutions: list[str], default_ratio: str = "16:9"):
+ return [
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text prompt for video generation.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=resolutions,
+ tooltip="Resolution of the output video.",
+ ),
+ IO.Combo.Input(
+ "ratio",
+ options=["16:9", "4:3", "1:1", "3:4", "9:16", "21:9", "adaptive"],
+ default=default_ratio,
+ tooltip="Aspect ratio of the output video.",
+ ),
+ IO.Int.Input(
+ "duration",
+ default=7,
+ min=4,
+ max=15,
+ step=1,
+ tooltip="Duration of the output video in seconds (4-15).",
+ display_mode=IO.NumberDisplay.slider,
+ ),
+ IO.Boolean.Input(
+ "generate_audio",
+ default=True,
+ tooltip="Enable audio generation for the output video.",
+ ),
+ ]
+
+
+class ByteDance2TextToVideoNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ByteDance2TextToVideoNode",
+ display_name="ByteDance Seedance 2.0 Text to Video",
+ category="video/partner/ByteDance",
+ description="Generate video using Seedance 2.0 models based on a text prompt.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option("Seedance 2.0", _seedance2_text_inputs(["480p", "720p", "1080p"])),
+ IO.DynamicCombo.Option("Seedance 2.0 Fast", _seedance2_text_inputs(["480p", "720p"])),
+ ],
+ tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add a watermark to the video.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
+ expr="""
+ (
+ $rate480 := 10044;
+ $rate720 := 21600;
+ $rate1080 := 48800;
+ $m := widgets.model;
+ $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $rate := $res = "1080p" ? $rate1080 :
+ $res = "720p" ? $rate720 :
+ $rate480;
+ $cost := $dur * $rate * $pricePer1K / 1000;
+ {"type": "usd", "usd": $cost, "format": {"approximate": true}}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ seed: int,
+ watermark: bool,
+ ) -> IO.NodeOutput:
+ validate_string(model["prompt"], strip_whitespace=True, min_length=1)
+ model_id = SEEDANCE_MODELS[model["model"]]
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
+ data=Seedance2TaskCreationRequest(
+ model=model_id,
+ content=[TaskTextContent(text=model["prompt"])],
+ generate_audio=model["generate_audio"],
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=model["duration"],
+ seed=seed,
+ watermark=watermark,
+ ),
+ response_model=TaskCreationResponse,
+ )
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
+ response_model=TaskStatusResponse,
+ status_extractor=lambda r: r.status,
+ price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
+ poll_interval=9,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
+
+
+class ByteDance2FirstLastFrameNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ByteDance2FirstLastFrameNode",
+ display_name="ByteDance Seedance 2.0 First-Last-Frame to Video",
+ category="video/partner/ByteDance",
+ description="Generate video using Seedance 2.0 from a first frame image and optional last frame image.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "Seedance 2.0",
+ _seedance2_text_inputs(["480p", "720p", "1080p"], default_ratio="adaptive"),
+ ),
+ IO.DynamicCombo.Option(
+ "Seedance 2.0 Fast",
+ _seedance2_text_inputs(["480p", "720p"], default_ratio="adaptive"),
+ ),
+ ],
+ tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
+ ),
+ IO.Image.Input(
+ "first_frame",
+ tooltip="First frame image for the video.",
+ optional=True,
+ ),
+ IO.Image.Input(
+ "last_frame",
+ tooltip="Last frame image for the video.",
+ optional=True,
+ ),
+ IO.String.Input(
+ "first_frame_asset_id",
+ default="",
+ tooltip="Seedance asset_id to use as the first frame. "
+ "Mutually exclusive with the first_frame image input.",
+ optional=True,
+ ),
+ IO.String.Input(
+ "last_frame_asset_id",
+ default="",
+ tooltip="Seedance asset_id to use as the last frame. "
+ "Mutually exclusive with the last_frame image input.",
+ optional=True,
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add a watermark to the video.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
+ expr="""
+ (
+ $rate480 := 10044;
+ $rate720 := 21600;
+ $rate1080 := 48800;
+ $m := widgets.model;
+ $pricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $rate := $res = "1080p" ? $rate1080 :
+ $res = "720p" ? $rate720 :
+ $rate480;
+ $cost := $dur * $rate * $pricePer1K / 1000;
+ {"type": "usd", "usd": $cost, "format": {"approximate": true}}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ seed: int,
+ watermark: bool,
+ first_frame: Input.Image | None = None,
+ last_frame: Input.Image | None = None,
+ first_frame_asset_id: str = "",
+ last_frame_asset_id: str = "",
+ ) -> IO.NodeOutput:
+ validate_string(model["prompt"], strip_whitespace=True, min_length=1)
+ model_id = SEEDANCE_MODELS[model["model"]]
+
+ first_frame_asset_id = first_frame_asset_id.strip()
+ last_frame_asset_id = last_frame_asset_id.strip()
+
+ if first_frame is not None and first_frame_asset_id:
+ raise ValueError("Provide only one of first_frame or first_frame_asset_id, not both.")
+ if first_frame is None and not first_frame_asset_id:
+ raise ValueError("Either first_frame or first_frame_asset_id is required.")
+ if last_frame is not None and last_frame_asset_id:
+ raise ValueError("Provide only one of last_frame or last_frame_asset_id, not both.")
+
+ asset_ids_to_resolve = [a for a in (first_frame_asset_id, last_frame_asset_id) if a]
+ image_assets: dict[str, str] = {}
+ if asset_ids_to_resolve:
+ image_assets, _, _ = await _resolve_reference_assets(cls, asset_ids_to_resolve)
+ for aid in asset_ids_to_resolve:
+ if aid not in image_assets:
+ raise ValueError(f"Asset {aid} is not an Image asset.")
+
+ if first_frame_asset_id:
+ first_frame_url = image_assets[first_frame_asset_id]
+ else:
+ first_frame_url = await _seedance_virtual_library_upload_image_asset(
+ cls, first_frame, wait_label="Uploading first frame."
+ )
+
+ content: list[TaskTextContent | TaskImageContent] = [
+ TaskTextContent(text=model["prompt"]),
+ TaskImageContent(
+ image_url=TaskImageContentUrl(url=first_frame_url),
+ role="first_frame",
+ ),
+ ]
+ if last_frame_asset_id:
+ content.append(
+ TaskImageContent(
+ image_url=TaskImageContentUrl(url=image_assets[last_frame_asset_id]),
+ role="last_frame",
+ ),
+ )
+ elif last_frame is not None:
+ content.append(
+ TaskImageContent(
+ image_url=TaskImageContentUrl(
+ url=await _seedance_virtual_library_upload_image_asset(
+ cls, last_frame, wait_label="Uploading last frame."
+ )
+ ),
+ role="last_frame",
+ ),
+ )
+
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
+ data=Seedance2TaskCreationRequest(
+ model=model_id,
+ content=content,
+ generate_audio=model["generate_audio"],
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=model["duration"],
+ seed=seed,
+ watermark=watermark,
+ ),
+ response_model=TaskCreationResponse,
+ )
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
+ response_model=TaskStatusResponse,
+ status_extractor=lambda r: r.status,
+ price_extractor=_seedance2_price_extractor(model_id, has_video_input=False),
+ poll_interval=9,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
+
+
+def _seedance2_reference_inputs(resolutions: list[str], default_ratio: str = "16:9"):
+ return [
+ *_seedance2_text_inputs(resolutions, default_ratio=default_ratio),
+ IO.Autogrow.Input(
+ "reference_images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("reference_image"),
+ names=[
+ "image_1",
+ "image_2",
+ "image_3",
+ "image_4",
+ "image_5",
+ "image_6",
+ "image_7",
+ "image_8",
+ "image_9",
+ ],
+ min=0,
+ ),
+ ),
+ IO.Autogrow.Input(
+ "reference_videos",
+ template=IO.Autogrow.TemplateNames(
+ IO.Video.Input("reference_video"),
+ names=["video_1", "video_2", "video_3"],
+ min=0,
+ ),
+ ),
+ IO.Autogrow.Input(
+ "reference_audios",
+ template=IO.Autogrow.TemplateNames(
+ IO.Audio.Input("reference_audio"),
+ names=["audio_1", "audio_2", "audio_3"],
+ min=0,
+ ),
+ ),
+ IO.Boolean.Input(
+ "auto_downscale",
+ default=False,
+ optional=True,
+ tooltip="Automatically downscale reference videos that exceed the model's pixel budget "
+ "for the selected resolution. Aspect ratio is preserved; videos already within limits are untouched.",
+ ),
+ IO.Boolean.Input(
+ "auto_upscale",
+ default=False,
+ advanced=True,
+ optional=True,
+ tooltip="Automatically upscale reference videos that are below the model's minimum pixel count "
+ "for the selected resolution. Aspect ratio is preserved; videos already meeting the minimum are "
+ "untouched. Note: upscaling a low-resolution source does not add real detail and may produce "
+ "lower-quality generations.",
+ ),
+ IO.Autogrow.Input(
+ "reference_assets",
+ template=IO.Autogrow.TemplateNames(
+ IO.String.Input("reference_asset"),
+ names=[
+ "asset_1",
+ "asset_2",
+ "asset_3",
+ "asset_4",
+ "asset_5",
+ "asset_6",
+ "asset_7",
+ "asset_8",
+ "asset_9",
+ ],
+ min=0,
+ ),
+ ),
+ ]
+
+
+class ByteDance2ReferenceNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ByteDance2ReferenceNode",
+ display_name="ByteDance Seedance 2.0 Reference to Video",
+ category="video/partner/ByteDance",
+ description="Generate, edit, or extend video using Seedance 2.0 with reference images, "
+ "videos, and audio. Supports multimodal reference, video editing, and video extension.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "Seedance 2.0",
+ _seedance2_reference_inputs(["480p", "720p", "1080p"], default_ratio="adaptive"),
+ ),
+ IO.DynamicCombo.Option(
+ "Seedance 2.0 Fast",
+ _seedance2_reference_inputs(["480p", "720p"], default_ratio="adaptive"),
+ ),
+ ],
+ tooltip="Seedance 2.0 for maximum quality; Seedance 2.0 Fast for speed optimization.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add a watermark to the video.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=["model", "model.resolution", "model.duration"],
+ input_groups=["model.reference_videos"],
+ ),
+ expr="""
+ (
+ $rate480 := 10044;
+ $rate720 := 21600;
+ $rate1080 := 48800;
+ $m := widgets.model;
+ $hasVideo := $lookup(inputGroups, "model.reference_videos") > 0;
+ $noVideoPricePer1K := $contains($m, "fast") ? 0.008008 : 0.01001;
+ $videoPricePer1K := $contains($m, "fast") ? 0.004719 : 0.006149;
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $rate := $res = "1080p" ? $rate1080 :
+ $res = "720p" ? $rate720 :
+ $rate480;
+ $noVideoCost := $dur * $rate * $noVideoPricePer1K / 1000;
+ $minVideoFactor := $ceil($dur * 5 / 3);
+ $minVideoCost := $minVideoFactor * $rate * $videoPricePer1K / 1000;
+ $maxVideoCost := (15 + $dur) * $rate * $videoPricePer1K / 1000;
+ $hasVideo
+ ? {
+ "type": "range_usd",
+ "min_usd": $minVideoCost,
+ "max_usd": $maxVideoCost,
+ "format": {"approximate": true}
+ }
+ : {
+ "type": "usd",
+ "usd": $noVideoCost,
+ "format": {"approximate": true}
+ }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ seed: int,
+ watermark: bool,
+ ) -> IO.NodeOutput:
+ validate_string(model["prompt"], strip_whitespace=True, min_length=1)
+
+ reference_images = model.get("reference_images", {})
+ reference_videos = model.get("reference_videos", {})
+ reference_audios = model.get("reference_audios", {})
+ reference_assets = model.get("reference_assets", {})
+
+ reference_image_assets, reference_video_assets, reference_audio_assets = await _resolve_reference_assets(
+ cls, list(reference_assets.values())
+ )
+
+ if not reference_images and not reference_videos and not reference_image_assets and not reference_video_assets:
+ raise ValueError("At least one reference image or video or asset is required.")
+
+ total_images = len(reference_images) + len(reference_image_assets)
+ if total_images > 9:
+ raise ValueError(
+ f"Too many reference images: {total_images} "
+ f"(images={len(reference_images)}, image assets={len(reference_image_assets)}). Maximum is 9."
+ )
+ total_videos = len(reference_videos) + len(reference_video_assets)
+ if total_videos > 3:
+ raise ValueError(
+ f"Too many reference videos: {total_videos} "
+ f"(videos={len(reference_videos)}, video assets={len(reference_video_assets)}). Maximum is 3."
+ )
+ total_audios = len(reference_audios) + len(reference_audio_assets)
+ if total_audios > 3:
+ raise ValueError(
+ f"Too many reference audios: {total_audios} "
+ f"(audios={len(reference_audios)}, audio assets={len(reference_audio_assets)}). Maximum is 3."
+ )
+
+ model_id = SEEDANCE_MODELS[model["model"]]
+ has_video_input = total_videos > 0
+
+ if model.get("auto_downscale") and reference_videos:
+ max_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("max")
+ if max_px:
+ for key in reference_videos:
+ reference_videos[key] = downscale_video_to_max_pixels(reference_videos[key], max_px)
+
+ if model.get("auto_upscale") and reference_videos:
+ min_px = SEEDANCE2_REF_VIDEO_PIXEL_LIMITS.get(model_id, {}).get(model["resolution"], {}).get("min")
+ if min_px:
+ for key in reference_videos:
+ reference_videos[key] = upscale_video_to_min_pixels(reference_videos[key], min_px)
+
+ total_video_duration = 0.0
+ for i, key in enumerate(reference_videos, 1):
+ video = reference_videos[key]
+ _validate_ref_video_pixels(video, model_id, model["resolution"], i)
+ try:
+ dur = video.get_duration()
+ if dur < 1.8:
+ raise ValueError(f"Reference video {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
+ total_video_duration += dur
+ except ValueError:
+ raise
+ except Exception:
+ pass
+ if total_video_duration > 15.1:
+ raise ValueError(f"Total reference video duration is {total_video_duration:.1f}s. Maximum is 15.1 seconds.")
+
+ total_audio_duration = 0.0
+ for i, key in enumerate(reference_audios, 1):
+ audio = reference_audios[key]
+ dur = int(audio["waveform"].shape[-1]) / int(audio["sample_rate"])
+ if dur < 1.8:
+ raise ValueError(f"Reference audio {i} is too short: {dur:.1f}s. Minimum duration is 1.8 seconds.")
+ total_audio_duration += dur
+ if total_audio_duration > 15.1:
+ raise ValueError(f"Total reference audio duration is {total_audio_duration:.1f}s. Maximum is 15.1 seconds.")
+
+ asset_labels = _build_asset_labels(
+ reference_assets,
+ reference_image_assets,
+ reference_video_assets,
+ reference_audio_assets,
+ len(reference_images),
+ len(reference_videos),
+ len(reference_audios),
+ )
+ prompt_text = _rewrite_asset_refs(model["prompt"], asset_labels)
+
+ content: list[TaskTextContent | TaskImageContent | TaskVideoContent | TaskAudioContent] = [
+ TaskTextContent(text=prompt_text),
+ ]
+ for i, key in enumerate(reference_images, 1):
+ content.append(
+ TaskImageContent(
+ image_url=TaskImageContentUrl(
+ url=await _seedance_virtual_library_upload_image_asset(
+ cls,
+ reference_images[key],
+ wait_label=f"Uploading image {i}",
+ ),
+ ),
+ role="reference_image",
+ ),
+ )
+ for i, key in enumerate(reference_videos, 1):
+ content.append(
+ TaskVideoContent(
+ video_url=TaskVideoContentUrl(
+ url=await _seedance_virtual_library_upload_video_asset(
+ cls,
+ reference_videos[key],
+ wait_label=f"Uploading video {i}",
+ ),
+ ),
+ ),
+ )
+ for key in reference_audios:
+ content.append(
+ TaskAudioContent(
+ audio_url=TaskAudioContentUrl(
+ url=await upload_audio_to_comfyapi(
+ cls,
+ reference_audios[key],
+ container_format="mp3",
+ codec_name="libmp3lame",
+ mime_type="audio/mpeg",
+ ),
+ ),
+ ),
+ )
+ for url in reference_image_assets.values():
+ content.append(
+ TaskImageContent(
+ image_url=TaskImageContentUrl(url=url),
+ role="reference_image",
+ ),
+ )
+ for url in reference_video_assets.values():
+ content.append(
+ TaskVideoContent(video_url=TaskVideoContentUrl(url=url)),
+ )
+ for url in reference_audio_assets.values():
+ content.append(
+ TaskAudioContent(audio_url=TaskAudioContentUrl(url=url)),
+ )
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
+ data=Seedance2TaskCreationRequest(
+ model=model_id,
+ content=content,
+ generate_audio=model["generate_audio"],
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=model["duration"],
+ seed=seed,
+ watermark=watermark,
+ ),
+ response_model=TaskCreationResponse,
+ )
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"{BYTEPLUS_SEEDANCE2_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
+ response_model=TaskStatusResponse,
+ status_extractor=lambda r: r.status,
+ price_extractor=_seedance2_price_extractor(model_id, has_video_input=has_video_input),
+ poll_interval=9,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
+
+
+async def process_video_task(
+ cls: type[IO.ComfyNode],
+ payload: Text2VideoTaskCreationRequest | Image2VideoTaskCreationRequest,
+ estimated_duration: int | None,
+) -> IO.NodeOutput:
+ if payload.model in DEPRECATED_MODELS:
+ logger.warning(
+ "Model '%s' is deprecated and will be deactivated on May 13, 2026. "
+ "Please switch to a newer model. Recommended: seedance-1-0-pro-fast-251015.",
+ payload.model,
+ )
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_TASK_ENDPOINT, method="POST"),
+ data=payload,
+ response_model=TaskCreationResponse,
+ )
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"{BYTEPLUS_TASK_STATUS_ENDPOINT}/{initial_response.id}"),
+ status_extractor=lambda r: r.status,
+ estimated_duration=estimated_duration,
+ response_model=TaskStatusResponse,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.content.video_url))
+
+
+class ByteDanceCreateImageAsset(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="ByteDanceCreateImageAsset",
+ display_name="ByteDance Create Image Asset",
+ category="image/partner/ByteDance",
+ description=(
+ "Create a Seedance 2.0 personal image asset. Uploads the input image and "
+ "registers it in the given asset group. If group_id is empty, runs a real-person "
+ "H5 authentication flow to create a new group before adding the asset."
+ ),
+ inputs=[
+ IO.Image.Input("image", tooltip="Image to register as a personal asset."),
+ IO.String.Input(
+ "group_id",
+ default="",
+ tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
+ "same person. Leave empty to run real-person authentication in the browser and create a new group.",
+ ),
+ # IO.String.Input(
+ # "name",
+ # default="",
+ # tooltip="Asset name (up to 64 characters).",
+ # ),
+ ],
+ outputs=[
+ IO.String.Output(display_name="asset_id"),
+ IO.String.Output(display_name="group_id"),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ # is_api_node=True,
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ image: Input.Image,
+ group_id: str = "",
+ # name: str = "",
+ ) -> IO.NodeOutput:
+ # if len(name) > 64:
+ # raise ValueError("Name of asset can not be greater then 64 symbols")
+ validate_image_dimensions(image, min_width=300, max_width=6000, min_height=300, max_height=6000)
+ validate_image_aspect_ratio(image, min_ratio=(0.4, 1), max_ratio=(2.5, 1))
+ resolved_group = await _resolve_group_id(cls, group_id)
+ asset_id = await _create_seedance_asset(
+ cls,
+ group_id=resolved_group,
+ url=await upload_image_to_comfyapi(cls, image),
+ name="",
+ asset_type="Image",
+ )
+ await _wait_for_asset_active(cls, asset_id, resolved_group)
+ PromptServer.instance.send_progress_text(
+ f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
+ f"group_id: {resolved_group}",
+ cls.hidden.unique_id,
+ )
+ return IO.NodeOutput(asset_id, resolved_group)
+
+
+class ByteDanceCreateVideoAsset(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="ByteDanceCreateVideoAsset",
+ display_name="ByteDance Create Video Asset",
+ category="video/partner/ByteDance",
+ description=(
+ "Create a Seedance 2.0 personal video asset. Uploads the input video and "
+ "registers it in the given asset group. If group_id is empty, runs a real-person "
+ "H5 authentication flow to create a new group before adding the asset."
+ ),
+ inputs=[
+ IO.Video.Input("video", tooltip="Video to register as a personal asset."),
+ IO.String.Input(
+ "group_id",
+ default="",
+ tooltip="Reuse an existing Seedance asset group ID to skip repeated human verification for the "
+ "same person. Leave empty to run real-person authentication in the browser and create a new group.",
+ ),
+ # IO.String.Input(
+ # "name",
+ # default="",
+ # tooltip="Asset name (up to 64 characters).",
+ # ),
+ ],
+ outputs=[
+ IO.String.Output(display_name="asset_id"),
+ IO.String.Output(display_name="group_id"),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ # is_api_node=True,
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ video: Input.Video,
+ group_id: str = "",
+ # name: str = "",
+ ) -> IO.NodeOutput:
+ # if len(name) > 64:
+ # raise ValueError("Name of asset can not be greater then 64 symbols")
+ validate_video_duration(video, min_duration=2, max_duration=15)
+ validate_video_dimensions(video, min_width=300, max_width=6000, min_height=300, max_height=6000)
+
+ w, h = video.get_dimensions()
+ if h > 0:
+ ratio = w / h
+ if not (0.4 <= ratio <= 2.5):
+ raise ValueError(f"Asset video aspect ratio (W/H) must be in [0.4, 2.5], got {ratio:.3f} ({w}x{h}).")
+ pixels = w * h
+ if not (409_600 <= pixels <= 927_408):
+ raise ValueError(
+ f"Asset video total pixels (W×H) must be in [409600, 927408], " f"got {pixels:,} ({w}x{h})."
+ )
+
+ fps = float(video.get_frame_rate())
+ if not (24 <= fps <= 60):
+ raise ValueError(f"Asset video FPS must be in [24, 60], got {fps:.2f}.")
+
+ resolved_group = await _resolve_group_id(cls, group_id)
+ asset_id = await _create_seedance_asset(
+ cls,
+ group_id=resolved_group,
+ url=await upload_video_to_comfyapi(cls, video),
+ name="",
+ asset_type="Video",
+ )
+ await _wait_for_asset_active(cls, asset_id, resolved_group)
+ PromptServer.instance.send_progress_text(
+ f"Please save the asset_id and group_id for reuse.\n\nasset_id: {asset_id}\n\n"
+ f"group_id: {resolved_group}",
+ cls.hidden.unique_id,
+ )
+ return IO.NodeOutput(asset_id, resolved_group)
+
+
class ByteDanceExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
return [
ByteDanceImageNode,
ByteDanceSeedreamNode,
+ ByteDanceSeedreamNodeV2,
ByteDanceTextToVideoNode,
ByteDanceImageToVideoNode,
ByteDanceFirstLastFrameNode,
ByteDanceImageReferenceNode,
+ ByteDance2TextToVideoNode,
+ ByteDance2FirstLastFrameNode,
+ ByteDance2ReferenceNode,
+ ByteDanceCreateImageAsset,
+ ByteDanceCreateVideoAsset,
]
diff --git a/comfy_api_nodes/nodes_bytedance_llm.py b/comfy_api_nodes/nodes_bytedance_llm.py
new file mode 100644
index 000000000..007cac45f
--- /dev/null
+++ b/comfy_api_nodes/nodes_bytedance_llm.py
@@ -0,0 +1,271 @@
+"""API Nodes for ByteDance Seed LLM via the BytePlus ModelArk Responses API.
+
+See: https://docs.byteplus.com/en/docs/ModelArk/1585128
+"""
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.bytedance_llm import (
+ BytePlusInputImage,
+ BytePlusInputMessage,
+ BytePlusInputText,
+ BytePlusInputVideo,
+ BytePlusMessageContent,
+ BytePlusResponseCreateRequest,
+ BytePlusResponseObject,
+)
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ get_number_of_images,
+ sync_op,
+ upload_images_to_comfyapi,
+ upload_video_to_comfyapi,
+ validate_string,
+)
+
+BYTEPLUS_RESPONSES_ENDPOINT = "/proxy/byteplus/api/v3/responses"
+SEED_MAX_IMAGES = 20
+SEED_MAX_VIDEOS = 4
+
+SEED_MODELS: dict[str, str] = {
+ "Seed 2.0 Pro": "seed-2-0-pro-260328",
+ "Seed 2.0 Lite": "seed-2-0-lite-260228",
+ "Seed 2.0 Mini": "seed-2-0-mini-260215",
+}
+
+# USD per 1M tokens: (input, cache_hit_input, output)
+_SEED_PRICES_PER_MILLION: dict[str, tuple[float, float, float]] = {
+ "seed-2-0-pro-260328": (0.50, 0.10, 3.00),
+ "seed-2-0-lite-260228": (0.25, 0.05, 2.00),
+ "seed-2-0-mini-260215": (0.10, 0.02, 0.40),
+}
+
+
+def _seed_model_inputs(max_images: int = SEED_MAX_IMAGES, max_videos: int = SEED_MAX_VIDEOS):
+ return [
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, max_images + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional image(s) to use as context for the model. Up to {max_images} images.",
+ ),
+ IO.Autogrow.Input(
+ "videos",
+ template=IO.Autogrow.TemplateNames(
+ IO.Video.Input("video"),
+ names=[f"video_{i}" for i in range(1, max_videos + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional video(s) to use as context for the model. Up to {max_videos} videos.",
+ ),
+ IO.Float.Input(
+ "temperature",
+ default=1.0,
+ min=0.0,
+ max=2.0,
+ step=0.01,
+ tooltip="Controls randomness. 0.0 is deterministic, higher values are more random.",
+ advanced=True,
+ ),
+ ]
+
+
+def _calculate_price(model_id: str, response: BytePlusResponseObject) -> float | None:
+ """Compute approximate USD price from response usage."""
+ if not response.usage:
+ return None
+ rates = _SEED_PRICES_PER_MILLION.get(model_id)
+ if rates is None:
+ return None
+ input_rate, cache_hit_rate, output_rate = rates
+ input_tokens = response.usage.input_tokens or 0
+ output_tokens = response.usage.output_tokens or 0
+ cached = 0
+ if response.usage.input_tokens_details:
+ cached = response.usage.input_tokens_details.cached_tokens or 0
+ fresh_input = max(0, input_tokens - cached)
+ total = fresh_input * input_rate + cached * cache_hit_rate + output_tokens * output_rate
+ return total / 1_000_000.0
+
+
+def _get_text_from_response(response: BytePlusResponseObject) -> str:
+ """Extract concatenated text from all assistant message output_text blocks."""
+ if not response.output:
+ return ""
+ chunks: list[str] = []
+ for item in response.output:
+ if item.type != "message" or not item.content:
+ continue
+ for block in item.content:
+ if block.type == "output_text" and block.text:
+ chunks.append(block.text)
+ elif block.type == "refusal" and block.refusal:
+ raise ValueError(f"Model refused to respond: {block.refusal}")
+ return "\n".join(chunks)
+
+
+async def _build_image_content_blocks(
+ cls: type[IO.ComfyNode],
+ image_tensors: list[Input.Image],
+) -> list[BytePlusInputImage]:
+ urls = await upload_images_to_comfyapi(
+ cls,
+ image_tensors,
+ max_images=SEED_MAX_IMAGES,
+ wait_label="Uploading reference images",
+ )
+ return [BytePlusInputImage(image_url=url) for url in urls]
+
+
+async def _build_video_content_blocks(
+ cls: type[IO.ComfyNode],
+ videos: list[Input.Video],
+) -> list[BytePlusInputVideo]:
+ blocks: list[BytePlusInputVideo] = []
+ total = len(videos)
+ for idx, video in enumerate(videos):
+ label = "Uploading reference video"
+ if total > 1:
+ label = f"{label} ({idx + 1}/{total})"
+ url = await upload_video_to_comfyapi(cls, video, wait_label=label)
+ blocks.append(BytePlusInputVideo(video_url=url))
+ return blocks
+
+
+class ByteDanceSeedNode(IO.ComfyNode):
+ """Generate text responses from a ByteDance Seed 2.0 model."""
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="ByteDanceSeedNode",
+ display_name="ByteDance Seed",
+ category="text/partner/ByteDance",
+ essentials_category="Text Generation",
+ description="Generate text responses with ByteDance's Seed 2.0 models. "
+ "Provide a text prompt and optionally one or more images or videos for multimodal context.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text input to the model.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[IO.DynamicCombo.Option(label, _seed_model_inputs()) for label in SEED_MODELS],
+ tooltip="The Seed model used to generate the response.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ IO.String.Input(
+ "system_prompt",
+ multiline=True,
+ default="",
+ optional=True,
+ advanced=True,
+ tooltip="Foundational instructions that dictate the model's behavior.",
+ ),
+ ],
+ outputs=[IO.String.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr="""
+ (
+ $m := widgets.model;
+ $contains($m, "mini") ? {
+ "type": "list_usd",
+ "usd": [0.00025, 0.0009],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : $contains($m, "lite") ? {
+ "type": "list_usd",
+ "usd": [0.0003, 0.002],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : $contains($m, "pro") ? {
+ "type": "list_usd",
+ "usd": [0.0005, 0.003],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : {"type":"text", "text":"Token-based"}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ system_prompt: str = "",
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ model_label = model["model"]
+ temperature = model["temperature"]
+ model_id = SEED_MODELS[model_label]
+
+ image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None]
+ if sum(get_number_of_images(t) for t in image_tensors) > SEED_MAX_IMAGES:
+ raise ValueError(f"Up to {SEED_MAX_IMAGES} images are supported per request.")
+
+ video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None]
+ if len(video_inputs) > SEED_MAX_VIDEOS:
+ raise ValueError(f"Up to {SEED_MAX_VIDEOS} videos are supported per request.")
+
+ content: list[BytePlusMessageContent] = []
+ if image_tensors:
+ content.extend(await _build_image_content_blocks(cls, image_tensors))
+ if video_inputs:
+ content.extend(await _build_video_content_blocks(cls, video_inputs))
+ content.append(BytePlusInputText(text=prompt))
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path=BYTEPLUS_RESPONSES_ENDPOINT, method="POST"),
+ response_model=BytePlusResponseObject,
+ data=BytePlusResponseCreateRequest(
+ model=model_id,
+ input=[BytePlusInputMessage(role="user", content=content)],
+ instructions=system_prompt or None,
+ temperature=temperature,
+ store=False,
+ stream=False,
+ ),
+ price_extractor=lambda r: _calculate_price(model_id, r),
+ )
+ if response.error:
+ raise ValueError(f"Seed API error ({response.error.code}): {response.error.message}")
+ result = _get_text_from_response(response)
+ if not result:
+ raise ValueError("Empty response from Seed model.")
+ return IO.NodeOutput(result)
+
+
+class ByteDanceLLMExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [ByteDanceSeedNode]
+
+
+async def comfy_entrypoint() -> ByteDanceLLMExtension:
+ return ByteDanceLLMExtension()
diff --git a/comfy_api_nodes/nodes_elevenlabs.py b/comfy_api_nodes/nodes_elevenlabs.py
index e452daf77..37eeb2601 100644
--- a/comfy_api_nodes/nodes_elevenlabs.py
+++ b/comfy_api_nodes/nodes_elevenlabs.py
@@ -69,7 +69,7 @@ class ElevenLabsSpeechToText(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsSpeechToText",
display_name="ElevenLabs Speech to Text",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Transcribe audio to text. "
"Supports automatic language detection, speaker diarization, and audio event tagging.",
inputs=[
@@ -210,7 +210,7 @@ class ElevenLabsVoiceSelector(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsVoiceSelector",
display_name="ElevenLabs Voice Selector",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Select a predefined ElevenLabs voice for text-to-speech generation.",
inputs=[
IO.Combo.Input(
@@ -239,7 +239,7 @@ class ElevenLabsTextToSpeech(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsTextToSpeech",
display_name="ElevenLabs Text to Speech",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Convert text to speech.",
inputs=[
IO.Custom(ELEVENLABS_VOICE).Input(
@@ -414,7 +414,7 @@ class ElevenLabsAudioIsolation(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsAudioIsolation",
display_name="ElevenLabs Voice Isolation",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Remove background noise from audio, isolating vocals or speech.",
inputs=[
IO.Audio.Input(
@@ -459,7 +459,7 @@ class ElevenLabsTextToSoundEffects(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsTextToSoundEffects",
display_name="ElevenLabs Text to Sound Effects",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Generate sound effects from text descriptions.",
inputs=[
IO.String.Input(
@@ -555,7 +555,7 @@ class ElevenLabsInstantVoiceClone(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsInstantVoiceClone",
display_name="ElevenLabs Instant Voice Clone",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Create a cloned voice from audio samples. "
"Provide 1-8 audio recordings of the voice to clone.",
inputs=[
@@ -658,7 +658,7 @@ class ElevenLabsSpeechToSpeech(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsSpeechToSpeech",
display_name="ElevenLabs Speech to Speech",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Transform speech from one voice to another while preserving the original content and emotion.",
inputs=[
IO.Custom(ELEVENLABS_VOICE).Input(
@@ -793,7 +793,7 @@ class ElevenLabsTextToDialogue(IO.ComfyNode):
return IO.Schema(
node_id="ElevenLabsTextToDialogue",
display_name="ElevenLabs Text to Dialogue",
- category="api node/audio/ElevenLabs",
+ category="audio/partner/ElevenLabs",
description="Generate multi-speaker dialogue from text. Each dialogue entry has its own text and voice.",
inputs=[
IO.Float.Input(
diff --git a/comfy_api_nodes/nodes_gemini.py b/comfy_api_nodes/nodes_gemini.py
index 2b77a022e..3cfd541b2 100644
--- a/comfy_api_nodes/nodes_gemini.py
+++ b/comfy_api_nodes/nodes_gemini.py
@@ -83,13 +83,16 @@ class GeminiImageModel(str, Enum):
async def create_image_parts(
cls: type[IO.ComfyNode],
- images: Input.Image,
+ images: Input.Image | list[Input.Image],
image_limit: int = 0,
) -> list[GeminiPart]:
image_parts: list[GeminiPart] = []
if image_limit < 0:
raise ValueError("image_limit must be greater than or equal to 0 when creating Gemini image parts.")
- total_images = get_number_of_images(images)
+
+ # Accept either a single (possibly-batched) tensor or a list of them; share URL budget across all.
+ images_list: list[Input.Image] = images if isinstance(images, list) else [images]
+ total_images = sum(get_number_of_images(img) for img in images_list)
if total_images <= 0:
raise ValueError("No images provided to create_image_parts; at least one image is required.")
@@ -98,10 +101,18 @@ async def create_image_parts(
# Number of images we'll send as URLs (fileData)
num_url_images = min(effective_max, 10) # Vertex API max number of image links
+ upload_kwargs: dict = {"wait_label": "Uploading reference images"}
+ if effective_max > num_url_images:
+ # Split path (e.g. 11+ images): suppress per-image counter to avoid a confusing dual-fraction label.
+ upload_kwargs = {
+ "wait_label": f"Uploading reference images ({num_url_images}+)",
+ "show_batch_index": False,
+ }
reference_images_urls = await upload_images_to_comfyapi(
cls,
- images,
+ images_list,
max_images=num_url_images,
+ **upload_kwargs,
)
for reference_image_url in reference_images_urls:
image_parts.append(
@@ -112,15 +123,22 @@ async def create_image_parts(
)
)
)
- for idx in range(num_url_images, effective_max):
- image_parts.append(
- GeminiPart(
- inlineData=GeminiInlineData(
- mimeType=GeminiMimeType.image_png,
- data=tensor_to_base64_string(images[idx]),
+ if effective_max > num_url_images:
+ flat: list[torch.Tensor] = []
+ for tensor in images_list:
+ if len(tensor.shape) == 4:
+ flat.extend(tensor[i] for i in range(tensor.shape[0]))
+ else:
+ flat.append(tensor)
+ for idx in range(num_url_images, effective_max):
+ image_parts.append(
+ GeminiPart(
+ inlineData=GeminiInlineData(
+ mimeType=GeminiMimeType.image_png,
+ data=tensor_to_base64_string(flat[idx]),
+ )
)
)
- )
return image_parts
@@ -282,7 +300,7 @@ class GeminiNode(IO.ComfyNode):
return IO.Schema(
node_id="GeminiNode",
display_name="Google Gemini",
- category="api node/text/Gemini",
+ category="text/partner/Gemini",
description="Generate text responses with Google's Gemini AI model. "
"You can provide multiple types of inputs (text, images, audio, video) "
"as context for generating more relevant and meaningful responses.",
@@ -523,7 +541,7 @@ class GeminiInputFiles(IO.ComfyNode):
return IO.Schema(
node_id="GeminiInputFiles",
display_name="Gemini Input Files",
- category="api node/text/Gemini",
+ category="text/partner/Gemini",
description="Loads and prepares input files to include as inputs for Gemini LLM nodes. "
"The files will be read by the Gemini model when generating a response. "
"The contents of the text file count toward the token limit. "
@@ -580,7 +598,7 @@ class GeminiImage(IO.ComfyNode):
return IO.Schema(
node_id="GeminiImageNode",
display_name="Nano Banana (Google Gemini Image)",
- category="api node/image/Gemini",
+ category="image/partner/Gemini",
description="Edit images synchronously via Google API.",
inputs=[
IO.String.Input(
@@ -713,7 +731,7 @@ class GeminiImage2(IO.ComfyNode):
return IO.Schema(
node_id="GeminiImage2Node",
display_name="Nano Banana Pro (Google Gemini Image)",
- category="api node/image/Gemini",
+ category="image/partner/Gemini",
description="Generate or edit images synchronously via Google Vertex API.",
inputs=[
IO.String.Input(
@@ -851,7 +869,7 @@ class GeminiNanoBanana2(IO.ComfyNode):
return IO.Schema(
node_id="GeminiNanoBanana2",
display_name="Nano Banana 2",
- category="api node/image/Gemini",
+ category="image/partner/Gemini",
description="Generate or edit images synchronously via Google Vertex API.",
inputs=[
IO.String.Input(
@@ -891,10 +909,6 @@ class GeminiNanoBanana2(IO.ComfyNode):
"9:16",
"16:9",
"21:9",
- # "1:4",
- # "4:1",
- # "8:1",
- # "1:8",
],
default="auto",
tooltip="If set to 'auto', matches your input image's aspect ratio; "
@@ -902,12 +916,7 @@ class GeminiNanoBanana2(IO.ComfyNode):
),
IO.Combo.Input(
"resolution",
- options=[
- # "512px",
- "1K",
- "2K",
- "4K",
- ],
+ options=["1K", "2K", "4K"],
tooltip="Target output resolution. For 2K/4K the native Gemini upscaler is used.",
),
IO.Combo.Input(
@@ -956,6 +965,7 @@ class GeminiNanoBanana2(IO.ComfyNode):
],
is_api_node=True,
price_badge=GEMINI_IMAGE_2_PRICE_BADGE,
+ is_deprecated=True,
)
@classmethod
@@ -1016,6 +1026,197 @@ class GeminiNanoBanana2(IO.ComfyNode):
)
+def _nano_banana_2_v2_model_inputs():
+ return [
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=[
+ "auto",
+ "1:1",
+ "2:3",
+ "3:2",
+ "3:4",
+ "4:3",
+ "4:5",
+ "5:4",
+ "9:16",
+ "16:9",
+ "21:9",
+ "1:4",
+ "4:1",
+ "8:1",
+ "1:8",
+ ],
+ default="auto",
+ tooltip="If set to 'auto', matches your input image's aspect ratio; "
+ "if no image is provided, a 16:9 square is usually generated.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=["1K", "2K", "4K"],
+ tooltip="Target output resolution. For 2K/4K the native Gemini upscaler is used.",
+ ),
+ IO.Combo.Input(
+ "thinking_level",
+ options=["MINIMAL", "HIGH"],
+ ),
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, 15)],
+ min=0,
+ ),
+ tooltip="Optional reference image(s). Up to 14 images total.",
+ ),
+ IO.Custom("GEMINI_INPUT_FILES").Input(
+ "files",
+ optional=True,
+ tooltip="Optional file(s) to use as context for the model. "
+ "Accepts inputs from the Gemini Generate Content Input Files node.",
+ ),
+ ]
+
+
+class GeminiNanoBanana2V2(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="GeminiNanoBanana2V2",
+ display_name="Nano Banana 2",
+ category="image/partner/Gemini",
+ description="Generate or edit images synchronously via Google Vertex API.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ tooltip="Text prompt describing the image to generate or the edits to apply. "
+ "Include any constraints, styles, or details the model should follow.",
+ default="",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "Nano Banana 2 (Gemini 3.1 Flash Image)",
+ _nano_banana_2_v2_model_inputs(),
+ ),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=42,
+ min=0,
+ max=0xFFFFFFFFFFFFFFFF,
+ control_after_generate=True,
+ tooltip="When the seed is fixed to a specific value, the model makes a best effort to provide "
+ "the same response for repeated requests. Deterministic output isn't guaranteed. "
+ "Also, changing the model or parameter settings, such as the temperature, "
+ "can cause variations in the response even when you use the same seed value. "
+ "By default, a random seed value is used.",
+ ),
+ IO.Combo.Input(
+ "response_modalities",
+ options=["IMAGE", "IMAGE+TEXT"],
+ advanced=True,
+ ),
+ IO.String.Input(
+ "system_prompt",
+ multiline=True,
+ default=GEMINI_IMAGE_SYS_PROMPT,
+ optional=True,
+ tooltip="Foundational instructions that dictate an AI's behavior.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Image.Output(),
+ IO.String.Output(),
+ IO.Image.Output(
+ display_name="thought_image",
+ tooltip="First image from the model's thinking process. "
+ "Only available with thinking_level HIGH and IMAGE+TEXT modality.",
+ ),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution"]),
+ expr="""
+ (
+ $r := $lookup(widgets, "model.resolution");
+ $prices := {"1k": 0.0696, "2k": 0.1014, "4k": 0.154};
+ {"type":"usd","usd": $lookup($prices, $r), "format":{"suffix":"/Image","approximate":true}}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ response_modalities: str,
+ system_prompt: str = "",
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ model_choice = model["model"]
+ if model_choice == "Nano Banana 2 (Gemini 3.1 Flash Image)":
+ model_id = "gemini-3.1-flash-image-preview"
+ else:
+ model_id = model_choice
+
+ images = model.get("images") or {}
+ parts: list[GeminiPart] = [GeminiPart(text=prompt)]
+ if images:
+ image_tensors: list[Input.Image] = [t for t in images.values() if t is not None]
+ if image_tensors:
+ if sum(get_number_of_images(t) for t in image_tensors) > 14:
+ raise ValueError("The current maximum number of supported images is 14.")
+ parts.extend(await create_image_parts(cls, image_tensors))
+ files = model.get("files")
+ if files is not None:
+ parts.extend(files)
+
+ image_config = GeminiImageConfig(imageSize=model["resolution"])
+ if model["aspect_ratio"] != "auto":
+ image_config.aspectRatio = model["aspect_ratio"]
+
+ gemini_system_prompt = None
+ if system_prompt:
+ gemini_system_prompt = GeminiSystemInstructionContent(parts=[GeminiTextPart(text=system_prompt)], role=None)
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/vertexai/gemini/{model_id}", method="POST"),
+ data=GeminiImageGenerateContentRequest(
+ contents=[
+ GeminiContent(role=GeminiRole.user, parts=parts),
+ ],
+ generationConfig=GeminiImageGenerationConfig(
+ responseModalities=(["IMAGE"] if response_modalities == "IMAGE" else ["TEXT", "IMAGE"]),
+ imageConfig=image_config,
+ thinkingConfig=GeminiThinkingConfig(thinkingLevel=model["thinking_level"]),
+ ),
+ systemInstruction=gemini_system_prompt,
+ ),
+ response_model=GeminiGenerateContentResponse,
+ price_extractor=calculate_tokens_price,
+ )
+ return IO.NodeOutput(
+ await get_image_from_response(response),
+ get_text_from_response(response),
+ await get_image_from_response(response, thought=True),
+ )
+
+
class GeminiExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -1024,6 +1225,7 @@ class GeminiExtension(ComfyExtension):
GeminiImage,
GeminiImage2,
GeminiNanoBanana2,
+ GeminiNanoBanana2V2,
GeminiInputFiles,
]
diff --git a/comfy_api_nodes/nodes_grok.py b/comfy_api_nodes/nodes_grok.py
index f42d84616..43e3cdc26 100644
--- a/comfy_api_nodes/nodes_grok.py
+++ b/comfy_api_nodes/nodes_grok.py
@@ -49,12 +49,17 @@ class GrokImageNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokImageNode",
display_name="Grok Image",
- category="api node/image/Grok",
+ category="image/partner/Grok",
description="Generate images using Grok based on a text prompt",
inputs=[
IO.Combo.Input(
"model",
- options=["grok-imagine-image-pro", "grok-imagine-image", "grok-imagine-image-beta"],
+ options=[
+ "grok-imagine-image-quality",
+ "grok-imagine-image-pro",
+ "grok-imagine-image",
+ "grok-imagine-image-beta",
+ ],
),
IO.String.Input(
"prompt",
@@ -111,10 +116,12 @@ class GrokImageNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["model", "number_of_images"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "number_of_images", "resolution"]),
expr="""
(
- $rate := $contains(widgets.model, "pro") ? 0.07 : 0.02;
+ $rate := widgets.model = "grok-imagine-image-quality"
+ ? (widgets.resolution = "1k" ? 0.05 : 0.07)
+ : ($contains(widgets.model, "pro") ? 0.07 : 0.02);
{"type":"usd","usd": $rate * widgets.number_of_images}
)
""",
@@ -155,6 +162,61 @@ class GrokImageNode(IO.ComfyNode):
)
+_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS = [
+ "auto",
+ "1:1",
+ "2:3",
+ "3:2",
+ "3:4",
+ "4:3",
+ "9:16",
+ "16:9",
+ "9:19.5",
+ "19.5:9",
+ "9:20",
+ "20:9",
+ "1:2",
+ "2:1",
+]
+
+
+def _grok_image_edit_model_inputs(*, max_ref_images: int, with_aspect_ratio: bool):
+ inputs = [
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, max_ref_images + 1)],
+ min=1,
+ ),
+ tooltip=(
+ "Reference image to edit."
+ if max_ref_images == 1
+ else f"Reference image(s) to edit. Up to {max_ref_images} images."
+ ),
+ ),
+ IO.Combo.Input("resolution", options=["1K", "2K"]),
+ IO.Int.Input(
+ "number_of_images",
+ default=1,
+ min=1,
+ max=10,
+ step=1,
+ tooltip="Number of edited images to generate",
+ display_mode=IO.NumberDisplay.number,
+ ),
+ ]
+ if with_aspect_ratio:
+ inputs.append(
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=_GROK_IMAGE_EDIT_ASPECT_RATIO_OPTIONS,
+ tooltip="Only allowed when multiple images are connected.",
+ )
+ )
+ return inputs
+
+
class GrokImageEditNode(IO.ComfyNode):
@classmethod
@@ -162,12 +224,17 @@ class GrokImageEditNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokImageEditNode",
display_name="Grok Image Edit",
- category="api node/image/Grok",
+ category="image/partner/Grok",
description="Modify an existing image based on a text prompt",
inputs=[
IO.Combo.Input(
"model",
- options=["grok-imagine-image-pro", "grok-imagine-image", "grok-imagine-image-beta"],
+ options=[
+ "grok-imagine-image-quality",
+ "grok-imagine-image-pro",
+ "grok-imagine-image",
+ "grok-imagine-image-beta",
+ ],
),
IO.Image.Input("image", display_name="images"),
IO.String.Input(
@@ -228,14 +295,23 @@ class GrokImageEditNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["model", "number_of_images"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "number_of_images", "resolution"]),
expr="""
(
- $rate := $contains(widgets.model, "pro") ? 0.07 : 0.02;
- {"type":"usd","usd": 0.002 + $rate * widgets.number_of_images}
+ $isQualityModel := widgets.model = "grok-imagine-image-quality";
+ $isPro := $contains(widgets.model, "pro");
+ $rate := $isQualityModel
+ ? (widgets.resolution = "1k" ? 0.05 : 0.07)
+ : ($isPro ? 0.07 : 0.02);
+ $base := $isQualityModel ? 0.01 : 0.002;
+ $output := $rate * widgets.number_of_images;
+ $isPro
+ ? {"type":"usd","usd": $base + $output}
+ : {"type":"range_usd","min_usd": $base + $output, "max_usd": 3 * $base + $output}
)
""",
),
+ is_deprecated=True,
)
@classmethod
@@ -283,6 +359,143 @@ class GrokImageEditNode(IO.ComfyNode):
)
+class GrokImageEditNodeV2(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="GrokImageEditNodeV2",
+ display_name="Grok Image Edit",
+ category="image/partner/Grok",
+ description="Modify an existing image based on a text prompt",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="The text prompt used to generate the image",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "grok-imagine-image-quality",
+ _grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
+ ),
+ IO.DynamicCombo.Option(
+ "grok-imagine-image-pro",
+ _grok_image_edit_model_inputs(max_ref_images=1, with_aspect_ratio=False),
+ ),
+ IO.DynamicCombo.Option(
+ "grok-imagine-image",
+ _grok_image_edit_model_inputs(max_ref_images=3, with_aspect_ratio=True),
+ ),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to determine if node should re-run; "
+ "actual results are nondeterministic regardless of seed.",
+ ),
+ ],
+ outputs=[
+ IO.Image.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=["model", "model.resolution", "model.number_of_images"],
+ ),
+ expr="""
+ (
+ $isQualityModel := widgets.model = "grok-imagine-image-quality";
+ $isPro := $contains(widgets.model, "pro");
+ $res := $lookup(widgets, "model.resolution");
+ $n := $lookup(widgets, "model.number_of_images");
+ $rate := $isQualityModel
+ ? ($res = "1k" ? 0.05 : 0.07)
+ : ($isPro ? 0.07 : 0.02);
+ $base := $isQualityModel ? 0.01 : 0.002;
+ $output := $rate * $n;
+ $isPro
+ ? {"type":"usd","usd": $base + $output}
+ : {"type":"range_usd","min_usd": $base + $output, "max_usd": 3 * $base + $output}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ model_id = model["model"]
+ resolution = model["resolution"]
+ number_of_images = model["number_of_images"]
+ images_dict = model.get("images") or {}
+ aspect_ratio = model.get("aspect_ratio", "auto")
+
+ image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
+ n_images = sum(get_number_of_images(t) for t in image_tensors)
+ if n_images < 1:
+ raise ValueError("At least one image is required for editing.")
+ if model_id == "grok-imagine-image-pro" and n_images > 1:
+ raise ValueError("The pro model supports only 1 input image.")
+ if model_id != "grok-imagine-image-pro" and n_images > 3:
+ raise ValueError("A maximum of 3 input images is supported.")
+ if aspect_ratio != "auto" and n_images == 1:
+ raise ValueError(
+ "Custom aspect ratio is only allowed when multiple images are connected to the image input."
+ )
+
+ flat_tensors: list[torch.Tensor] = []
+ for tensor in image_tensors:
+ if len(tensor.shape) == 4:
+ flat_tensors.extend(tensor[i] for i in range(tensor.shape[0]))
+ else:
+ flat_tensors.append(tensor)
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/xai/v1/images/edits", method="POST"),
+ data=ImageEditRequest(
+ model=model_id,
+ images=[
+ InputUrlObject(url=f"data:image/png;base64,{tensor_to_base64_string(i)}") for i in flat_tensors
+ ],
+ prompt=prompt,
+ resolution=resolution.lower(),
+ n=number_of_images,
+ seed=seed,
+ aspect_ratio=None if aspect_ratio == "auto" else aspect_ratio,
+ ),
+ response_model=ImageGenerationResponse,
+ price_extractor=_extract_grok_price,
+ )
+ if len(response.data) == 1:
+ return IO.NodeOutput(await download_url_to_image_tensor(response.data[0].url))
+ return IO.NodeOutput(
+ torch.cat(
+ [await download_url_to_image_tensor(i) for i in [str(d.url) for d in response.data if d.url]],
+ )
+ )
+
+
class GrokVideoNode(IO.ComfyNode):
@classmethod
@@ -290,7 +503,7 @@ class GrokVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokVideoNode",
display_name="Grok Video",
- category="api node/video/Grok",
+ category="video/partner/Grok",
description="Generate video from a prompt or an image",
inputs=[
IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]),
@@ -402,7 +615,7 @@ class GrokVideoEditNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokVideoEditNode",
display_name="Grok Video Edit",
- category="api node/video/Grok",
+ category="video/partner/Grok",
description="Edit an existing video based on a text prompt.",
inputs=[
IO.Combo.Input("model", options=["grok-imagine-video", "grok-imagine-video-beta"]),
@@ -480,7 +693,7 @@ class GrokVideoReferenceNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokVideoReferenceNode",
display_name="Grok Reference-to-Video",
- category="api node/video/Grok",
+ category="video/partner/Grok",
description="Generate video guided by reference images as style and content references.",
inputs=[
IO.String.Input(
@@ -613,7 +826,7 @@ class GrokVideoExtendNode(IO.ComfyNode):
return IO.Schema(
node_id="GrokVideoExtendNode",
display_name="Grok Video Extend",
- category="api node/video/Grok",
+ category="video/partner/Grok",
description="Extend an existing video with a seamless continuation based on a text prompt.",
inputs=[
IO.String.Input(
@@ -717,6 +930,7 @@ class GrokExtension(ComfyExtension):
return [
GrokImageNode,
GrokImageEditNode,
+ GrokImageEditNodeV2,
GrokVideoNode,
GrokVideoReferenceNode,
GrokVideoEditNode,
diff --git a/comfy_api_nodes/nodes_hitpaw.py b/comfy_api_nodes/nodes_hitpaw.py
index 488080a74..22e679c29 100644
--- a/comfy_api_nodes/nodes_hitpaw.py
+++ b/comfy_api_nodes/nodes_hitpaw.py
@@ -71,7 +71,7 @@ class HitPawGeneralImageEnhance(IO.ComfyNode):
return IO.Schema(
node_id="HitPawGeneralImageEnhance",
display_name="HitPaw General Image Enhance",
- category="api node/image/HitPaw",
+ category="image/partner/HitPaw",
description="Upscale low-resolution images to super-resolution, eliminate artifacts and noise. "
f"Maximum output: {MAX_MP_GENERATIVE} megapixels.",
inputs=[
@@ -178,7 +178,6 @@ class HitPawGeneralImageEnhance(IO.ComfyNode):
status_extractor=lambda x: x.data.status,
price_extractor=lambda x: request_price,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.data.res_url))
@@ -202,7 +201,7 @@ class HitPawVideoEnhance(IO.ComfyNode):
return IO.Schema(
node_id="HitPawVideoEnhance",
display_name="HitPaw Video Enhance",
- category="api node/video/HitPaw",
+ category="video/partner/HitPaw",
description="Upscale low-resolution videos to high resolution, eliminate artifacts and noise. "
"Prices shown are per second of video.",
inputs=[
@@ -324,7 +323,6 @@ class HitPawVideoEnhance(IO.ComfyNode):
status_extractor=lambda x: x.data.status,
price_extractor=lambda x: request_price,
poll_interval=10.0,
- max_poll_attempts=320,
)
return IO.NodeOutput(await download_url_to_video_output(final_response.data.res_url))
diff --git a/comfy_api_nodes/nodes_hunyuan3d.py b/comfy_api_nodes/nodes_hunyuan3d.py
index 44c94a98e..826a3bd2d 100644
--- a/comfy_api_nodes/nodes_hunyuan3d.py
+++ b/comfy_api_nodes/nodes_hunyuan3d.py
@@ -123,7 +123,7 @@ class TencentTextToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="TencentTextToModelNode",
display_name="Hunyuan3D: Text to Model",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
essentials_category="3D",
inputs=[
IO.Combo.Input(
@@ -221,14 +221,17 @@ class TencentTextToModelNode(IO.ComfyNode):
response_model=To3DProTaskResultResponse,
status_extractor=lambda r: r.Status,
)
- obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
+ obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
+ obj_result = None
+ if obj_file_response:
+ obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
return IO.NodeOutput(
f"{task_id}.glb",
await download_url_to_file_3d(
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
),
- obj_result.obj,
- obj_result.texture,
+ obj_result.obj if obj_result else None,
+ obj_result.texture if obj_result else None,
)
@@ -239,7 +242,7 @@ class TencentImageToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="TencentImageToModelNode",
display_name="Hunyuan3D: Image(s) to Model",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
essentials_category="3D",
inputs=[
IO.Combo.Input(
@@ -378,17 +381,30 @@ class TencentImageToModelNode(IO.ComfyNode):
response_model=To3DProTaskResultResponse,
status_extractor=lambda r: r.Status,
)
- obj_result = await download_and_extract_obj_zip(get_file_from_response(result.ResultFile3Ds, "obj").Url)
+ obj_file_response = get_file_from_response(result.ResultFile3Ds, "obj", raise_if_not_found=False)
+ if obj_file_response:
+ obj_result = await download_and_extract_obj_zip(obj_file_response.Url)
+ return IO.NodeOutput(
+ f"{task_id}.glb",
+ await download_url_to_file_3d(
+ get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
+ ),
+ obj_result.obj,
+ obj_result.texture,
+ obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3),
+ obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3),
+ obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3),
+ )
return IO.NodeOutput(
f"{task_id}.glb",
await download_url_to_file_3d(
get_file_from_response(result.ResultFile3Ds, "glb").Url, "glb", task_id=task_id
),
- obj_result.obj,
- obj_result.texture,
- obj_result.metallic if obj_result.metallic is not None else torch.zeros(1, 1, 1, 3),
- obj_result.normal if obj_result.normal is not None else torch.zeros(1, 1, 1, 3),
- obj_result.roughness if obj_result.roughness is not None else torch.zeros(1, 1, 1, 3),
+ None,
+ None,
+ None,
+ None,
+ None,
)
@@ -399,7 +415,7 @@ class TencentModelTo3DUVNode(IO.ComfyNode):
return IO.Schema(
node_id="TencentModelTo3DUVNode",
display_name="Hunyuan3D: Model to UV",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
description="Perform UV unfolding on a 3D model to generate UV texture. "
"Input model must have less than 30000 faces.",
inputs=[
@@ -489,7 +505,7 @@ class Tencent3DTextureEditNode(IO.ComfyNode):
return IO.Schema(
node_id="Tencent3DTextureEditNode",
display_name="Hunyuan3D: 3D Texture Edit",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
description="After inputting the 3D model, perform 3D model texture redrawing.",
inputs=[
IO.MultiType.Input(
@@ -578,7 +594,7 @@ class Tencent3DPartNode(IO.ComfyNode):
return IO.Schema(
node_id="Tencent3DPartNode",
display_name="Hunyuan3D: 3D Part",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
description="Automatically perform component identification and generation based on the model structure.",
inputs=[
IO.MultiType.Input(
@@ -650,7 +666,7 @@ class TencentSmartTopologyNode(IO.ComfyNode):
return IO.Schema(
node_id="TencentSmartTopologyNode",
display_name="Hunyuan3D: Smart Topology",
- category="api node/3d/Tencent",
+ category="3d/partner/Tencent",
description="Perform smart retopology on a 3D model. "
"Supports GLB/OBJ formats; max 200MB; recommended for high-poly models.",
inputs=[
diff --git a/comfy_api_nodes/nodes_ideogram.py b/comfy_api_nodes/nodes_ideogram.py
index 97c3609bd..edd9b9435 100644
--- a/comfy_api_nodes/nodes_ideogram.py
+++ b/comfy_api_nodes/nodes_ideogram.py
@@ -234,7 +234,7 @@ class IdeogramV1(IO.ComfyNode):
return IO.Schema(
node_id="IdeogramV1",
display_name="Ideogram V1",
- category="api node/image/Ideogram",
+ category="image/partner/Ideogram",
description="Generates images using the Ideogram V1 model.",
inputs=[
IO.String.Input(
@@ -360,7 +360,7 @@ class IdeogramV2(IO.ComfyNode):
return IO.Schema(
node_id="IdeogramV2",
display_name="Ideogram V2",
- category="api node/image/Ideogram",
+ category="image/partner/Ideogram",
description="Generates images using the Ideogram V2 model.",
inputs=[
IO.String.Input(
@@ -526,7 +526,7 @@ class IdeogramV3(IO.ComfyNode):
return IO.Schema(
node_id="IdeogramV3",
display_name="Ideogram V3",
- category="api node/image/Ideogram",
+ category="image/partner/Ideogram",
description="Generates images using the Ideogram V3 model. "
"Supports both regular image generation from text prompts and image editing with mask.",
inputs=[
diff --git a/comfy_api_nodes/nodes_kling.py b/comfy_api_nodes/nodes_kling.py
index 9a37ccc53..9925ec548 100644
--- a/comfy_api_nodes/nodes_kling.py
+++ b/comfy_api_nodes/nodes_kling.py
@@ -642,7 +642,7 @@ class KlingCameraControls(IO.ComfyNode):
return IO.Schema(
node_id="KlingCameraControls",
display_name="Kling Camera Controls",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Allows specifying configuration options for Kling Camera Controls and motion control effects.",
inputs=[
IO.Combo.Input("camera_control_type", options=KlingCameraControlType),
@@ -762,7 +762,7 @@ class KlingTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingTextToVideoNode",
display_name="Kling Text to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Kling Text to Video Node",
inputs=[
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
@@ -849,7 +849,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProTextToVideoNode",
display_name="Kling 3.0 Omni Text to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Use text prompts to generate videos with the latest Kling model.",
inputs=[
IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
@@ -862,7 +862,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
),
IO.Combo.Input("aspect_ratio", options=["16:9", "9:16", "1:1"]),
IO.Int.Input("duration", default=5, min=3, max=15, display_mode=IO.NumberDisplay.slider),
- IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
IO.DynamicCombo.Input(
"storyboards",
options=[
@@ -904,12 +904,13 @@ class OmniProTextToVideoNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
- $mode := (widgets.resolution = "720p") ? "std" : "pro";
+ $res := widgets.resolution;
+ $mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
$isV3 := $contains(widgets.model_name, "v3");
$audio := $isV3 and widgets.generate_audio;
$rates := $audio
- ? {"std": 0.112, "pro": 0.14}
- : {"std": 0.084, "pro": 0.112};
+ ? {"std": 0.112, "pro": 0.14, "4k": 0.42}
+ : {"std": 0.084, "pro": 0.112, "4k": 0.42};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -934,6 +935,8 @@ class OmniProTextToVideoNode(IO.ComfyNode):
raise ValueError("kling-video-o1 only supports durations of 5 or 10 seconds.")
if generate_audio:
raise ValueError("kling-video-o1 does not support audio generation.")
+ if resolution == "4k":
+ raise ValueError("kling-video-o1 does not support 4k resolution.")
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
if stories_enabled and model_name == "kling-video-o1":
raise ValueError("kling-video-o1 does not support storyboards.")
@@ -963,6 +966,12 @@ class OmniProTextToVideoNode(IO.ComfyNode):
f"must equal the global duration ({duration}s)."
)
+ if resolution == "4k":
+ mode = "4k"
+ elif resolution == "1080p":
+ mode = "pro"
+ else:
+ mode = "std"
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
@@ -972,7 +981,7 @@ class OmniProTextToVideoNode(IO.ComfyNode):
prompt=prompt,
aspect_ratio=aspect_ratio,
duration=str(duration),
- mode="pro" if resolution == "1080p" else "std",
+ mode=mode,
multi_shot=multi_shot,
multi_prompt=multi_prompt_list,
shot_type="customize" if multi_shot else None,
@@ -989,7 +998,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProFirstLastFrameNode",
display_name="Kling 3.0 Omni First-Last-Frame to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Use a start frame, an optional end frame, or reference images with the latest Kling model.",
inputs=[
IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
@@ -1014,7 +1023,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
optional=True,
tooltip="Up to 6 additional reference images.",
),
- IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
IO.DynamicCombo.Input(
"storyboards",
options=[
@@ -1061,12 +1070,13 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
- $mode := (widgets.resolution = "720p") ? "std" : "pro";
+ $res := widgets.resolution;
+ $mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
$isV3 := $contains(widgets.model_name, "v3");
$audio := $isV3 and widgets.generate_audio;
$rates := $audio
- ? {"std": 0.112, "pro": 0.14}
- : {"std": 0.084, "pro": 0.112};
+ ? {"std": 0.112, "pro": 0.14, "4k": 0.42}
+ : {"std": 0.084, "pro": 0.112, "4k": 0.42};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -1093,6 +1103,8 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
if generate_audio:
raise ValueError("kling-video-o1 does not support audio generation.")
+ if resolution == "4k":
+ raise ValueError("kling-video-o1 does not support 4k resolution.")
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
if stories_enabled and model_name == "kling-video-o1":
raise ValueError("kling-video-o1 does not support storyboards.")
@@ -1161,6 +1173,12 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
validate_image_aspect_ratio(i, (1, 2.5), (2.5, 1))
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference frame(s)"):
image_list.append(OmniParamImage(image_url=i))
+ if resolution == "4k":
+ mode = "4k"
+ elif resolution == "1080p":
+ mode = "pro"
+ else:
+ mode = "std"
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
@@ -1170,7 +1188,7 @@ class OmniProFirstLastFrameNode(IO.ComfyNode):
prompt=prompt,
duration=str(duration),
image_list=image_list,
- mode="pro" if resolution == "1080p" else "std",
+ mode=mode,
sound="on" if generate_audio else "off",
multi_shot=multi_shot,
multi_prompt=multi_prompt_list,
@@ -1187,7 +1205,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProImageToVideoNode",
display_name="Kling 3.0 Omni Image to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Use up to 7 reference images to generate a video with the latest Kling model.",
inputs=[
IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
@@ -1204,7 +1222,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
"reference_images",
tooltip="Up to 7 reference images.",
),
- IO.Combo.Input("resolution", options=["1080p", "720p"], optional=True),
+ IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p", optional=True),
IO.DynamicCombo.Input(
"storyboards",
options=[
@@ -1251,12 +1269,13 @@ class OmniProImageToVideoNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(widgets=["duration", "resolution", "model_name", "generate_audio"]),
expr="""
(
- $mode := (widgets.resolution = "720p") ? "std" : "pro";
+ $res := widgets.resolution;
+ $mode := $res = "4k" ? "4k" : ($res = "720p" ? "std" : "pro");
$isV3 := $contains(widgets.model_name, "v3");
$audio := $isV3 and widgets.generate_audio;
$rates := $audio
- ? {"std": 0.112, "pro": 0.14}
- : {"std": 0.084, "pro": 0.112};
+ ? {"std": 0.112, "pro": 0.14, "4k": 0.42}
+ : {"std": 0.084, "pro": 0.112, "4k": 0.42};
{"type":"usd","usd": $lookup($rates, $mode) * widgets.duration}
)
""",
@@ -1282,6 +1301,8 @@ class OmniProImageToVideoNode(IO.ComfyNode):
raise ValueError("kling-video-o1 does not support durations greater than 10 seconds.")
if generate_audio:
raise ValueError("kling-video-o1 does not support audio generation.")
+ if resolution == "4k":
+ raise ValueError("kling-video-o1 does not support 4k resolution.")
stories_enabled = storyboards is not None and storyboards["storyboards"] != "disabled"
if stories_enabled and model_name == "kling-video-o1":
raise ValueError("kling-video-o1 does not support storyboards.")
@@ -1320,6 +1341,12 @@ class OmniProImageToVideoNode(IO.ComfyNode):
image_list: list[OmniParamImage] = []
for i in await upload_images_to_comfyapi(cls, reference_images, wait_label="Uploading reference image"):
image_list.append(OmniParamImage(image_url=i))
+ if resolution == "4k":
+ mode = "4k"
+ elif resolution == "1080p":
+ mode = "pro"
+ else:
+ mode = "std"
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/videos/omni-video", method="POST"),
@@ -1330,7 +1357,7 @@ class OmniProImageToVideoNode(IO.ComfyNode):
aspect_ratio=aspect_ratio,
duration=str(duration),
image_list=image_list,
- mode="pro" if resolution == "1080p" else "std",
+ mode=mode,
sound="on" if generate_audio else "off",
multi_shot=multi_shot,
multi_prompt=multi_prompt_list,
@@ -1347,7 +1374,7 @@ class OmniProVideoToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProVideoToVideoNode",
display_name="Kling 3.0 Omni Video to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Use a video and up to 4 reference images to generate a video with the latest Kling model.",
inputs=[
IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-video-o1"]),
@@ -1458,7 +1485,7 @@ class OmniProEditVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProEditVideoNode",
display_name="Kling 3.0 Omni Edit Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
essentials_category="Video Generation",
description="Edit an existing video with the latest model from Kling.",
inputs=[
@@ -1566,7 +1593,7 @@ class OmniProImageNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingOmniProImageNode",
display_name="Kling 3.0 Omni Image",
- category="api node/image/Kling",
+ category="image/partner/Kling",
description="Create or edit images with the latest model from Kling.",
inputs=[
IO.Combo.Input("model_name", options=["kling-v3-omni", "kling-image-o1"]),
@@ -1694,7 +1721,7 @@ class KlingCameraControlT2VNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingCameraControlT2VNode",
display_name="Kling Text to Video (Camera Control)",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Transform text into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original text.",
inputs=[
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
@@ -1756,7 +1783,7 @@ class KlingImage2VideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingImage2VideoNode",
display_name="Kling Image(First Frame) to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
inputs=[
IO.Image.Input("start_frame", tooltip="The reference image used to generate the video."),
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
@@ -1855,7 +1882,7 @@ class KlingCameraControlI2VNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingCameraControlI2VNode",
display_name="Kling Image to Video (Camera Control)",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Transform still images into cinematic videos with professional camera movements that simulate real-world cinematography. Control virtual camera actions including zoom, rotation, pan, tilt, and first-person view, while maintaining focus on your original image.",
inputs=[
IO.Image.Input(
@@ -1926,7 +1953,7 @@ class KlingStartEndFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingStartEndFrameNode",
display_name="Kling Start-End Frame to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Generate a video sequence that transitions between your provided start and end images. The node creates all frames in between, producing a smooth transformation from the first frame to the last.",
inputs=[
IO.Image.Input(
@@ -2020,7 +2047,7 @@ class KlingVideoExtendNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingVideoExtendNode",
display_name="Kling Video Extend",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Kling Video Extend Node. Extend videos made by other Kling nodes. The video_id is created by using other Kling Nodes.",
inputs=[
IO.String.Input(
@@ -2101,7 +2128,7 @@ class KlingDualCharacterVideoEffectNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingDualCharacterVideoEffectNode",
display_name="Kling Dual Character Video Effects",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Achieve different special effects when generating a video based on the effect_scene. First image will be positioned on left side, second on right side of the composite.",
inputs=[
IO.Image.Input("image_left", tooltip="Left side image"),
@@ -2191,7 +2218,7 @@ class KlingSingleImageVideoEffectNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingSingleImageVideoEffectNode",
display_name="Kling Video Effects",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Achieve different special effects when generating a video based on the effect_scene.",
inputs=[
IO.Image.Input(
@@ -2264,7 +2291,7 @@ class KlingLipSyncAudioToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingLipSyncAudioToVideoNode",
display_name="Kling Lip Sync Video with Audio",
- category="api node/video/Kling",
+ category="video/partner/Kling",
essentials_category="Video Generation",
description="Kling Lip Sync Audio to Video Node. Syncs mouth movements in a video file to the audio content of an audio file. When using, ensure that the audio contains clearly distinguishable vocals and that the video contains a distinct face. The audio file should not be larger than 5MB. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.",
inputs=[
@@ -2316,7 +2343,7 @@ class KlingLipSyncTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingLipSyncTextToVideoNode",
display_name="Kling Lip Sync Video with Text",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Kling Lip Sync Text to Video Node. Syncs mouth movements in a video file to a text prompt. The video file should not be larger than 100MB, should have height/width between 720px and 1920px, and should be between 2s and 10s in length.",
inputs=[
IO.Video.Input("video"),
@@ -2384,7 +2411,7 @@ class KlingVirtualTryOnNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingVirtualTryOnNode",
display_name="Kling Virtual Try On",
- category="api node/image/Kling",
+ category="image/partner/Kling",
description="Kling Virtual Try On Node. Input a human image and a cloth image to try on the cloth on the human. You can merge multiple clothing item pictures into one image with a white background.",
inputs=[
IO.Image.Input("human_image"),
@@ -2451,7 +2478,7 @@ class KlingImageGenerationNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingImageGenerationNode",
display_name="Kling 3.0 Image",
- category="api node/image/Kling",
+ category="image/partner/Kling",
description="Kling Image Generation Node. Generate an image from a text prompt with an optional reference image.",
inputs=[
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt"),
@@ -2588,7 +2615,7 @@ class TextToVideoWithAudio(IO.ComfyNode):
return IO.Schema(
node_id="KlingTextToVideoWithAudio",
display_name="Kling 2.6 Text to Video with Audio",
- category="api node/video/Kling",
+ category="video/partner/Kling",
inputs=[
IO.Combo.Input("model_name", options=["kling-v2-6"]),
IO.String.Input("prompt", multiline=True, tooltip="Positive text prompt."),
@@ -2656,7 +2683,7 @@ class ImageToVideoWithAudio(IO.ComfyNode):
return IO.Schema(
node_id="KlingImageToVideoWithAudio",
display_name="Kling 2.6 Image(First Frame) to Video with Audio",
- category="api node/video/Kling",
+ category="video/partner/Kling",
inputs=[
IO.Combo.Input("model_name", options=["kling-v2-6"]),
IO.Image.Input("start_frame"),
@@ -2726,7 +2753,7 @@ class MotionControl(IO.ComfyNode):
return IO.Schema(
node_id="KlingMotionControl",
display_name="Kling Motion Control",
- category="api node/video/Kling",
+ category="video/partner/Kling",
inputs=[
IO.String.Input("prompt", multiline=True),
IO.Image.Input("reference_image"),
@@ -2760,11 +2787,15 @@ class MotionControl(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["mode"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["mode", "model"]),
expr="""
(
- $prices := {"std": 0.07, "pro": 0.112};
- {"type":"usd","usd": $lookup($prices, widgets.mode), "format":{"suffix":"/second"}}
+ $prices := {
+ "kling-v3": {"std": 0.126, "pro": 0.168},
+ "kling-v2-6": {"std": 0.07, "pro": 0.112}
+ };
+ $modelPrices := $lookup($prices, widgets.model);
+ {"type":"usd","usd": $lookup($modelPrices, widgets.mode), "format":{"suffix":"/second"}}
)
""",
),
@@ -2823,7 +2854,7 @@ class KlingVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingVideoNode",
display_name="Kling 3.0 Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Generate videos with Kling V3. "
"Supports text-to-video and image-to-video with optional storyboard multi-prompt and audio generation.",
inputs=[
@@ -2860,7 +2891,7 @@ class KlingVideoNode(IO.ComfyNode):
IO.DynamicCombo.Option(
"kling-v3",
[
- IO.Combo.Input("resolution", options=["1080p", "720p"]),
+ IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
IO.Combo.Input(
"aspect_ratio",
options=["16:9", "9:16", "1:1"],
@@ -2913,7 +2944,11 @@ class KlingVideoNode(IO.ComfyNode):
),
expr="""
(
- $rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
+ $rates := {
+ "4k": {"off": 0.42, "on": 0.42},
+ "1080p": {"off": 0.112, "on": 0.168},
+ "720p": {"off": 0.084, "on": 0.126}
+ };
$res := $lookup(widgets, "model.resolution");
$audio := widgets.generate_audio ? "on" : "off";
$rate := $lookup($lookup($rates, $res), $audio);
@@ -2943,7 +2978,12 @@ class KlingVideoNode(IO.ComfyNode):
start_frame: Input.Image | None = None,
) -> IO.NodeOutput:
_ = seed
- mode = "pro" if model["resolution"] == "1080p" else "std"
+ if model["resolution"] == "4k":
+ mode = "4k"
+ elif model["resolution"] == "1080p":
+ mode = "pro"
+ else:
+ mode = "std"
custom_multi_shot = False
if multi_shot["multi_shot"] == "disabled":
shot_type = None
@@ -3037,7 +3077,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingFirstLastFrameNode",
display_name="Kling 3.0 First-Last-Frame to Video",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Generate videos with Kling V3 using first and last frames.",
inputs=[
IO.String.Input("prompt", multiline=True, default=""),
@@ -3057,7 +3097,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
IO.DynamicCombo.Option(
"kling-v3",
[
- IO.Combo.Input("resolution", options=["1080p", "720p"]),
+ IO.Combo.Input("resolution", options=["4k", "1080p", "720p"], default="1080p"),
],
),
],
@@ -3089,7 +3129,11 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
),
expr="""
(
- $rates := {"1080p": {"off": 0.112, "on": 0.168}, "720p": {"off": 0.084, "on": 0.126}};
+ $rates := {
+ "4k": {"off": 0.42, "on": 0.42},
+ "1080p": {"off": 0.112, "on": 0.168},
+ "720p": {"off": 0.084, "on": 0.126}
+ };
$res := $lookup(widgets, "model.resolution");
$audio := widgets.generate_audio ? "on" : "off";
$rate := $lookup($lookup($rates, $res), $audio);
@@ -3118,6 +3162,12 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
validate_image_aspect_ratio(end_frame, (1, 2.5), (2.5, 1))
image_url = await upload_image_to_comfyapi(cls, first_frame, wait_label="Uploading first frame")
image_tail_url = await upload_image_to_comfyapi(cls, end_frame, wait_label="Uploading end frame")
+ if model["resolution"] == "4k":
+ mode = "4k"
+ elif model["resolution"] == "1080p":
+ mode = "pro"
+ else:
+ mode = "std"
response = await sync_op(
cls,
ApiEndpoint(path="/proxy/kling/v1/videos/image2video", method="POST"),
@@ -3127,7 +3177,7 @@ class KlingFirstLastFrameNode(IO.ComfyNode):
image=image_url,
image_tail=image_tail_url,
prompt=prompt,
- mode="pro" if model["resolution"] == "1080p" else "std",
+ mode=mode,
duration=str(duration),
sound="on" if generate_audio else "off",
),
@@ -3152,7 +3202,7 @@ class KlingAvatarNode(IO.ComfyNode):
return IO.Schema(
node_id="KlingAvatarNode",
display_name="Kling Avatar 2.0",
- category="api node/video/Kling",
+ category="video/partner/Kling",
description="Generate broadcast-style digital human videos from a single photo and an audio file.",
inputs=[
IO.Image.Input(
diff --git a/comfy_api_nodes/nodes_krea.py b/comfy_api_nodes/nodes_krea.py
new file mode 100644
index 000000000..be04a272b
--- /dev/null
+++ b/comfy_api_nodes/nodes_krea.py
@@ -0,0 +1,290 @@
+"""Krea image-generation nodes."""
+
+import re
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.krea import (
+ KreaAssetResponse,
+ KreaGenerateImageRequest,
+ KreaImageStyleReference,
+ KreaJob,
+ KreaMoodboard,
+)
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ download_url_to_image_tensor,
+ poll_op,
+ sync_op,
+ tensor_to_bytesio,
+ validate_string,
+)
+
+
+class KreaIO:
+ STYLE_REF = "KREA_STYLE_REF"
+
+
+async def _upload_image_to_krea_assets(cls: type[IO.ComfyNode], image: Input.Image) -> str:
+ """Upload an image to Krea's /assets endpoint and return the Krea-hosted image URL."""
+ img_io = tensor_to_bytesio(image, total_pixels=2048 * 2048, mime_type="image/png")
+ response = await sync_op(
+ cls,
+ endpoint=ApiEndpoint(path="/proxy/krea/assets", method="POST"),
+ response_model=KreaAssetResponse,
+ files=[("file", (img_io.name, img_io, "image/png"))],
+ content_type="multipart/form-data",
+ max_retries=1,
+ wait_label="Uploading reference",
+ )
+ return response.image_url
+
+
+_MODEL_MEDIUM = "Krea 2 Medium"
+_MODEL_LARGE = "Krea 2 Large"
+_MODEL_ENDPOINTS: dict[str, str] = {
+ _MODEL_MEDIUM: "/proxy/krea/generate/image/krea/krea-2/medium",
+ _MODEL_LARGE: "/proxy/krea/generate/image/krea/krea-2/large",
+}
+
+_ASPECT_RATIOS = ["1:1", "4:3", "3:2", "16:9", "2.35:1", "4:5", "2:3", "9:16"]
+_RESOLUTIONS = ["1K"]
+_CREATIVITY_LEVELS = ["raw", "low", "medium", "high"]
+_KREA_QUEUED_STATUSES = ["backlogged", "queued", "scheduled"]
+
+_UUID_RE = re.compile(r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$")
+
+
+def _krea_model_inputs() -> list:
+ """Nested inputs shared by both Krea 2 Medium and Large under the DynamicCombo."""
+ return [
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=_ASPECT_RATIOS,
+ tooltip="Output aspect ratio.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=_RESOLUTIONS,
+ tooltip="Resolution scale.",
+ ),
+ IO.Combo.Input(
+ "creativity",
+ options=_CREATIVITY_LEVELS,
+ default="medium",
+ tooltip="Prompt interpretation strength: raw stays closest to the prompt; high is most creative.",
+ ),
+ IO.String.Input(
+ "moodboard_id",
+ default="",
+ tooltip="Optional Krea moodboard UUID (e.g. from the Krea website). "
+ "Leave empty to disable. Only one moodboard is supported per request.",
+ optional=True,
+ ),
+ IO.Float.Input(
+ "moodboard_strength",
+ default=0.35,
+ min=-0.5,
+ max=1.5,
+ step=0.05,
+ tooltip="Moodboard influence; ignored when moodboard_id is empty.",
+ optional=True,
+ ),
+ IO.Custom(KreaIO.STYLE_REF).Input(
+ "style_reference",
+ optional=True,
+ tooltip="Optional chain of style references (max 10) from Krea 2 Style Reference nodes.",
+ ),
+ ]
+
+
+class Krea2ImageNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="Krea2ImageNode",
+ display_name="Krea 2 Image",
+ category="image/partner/Krea",
+ description=(
+ "Generate images via Krea 2 — pick Medium (expressive illustrations) or "
+ "Large (expressive photorealism). Supports an optional moodboard and up "
+ "to 10 chained image style references."
+ ),
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text prompt for the image.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(_MODEL_MEDIUM, _krea_model_inputs()),
+ IO.DynamicCombo.Option(_MODEL_LARGE, _krea_model_inputs()),
+ ],
+ tooltip="Krea 2 Medium is best for expressive illustrations; "
+ "Krea 2 Large is best for expressive photorealism.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Random seed for reproducibility.",
+ ),
+ ],
+ outputs=[IO.Image.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(
+ widgets=["model", "model.moodboard_id"],
+ inputs=["model.style_reference"],
+ ),
+ expr="""
+ (
+ $isLarge := widgets.model = "krea 2 large";
+ $hasMoodboard := $length($lookup(widgets, "model.moodboard_id")) > 0;
+ $hasStyle := $lookup(inputs, "model.style_reference").connected;
+ $usd := $hasMoodboard
+ ? ($isLarge ? 0.07 : 0.04)
+ : ($hasStyle
+ ? ($isLarge ? 0.065 : 0.035)
+ : ($isLarge ? 0.06 : 0.03));
+ {"type":"usd","usd": $usd}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=False, min_length=1)
+
+ model_choice = model["model"]
+ endpoint_path = _MODEL_ENDPOINTS.get(model_choice)
+ if endpoint_path is None:
+ raise ValueError(f"Unknown Krea 2 model: {model_choice!r}")
+
+ moodboards: list[KreaMoodboard] | None = None
+ mb_id = (model.get("moodboard_id") or "").strip()
+ if mb_id:
+ if not _UUID_RE.match(mb_id):
+ raise ValueError(f"moodboard_id must be a UUID (received {mb_id!r}); copy it from the Krea website.")
+ mb_strength = model.get("moodboard_strength")
+ moodboards = [KreaMoodboard(id=mb_id, strength=0.35 if mb_strength is None else float(mb_strength))]
+
+ style_reference = model.get("style_reference")
+ image_style_references: list[KreaImageStyleReference] | None = None
+ if style_reference:
+ if len(style_reference) > 10:
+ raise ValueError(f"Krea 2 accepts at most 10 image_style_references; received {len(style_reference)}.")
+ image_style_references = [
+ KreaImageStyleReference(url=ref["url"], strength=float(ref["strength"])) for ref in style_reference
+ ]
+ initial = await sync_op(
+ cls,
+ ApiEndpoint(path=endpoint_path, method="POST"),
+ response_model=KreaJob,
+ data=KreaGenerateImageRequest(
+ prompt=prompt,
+ aspect_ratio=model["aspect_ratio"],
+ resolution=model["resolution"],
+ seed=seed,
+ creativity=model["creativity"],
+ moodboards=moodboards,
+ image_style_references=image_style_references,
+ ),
+ )
+ job = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/krea/jobs/{initial.job_id}", method="GET"),
+ response_model=KreaJob,
+ status_extractor=lambda r: r.status,
+ queued_statuses=_KREA_QUEUED_STATUSES,
+ )
+ if not job.result or not job.result.urls:
+ raise RuntimeError(f"Krea 2 job {job.job_id} completed without any image URLs.")
+ image = await download_url_to_image_tensor(job.result.urls[0])
+ return IO.NodeOutput(image)
+
+
+class Krea2StyleReferenceNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="Krea2StyleReferenceNode",
+ display_name="Krea 2 Style Reference",
+ category="image/partner/Krea",
+ description=(
+ "Add an image style reference to a Krea 2 generation. Chain multiple Krea 2 "
+ "Style Reference nodes (max 10) and feed the final `style_reference` output "
+ "into Krea 2 Image. Each image is uploaded to ComfyAPI storage and passed as URL."
+ ),
+ inputs=[
+ IO.Image.Input(
+ "image",
+ tooltip="Reference image whose style influences the generation.",
+ ),
+ IO.Float.Input(
+ "strength",
+ default=1.0,
+ min=-2.0,
+ max=2.0,
+ step=0.05,
+ tooltip="Reference strength; negative values invert the style influence.",
+ ),
+ IO.Custom(KreaIO.STYLE_REF).Input(
+ "style_reference",
+ optional=True,
+ tooltip="Optional incoming chain of style references; this node appends one more.",
+ ),
+ ],
+ outputs=[IO.Custom(KreaIO.STYLE_REF).Output(display_name="style_reference")],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ image: Input.Image,
+ strength: float,
+ style_reference: list[dict] | None = None,
+ ) -> IO.NodeOutput:
+ chain: list[dict] = list(style_reference) if style_reference else []
+ if len(chain) >= 10:
+ raise ValueError("Krea 2 accepts at most 10 image_style_references in one generation.")
+ url = await _upload_image_to_krea_assets(cls, image)
+ chain.append({"url": url, "strength": float(strength)})
+ return IO.NodeOutput(chain)
+
+
+class KreaExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [
+ Krea2ImageNode,
+ Krea2StyleReferenceNode,
+ ]
+
+
+async def comfy_entrypoint() -> KreaExtension:
+ return KreaExtension()
diff --git a/comfy_api_nodes/nodes_ltxv.py b/comfy_api_nodes/nodes_ltxv.py
index 0a219af96..01791d354 100644
--- a/comfy_api_nodes/nodes_ltxv.py
+++ b/comfy_api_nodes/nodes_ltxv.py
@@ -50,7 +50,7 @@ class TextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="LtxvApiTextToVideo",
display_name="LTXV Text To Video",
- category="api node/video/LTXV",
+ category="video/partner/LTXV",
description="Professional-quality videos with customizable duration and resolution.",
inputs=[
IO.Combo.Input("model", options=list(MODELS_MAP.keys())),
@@ -127,7 +127,7 @@ class ImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="LtxvApiImageToVideo",
display_name="LTXV Image To Video",
- category="api node/video/LTXV",
+ category="video/partner/LTXV",
description="Professional-quality videos with customizable duration and resolution based on start image.",
inputs=[
IO.Image.Input("image", tooltip="First frame to be used for the video."),
diff --git a/comfy_api_nodes/nodes_luma.py b/comfy_api_nodes/nodes_luma.py
index 9ed6cd299..08ae9904c 100644
--- a/comfy_api_nodes/nodes_luma.py
+++ b/comfy_api_nodes/nodes_luma.py
@@ -1,10 +1,11 @@
-from typing import Optional
-
import torch
from typing_extensions import override
-from comfy_api.latest import IO, ComfyExtension
+from comfy_api.latest import IO, ComfyExtension, Input
from comfy_api_nodes.apis.luma import (
+ Luma2Generation,
+ Luma2GenerationRequest,
+ Luma2ImageRef,
LumaAspectRatio,
LumaCharacterRef,
LumaConceptChain,
@@ -30,6 +31,7 @@ from comfy_api_nodes.util import (
download_url_to_video_output,
poll_op,
sync_op,
+ upload_image_to_comfyapi,
upload_images_to_comfyapi,
validate_string,
)
@@ -44,7 +46,7 @@ class LumaReferenceNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaReferenceNode",
display_name="Luma Reference",
- category="api node/image/Luma",
+ category="image/partner/Luma",
description="Holds an image and weight for use with Luma Generate Image node.",
inputs=[
IO.Image.Input(
@@ -83,7 +85,7 @@ class LumaConceptsNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaConceptsNode",
display_name="Luma Concepts",
- category="api node/video/Luma",
+ category="video/partner/Luma",
description="Camera Concepts for use with Luma Text to Video and Luma Image to Video nodes.",
inputs=[
IO.Combo.Input(
@@ -132,7 +134,7 @@ class LumaImageGenerationNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaImageNode",
display_name="Luma Text to Image",
- category="api node/image/Luma",
+ category="image/partner/Luma",
description="Generates images synchronously based on prompt and aspect ratio.",
inputs=[
IO.String.Input(
@@ -212,9 +214,9 @@ class LumaImageGenerationNode(IO.ComfyNode):
aspect_ratio: str,
seed,
style_image_weight: float,
- image_luma_ref: Optional[LumaReferenceChain] = None,
- style_image: Optional[torch.Tensor] = None,
- character_image: Optional[torch.Tensor] = None,
+ image_luma_ref: LumaReferenceChain | None = None,
+ style_image: torch.Tensor | None = None,
+ character_image: torch.Tensor | None = None,
) -> IO.NodeOutput:
validate_string(prompt, strip_whitespace=True, min_length=3)
# handle image_luma_ref
@@ -276,7 +278,7 @@ class LumaImageModifyNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaImageModifyNode",
display_name="Luma Image to Image",
- category="api node/image/Luma",
+ category="image/partner/Luma",
description="Modifies images synchronously based on prompt and aspect ratio.",
inputs=[
IO.Image.Input(
@@ -369,7 +371,7 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaVideoNode",
display_name="Luma Text to Video",
- category="api node/video/Luma",
+ category="video/partner/Luma",
description="Generates videos synchronously based on prompt and output_size.",
inputs=[
IO.String.Input(
@@ -434,7 +436,7 @@ class LumaTextToVideoGenerationNode(IO.ComfyNode):
duration: str,
loop: bool,
seed,
- luma_concepts: Optional[LumaConceptChain] = None,
+ luma_concepts: LumaConceptChain | None = None,
) -> IO.NodeOutput:
validate_string(prompt, strip_whitespace=False, min_length=3)
duration = duration if model != LumaVideoModel.ray_1_6 else None
@@ -470,7 +472,7 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode):
return IO.Schema(
node_id="LumaImageToVideoNode",
display_name="Luma Image to Video",
- category="api node/video/Luma",
+ category="video/partner/Luma",
description="Generates videos synchronously based on prompt, input images, and output_size.",
inputs=[
IO.String.Input(
@@ -533,7 +535,6 @@ class LumaImageToVideoGenerationNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=PRICE_BADGE_VIDEO,
-
)
@classmethod
@@ -644,6 +645,293 @@ PRICE_BADGE_VIDEO = IO.PriceBadge(
)
+def _luma2_uni1_common_inputs(max_image_refs: int) -> list:
+ return [
+ IO.Combo.Input(
+ "style",
+ options=["auto", "manga"],
+ default="auto",
+ tooltip="Style preset. 'auto' picks based on the prompt; "
+ "'manga' applies a manga/anime aesthetic and requires a portrait "
+ "aspect ratio (2:3, 9:16, 1:2, 1:3).",
+ ),
+ IO.Boolean.Input(
+ "web_search",
+ default=False,
+ tooltip="Search the web for visual references before generating.",
+ ),
+ IO.Autogrow.Input(
+ "image_ref",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, max_image_refs + 1)],
+ min=0,
+ ),
+ optional=True,
+ tooltip=f"Up to {max_image_refs} reference images for style/content guidance.",
+ ),
+ ]
+
+
+async def _luma2_upload_image_refs(
+ cls: type[IO.ComfyNode],
+ refs: dict | None,
+ max_count: int,
+) -> list[Luma2ImageRef] | None:
+ if not refs:
+ return None
+ out: list[Luma2ImageRef] = []
+ for key in refs:
+ url = await upload_image_to_comfyapi(cls, refs[key])
+ out.append(Luma2ImageRef(url=url))
+ if len(out) > max_count:
+ raise ValueError(f"Maximum {max_count} reference images are allowed.")
+ return out or None
+
+
+async def _luma2_submit_and_poll(
+ cls: type[IO.ComfyNode],
+ request: Luma2GenerationRequest,
+) -> Input.Image:
+ initial = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/luma_2/generations", method="POST"),
+ response_model=Luma2Generation,
+ data=request,
+ )
+ if not initial.id:
+ raise RuntimeError("Luma 2 API did not return a generation id.")
+ final = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/luma_2/generations/{initial.id}", method="GET"),
+ response_model=Luma2Generation,
+ status_extractor=lambda r: r.state,
+ progress_extractor=lambda r: None,
+ )
+ if not final.output:
+ msg = final.failure_reason or "no output returned"
+ raise RuntimeError(f"Luma 2 generation failed: {msg}")
+ url = final.output[0].url
+ if not url:
+ raise RuntimeError("Luma 2 generation completed without an output URL.")
+ return await download_url_to_image_tensor(url)
+
+
+class LumaImageNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="LumaImageNode2",
+ display_name="Luma UNI-1 Image",
+ category="image/partner/Luma",
+ description="Generate images from text using the Luma UNI-1 model.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text description of the desired image. 1–6000 characters.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "uni-1",
+ [
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=[
+ "auto",
+ "3:1",
+ "2:1",
+ "16:9",
+ "3:2",
+ "1:1",
+ "2:3",
+ "9:16",
+ "1:2",
+ "1:3",
+ ],
+ default="auto",
+ tooltip="Output image aspect ratio. 'auto' lets "
+ "the model pick based on the prompt.",
+ ),
+ *_luma2_uni1_common_inputs(max_image_refs=9),
+ ],
+ ),
+ IO.DynamicCombo.Option(
+ "uni-1-max",
+ [
+ IO.Combo.Input(
+ "aspect_ratio",
+ options=[
+ "auto",
+ "3:1",
+ "2:1",
+ "16:9",
+ "3:2",
+ "1:1",
+ "2:3",
+ "9:16",
+ "1:2",
+ "1:3",
+ ],
+ default="auto",
+ tooltip="Output image aspect ratio. 'auto' lets "
+ "the model pick based on the prompt.",
+ ),
+ *_luma2_uni1_common_inputs(max_image_refs=9),
+ ],
+ ),
+ ],
+ tooltip="Model to use for generation.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ ],
+ outputs=[IO.Image.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"], input_groups=["model.image_ref"]),
+ expr="""
+ (
+ $m := widgets.model;
+ $refs := $lookup(inputGroups, "model.image_ref");
+ $base := $m = "uni-1-max" ? 0.1 : 0.0404;
+ {"type":"usd","usd": $round($base + 0.003 * $refs, 4)}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, min_length=1, max_length=6000)
+ aspect_ratio = model["aspect_ratio"]
+ style = model["style"]
+ allowed_manga_ratios = {"2:3", "9:16", "1:2", "1:3"}
+ if style == "manga" and aspect_ratio != "auto" and aspect_ratio not in allowed_manga_ratios:
+ raise ValueError(
+ f"'manga' style requires a portrait aspect ratio "
+ f"({', '.join(sorted(allowed_manga_ratios))}) or 'auto'; got '{aspect_ratio}'."
+ )
+ request = Luma2GenerationRequest(
+ prompt=prompt,
+ model=model["model"],
+ type="image",
+ aspect_ratio=aspect_ratio if aspect_ratio != "auto" else None,
+ style=style if style != "auto" else None,
+ output_format="png",
+ web_search=model["web_search"],
+ image_ref=await _luma2_upload_image_refs(cls, model.get("image_ref"), max_count=9),
+ )
+ return IO.NodeOutput(await _luma2_submit_and_poll(cls, request))
+
+
+class LumaImageEditNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="LumaImageEditNode2",
+ display_name="Luma UNI-1 Image Edit",
+ category="image/partner/Luma",
+ description="Edit an existing image with a text prompt using the Luma UNI-1 model.",
+ inputs=[
+ IO.Image.Input(
+ "source",
+ tooltip="Source image to edit.",
+ ),
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Description of the desired edit. 1–6000 characters.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "uni-1",
+ _luma2_uni1_common_inputs(max_image_refs=8),
+ ),
+ IO.DynamicCombo.Option(
+ "uni-1-max",
+ _luma2_uni1_common_inputs(max_image_refs=8),
+ ),
+ ],
+ tooltip="Model to use for editing.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Seed controls whether the node should re-run; "
+ "results are non-deterministic regardless of seed.",
+ ),
+ ],
+ outputs=[IO.Image.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"], input_groups=["model.image_ref"]),
+ expr="""
+ (
+ $m := widgets.model;
+ $refs := $lookup(inputGroups, "model.image_ref");
+ $base := $m = "uni-1-max" ? 0.103 : 0.0434;
+ {"type":"usd","usd": $round($base + 0.003 * $refs, 4)}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ source: Input.Image,
+ prompt: str,
+ model: dict,
+ seed: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, min_length=1, max_length=6000)
+ request = Luma2GenerationRequest(
+ prompt=prompt,
+ model=model["model"],
+ type="image_edit",
+ source=Luma2ImageRef(url=await upload_image_to_comfyapi(cls, source)),
+ style=model["style"] if model["style"] != "auto" else None,
+ output_format="png",
+ web_search=model["web_search"],
+ image_ref=await _luma2_upload_image_refs(cls, model.get("image_ref"), max_count=8),
+ )
+ return IO.NodeOutput(await _luma2_submit_and_poll(cls, request))
+
+
class LumaExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -654,6 +942,8 @@ class LumaExtension(ComfyExtension):
LumaImageToVideoGenerationNode,
LumaReferenceNode,
LumaConceptsNode,
+ LumaImageNode,
+ LumaImageEditNode,
]
diff --git a/comfy_api_nodes/nodes_magnific.py b/comfy_api_nodes/nodes_magnific.py
index 0f53208d4..a6aeb194a 100644
--- a/comfy_api_nodes/nodes_magnific.py
+++ b/comfy_api_nodes/nodes_magnific.py
@@ -61,7 +61,7 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
return IO.Schema(
node_id="MagnificImageUpscalerCreativeNode",
display_name="Magnific Image Upscale (Creative)",
- category="api node/image/Magnific",
+ category="image/partner/Magnific",
description="Prompt‑guided enhancement, stylization, and 2x/4x/8x/16x upscaling. "
"Maximum output: 25.3 megapixels.",
inputs=[
@@ -230,7 +230,6 @@ class MagnificImageUpscalerCreativeNode(IO.ComfyNode):
status_extractor=lambda x: x.status,
price_extractor=lambda _: price_usd,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
@@ -241,7 +240,7 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
return IO.Schema(
node_id="MagnificImageUpscalerPreciseV2Node",
display_name="Magnific Image Upscale (Precise V2)",
- category="api node/image/Magnific",
+ category="image/partner/Magnific",
description="High-fidelity upscaling with fine control over sharpness, grain, and detail. "
"Maximum output: 10060×10060 pixels.",
inputs=[
@@ -391,7 +390,6 @@ class MagnificImageUpscalerPreciseV2Node(IO.ComfyNode):
status_extractor=lambda x: x.status,
price_extractor=lambda _: price_usd,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
@@ -402,7 +400,7 @@ class MagnificImageStyleTransferNode(IO.ComfyNode):
return IO.Schema(
node_id="MagnificImageStyleTransferNode",
display_name="Magnific Image Style Transfer",
- category="api node/image/Magnific",
+ category="image/partner/Magnific",
description="Transfer the style from a reference image to your input image.",
inputs=[
IO.Image.Input("image", tooltip="The image to apply style transfer to."),
@@ -541,7 +539,6 @@ class MagnificImageStyleTransferNode(IO.ComfyNode):
response_model=TaskResponse,
status_extractor=lambda x: x.status,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
@@ -552,7 +549,7 @@ class MagnificImageRelightNode(IO.ComfyNode):
return IO.Schema(
node_id="MagnificImageRelightNode",
display_name="Magnific Image Relight",
- category="api node/image/Magnific",
+ category="image/partner/Magnific",
description="Relight an image with lighting adjustments and optional reference-based light transfer.",
inputs=[
IO.Image.Input("image", tooltip="The image to relight."),
@@ -782,7 +779,6 @@ class MagnificImageRelightNode(IO.ComfyNode):
response_model=TaskResponse,
status_extractor=lambda x: x.status,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
@@ -793,7 +789,7 @@ class MagnificImageSkinEnhancerNode(IO.ComfyNode):
return IO.Schema(
node_id="MagnificImageSkinEnhancerNode",
display_name="Magnific Image Skin Enhancer",
- category="api node/image/Magnific",
+ category="image/partner/Magnific",
description="Skin enhancement for portraits with multiple processing modes.",
inputs=[
IO.Image.Input("image", tooltip="The portrait image to enhance."),
@@ -924,7 +920,6 @@ class MagnificImageSkinEnhancerNode(IO.ComfyNode):
response_model=TaskResponse,
status_extractor=lambda x: x.status,
poll_interval=10.0,
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_image_tensor(final_response.generated[0]))
diff --git a/comfy_api_nodes/nodes_meshy.py b/comfy_api_nodes/nodes_meshy.py
index 3cf577f4a..4fb670404 100644
--- a/comfy_api_nodes/nodes_meshy.py
+++ b/comfy_api_nodes/nodes_meshy.py
@@ -33,7 +33,7 @@ class MeshyTextToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyTextToModelNode",
display_name="Meshy: Text to Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
inputs=[
IO.Combo.Input("model", options=["latest"]),
IO.String.Input("prompt", multiline=True, default=""),
@@ -145,7 +145,7 @@ class MeshyRefineNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyRefineNode",
display_name="Meshy: Refine Draft Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
description="Refine a previously created draft model.",
inputs=[
IO.Combo.Input("model", options=["latest"]),
@@ -240,7 +240,7 @@ class MeshyImageToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyImageToModelNode",
display_name="Meshy: Image to Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
inputs=[
IO.Combo.Input("model", options=["latest"]),
IO.Image.Input("image"),
@@ -405,7 +405,7 @@ class MeshyMultiImageToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyMultiImageToModelNode",
display_name="Meshy: Multi-Image to Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
inputs=[
IO.Combo.Input("model", options=["latest"]),
IO.Autogrow.Input(
@@ -575,7 +575,7 @@ class MeshyRigModelNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyRigModelNode",
display_name="Meshy: Rig Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
description="Provides a rigged character in standard formats. "
"Auto-rigging is currently not suitable for untextured meshes, non-humanoid assets, "
"or humanoid assets with unclear limb and body structure.",
@@ -656,7 +656,7 @@ class MeshyAnimateModelNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyAnimateModelNode",
display_name="Meshy: Animate Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
description="Apply a specific animation action to a previously rigged character.",
inputs=[
IO.Custom("MESHY_RIGGED_TASK_ID").Input("rig_task_id"),
@@ -722,7 +722,7 @@ class MeshyTextureNode(IO.ComfyNode):
return IO.Schema(
node_id="MeshyTextureNode",
display_name="Meshy: Texture Model",
- category="api node/3d/Meshy",
+ category="3d/partner/Meshy",
inputs=[
IO.Combo.Input("model", options=["latest"]),
IO.Custom("MESHY_TASK_ID").Input("meshy_task_id"),
diff --git a/comfy_api_nodes/nodes_minimax.py b/comfy_api_nodes/nodes_minimax.py
index b5d0b461f..338584148 100644
--- a/comfy_api_nodes/nodes_minimax.py
+++ b/comfy_api_nodes/nodes_minimax.py
@@ -101,7 +101,7 @@ class MinimaxTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="MinimaxTextToVideoNode",
display_name="MiniMax Text to Video",
- category="api node/video/MiniMax",
+ category="video/partner/MiniMax",
description="Generates videos synchronously based on a prompt, and optional parameters.",
inputs=[
IO.String.Input(
@@ -163,7 +163,7 @@ class MinimaxImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="MinimaxImageToVideoNode",
display_name="MiniMax Image to Video",
- category="api node/video/MiniMax",
+ category="video/partner/MiniMax",
description="Generates videos synchronously based on an image and prompt, and optional parameters.",
inputs=[
IO.Image.Input(
@@ -230,7 +230,7 @@ class MinimaxSubjectToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="MinimaxSubjectToVideoNode",
display_name="MiniMax Subject to Video",
- category="api node/video/MiniMax",
+ category="video/partner/MiniMax",
description="Generates videos synchronously based on an image and prompt, and optional parameters.",
inputs=[
IO.Image.Input(
@@ -294,7 +294,7 @@ class MinimaxHailuoVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="MinimaxHailuoVideoNode",
display_name="MiniMax Hailuo Video",
- category="api node/video/MiniMax",
+ category="video/partner/MiniMax",
description="Generates videos from prompt, with optional start frame using the new MiniMax Hailuo-02 model.",
inputs=[
IO.String.Input(
diff --git a/comfy_api_nodes/nodes_moonvalley.py b/comfy_api_nodes/nodes_moonvalley.py
deleted file mode 100644
index 78a230529..000000000
--- a/comfy_api_nodes/nodes_moonvalley.py
+++ /dev/null
@@ -1,534 +0,0 @@
-import logging
-
-from typing_extensions import override
-
-from comfy_api.latest import IO, ComfyExtension, Input
-from comfy_api_nodes.apis.moonvalley import (
- MoonvalleyPromptResponse,
- MoonvalleyTextToVideoInferenceParams,
- MoonvalleyTextToVideoRequest,
- MoonvalleyVideoToVideoInferenceParams,
- MoonvalleyVideoToVideoRequest,
-)
-from comfy_api_nodes.util import (
- ApiEndpoint,
- download_url_to_video_output,
- poll_op,
- sync_op,
- trim_video,
- upload_images_to_comfyapi,
- upload_video_to_comfyapi,
- validate_container_format_is_mp4,
- validate_image_dimensions,
- validate_string,
-)
-
-API_UPLOADS_ENDPOINT = "/proxy/moonvalley/uploads"
-API_PROMPTS_ENDPOINT = "/proxy/moonvalley/prompts"
-API_VIDEO2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/video-to-video"
-API_TXT2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/text-to-video"
-API_IMG2VIDEO_ENDPOINT = "/proxy/moonvalley/prompts/image-to-video"
-
-MIN_WIDTH = 300
-MIN_HEIGHT = 300
-
-MAX_WIDTH = 10000
-MAX_HEIGHT = 10000
-
-MIN_VID_WIDTH = 300
-MIN_VID_HEIGHT = 300
-
-MAX_VID_WIDTH = 10000
-MAX_VID_HEIGHT = 10000
-
-MAX_VIDEO_SIZE = 1024 * 1024 * 1024 # 1 GB max for in-memory video processing
-
-MOONVALLEY_MAREY_MAX_PROMPT_LENGTH = 5000
-
-
-def is_valid_task_creation_response(response: MoonvalleyPromptResponse) -> bool:
- """Verifies that the initial response contains a task ID."""
- return bool(response.id)
-
-
-def validate_task_creation_response(response) -> None:
- if not is_valid_task_creation_response(response):
- error_msg = f"Moonvalley Marey API: Initial request failed. Code: {response.code}, Message: {response.message}, Data: {response}"
- logging.error(error_msg)
- raise RuntimeError(error_msg)
-
-
-def validate_video_to_video_input(video: Input.Video) -> Input.Video:
- """
- Validates and processes video input for Moonvalley Video-to-Video generation.
-
- Args:
- video: Input video to validate
-
- Returns:
- Validated and potentially trimmed video
-
- Raises:
- ValueError: If video doesn't meet requirements
- MoonvalleyApiError: If video duration is too short
- """
- width, height = _get_video_dimensions(video)
- _validate_video_dimensions(width, height)
- validate_container_format_is_mp4(video)
-
- return _validate_and_trim_duration(video)
-
-
-def _get_video_dimensions(video: Input.Video) -> tuple[int, int]:
- """Extracts video dimensions with error handling."""
- try:
- return video.get_dimensions()
- except Exception as e:
- logging.error("Error getting dimensions of video: %s", e)
- raise ValueError(f"Cannot get video dimensions: {e}") from e
-
-
-def _validate_video_dimensions(width: int, height: int) -> None:
- """Validates video dimensions meet Moonvalley V2V requirements."""
- supported_resolutions = {
- (1920, 1080),
- (1080, 1920),
- (1152, 1152),
- (1536, 1152),
- (1152, 1536),
- }
-
- if (width, height) not in supported_resolutions:
- supported_list = ", ".join([f"{w}x{h}" for w, h in sorted(supported_resolutions)])
- raise ValueError(f"Resolution {width}x{height} not supported. Supported: {supported_list}")
-
-
-def _validate_and_trim_duration(video: Input.Video) -> Input.Video:
- """Validates video duration and trims to 5 seconds if needed."""
- duration = video.get_duration()
- _validate_minimum_duration(duration)
- return _trim_if_too_long(video, duration)
-
-
-def _validate_minimum_duration(duration: float) -> None:
- """Ensures video is at least 5 seconds long."""
- if duration < 5:
- raise ValueError("Input video must be at least 5 seconds long.")
-
-
-def _trim_if_too_long(video: Input.Video, duration: float) -> Input.Video:
- """Trims video to 5 seconds if longer."""
- if duration > 5:
- return trim_video(video, 5)
- return video
-
-
-def parse_width_height_from_res(resolution: str):
- # Accepts a string like "16:9 (1920 x 1080)" and returns width, height as a dict
- res_map = {
- "16:9 (1920 x 1080)": {"width": 1920, "height": 1080},
- "9:16 (1080 x 1920)": {"width": 1080, "height": 1920},
- "1:1 (1152 x 1152)": {"width": 1152, "height": 1152},
- "4:3 (1536 x 1152)": {"width": 1536, "height": 1152},
- "3:4 (1152 x 1536)": {"width": 1152, "height": 1536},
- # "21:9 (2560 x 1080)": {"width": 2560, "height": 1080},
- }
- return res_map.get(resolution, {"width": 1920, "height": 1080})
-
-
-def parse_control_parameter(value):
- control_map = {
- "Motion Transfer": "motion_control",
- "Canny": "canny_control",
- "Pose Transfer": "pose_control",
- "Depth": "depth_control",
- }
- return control_map.get(value, control_map["Motion Transfer"])
-
-
-async def get_response(cls: type[IO.ComfyNode], task_id: str) -> MoonvalleyPromptResponse:
- return await poll_op(
- cls,
- ApiEndpoint(path=f"{API_PROMPTS_ENDPOINT}/{task_id}"),
- response_model=MoonvalleyPromptResponse,
- status_extractor=lambda r: (r.status if r and r.status else None),
- poll_interval=16.0,
- max_poll_attempts=240,
- )
-
-
-class MoonvalleyImg2VideoNode(IO.ComfyNode):
-
- @classmethod
- def define_schema(cls) -> IO.Schema:
- return IO.Schema(
- node_id="MoonvalleyImg2VideoNode",
- display_name="Moonvalley Marey Image to Video",
- category="api node/video/Moonvalley Marey",
- description="Moonvalley Marey Image to Video Node",
- inputs=[
- IO.Image.Input(
- "image",
- tooltip="The reference image used to generate the video",
- ),
- IO.String.Input(
- "prompt",
- multiline=True,
- ),
- IO.String.Input(
- "negative_prompt",
- multiline=True,
- default=" gopro, bright, contrast, static, overexposed, vignette, "
- "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
- "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
- "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
- "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
- "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
- tooltip="Negative prompt text",
- ),
- IO.Combo.Input(
- "resolution",
- options=[
- "16:9 (1920 x 1080)",
- "9:16 (1080 x 1920)",
- "1:1 (1152 x 1152)",
- "4:3 (1536 x 1152)",
- "3:4 (1152 x 1536)",
- # "21:9 (2560 x 1080)",
- ],
- default="16:9 (1920 x 1080)",
- tooltip="Resolution of the output video",
- ),
- IO.Float.Input(
- "prompt_adherence",
- default=4.5,
- min=1.0,
- max=20.0,
- step=1.0,
- tooltip="Guidance scale for generation control",
- ),
- IO.Int.Input(
- "seed",
- default=9,
- min=0,
- max=4294967295,
- step=1,
- display_mode=IO.NumberDisplay.number,
- tooltip="Random seed value",
- control_after_generate=True,
- ),
- IO.Int.Input(
- "steps",
- default=80,
- min=75, # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
- max=100,
- step=1,
- tooltip="Number of denoising steps",
- ),
- ],
- outputs=[IO.Video.Output()],
- hidden=[
- IO.Hidden.auth_token_comfy_org,
- IO.Hidden.api_key_comfy_org,
- IO.Hidden.unique_id,
- ],
- is_api_node=True,
- price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(),
- expr="""{"type":"usd","usd": 1.5}""",
- ),
- )
-
- @classmethod
- async def execute(
- cls,
- image: Input.Image,
- prompt: str,
- negative_prompt: str,
- resolution: str,
- prompt_adherence: float,
- seed: int,
- steps: int,
- ) -> IO.NodeOutput:
- validate_image_dimensions(image, min_width=300, min_height=300, max_height=MAX_HEIGHT, max_width=MAX_WIDTH)
- validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
- validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
- width_height = parse_width_height_from_res(resolution)
-
- inference_params = MoonvalleyTextToVideoInferenceParams(
- negative_prompt=negative_prompt,
- steps=steps,
- seed=seed,
- guidance_scale=prompt_adherence,
- width=width_height["width"],
- height=width_height["height"],
- use_negative_prompts=True,
- )
-
- # Get MIME type from tensor - assuming PNG format for image tensors
- mime_type = "image/png"
- image_url = (await upload_images_to_comfyapi(cls, image, max_images=1, mime_type=mime_type))[0]
- task_creation_response = await sync_op(
- cls,
- endpoint=ApiEndpoint(path=API_IMG2VIDEO_ENDPOINT, method="POST"),
- response_model=MoonvalleyPromptResponse,
- data=MoonvalleyTextToVideoRequest(
- image_url=image_url, prompt_text=prompt, inference_params=inference_params
- ),
- )
- validate_task_creation_response(task_creation_response)
- final_response = await get_response(cls, task_creation_response.id)
- video = await download_url_to_video_output(final_response.output_url)
- return IO.NodeOutput(video)
-
-
-class MoonvalleyVideo2VideoNode(IO.ComfyNode):
-
- @classmethod
- def define_schema(cls) -> IO.Schema:
- return IO.Schema(
- node_id="MoonvalleyVideo2VideoNode",
- display_name="Moonvalley Marey Video to Video",
- category="api node/video/Moonvalley Marey",
- description="",
- inputs=[
- IO.String.Input(
- "prompt",
- multiline=True,
- tooltip="Describes the video to generate",
- ),
- IO.String.Input(
- "negative_prompt",
- multiline=True,
- default=" gopro, bright, contrast, static, overexposed, vignette, "
- "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
- "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
- "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
- "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
- "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
- tooltip="Negative prompt text",
- ),
- IO.Int.Input(
- "seed",
- default=9,
- min=0,
- max=4294967295,
- step=1,
- display_mode=IO.NumberDisplay.number,
- tooltip="Random seed value",
- control_after_generate=False,
- ),
- IO.Video.Input(
- "video",
- tooltip="The reference video used to generate the output video. Must be at least 5 seconds long. "
- "Videos longer than 5s will be automatically trimmed. Only MP4 format supported.",
- ),
- IO.Combo.Input(
- "control_type",
- options=["Motion Transfer", "Pose Transfer"],
- default="Motion Transfer",
- optional=True,
- ),
- IO.Int.Input(
- "motion_intensity",
- default=100,
- min=0,
- max=100,
- step=1,
- tooltip="Only used if control_type is 'Motion Transfer'",
- optional=True,
- ),
- IO.Int.Input(
- "steps",
- default=60,
- min=60, # steps should be greater or equal to cooldown_steps(36) + warmup_steps(24)
- max=100,
- step=1,
- display_mode=IO.NumberDisplay.number,
- tooltip="Number of inference steps",
- ),
- ],
- outputs=[IO.Video.Output()],
- hidden=[
- IO.Hidden.auth_token_comfy_org,
- IO.Hidden.api_key_comfy_org,
- IO.Hidden.unique_id,
- ],
- is_api_node=True,
- price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(),
- expr="""{"type":"usd","usd": 2.25}""",
- ),
- )
-
- @classmethod
- async def execute(
- cls,
- prompt: str,
- negative_prompt: str,
- seed: int,
- video: Input.Video | None = None,
- control_type: str = "Motion Transfer",
- motion_intensity: int | None = 100,
- steps=60,
- prompt_adherence=4.5,
- ) -> IO.NodeOutput:
- validated_video = validate_video_to_video_input(video)
- video_url = await upload_video_to_comfyapi(cls, validated_video)
- validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
- validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
-
- # Only include motion_intensity for Motion Transfer
- control_params = {}
- if control_type == "Motion Transfer" and motion_intensity is not None:
- control_params["motion_intensity"] = motion_intensity
-
- inference_params = MoonvalleyVideoToVideoInferenceParams(
- negative_prompt=negative_prompt,
- seed=seed,
- control_params=control_params,
- steps=steps,
- guidance_scale=prompt_adherence,
- )
-
- task_creation_response = await sync_op(
- cls,
- endpoint=ApiEndpoint(path=API_VIDEO2VIDEO_ENDPOINT, method="POST"),
- response_model=MoonvalleyPromptResponse,
- data=MoonvalleyVideoToVideoRequest(
- control_type=parse_control_parameter(control_type),
- video_url=video_url,
- prompt_text=prompt,
- inference_params=inference_params,
- ),
- )
- validate_task_creation_response(task_creation_response)
- final_response = await get_response(cls, task_creation_response.id)
- return IO.NodeOutput(await download_url_to_video_output(final_response.output_url))
-
-
-class MoonvalleyTxt2VideoNode(IO.ComfyNode):
-
- @classmethod
- def define_schema(cls) -> IO.Schema:
- return IO.Schema(
- node_id="MoonvalleyTxt2VideoNode",
- display_name="Moonvalley Marey Text to Video",
- category="api node/video/Moonvalley Marey",
- description="",
- inputs=[
- IO.String.Input(
- "prompt",
- multiline=True,
- ),
- IO.String.Input(
- "negative_prompt",
- multiline=True,
- default=" gopro, bright, contrast, static, overexposed, vignette, "
- "artifacts, still, noise, texture, scanlines, videogame, 360 camera, VR, transition, "
- "flare, saturation, distorted, warped, wide angle, saturated, vibrant, glowing, "
- "cross dissolve, cheesy, ugly hands, mutated hands, mutant, disfigured, extra fingers, "
- "blown out, horrible, blurry, worst quality, bad, dissolve, melt, fade in, fade out, "
- "wobbly, weird, low quality, plastic, stock footage, video camera, boring",
- tooltip="Negative prompt text",
- ),
- IO.Combo.Input(
- "resolution",
- options=[
- "16:9 (1920 x 1080)",
- "9:16 (1080 x 1920)",
- "1:1 (1152 x 1152)",
- "4:3 (1536 x 1152)",
- "3:4 (1152 x 1536)",
- "21:9 (2560 x 1080)",
- ],
- default="16:9 (1920 x 1080)",
- tooltip="Resolution of the output video",
- ),
- IO.Float.Input(
- "prompt_adherence",
- default=4.0,
- min=1.0,
- max=20.0,
- step=1.0,
- tooltip="Guidance scale for generation control",
- ),
- IO.Int.Input(
- "seed",
- default=9,
- min=0,
- max=4294967295,
- step=1,
- display_mode=IO.NumberDisplay.number,
- control_after_generate=True,
- tooltip="Random seed value",
- ),
- IO.Int.Input(
- "steps",
- default=80,
- min=75, # steps should be greater or equal to cooldown_steps(75) + warmup_steps(0)
- max=100,
- step=1,
- tooltip="Inference steps",
- ),
- ],
- outputs=[IO.Video.Output()],
- hidden=[
- IO.Hidden.auth_token_comfy_org,
- IO.Hidden.api_key_comfy_org,
- IO.Hidden.unique_id,
- ],
- is_api_node=True,
- price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(),
- expr="""{"type":"usd","usd": 1.5}""",
- ),
- )
-
- @classmethod
- async def execute(
- cls,
- prompt: str,
- negative_prompt: str,
- resolution: str,
- prompt_adherence: float,
- seed: int,
- steps: int,
- ) -> IO.NodeOutput:
- validate_string(prompt, min_length=1, max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
- validate_string(negative_prompt, field_name="negative_prompt", max_length=MOONVALLEY_MAREY_MAX_PROMPT_LENGTH)
- width_height = parse_width_height_from_res(resolution)
-
- inference_params = MoonvalleyTextToVideoInferenceParams(
- negative_prompt=negative_prompt,
- steps=steps,
- seed=seed,
- guidance_scale=prompt_adherence,
- num_frames=128,
- width=width_height["width"],
- height=width_height["height"],
- )
-
- task_creation_response = await sync_op(
- cls,
- endpoint=ApiEndpoint(path=API_TXT2VIDEO_ENDPOINT, method="POST"),
- response_model=MoonvalleyPromptResponse,
- data=MoonvalleyTextToVideoRequest(prompt_text=prompt, inference_params=inference_params),
- )
- validate_task_creation_response(task_creation_response)
- final_response = await get_response(cls, task_creation_response.id)
- return IO.NodeOutput(await download_url_to_video_output(final_response.output_url))
-
-
-class MoonvalleyExtension(ComfyExtension):
- @override
- async def get_node_list(self) -> list[type[IO.ComfyNode]]:
- return [
- MoonvalleyImg2VideoNode,
- MoonvalleyTxt2VideoNode,
- MoonvalleyVideo2VideoNode,
- ]
-
-
-async def comfy_entrypoint() -> MoonvalleyExtension:
- return MoonvalleyExtension()
diff --git a/comfy_api_nodes/nodes_openai.py b/comfy_api_nodes/nodes_openai.py
index 4ee896fa8..48c739dfe 100644
--- a/comfy_api_nodes/nodes_openai.py
+++ b/comfy_api_nodes/nodes_openai.py
@@ -27,6 +27,7 @@ from comfy_api_nodes.util import (
ApiEndpoint,
download_url_to_bytesio,
downscale_image_tensor,
+ get_number_of_images,
poll_op,
sync_op,
tensor_to_base64_string,
@@ -39,16 +40,18 @@ STARTING_POINT_ID_PATTERN = r""
class SupportedOpenAIModel(str, Enum):
- o4_mini = "o4-mini"
- o1 = "o1"
- o3 = "o3"
- o1_pro = "o1-pro"
- gpt_4_1 = "gpt-4.1"
- gpt_4_1_mini = "gpt-4.1-mini"
- gpt_4_1_nano = "gpt-4.1-nano"
+ gpt_5_5_pro = "gpt-5.5-pro"
+ gpt_5_5 = "gpt-5.5"
gpt_5 = "gpt-5"
gpt_5_mini = "gpt-5-mini"
gpt_5_nano = "gpt-5-nano"
+ gpt_4_1 = "gpt-4.1"
+ gpt_4_1_mini = "gpt-4.1-mini"
+ gpt_4_1_nano = "gpt-4.1-nano"
+ o4_mini = "o4-mini"
+ o3 = "o3"
+ o1_pro = "o1-pro"
+ o1 = "o1"
async def validate_and_cast_response(response, timeout: int = None) -> torch.Tensor:
@@ -96,7 +99,7 @@ class OpenAIDalle2(IO.ComfyNode):
return IO.Schema(
node_id="OpenAIDalle2",
display_name="OpenAI DALL·E 2",
- category="api node/image/OpenAI",
+ category="image/partner/OpenAI",
description="Generates images synchronously via OpenAI's DALL·E 2 endpoint.",
inputs=[
IO.String.Input(
@@ -246,7 +249,7 @@ class OpenAIDalle3(IO.ComfyNode):
return IO.Schema(
node_id="OpenAIDalle3",
display_name="OpenAI DALL·E 3",
- category="api node/image/OpenAI",
+ category="image/partner/OpenAI",
description="Generates images synchronously via OpenAI's DALL·E 3 endpoint.",
inputs=[
IO.String.Input(
@@ -357,15 +360,20 @@ def calculate_tokens_price_image_1_5(response: OpenAIImageGenerationResponse) ->
return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 32.0)) / 1_000_000.0
+def calculate_tokens_price_image_2_0(response: OpenAIImageGenerationResponse) -> float | None:
+ return ((response.usage.input_tokens * 8.0) + (response.usage.output_tokens * 30.0)) / 1_000_000.0
+
+
class OpenAIGPTImage1(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="OpenAIGPTImage1",
- display_name="OpenAI GPT Image 1.5",
- category="api node/image/OpenAI",
+ display_name="OpenAI GPT Image 2",
+ category="image/partner/OpenAI",
description="Generates images synchronously via OpenAI's GPT Image endpoint.",
+ is_deprecated=True,
inputs=[
IO.String.Input(
"prompt",
@@ -401,8 +409,19 @@ class OpenAIGPTImage1(IO.ComfyNode):
IO.Combo.Input(
"size",
default="auto",
- options=["auto", "1024x1024", "1024x1536", "1536x1024"],
- tooltip="Image size",
+ options=[
+ "auto",
+ "1024x1024",
+ "1024x1536",
+ "1536x1024",
+ "2048x2048",
+ "2048x1152",
+ "1152x2048",
+ "3840x2160",
+ "2160x3840",
+ "Custom",
+ ],
+ tooltip="Image size. Select 'Custom' to use the custom width and height (GPT Image 2 only).",
optional=True,
),
IO.Int.Input(
@@ -427,8 +446,26 @@ class OpenAIGPTImage1(IO.ComfyNode):
),
IO.Combo.Input(
"model",
- options=["gpt-image-1", "gpt-image-1.5"],
- default="gpt-image-1.5",
+ options=["gpt-image-1", "gpt-image-1.5", "gpt-image-2"],
+ default="gpt-image-2",
+ optional=True,
+ ),
+ IO.Int.Input(
+ "custom_width",
+ default=1024,
+ min=1024,
+ max=3840,
+ step=16,
+ tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
+ optional=True,
+ ),
+ IO.Int.Input(
+ "custom_height",
+ default=1024,
+ min=1024,
+ max=3840,
+ step=16,
+ tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16 (GPT Image 2 only).",
optional=True,
),
],
@@ -442,23 +479,36 @@ class OpenAIGPTImage1(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["quality", "n"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["quality", "n", "model"]),
expr="""
(
$ranges := {
- "low": [0.011, 0.02],
- "medium": [0.046, 0.07],
- "high": [0.167, 0.3]
+ "gpt-image-1": {
+ "low": [0.011, 0.02],
+ "medium": [0.042, 0.07],
+ "high": [0.167, 0.25]
+ },
+ "gpt-image-1.5": {
+ "low": [0.009, 0.02],
+ "medium": [0.034, 0.062],
+ "high": [0.133, 0.22]
+ },
+ "gpt-image-2": {
+ "low": [0.0048, 0.019],
+ "medium": [0.041, 0.168],
+ "high": [0.165, 0.67]
+ }
};
- $range := $lookup($ranges, widgets.quality);
- $n := widgets.n;
+ $range := $lookup($lookup($ranges, widgets.model), widgets.quality);
+ $nRaw := widgets.n;
+ $n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
($n = 1)
- ? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1]}
+ ? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
: {
"type":"range_usd",
- "min_usd": $range[0],
- "max_usd": $range[1],
- "format": { "suffix": " x " & $string($n) & "/Run" }
+ "min_usd": $range[0] * $n,
+ "max_usd": $range[1] * $n,
+ "format": { "suffix": "/Run", "approximate": true }
}
)
""",
@@ -476,6 +526,8 @@ class OpenAIGPTImage1(IO.ComfyNode):
mask: Input.Image | None = None,
n: int = 1,
size: str = "1024x1024",
+ custom_width: int = 1024,
+ custom_height: int = 1024,
model: str = "gpt-image-1",
) -> IO.NodeOutput:
validate_string(prompt, strip_whitespace=False)
@@ -483,10 +535,36 @@ class OpenAIGPTImage1(IO.ComfyNode):
if mask is not None and image is None:
raise ValueError("Cannot use a mask without an input image")
+ if size == "Custom":
+ if model != "gpt-image-2":
+ raise ValueError("Custom resolution is only supported by GPT Image 2 model")
+ if custom_width % 16 != 0 or custom_height % 16 != 0:
+ raise ValueError(f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}")
+ if max(custom_width, custom_height) > 3840:
+ raise ValueError(f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}")
+ ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
+ if ratio > 3:
+ raise ValueError(
+ f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
+ )
+ total_pixels = custom_width * custom_height
+ if not 655_360 <= total_pixels <= 8_294_400:
+ raise ValueError(
+ f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
+ )
+ size = f"{custom_width}x{custom_height}"
+ elif model in ("gpt-image-1", "gpt-image-1.5"):
+ if size not in ("auto", "1024x1024", "1024x1536", "1536x1024"):
+ raise ValueError(f"Resolution {size} is only supported by GPT Image 2 model")
+
if model == "gpt-image-1":
price_extractor = calculate_tokens_price_image_1
elif model == "gpt-image-1.5":
price_extractor = calculate_tokens_price_image_1_5
+ elif model == "gpt-image-2":
+ price_extractor = calculate_tokens_price_image_2_0
+ if background == "transparent":
+ raise ValueError("Transparent background is not supported for GPT Image 2 model")
else:
raise ValueError(f"Unknown model: {model}")
@@ -564,6 +642,316 @@ class OpenAIGPTImage1(IO.ComfyNode):
return IO.NodeOutput(await validate_and_cast_response(response))
+def _gpt_image_shared_inputs():
+ """Inputs shared by all GPT Image models (quality + reference images + mask)."""
+ return [
+ IO.Combo.Input(
+ "quality",
+ default="low",
+ options=["low", "medium", "high"],
+ tooltip="Image quality, affects cost and generation time.",
+ ),
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, 17)],
+ min=0,
+ ),
+ tooltip="Optional reference image(s) for image editing. Up to 16 images.",
+ ),
+ IO.Mask.Input(
+ "mask",
+ optional=True,
+ tooltip="Optional mask for inpainting (white areas will be replaced). "
+ "Requires exactly one reference image.",
+ ),
+ ]
+
+
+def _gpt_image_legacy_model_inputs():
+ """Per-model widget set for legacy gpt-image-1 / gpt-image-1.5 (4 base sizes, transparent bg allowed)."""
+ return [
+ IO.Combo.Input(
+ "size",
+ default="auto",
+ options=["auto", "1024x1024", "1024x1536", "1536x1024"],
+ tooltip="Image size.",
+ ),
+ IO.Combo.Input(
+ "background",
+ default="auto",
+ options=["auto", "opaque", "transparent"],
+ tooltip="Return image with or without background.",
+ ),
+ *_gpt_image_shared_inputs(),
+ ]
+
+
+class OpenAIGPTImageNodeV2(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="OpenAIGPTImageNodeV2",
+ display_name="OpenAI GPT Image 2",
+ category="image/partner/OpenAI",
+ description="Generates images via OpenAI's GPT Image endpoint.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ default="",
+ multiline=True,
+ tooltip="Text prompt for GPT Image",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "gpt-image-2",
+ [
+ IO.Combo.Input(
+ "size",
+ default="auto",
+ options=[
+ "auto",
+ "1024x1024",
+ "1024x1536",
+ "1536x1024",
+ "2048x2048",
+ "2048x1152",
+ "1152x2048",
+ "3840x2160",
+ "2160x3840",
+ "Custom",
+ ],
+ tooltip="Image size. Select 'Custom' to use the custom width and height.",
+ ),
+ IO.Int.Input(
+ "custom_width",
+ default=1024,
+ min=1024,
+ max=3840,
+ step=16,
+ tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
+ ),
+ IO.Int.Input(
+ "custom_height",
+ default=1024,
+ min=1024,
+ max=3840,
+ step=16,
+ tooltip="Used only when `size` is 'Custom'. Must be a multiple of 16.",
+ ),
+ IO.Combo.Input(
+ "background",
+ default="auto",
+ options=["auto", "opaque"],
+ tooltip="Return image with or without background.",
+ ),
+ *_gpt_image_shared_inputs(),
+ ],
+ ),
+ IO.DynamicCombo.Option("gpt-image-1.5", _gpt_image_legacy_model_inputs()),
+ IO.DynamicCombo.Option("gpt-image-1", _gpt_image_legacy_model_inputs()),
+ ],
+ ),
+ IO.Int.Input(
+ "n",
+ default=1,
+ min=1,
+ max=8,
+ step=1,
+ tooltip="How many images to generate",
+ display_mode=IO.NumberDisplay.number,
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="not implemented yet in backend",
+ ),
+ ],
+ outputs=[IO.Image.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.quality", "n"]),
+ expr="""
+ (
+ $ranges := {
+ "gpt-image-1": {
+ "low": [0.011, 0.02],
+ "medium": [0.042, 0.07],
+ "high": [0.167, 0.25]
+ },
+ "gpt-image-1.5": {
+ "low": [0.009, 0.02],
+ "medium": [0.034, 0.062],
+ "high": [0.133, 0.22]
+ },
+ "gpt-image-2": {
+ "low": [0.0048, 0.019],
+ "medium": [0.041, 0.168],
+ "high": [0.165, 0.67]
+ }
+ };
+ $range := $lookup($lookup($ranges, widgets.model), $lookup(widgets, "model.quality"));
+ $nRaw := widgets.n;
+ $n := ($nRaw != null and $nRaw != 0) ? $nRaw : 1;
+ ($n = 1)
+ ? {"type":"range_usd","min_usd": $range[0], "max_usd": $range[1], "format": {"approximate": true}}
+ : {
+ "type":"range_usd",
+ "min_usd": $range[0] * $n,
+ "max_usd": $range[1] * $n,
+ "format": { "suffix": "/Run", "approximate": true }
+ }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ n: int,
+ seed: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=False)
+
+ model_id = model["model"]
+ size = model["size"]
+ background = model["background"]
+ quality = model["quality"]
+ custom_width = model.get("custom_width", 1024)
+ custom_height = model.get("custom_height", 1024)
+
+ images_dict = model.get("images") or {}
+ image_tensors: list[Input.Image] = [t for t in images_dict.values() if t is not None]
+ n_images = sum(get_number_of_images(t) for t in image_tensors)
+ mask = model.get("mask")
+
+ if mask is not None and n_images == 0:
+ raise ValueError("Cannot use a mask without an input image")
+
+ if size == "Custom":
+ if custom_width % 16 != 0 or custom_height % 16 != 0:
+ raise ValueError(
+ f"Custom width and height must be multiples of 16, got {custom_width}x{custom_height}"
+ )
+ if max(custom_width, custom_height) > 3840:
+ raise ValueError(
+ f"Custom resolution max edge must be <= 3840, got {custom_width}x{custom_height}"
+ )
+ ratio = max(custom_width, custom_height) / min(custom_width, custom_height)
+ if ratio > 3:
+ raise ValueError(
+ f"Custom resolution aspect ratio must not exceed 3:1, got {custom_width}x{custom_height}"
+ )
+ total_pixels = custom_width * custom_height
+ if not 655_360 <= total_pixels <= 8_294_400:
+ raise ValueError(
+ f"Custom resolution total pixels must be between 655,360 and 8,294,400, got {total_pixels}"
+ )
+ size = f"{custom_width}x{custom_height}"
+
+ if model_id == "gpt-image-1":
+ price_extractor = calculate_tokens_price_image_1
+ elif model_id == "gpt-image-1.5":
+ price_extractor = calculate_tokens_price_image_1_5
+ elif model_id == "gpt-image-2":
+ price_extractor = calculate_tokens_price_image_2_0
+ else:
+ raise ValueError(f"Unknown model: {model_id}")
+
+ if image_tensors:
+ flat: list[torch.Tensor] = []
+ for tensor in image_tensors:
+ if len(tensor.shape) == 4:
+ flat.extend(tensor[i : i + 1] for i in range(tensor.shape[0]))
+ else:
+ flat.append(tensor.unsqueeze(0))
+
+ files = []
+ for i, single_image in enumerate(flat):
+ scaled_image = downscale_image_tensor(single_image, total_pixels=2048 * 2048).squeeze()
+ image_np = (scaled_image.numpy() * 255).astype(np.uint8)
+ img = Image.fromarray(image_np)
+ img_byte_arr = BytesIO()
+ img.save(img_byte_arr, format="PNG")
+ img_byte_arr.seek(0)
+
+ if len(flat) == 1:
+ files.append(("image", (f"image_{i}.png", img_byte_arr, "image/png")))
+ else:
+ files.append(("image[]", (f"image_{i}.png", img_byte_arr, "image/png")))
+
+ if mask is not None:
+ if len(flat) != 1:
+ raise Exception("Cannot use a mask with multiple image")
+ ref_image = flat[0]
+ if mask.shape[1:] != ref_image.shape[1:-1]:
+ raise Exception("Mask and Image must be the same size")
+ _, height, width = mask.shape
+ rgba_mask = torch.zeros(height, width, 4, device="cpu")
+ rgba_mask[:, :, 3] = 1 - mask.squeeze().cpu()
+ scaled_mask = downscale_image_tensor(
+ rgba_mask.unsqueeze(0), total_pixels=2048 * 2048
+ ).squeeze()
+ mask_np = (scaled_mask.numpy() * 255).astype(np.uint8)
+ mask_img = Image.fromarray(mask_np)
+ mask_img_byte_arr = BytesIO()
+ mask_img.save(mask_img_byte_arr, format="PNG")
+ mask_img_byte_arr.seek(0)
+ files.append(("mask", ("mask.png", mask_img_byte_arr, "image/png")))
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/openai/images/edits", method="POST"),
+ response_model=OpenAIImageGenerationResponse,
+ data=OpenAIImageEditRequest(
+ model=model_id,
+ prompt=prompt,
+ quality=quality,
+ background=background,
+ n=n,
+ size=size,
+ moderation="low",
+ ),
+ content_type="multipart/form-data",
+ files=files,
+ price_extractor=price_extractor,
+ )
+ else:
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/openai/images/generations", method="POST"),
+ response_model=OpenAIImageGenerationResponse,
+ data=OpenAIImageGenerationRequest(
+ model=model_id,
+ prompt=prompt,
+ quality=quality,
+ background=background,
+ n=n,
+ size=size,
+ moderation="low",
+ ),
+ price_extractor=price_extractor,
+ )
+ return IO.NodeOutput(await validate_and_cast_response(response))
+
+
class OpenAIChatNode(IO.ComfyNode):
"""
Node to generate text responses from an OpenAI model.
@@ -574,7 +962,7 @@ class OpenAIChatNode(IO.ComfyNode):
return IO.Schema(
node_id="OpenAIChatNode",
display_name="OpenAI ChatGPT",
- category="api node/text/OpenAI",
+ category="text/partner/OpenAI",
essentials_category="Text Generation",
description="Generate text responses from an OpenAI model.",
inputs=[
@@ -665,6 +1053,16 @@ class OpenAIChatNode(IO.ComfyNode):
"usd": [0.002, 0.008],
"format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
}
+ : $contains($m, "gpt-5.5-pro") ? {
+ "type": "list_usd",
+ "usd": [0.03, 0.18],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
+ : $contains($m, "gpt-5.5") ? {
+ "type": "list_usd",
+ "usd": [0.005, 0.03],
+ "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }
+ }
: $contains($m, "gpt-5-nano") ? {
"type": "list_usd",
"usd": [0.00005, 0.0004],
@@ -803,7 +1201,7 @@ class OpenAIInputFiles(IO.ComfyNode):
return IO.Schema(
node_id="OpenAIInputFiles",
display_name="OpenAI ChatGPT Input Files",
- category="api node/text/OpenAI",
+ category="text/partner/OpenAI",
description="Loads and prepares input files (text, pdf, etc.) to include as inputs for the OpenAI Chat Node. The files will be read by the OpenAI model when generating a response. 🛈 TIP: Can be chained together with other OpenAI Input File nodes.",
inputs=[
IO.Combo.Input(
@@ -850,7 +1248,7 @@ class OpenAIChatConfig(IO.ComfyNode):
return IO.Schema(
node_id="OpenAIChatConfig",
display_name="OpenAI ChatGPT Advanced Options",
- category="api node/text/OpenAI",
+ category="text/partner/OpenAI",
description="Allows specifying advanced configuration options for the OpenAI Chat Nodes.",
inputs=[
IO.Combo.Input(
@@ -913,6 +1311,7 @@ class OpenAIExtension(ComfyExtension):
OpenAIDalle2,
OpenAIDalle3,
OpenAIGPTImage1,
+ OpenAIGPTImageNodeV2,
OpenAIChatNode,
OpenAIInputFiles,
OpenAIChatConfig,
diff --git a/comfy_api_nodes/nodes_openrouter.py b/comfy_api_nodes/nodes_openrouter.py
new file mode 100644
index 000000000..d2ebbef0d
--- /dev/null
+++ b/comfy_api_nodes/nodes_openrouter.py
@@ -0,0 +1,374 @@
+"""API Nodes for OpenRouter LLM chat completions."""
+
+from dataclasses import dataclass
+from typing import Literal
+
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.apis.openrouter import (
+ OpenRouterChatRequest,
+ OpenRouterChatResponse,
+ OpenRouterContentBlock,
+ OpenRouterImageContent,
+ OpenRouterImageUrl,
+ OpenRouterMessage,
+ OpenRouterReasoningConfig,
+ OpenRouterTextContent,
+ OpenRouterVideoContent,
+ OpenRouterVideoUrl,
+ OpenRouterWebSearchOptions,
+)
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ get_number_of_images,
+ sync_op,
+ upload_images_to_comfyapi,
+ upload_video_to_comfyapi,
+ validate_string,
+)
+
+OPENROUTER_CHAT_ENDPOINT = "/proxy/openrouter/api/v1/chat/completions"
+
+
+Profile = Literal["standard", "reasoning", "frontier_reasoning", "perplexity", "perplexity_reasoning"]
+
+
+@dataclass(frozen=True)
+class _ModelSpec:
+ slug: str # exact OpenRouter model id
+ profile: Profile
+ price_in: float # USD per token (prompt)
+ price_out: float # USD per token (completion)
+ max_images: int = 0 # 0 = no image input; otherwise max URL-passed images supported
+ max_videos: int = 0 # 0 = no video input; otherwise max URL-passed videos supported
+
+
+MODELS: list[_ModelSpec] = [
+ _ModelSpec("anthropic/claude-opus-4.7", "frontier_reasoning", 0.000005, 0.000025, max_images=20),
+ _ModelSpec("openai/gpt-5.5-pro", "frontier_reasoning", 0.00003, 0.00018, max_images=20),
+ _ModelSpec("openai/gpt-5.5", "frontier_reasoning", 0.000005, 0.00003, max_images=20),
+ _ModelSpec("google/gemini-3.5-flash", "reasoning", 0.0000015, 0.000009, max_images=20, max_videos=4),
+ _ModelSpec("x-ai/grok-4.20", "reasoning", 0.00000125, 0.0000025, max_images=20),
+ _ModelSpec("x-ai/grok-4.3", "reasoning", 0.00000125, 0.0000025, max_images=20),
+ _ModelSpec("deepseek/deepseek-v4-pro", "reasoning", 0.000000435, 0.00000087),
+ _ModelSpec("deepseek/deepseek-v4-flash", "reasoning", 0.000000112, 0.000000224),
+ _ModelSpec("deepseek/deepseek-v3.2", "reasoning", 0.000000252, 0.000000378),
+ _ModelSpec("qwen/qwen3.6-max-preview", "reasoning", 0.00000104, 0.00000624),
+ _ModelSpec("qwen/qwen3.6-plus", "reasoning", 0.000000325, 0.00000195, max_images=10, max_videos=4),
+ _ModelSpec("qwen/qwen3.6-flash", "reasoning", 0.0000001875, 0.000001125, max_images=10, max_videos=4),
+ _ModelSpec("mistralai/mistral-large-2512", "standard", 0.0000005, 0.0000015, max_images=8),
+ _ModelSpec("mistralai/mistral-medium-3-5", "reasoning", 0.0000015, 0.0000075, max_images=8),
+ _ModelSpec("z-ai/glm-4.6", "reasoning", 0.00000043, 0.00000174),
+ _ModelSpec("z-ai/glm-5", "reasoning", 0.0000006, 0.00000192),
+ _ModelSpec("moonshotai/kimi-k2.6", "reasoning", 0.00000073, 0.00000349, max_images=10),
+ _ModelSpec("moonshotai/kimi-k2-thinking", "reasoning", 0.0000006, 0.0000025),
+ _ModelSpec("perplexity/sonar-pro", "perplexity", 0.000003, 0.000015),
+ _ModelSpec("perplexity/sonar-reasoning-pro", "perplexity_reasoning", 0.000002, 0.000008),
+ _ModelSpec("perplexity/sonar-deep-research", "perplexity_reasoning", 0.000002, 0.000008),
+]
+
+_MODELS_BY_SLUG: dict[str, _ModelSpec] = {m.slug: m for m in MODELS}
+_REASONING_EFFORTS = ["off", "low", "medium", "high"]
+_SEARCH_CONTEXT_SIZES = ["low", "medium", "high"]
+
+
+def _reasoning_extra_inputs() -> list:
+ return [
+ IO.Combo.Input(
+ "reasoning_effort",
+ options=_REASONING_EFFORTS,
+ default="off",
+ tooltip="Reasoning effort. 'off' disables reasoning entirely.",
+ advanced=True,
+ ),
+ ]
+
+
+def _perplexity_extra_inputs() -> list:
+ return [
+ IO.Combo.Input(
+ "search_context_size",
+ options=_SEARCH_CONTEXT_SIZES,
+ default="medium",
+ tooltip="How much web search context to retrieve. Larger = more grounded but slower/pricier.",
+ advanced=True,
+ ),
+ ]
+
+
+def _profile_inputs(profile: Profile) -> list:
+ if profile == "standard":
+ return []
+ if profile in ("reasoning", "frontier_reasoning"):
+ return _reasoning_extra_inputs()
+ if profile == "perplexity":
+ return _perplexity_extra_inputs()
+ if profile == "perplexity_reasoning":
+ return _perplexity_extra_inputs() + _reasoning_extra_inputs()
+ raise ValueError(f"Unknown profile: {profile}")
+
+
+def _media_inputs(spec: _ModelSpec) -> list:
+ extras: list = []
+ if spec.max_images > 0:
+ extras.append(
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, spec.max_images + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional reference image(s) — up to {spec.max_images}. Sent as URLs.",
+ )
+ )
+ if spec.max_videos > 0:
+ extras.append(
+ IO.Autogrow.Input(
+ "videos",
+ template=IO.Autogrow.TemplateNames(
+ IO.Video.Input("video"),
+ names=[f"video_{i}" for i in range(1, spec.max_videos + 1)],
+ min=0,
+ ),
+ tooltip=f"Optional reference video(s) — up to {spec.max_videos}. Sent as URLs.",
+ )
+ )
+ return extras
+
+
+def _inputs_for_model(spec: _ModelSpec) -> list:
+ return _profile_inputs(spec.profile) + _media_inputs(spec)
+
+
+def _build_model_options() -> list[IO.DynamicCombo.Option]:
+ return [IO.DynamicCombo.Option(spec.slug, _inputs_for_model(spec)) for spec in MODELS]
+
+
+def _calculate_price(response: OpenRouterChatResponse) -> float | None:
+ if response.usage and response.usage.cost is not None:
+ return float(response.usage.cost)
+ return None
+
+
+def _price_badge_jsonata() -> str:
+ rates_pairs = []
+ for spec in MODELS:
+ prompt_per_1k = spec.price_in * 1000
+ completion_per_1k = spec.price_out * 1000
+ rates_pairs.append(f' "{spec.slug}": [{prompt_per_1k:.8g}, {completion_per_1k:.8g}]')
+ rates_block = ",\n".join(rates_pairs)
+ return (
+ "(\n"
+ " $rates := {\n"
+ f"{rates_block}\n"
+ " };\n"
+ " $r := $lookup($rates, widgets.model);\n"
+ " $r ? {\n"
+ ' "type": "list_usd",\n'
+ ' "usd": $r,\n'
+ ' "format": { "approximate": true, "separator": "-", "suffix": " per 1K tokens" }\n'
+ ' } : {"type": "text", "text": "Token-based"}\n'
+ ")"
+ )
+
+
+async def _build_image_blocks(
+ cls: type[IO.ComfyNode], spec: _ModelSpec, images: list[Input.Image]
+) -> list[OpenRouterImageContent]:
+ urls = await upload_images_to_comfyapi(
+ cls,
+ images,
+ max_images=spec.max_images,
+ total_pixels=2048 * 2048,
+ mime_type="image/png",
+ wait_label="Uploading reference images",
+ )
+ return [OpenRouterImageContent(image_url=OpenRouterImageUrl(url=url)) for url in urls]
+
+
+async def _build_video_blocks(cls: type[IO.ComfyNode], videos: list[Input.Video]) -> list[OpenRouterVideoContent]:
+ blocks: list[OpenRouterVideoContent] = []
+ total = len(videos)
+ for idx, video in enumerate(videos):
+ label = "Uploading reference video"
+ if total > 1:
+ label = f"{label} ({idx + 1}/{total})"
+ url = await upload_video_to_comfyapi(cls, video, wait_label=label)
+ blocks.append(OpenRouterVideoContent(video_url=OpenRouterVideoUrl(url=url)))
+ return blocks
+
+
+def _user_message(prompt: str, media_blocks: list[OpenRouterContentBlock]) -> OpenRouterMessage:
+ if not media_blocks:
+ return OpenRouterMessage(role="user", content=prompt)
+ blocks: list[OpenRouterContentBlock] = list(media_blocks)
+ blocks.append(OpenRouterTextContent(text=prompt))
+ return OpenRouterMessage(role="user", content=blocks)
+
+
+def _build_messages(
+ system_prompt: str, prompt: str, media_blocks: list[OpenRouterContentBlock]
+) -> list[OpenRouterMessage]:
+ messages: list[OpenRouterMessage] = []
+ if system_prompt:
+ messages.append(OpenRouterMessage(role="system", content=system_prompt))
+ messages.append(_user_message(prompt, media_blocks))
+ return messages
+
+
+def _build_request(
+ slug: str,
+ system_prompt: str,
+ prompt: str,
+ media_blocks: list[OpenRouterContentBlock],
+ *,
+ seed: int,
+ reasoning_effort: str | None,
+ search_context_size: str | None,
+) -> OpenRouterChatRequest:
+ reasoning_cfg: OpenRouterReasoningConfig | None = None
+ if reasoning_effort and reasoning_effort != "off":
+ # exclude=True asks providers to reason internally but not return the trace
+ reasoning_cfg = OpenRouterReasoningConfig(effort=reasoning_effort, exclude=True)
+ web_search_cfg: OpenRouterWebSearchOptions | None = None
+ if search_context_size:
+ web_search_cfg = OpenRouterWebSearchOptions(search_context_size=search_context_size)
+ return OpenRouterChatRequest(
+ model=slug,
+ messages=_build_messages(system_prompt, prompt, media_blocks),
+ seed=seed if seed > 0 else None,
+ reasoning=reasoning_cfg,
+ web_search_options=web_search_cfg,
+ )
+
+
+def _extract_text(response: OpenRouterChatResponse) -> str:
+ if response.error:
+ code = response.error.code if response.error.code is not None else "unknown"
+ raise ValueError(f"OpenRouter error ({code}): {response.error.message or 'no message'}")
+ if not response.choices:
+ raise ValueError("Empty response from OpenRouter (no choices).")
+ message = response.choices[0].message
+ if not message:
+ raise ValueError("Empty response from OpenRouter (no message).")
+ if message.refusal:
+ raise ValueError(f"Model refused to respond: {message.refusal}")
+ return message.content or ""
+
+
+class OpenRouterLLMNode(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="OpenRouterLLMNode",
+ display_name="OpenRouter LLM",
+ category="text/partner/OpenRouter",
+ essentials_category="Text Generation",
+ description=(
+ "Generate text responses through OpenRouter. Routes to a curated set of popular "
+ "models from xAI, DeepSeek, Qwen, Mistral, Z.AI (GLM), Moonshot (Kimi), and "
+ "Perplexity Sonar."
+ ),
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text input to the model.",
+ ),
+ IO.DynamicCombo.Input(
+ "model",
+ options=_build_model_options(),
+ tooltip="The OpenRouter model used to generate the response.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ control_after_generate=True,
+ tooltip="Seed for sampling. Set to 0 to omit. Most models treat this as a hint only.",
+ ),
+ IO.String.Input(
+ "system_prompt",
+ multiline=True,
+ default="",
+ optional=True,
+ advanced=True,
+ tooltip="Foundational instructions that dictate the model's behavior.",
+ ),
+ ],
+ outputs=[IO.String.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr=_price_badge_jsonata(),
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ model: dict,
+ seed: int,
+ system_prompt: str = "",
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ slug: str = model["model"]
+ spec = _MODELS_BY_SLUG.get(slug)
+ if spec is None:
+ raise ValueError(f"Unknown OpenRouter model: {slug}")
+
+ reasoning_effort: str | None = model.get("reasoning_effort")
+ search_context_size: str | None = model.get("search_context_size")
+
+ image_tensors: list[Input.Image] = [t for t in (model.get("images") or {}).values() if t is not None]
+ if image_tensors and sum(get_number_of_images(t) for t in image_tensors) > spec.max_images:
+ raise ValueError(f"Up to {spec.max_images} images are supported for {slug}.")
+ video_inputs: list[Input.Video] = [v for v in (model.get("videos") or {}).values() if v is not None]
+ if video_inputs and len(video_inputs) > spec.max_videos:
+ raise ValueError(f"Up to {spec.max_videos} videos are supported for {slug}.")
+
+ media_blocks: list[OpenRouterContentBlock] = []
+ if image_tensors:
+ media_blocks.extend(await _build_image_blocks(cls, spec, image_tensors))
+ if video_inputs:
+ media_blocks.extend(await _build_video_blocks(cls, video_inputs))
+
+ request = _build_request(
+ slug,
+ system_prompt,
+ prompt,
+ media_blocks,
+ seed=seed,
+ reasoning_effort=reasoning_effort,
+ search_context_size=search_context_size,
+ )
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path=OPENROUTER_CHAT_ENDPOINT, method="POST"),
+ response_model=OpenRouterChatResponse,
+ data=request,
+ price_extractor=_calculate_price,
+ )
+ return IO.NodeOutput(_extract_text(response))
+
+
+class OpenRouterExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [OpenRouterLLMNode]
+
+
+async def comfy_entrypoint() -> OpenRouterExtension:
+ return OpenRouterExtension()
diff --git a/comfy_api_nodes/nodes_pixverse.py b/comfy_api_nodes/nodes_pixverse.py
index e17a24ae7..3861cfedd 100644
--- a/comfy_api_nodes/nodes_pixverse.py
+++ b/comfy_api_nodes/nodes_pixverse.py
@@ -53,7 +53,7 @@ class PixverseTemplateNode(IO.ComfyNode):
return IO.Schema(
node_id="PixverseTemplateNode",
display_name="PixVerse Template",
- category="api node/video/PixVerse",
+ category="video/partner/PixVerse",
inputs=[
IO.Combo.Input("template", options=list(pixverse_templates.keys())),
],
@@ -74,7 +74,7 @@ class PixverseTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="PixverseTextToVideoNode",
display_name="PixVerse Text to Video",
- category="api node/video/PixVerse",
+ category="video/partner/PixVerse",
description="Generates videos based on prompt and output_size.",
inputs=[
IO.String.Input(
@@ -192,7 +192,7 @@ class PixverseImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="PixverseImageToVideoNode",
display_name="PixVerse Image to Video",
- category="api node/video/PixVerse",
+ category="video/partner/PixVerse",
description="Generates videos based on prompt and output_size.",
inputs=[
IO.Image.Input("image"),
@@ -310,7 +310,7 @@ class PixverseTransitionVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="PixverseTransitionVideoNode",
display_name="PixVerse Transition Video",
- category="api node/video/PixVerse",
+ category="video/partner/PixVerse",
description="Generates videos based on prompt and output_size.",
inputs=[
IO.Image.Input("first_frame"),
diff --git a/comfy_api_nodes/nodes_quiver.py b/comfy_api_nodes/nodes_quiver.py
index 61533263f..ad045a7ef 100644
--- a/comfy_api_nodes/nodes_quiver.py
+++ b/comfy_api_nodes/nodes_quiver.py
@@ -17,6 +17,44 @@ from comfy_api_nodes.util import (
)
from comfy_extras.nodes_images import SVG
+_ARROW_MODELS = ["arrow-1.1", "arrow-1.1-max", "arrow-preview"]
+
+
+def _arrow_sampling_inputs():
+ """Shared sampling inputs for all Arrow model variants."""
+ return [
+ IO.Float.Input(
+ "temperature",
+ default=1.0,
+ min=0.0,
+ max=2.0,
+ step=0.1,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Randomness control. Higher values increase randomness.",
+ advanced=True,
+ ),
+ IO.Float.Input(
+ "top_p",
+ default=1.0,
+ min=0.05,
+ max=1.0,
+ step=0.05,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Nucleus sampling parameter.",
+ advanced=True,
+ ),
+ IO.Float.Input(
+ "presence_penalty",
+ default=0.0,
+ min=-2.0,
+ max=2.0,
+ step=0.1,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Token presence penalty.",
+ advanced=True,
+ ),
+ ]
+
class QuiverTextToSVGNode(IO.ComfyNode):
@classmethod
@@ -24,7 +62,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
return IO.Schema(
node_id="QuiverTextToSVGNode",
display_name="Quiver Text to SVG",
- category="api node/image/Quiver",
+ category="image/partner/Quiver",
description="Generate an SVG from a text prompt using Quiver AI.",
inputs=[
IO.String.Input(
@@ -39,6 +77,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
default="",
tooltip="Additional style or formatting guidance.",
optional=True,
+ advanced=True,
),
IO.Autogrow.Input(
"reference_images",
@@ -53,43 +92,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
),
IO.DynamicCombo.Input(
"model",
- options=[
- IO.DynamicCombo.Option(
- "arrow-preview",
- [
- IO.Float.Input(
- "temperature",
- default=1.0,
- min=0.0,
- max=2.0,
- step=0.1,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Randomness control. Higher values increase randomness.",
- advanced=True,
- ),
- IO.Float.Input(
- "top_p",
- default=1.0,
- min=0.05,
- max=1.0,
- step=0.05,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Nucleus sampling parameter.",
- advanced=True,
- ),
- IO.Float.Input(
- "presence_penalty",
- default=0.0,
- min=-2.0,
- max=2.0,
- step=0.1,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Token presence penalty.",
- advanced=True,
- ),
- ],
- ),
- ],
+ options=[IO.DynamicCombo.Option(m, _arrow_sampling_inputs()) for m in _ARROW_MODELS],
tooltip="Model to use for SVG generation.",
),
IO.Int.Input(
@@ -112,7 +115,16 @@ class QuiverTextToSVGNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.429}""",
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr="""
+ (
+ $contains(widgets.model, "max")
+ ? {"type":"usd","usd":0.3575}
+ : $contains(widgets.model, "preview")
+ ? {"type":"usd","usd":0.429}
+ : {"type":"usd","usd":0.286}
+ )
+ """,
),
)
@@ -131,7 +143,7 @@ class QuiverTextToSVGNode(IO.ComfyNode):
if reference_images:
references = []
for key in reference_images:
- url = await upload_image_to_comfyapi(cls, reference_images[key])
+ url = await upload_image_to_comfyapi(cls, reference_images[key], mime_type="image/png")
references.append(QuiverImageObject(url=url))
if len(references) > 4:
raise ValueError("Maximum 4 reference images are allowed.")
@@ -165,7 +177,7 @@ class QuiverImageToSVGNode(IO.ComfyNode):
return IO.Schema(
node_id="QuiverImageToSVGNode",
display_name="Quiver Image to SVG",
- category="api node/image/Quiver",
+ category="image/partner/Quiver",
description="Vectorize a raster image into SVG using Quiver AI.",
inputs=[
IO.Image.Input(
@@ -176,12 +188,13 @@ class QuiverImageToSVGNode(IO.ComfyNode):
"auto_crop",
default=False,
tooltip="Automatically crop to the dominant subject.",
+ advanced=True,
),
IO.DynamicCombo.Input(
"model",
options=[
IO.DynamicCombo.Option(
- "arrow-preview",
+ m,
[
IO.Int.Input(
"target_size",
@@ -189,39 +202,12 @@ class QuiverImageToSVGNode(IO.ComfyNode):
min=128,
max=4096,
tooltip="Square resize target in pixels.",
- ),
- IO.Float.Input(
- "temperature",
- default=1.0,
- min=0.0,
- max=2.0,
- step=0.1,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Randomness control. Higher values increase randomness.",
- advanced=True,
- ),
- IO.Float.Input(
- "top_p",
- default=1.0,
- min=0.05,
- max=1.0,
- step=0.05,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Nucleus sampling parameter.",
- advanced=True,
- ),
- IO.Float.Input(
- "presence_penalty",
- default=0.0,
- min=-2.0,
- max=2.0,
- step=0.1,
- display_mode=IO.NumberDisplay.slider,
- tooltip="Token presence penalty.",
advanced=True,
),
+ *_arrow_sampling_inputs(),
],
- ),
+ )
+ for m in _ARROW_MODELS
],
tooltip="Model to use for SVG vectorization.",
),
@@ -245,7 +231,16 @@ class QuiverImageToSVGNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.429}""",
+ depends_on=IO.PriceBadgeDepends(widgets=["model"]),
+ expr="""
+ (
+ $contains(widgets.model, "max")
+ ? {"type":"usd","usd":0.3575}
+ : $contains(widgets.model, "preview")
+ ? {"type":"usd","usd":0.429}
+ : {"type":"usd","usd":0.286}
+ )
+ """,
),
)
@@ -257,7 +252,7 @@ class QuiverImageToSVGNode(IO.ComfyNode):
model: dict,
seed: int,
) -> IO.NodeOutput:
- image_url = await upload_image_to_comfyapi(cls, image)
+ image_url = await upload_image_to_comfyapi(cls, image, mime_type="image/png")
response = await sync_op(
cls,
diff --git a/comfy_api_nodes/nodes_recraft.py b/comfy_api_nodes/nodes_recraft.py
index c60cfbc4a..07387821d 100644
--- a/comfy_api_nodes/nodes_recraft.py
+++ b/comfy_api_nodes/nodes_recraft.py
@@ -178,7 +178,7 @@ class RecraftColorRGBNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftColorRGB",
display_name="Recraft Color RGB",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Create Recraft Color by choosing specific RGB values.",
inputs=[
IO.Int.Input("r", default=0, min=0, max=255, tooltip="Red value of color."),
@@ -204,7 +204,7 @@ class RecraftControlsNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftControls",
display_name="Recraft Controls",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Create Recraft Controls for customizing Recraft generation.",
inputs=[
IO.Custom(RecraftIO.COLOR).Input("colors", optional=True),
@@ -228,7 +228,7 @@ class RecraftStyleV3RealisticImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftStyleV3RealisticImage",
display_name="Recraft Style - Realistic Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Select realistic_image style and optional substyle.",
inputs=[
IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)),
@@ -253,7 +253,7 @@ class RecraftStyleV3DigitalIllustrationNode(RecraftStyleV3RealisticImageNode):
return IO.Schema(
node_id="RecraftStyleV3DigitalIllustration",
display_name="Recraft Style - Digital Illustration",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Select realistic_image style and optional substyle.",
inputs=[
IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)),
@@ -272,7 +272,7 @@ class RecraftStyleV3VectorIllustrationNode(RecraftStyleV3RealisticImageNode):
return IO.Schema(
node_id="RecraftStyleV3VectorIllustrationNode",
display_name="Recraft Style - Realistic Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Select realistic_image style and optional substyle.",
inputs=[
IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE)),
@@ -291,7 +291,7 @@ class RecraftStyleV3LogoRasterNode(RecraftStyleV3RealisticImageNode):
return IO.Schema(
node_id="RecraftStyleV3LogoRaster",
display_name="Recraft Style - Logo Raster",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Select realistic_image style and optional substyle.",
inputs=[
IO.Combo.Input("substyle", options=get_v3_substyles(cls.RECRAFT_STYLE, include_none=False)),
@@ -308,7 +308,7 @@ class RecraftStyleInfiniteStyleLibrary(IO.ComfyNode):
return IO.Schema(
node_id="RecraftStyleV3InfiniteStyleLibrary",
display_name="Recraft Style - Infinite Style Library",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Choose style based on preexisting UUID from Recraft's Infinite Style Library.",
inputs=[
IO.String.Input("style_id", default="", tooltip="UUID of style from Infinite Style Library."),
@@ -331,7 +331,7 @@ class RecraftCreateStyleNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftCreateStyleNode",
display_name="Recraft Create Style",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Create a custom style from reference images. "
"Upload 1-5 images to use as style references. "
"Total size of all images is limited to 5 MB.",
@@ -400,7 +400,7 @@ class RecraftTextToImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftTextToImageNode",
display_name="Recraft Text to Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Generates images synchronously based on prompt and resolution.",
inputs=[
IO.String.Input("prompt", multiline=True, default="", tooltip="Prompt for the image generation."),
@@ -512,7 +512,7 @@ class RecraftImageToImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftImageToImageNode",
display_name="Recraft Image to Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Modify image based on prompt and strength.",
inputs=[
IO.Image.Input("image"),
@@ -630,7 +630,7 @@ class RecraftImageInpaintingNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftImageInpaintingNode",
display_name="Recraft Image Inpainting",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Modify image based on prompt and mask.",
inputs=[
IO.Image.Input("image"),
@@ -732,7 +732,7 @@ class RecraftTextToVectorNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftTextToVectorNode",
display_name="Recraft Text to Vector",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Generates SVG synchronously based on prompt and resolution.",
inputs=[
IO.String.Input("prompt", default="", tooltip="Prompt for the image generation.", multiline=True),
@@ -832,7 +832,7 @@ class RecraftVectorizeImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftVectorizeImageNode",
display_name="Recraft Vectorize Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
essentials_category="Image Tools",
description="Generates SVG synchronously from an input image.",
inputs=[
@@ -876,7 +876,7 @@ class RecraftReplaceBackgroundNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftReplaceBackgroundNode",
display_name="Recraft Replace Background",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Replace background on image, based on provided prompt.",
inputs=[
IO.Image.Input("image"),
@@ -963,7 +963,7 @@ class RecraftRemoveBackgroundNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftRemoveBackgroundNode",
display_name="Recraft Remove Background",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
essentials_category="Image Tools",
description="Remove background from image, and return processed image and mask.",
inputs=[
@@ -1012,7 +1012,7 @@ class RecraftCrispUpscaleNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftCrispUpscaleNode",
display_name="Recraft Crisp Upscale Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Upscale image synchronously.\n"
"Enhances a given raster image using ‘crisp upscale’ tool, "
"increasing image resolution, making the image sharper and cleaner.",
@@ -1058,7 +1058,7 @@ class RecraftCreativeUpscaleNode(RecraftCrispUpscaleNode):
return IO.Schema(
node_id="RecraftCreativeUpscaleNode",
display_name="Recraft Creative Upscale Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Upscale image synchronously.\n"
"Enhances a given raster image using ‘creative upscale’ tool, "
"boosting resolution with a focus on refining small details and faces.",
@@ -1086,7 +1086,7 @@ class RecraftV4TextToImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftV4TextToImageNode",
display_name="Recraft V4 Text to Image",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Generates images using Recraft V4 or V4 Pro models.",
inputs=[
IO.String.Input(
@@ -1210,7 +1210,7 @@ class RecraftV4TextToVectorNode(IO.ComfyNode):
return IO.Schema(
node_id="RecraftV4TextToVectorNode",
display_name="Recraft V4 Text to Vector",
- category="api node/image/Recraft",
+ category="image/partner/Recraft",
description="Generates SVG using Recraft V4 or V4 Pro models.",
inputs=[
IO.String.Input(
diff --git a/comfy_api_nodes/nodes_reve.py b/comfy_api_nodes/nodes_reve.py
index a87395394..2b15eadd7 100644
--- a/comfy_api_nodes/nodes_reve.py
+++ b/comfy_api_nodes/nodes_reve.py
@@ -109,7 +109,7 @@ class ReveImageCreateNode(IO.ComfyNode):
return IO.Schema(
node_id="ReveImageCreateNode",
display_name="Reve Image Create",
- category="api node/image/Reve",
+ category="image/partner/Reve",
description="Generate images from text descriptions using Reve.",
inputs=[
IO.String.Input(
@@ -200,7 +200,7 @@ class ReveImageEditNode(IO.ComfyNode):
return IO.Schema(
node_id="ReveImageEditNode",
display_name="Reve Image Edit",
- category="api node/image/Reve",
+ category="image/partner/Reve",
description="Edit images using natural language instructions with Reve.",
inputs=[
IO.Image.Input("image", tooltip="The image to edit."),
@@ -300,7 +300,7 @@ class ReveImageRemixNode(IO.ComfyNode):
return IO.Schema(
node_id="ReveImageRemixNode",
display_name="Reve Image Remix",
- category="api node/image/Reve",
+ category="image/partner/Reve",
description="Combine reference images with text prompts to create new images using Reve.",
inputs=[
IO.Autogrow.Input(
diff --git a/comfy_api_nodes/nodes_rodin.py b/comfy_api_nodes/nodes_rodin.py
index 2b829b8db..e14955661 100644
--- a/comfy_api_nodes/nodes_rodin.py
+++ b/comfy_api_nodes/nodes_rodin.py
@@ -5,32 +5,37 @@ Rodin API docs: https://developer.hyper3d.ai/
"""
-from inspect import cleandoc
-import folder_paths as comfy_paths
-import os
import logging
import math
+import os
+from inspect import cleandoc
from io import BytesIO
-from typing_extensions import override
+from typing import Any
+
+import aiohttp
from PIL import Image
+from typing_extensions import override
+
+import folder_paths as comfy_paths
+from comfy_api.latest import IO, ComfyExtension, Types
from comfy_api_nodes.apis.rodin import (
- Rodin3DGenerateRequest,
- Rodin3DGenerateResponse,
+ JobStatus,
Rodin3DCheckStatusRequest,
Rodin3DCheckStatusResponse,
Rodin3DDownloadRequest,
Rodin3DDownloadResponse,
- JobStatus,
+ Rodin3DGen25Request,
+ Rodin3DGenerateRequest,
+ Rodin3DGenerateResponse,
)
from comfy_api_nodes.util import (
- sync_op,
- poll_op,
ApiEndpoint,
download_url_to_bytesio,
download_url_to_file_3d,
+ poll_op,
+ sync_op,
+ validate_string,
)
-from comfy_api.latest import ComfyExtension, IO, Types
-
COMMON_PARAMETERS = [
IO.Int.Input(
@@ -51,40 +56,30 @@ COMMON_PARAMETERS = [
]
-def get_quality_mode(poly_count):
- polycount = poly_count.split("-")
- poly = polycount[1]
- count = polycount[0]
- if poly == "Triangle":
- mesh_mode = "Raw"
- elif poly == "Quad":
- mesh_mode = "Quad"
- else:
- mesh_mode = "Quad"
-
- if count == "4K":
- quality_override = 4000
- elif count == "8K":
- quality_override = 8000
- elif count == "18K":
- quality_override = 18000
- elif count == "50K":
- quality_override = 50000
- elif count == "2K":
- quality_override = 2000
- elif count == "20K":
- quality_override = 20000
- elif count == "150K":
- quality_override = 150000
- elif count == "500K":
- quality_override = 500000
- else:
- quality_override = 18000
-
- return mesh_mode, quality_override
+_QUALITY_MESH_OPTIONS: dict[str, tuple[str, int]] = {
+ "4K-Quad": ("Quad", 4000),
+ "8K-Quad": ("Quad", 8000),
+ "18K-Quad": ("Quad", 18000),
+ "50K-Quad": ("Quad", 50000),
+ "200K-Quad": ("Quad", 200000),
+ "2K-Triangle": ("Raw", 2000),
+ "20K-Triangle": ("Raw", 20000),
+ "150K-Triangle": ("Raw", 150000),
+ "200K-Triangle": ("Raw", 200000),
+ "500K-Triangle": ("Raw", 500000),
+ "1M-Triangle": ("Raw", 1000000),
+}
-def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
+def get_quality_mode(poly_count: str) -> tuple[str, int]:
+ """Map a polygon-count preset like '18K-Quad' to (mesh_mode, quality_override).
+
+ Falls back to ('Quad', 18000) for unknown labels; legacy parity.
+ """
+ return _QUALITY_MESH_OPTIONS.get(poly_count, ("Quad", 18000))
+
+
+def tensor_to_filelike(tensor, max_pixels: int = 2048 * 2048):
"""
Converts a PyTorch tensor to a file-like object.
@@ -96,8 +91,8 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
- io.BytesIO: A file-like object containing the image data.
"""
array = tensor.cpu().numpy()
- array = (array * 255).astype('uint8')
- image = Image.fromarray(array, 'RGB')
+ array = (array * 255).astype("uint8")
+ image = Image.fromarray(array, "RGB")
original_width, original_height = image.size
original_pixels = original_width * original_height
@@ -112,7 +107,7 @@ def tensor_to_filelike(tensor, max_pixels: int = 2048*2048):
image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
img_byte_arr = BytesIO()
- image.save(img_byte_arr, format='PNG') # PNG is used for lossless compression
+ image.save(img_byte_arr, format="PNG") # PNG is used for lossless compression
img_byte_arr.seek(0)
return img_byte_arr
@@ -145,11 +140,9 @@ async def create_generate_task(
TAPose=ta_pose,
),
files=[
- (
- "images",
- open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image)
- )
- for image in images if image is not None
+ ("images", open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image))
+ for image in images
+ if image is not None
],
content_type="multipart/form-data",
)
@@ -177,6 +170,7 @@ def check_rodin_status(response: Rodin3DCheckStatusResponse) -> str:
return "DONE"
return "Generating"
+
def extract_progress(response: Rodin3DCheckStatusResponse) -> int | None:
if not response.jobs:
return None
@@ -214,7 +208,7 @@ async def download_files(url_list, task_uuid: str) -> tuple[str | None, Types.Fi
model_file_path = None
file_3d = None
- for i in url_list.list:
+ for i in url_list.items:
file_path = os.path.join(save_path, i.name)
if i.name.lower().endswith(".glb"):
model_file_path = os.path.join(result_folder_name, i.name)
@@ -236,7 +230,7 @@ class Rodin3D_Regular(IO.ComfyNode):
return IO.Schema(
node_id="Rodin3D_Regular",
display_name="Rodin 3D Generate - Regular Generate",
- category="api node/3d/Rodin",
+ category="3d/partner/Rodin",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("Images"),
@@ -295,7 +289,7 @@ class Rodin3D_Detail(IO.ComfyNode):
return IO.Schema(
node_id="Rodin3D_Detail",
display_name="Rodin 3D Generate - Detail Generate",
- category="api node/3d/Rodin",
+ category="3d/partner/Rodin",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("Images"),
@@ -354,7 +348,7 @@ class Rodin3D_Smooth(IO.ComfyNode):
return IO.Schema(
node_id="Rodin3D_Smooth",
display_name="Rodin 3D Generate - Smooth Generate",
- category="api node/3d/Rodin",
+ category="3d/partner/Rodin",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("Images"),
@@ -412,7 +406,7 @@ class Rodin3D_Sketch(IO.ComfyNode):
return IO.Schema(
node_id="Rodin3D_Sketch",
display_name="Rodin 3D Generate - Sketch Generate",
- category="api node/3d/Rodin",
+ category="3d/partner/Rodin",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("Images"),
@@ -474,7 +468,7 @@ class Rodin3D_Gen2(IO.ComfyNode):
return IO.Schema(
node_id="Rodin3D_Gen2",
display_name="Rodin 3D Generate - Gen-2 Generate",
- category="api node/3d/Rodin",
+ category="3d/partner/Rodin",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("Images"),
@@ -489,7 +483,16 @@ class Rodin3D_Gen2(IO.ComfyNode):
IO.Combo.Input("Material_Type", options=["PBR", "Shaded"], default="PBR", optional=True),
IO.Combo.Input(
"Polygon_count",
- options=["4K-Quad", "8K-Quad", "18K-Quad", "50K-Quad", "2K-Triangle", "20K-Triangle", "150K-Triangle", "500K-Triangle"],
+ options=[
+ "4K-Quad",
+ "8K-Quad",
+ "18K-Quad",
+ "50K-Quad",
+ "2K-Triangle",
+ "20K-Triangle",
+ "150K-Triangle",
+ "500K-Triangle",
+ ],
default="500K-Triangle",
optional=True,
),
@@ -542,6 +545,566 @@ class Rodin3D_Gen2(IO.ComfyNode):
return IO.NodeOutput(model_path, file_3d)
+def _rodin_multipart_parser(data: dict[str, Any]) -> aiohttp.FormData:
+ """Convert a Rodin request dict to an aiohttp form, fixing bool/list serialization.
+
+ Booleans --> "true"/"false". Lists --> one field per element.
+ """
+ form = aiohttp.FormData(default_to_multipart=True)
+ for key, value in data.items():
+ if value is None:
+ continue
+ if isinstance(value, bool):
+ form.add_field(key, "true" if value else "false")
+ elif isinstance(value, list):
+ for item in value:
+ form.add_field(key, str(item))
+ elif isinstance(value, (bytes, bytearray)):
+ form.add_field(key, value)
+ else:
+ form.add_field(key, str(value))
+ return form
+
+
+async def _create_gen25_task(
+ cls: type[IO.ComfyNode],
+ request: Rodin3DGen25Request,
+ images: list | None,
+) -> tuple[str, str]:
+ """Submit a Gen-2.5 generate job; returns (task_uuid, subscription_key)."""
+
+ if images is not None and len(images) > 5:
+ raise ValueError("Rodin Gen-2.5 supports at most 5 input images.")
+
+ files = None
+ if images:
+ files = [
+ (
+ "images",
+ open(image, "rb") if isinstance(image, str) else tensor_to_filelike(image),
+ )
+ for image in images
+ if image is not None
+ ]
+
+ response = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/rodin/api/v2/rodin", method="POST"),
+ response_model=Rodin3DGenerateResponse,
+ data=request,
+ files=files,
+ content_type="multipart/form-data",
+ multipart_parser=_rodin_multipart_parser,
+ )
+
+ if not response.uuid or not response.jobs or not response.jobs.subscription_key:
+ raise RuntimeError(f"Rodin Gen-2.5 submit failed: message={response.message!r}")
+ return response.uuid, response.jobs.subscription_key
+
+
+_PREVIEWABLE_3D_EXTS = {".glb", ".obj", ".fbx", ".stl", ".gltf"}
+
+
+async def _download_gen25_files(
+ download_list: Rodin3DDownloadResponse,
+ task_uuid: str,
+ geometry_file_format: str,
+) -> Types.File3D | None:
+ """Download every file in the list; return the File3D matching the chosen format."""
+
+ folder_name = f"Rodin3D_Gen25_{task_uuid}"
+ save_dir = os.path.join(comfy_paths.get_output_directory(), folder_name)
+ os.makedirs(save_dir, exist_ok=True)
+
+ target_ext = f".{geometry_file_format.lower().lstrip('.')}"
+ file_3d: Types.File3D | None = None
+
+ for item in download_list.items:
+ file_path = os.path.join(save_dir, item.name)
+ ext = os.path.splitext(item.name.lower())[1]
+ # Prefer the file matching the user's chosen format; fall back below.
+ if file_3d is None and ext == target_ext and ext in _PREVIEWABLE_3D_EXTS:
+ file_3d = await download_url_to_file_3d(item.url, target_ext.lstrip("."))
+ with open(file_path, "wb") as f:
+ f.write(file_3d.get_bytes())
+ continue
+ await download_url_to_bytesio(item.url, file_path)
+
+ # If the chosen format wasn't found, surface any model file we did get.
+ if file_3d is None:
+ for item in download_list.items:
+ ext = os.path.splitext(item.name.lower())[1]
+ if ext in _PREVIEWABLE_3D_EXTS:
+ file_3d = await download_url_to_file_3d(item.url, ext.lstrip("."))
+ break
+ return file_3d
+
+
+_MODE_REGULAR = "Regular"
+_MODE_FAST = "Fast"
+_MODE_EXTREME_HIGH = "Extreme-High"
+
+_REGULAR_POLY_OPTIONS = [
+ "Default",
+ "4K-Quad",
+ "8K-Quad",
+ "18K-Quad",
+ "50K-Quad",
+ "2K-Triangle",
+ "20K-Triangle",
+ "150K-Triangle",
+ "500K-Triangle",
+ "1M-Triangle",
+]
+
+_TEXTURE_MODE_OPTIONS = ["Default", "legacy", "extreme-low", "low", "medium", "high"]
+_GEOMETRY_FORMAT_OPTIONS = ["glb", "fbx", "obj", "stl"]
+_MATERIAL_OPTIONS = ["PBR", "Shaded", "All", "None"]
+
+
+def _build_mode_input(name: str = "mode") -> IO.DynamicCombo.Input:
+ return IO.DynamicCombo.Input(
+ name,
+ options=[
+ IO.DynamicCombo.Option(
+ _MODE_REGULAR,
+ [
+ IO.Combo.Input(
+ "tier",
+ options=["Gen-2.5-Low", "Gen-2.5-Medium", "Gen-2.5-High"],
+ default="Gen-2.5-High",
+ tooltip="Quality tier. Higher tiers produce higher-fidelity geometry.",
+ ),
+ IO.Combo.Input(
+ "polygon_count",
+ options=_REGULAR_POLY_OPTIONS,
+ default="Default",
+ tooltip="Preset face count. 'Default' uses the server's default for the selected tier.",
+ ),
+ IO.Boolean.Input(
+ "creative",
+ default=False,
+ tooltip="Creative mode (Medium/High only). Enhances generative robustness.",
+ ),
+ ],
+ ),
+ IO.DynamicCombo.Option(
+ _MODE_FAST,
+ [
+ IO.Combo.Input(
+ "tier",
+ options=[
+ "Gen-2.5-Extreme-Low",
+ "Gen-2.5-Low",
+ "Gen-2.5-Medium",
+ "Gen-2.5-High",
+ ],
+ default="Gen-2.5-Low",
+ ),
+ IO.Int.Input(
+ "mesh_faces",
+ default=20000,
+ min=1000,
+ max=20000,
+ display_mode=IO.NumberDisplay.number,
+ tooltip="Mesh face count (1K-20K in Fast mode).",
+ ),
+ ],
+ ),
+ IO.DynamicCombo.Option(
+ _MODE_EXTREME_HIGH,
+ [
+ IO.Combo.Input("mesh_mode", options=["Raw", "Quad"], default="Raw"),
+ IO.Int.Input(
+ "mesh_faces",
+ default=1000000,
+ min=20000,
+ max=2000000,
+ display_mode=IO.NumberDisplay.number,
+ tooltip=(
+ "Mesh face count. Raw mode: 20K-2M. "
+ "Quad mode: keep under 200K (upstream may reject higher values)."
+ ),
+ ),
+ IO.Boolean.Input(
+ "is_micro",
+ default=False,
+ tooltip="Enable micro detail (Extreme-High only).",
+ ),
+ IO.Boolean.Input(
+ "creative",
+ default=False,
+ tooltip="Creative mode. Enhances generative robustness.",
+ ),
+ ],
+ ),
+ ],
+ tooltip=(
+ "Generation mode. Regular = balanced. Fast = 1K-20K faces for rapid prototyping. "
+ "Extreme-High = 20K-2M faces with optional micro details."
+ ),
+ )
+
+
+def _build_common_inputs(*, include_image_only: bool) -> list:
+ inputs: list = [
+ IO.Combo.Input("material", options=_MATERIAL_OPTIONS, default="Shaded"),
+ IO.Combo.Input("geometry_file_format", options=_GEOMETRY_FORMAT_OPTIONS, default="glb"),
+ IO.Combo.Input(
+ "texture_mode",
+ options=_TEXTURE_MODE_OPTIONS,
+ default="Default",
+ optional=True,
+ tooltip="Texture quality preset. 'Default' uses the server's default for the selected tier.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=65535,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ optional=True,
+ ),
+ IO.Boolean.Input(
+ "TAPose", default=False, optional=True, advanced=True, tooltip="T/A pose for human-like models."
+ ),
+ IO.Boolean.Input(
+ "hd_texture", default=False, optional=True, advanced=True, tooltip="High-quality texture enhancement."
+ ),
+ IO.Boolean.Input(
+ "texture_delight",
+ default=False,
+ optional=True,
+ advanced=True,
+ tooltip="Remove baked lighting from textures.",
+ ),
+ ]
+ if include_image_only:
+ inputs.append(
+ IO.Boolean.Input(
+ "use_original_alpha",
+ default=False,
+ optional=True,
+ advanced=True,
+ tooltip="Preserve image transparency.",
+ )
+ )
+ inputs.extend(
+ [
+ IO.Boolean.Input(
+ "addon_highpack",
+ default=False,
+ optional=True,
+ advanced=True,
+ tooltip="HighPack addon: 4K textures and ~16x faces in Quad mode.",
+ ),
+ IO.Int.Input(
+ "bbox_width",
+ default=0,
+ min=0,
+ max=300,
+ display_mode=IO.NumberDisplay.number,
+ optional=True,
+ advanced=True,
+ tooltip="Bounding-box width (Y axis). Set to 0 with the others to skip bbox.",
+ ),
+ IO.Int.Input(
+ "bbox_height",
+ default=0,
+ min=0,
+ max=300,
+ display_mode=IO.NumberDisplay.number,
+ optional=True,
+ advanced=True,
+ tooltip="Bounding-box height (Z axis).",
+ ),
+ IO.Int.Input(
+ "bbox_length",
+ default=0,
+ min=0,
+ max=300,
+ display_mode=IO.NumberDisplay.number,
+ optional=True,
+ advanced=True,
+ tooltip="Bounding-box length (X axis).",
+ ),
+ IO.Int.Input(
+ "height_cm",
+ default=0,
+ min=0,
+ max=10000,
+ display_mode=IO.NumberDisplay.number,
+ optional=True,
+ advanced=True,
+ tooltip="Approximate model height in centimeters (0 to skip).",
+ ),
+ ]
+ )
+ return inputs
+
+
+_PRICE_EXPR = """
+(
+ $baseCredits := widgets.mode = "extreme-high" ? 1.0 : 0.5;
+ $addonCredits := widgets.addon_highpack ? 1.0 : 0.0;
+ $total := ($baseCredits * 1.5) + ($addonCredits * 0.8);
+ {"type":"usd","usd": $total}
+)
+"""
+
+
+def _resolve_mode_params(mode_input: dict) -> dict:
+ """Translate the DynamicCombo `mode` payload into Gen-2.5 request fields.
+
+ Returns a dict with: tier, quality_override, mesh_mode, geometry_instruct_mode, is_micro.
+ Missing keys mean "do not send" (so we don't override server defaults).
+ """
+ selected = mode_input["mode"]
+ out: dict = {}
+
+ if selected == _MODE_REGULAR:
+ out["tier"] = mode_input["tier"]
+ polygon = mode_input.get("polygon_count", "Default")
+ if polygon != "Default":
+ mesh_mode, faces = get_quality_mode(polygon)
+ out["mesh_mode"] = mesh_mode
+ out["quality_override"] = faces
+ if mode_input.get("creative"):
+ out["geometry_instruct_mode"] = "creative"
+
+ elif selected == _MODE_FAST:
+ out["tier"] = mode_input["tier"]
+ out["mesh_mode"] = "Raw"
+ out["quality_override"] = int(mode_input["mesh_faces"])
+
+ elif selected == _MODE_EXTREME_HIGH:
+ out["tier"] = "Gen-2.5-Extreme-High"
+ out["mesh_mode"] = mode_input["mesh_mode"]
+ out["quality_override"] = int(mode_input["mesh_faces"])
+ if mode_input.get("is_micro"):
+ out["is_micro"] = True
+ if mode_input.get("creative"):
+ out["geometry_instruct_mode"] = "creative"
+ return out
+
+
+def _build_request(
+ *,
+ mode_input: dict,
+ material: str,
+ geometry_file_format: str,
+ texture_mode: str,
+ seed: int,
+ TAPose: bool,
+ hd_texture: bool,
+ texture_delight: bool,
+ addon_highpack: bool,
+ bbox_width: int,
+ bbox_height: int,
+ bbox_length: int,
+ height_cm: int,
+ prompt: str | None = None,
+ use_original_alpha: bool = False,
+) -> Rodin3DGen25Request:
+ mode_params = _resolve_mode_params(mode_input)
+
+ bbox = None
+ if bbox_width and bbox_height and bbox_length:
+ bbox = [bbox_width, bbox_height, bbox_length]
+
+ return Rodin3DGen25Request(
+ tier=mode_params["tier"],
+ prompt=prompt or None,
+ seed=seed,
+ material=material,
+ geometry_file_format=geometry_file_format,
+ texture_mode=None if texture_mode == "Default" else texture_mode,
+ mesh_mode=mode_params.get("mesh_mode"),
+ quality_override=mode_params.get("quality_override"),
+ geometry_instruct_mode=mode_params.get("geometry_instruct_mode"),
+ bbox_condition=bbox,
+ height=height_cm or None,
+ TAPose=TAPose or None,
+ hd_texture=hd_texture or None,
+ texture_delight=texture_delight or None,
+ is_micro=mode_params.get("is_micro"),
+ use_original_alpha=use_original_alpha or None,
+ addons=["HighPack"] if addon_highpack else None,
+ )
+
+
+class Rodin3D_Gen25_Image(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="Rodin3D_Gen25_Image",
+ display_name="Rodin 3D Gen-2.5 - Image to 3D",
+ category="3d/partner/Rodin",
+ description=(
+ "Generate a 3D model from 1-5 reference images via Rodin Gen-2.5. "
+ "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost."
+ ),
+ inputs=[
+ IO.Autogrow.Input(
+ "images",
+ template=IO.Autogrow.TemplatePrefix(IO.Image.Input("image"), prefix="image", min=1, max=5),
+ tooltip="1-5 images. The first image is used for materials when multi-view.",
+ ),
+ _build_mode_input(),
+ *_build_common_inputs(include_image_only=True),
+ ],
+ outputs=[IO.File3DAny.Output(display_name="model_file")],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]),
+ expr=_PRICE_EXPR,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ images: IO.Autogrow.Type,
+ mode: dict,
+ material: str,
+ geometry_file_format: str,
+ texture_mode: str,
+ seed: int,
+ TAPose: bool,
+ hd_texture: bool,
+ texture_delight: bool,
+ use_original_alpha: bool,
+ addon_highpack: bool,
+ bbox_width: int,
+ bbox_height: int,
+ bbox_length: int,
+ height_cm: int,
+ ) -> IO.NodeOutput:
+ image_tensors = [img for img in images.values() if img is not None]
+ if not image_tensors:
+ raise ValueError("Rodin Gen-2.5 Image-to-3D requires at least one image.")
+
+ # Flatten multi-image tensors into individual frames; the API accepts each as a separate part.
+ flat_images: list = []
+ for tensor in image_tensors:
+ if hasattr(tensor, "shape") and len(tensor.shape) == 4:
+ for i in range(tensor.shape[0]):
+ flat_images.append(tensor[i])
+ else:
+ flat_images.append(tensor)
+
+ if len(flat_images) > 5:
+ raise ValueError(f"Rodin Gen-2.5 accepts at most 5 images; received {len(flat_images)}.")
+
+ request = _build_request(
+ mode_input=mode,
+ material=material,
+ geometry_file_format=geometry_file_format,
+ texture_mode=texture_mode,
+ seed=seed,
+ TAPose=TAPose,
+ hd_texture=hd_texture,
+ texture_delight=texture_delight,
+ addon_highpack=addon_highpack,
+ bbox_width=bbox_width,
+ bbox_height=bbox_height,
+ bbox_length=bbox_length,
+ height_cm=height_cm,
+ prompt=None,
+ use_original_alpha=use_original_alpha,
+ )
+
+ task_uuid, subscription_key = await _create_gen25_task(cls, request, flat_images)
+ await poll_for_task_status(subscription_key, cls)
+ download_list = await get_rodin_download_list(task_uuid, cls)
+ file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format)
+ return IO.NodeOutput(file_3d)
+
+
+class Rodin3D_Gen25_Text(IO.ComfyNode):
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="Rodin3D_Gen25_Text",
+ display_name="Rodin 3D Gen-2.5 - Text to 3D",
+ category="3d/partner/Rodin",
+ description=(
+ "Generate a 3D model from a text prompt via Rodin Gen-2.5. "
+ "Pick a mode (Fast / Regular / Extreme-High) to tune quality vs. cost."
+ ),
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Text prompt for the 3D model.",
+ ),
+ _build_mode_input(),
+ *_build_common_inputs(include_image_only=False),
+ ],
+ outputs=[IO.File3DAny.Output(display_name="model_file")],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["mode", "addon_highpack"]),
+ expr=_PRICE_EXPR,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ mode: dict,
+ material: str,
+ geometry_file_format: str,
+ texture_mode: str,
+ seed: int,
+ TAPose: bool,
+ hd_texture: bool,
+ texture_delight: bool,
+ addon_highpack: bool,
+ bbox_width: int,
+ bbox_height: int,
+ bbox_length: int,
+ height_cm: int,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, field_name="prompt", min_length=1, max_length=2500)
+ request = _build_request(
+ mode_input=mode,
+ material=material,
+ geometry_file_format=geometry_file_format,
+ texture_mode=texture_mode,
+ seed=seed,
+ TAPose=TAPose,
+ hd_texture=hd_texture,
+ texture_delight=texture_delight,
+ addon_highpack=addon_highpack,
+ bbox_width=bbox_width,
+ bbox_height=bbox_height,
+ bbox_length=bbox_length,
+ height_cm=height_cm,
+ prompt=prompt,
+ )
+ task_uuid, subscription_key = await _create_gen25_task(cls, request, images=None)
+ await poll_for_task_status(subscription_key, cls)
+ download_list = await get_rodin_download_list(task_uuid, cls)
+ file_3d = await _download_gen25_files(download_list, task_uuid, geometry_file_format)
+ return IO.NodeOutput(file_3d)
+
+
class Rodin3DExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -551,6 +1114,8 @@ class Rodin3DExtension(ComfyExtension):
Rodin3D_Smooth,
Rodin3D_Sketch,
Rodin3D_Gen2,
+ Rodin3D_Gen25_Image,
+ Rodin3D_Gen25_Text,
]
diff --git a/comfy_api_nodes/nodes_runway.py b/comfy_api_nodes/nodes_runway.py
index 573170ba2..7357c733e 100644
--- a/comfy_api_nodes/nodes_runway.py
+++ b/comfy_api_nodes/nodes_runway.py
@@ -140,7 +140,7 @@ class RunwayImageToVideoNodeGen3a(IO.ComfyNode):
return IO.Schema(
node_id="RunwayImageToVideoNodeGen3a",
display_name="Runway Image to Video (Gen3a Turbo)",
- category="api node/video/Runway",
+ category="video/partner/Runway",
description="Generate a video from a single starting frame using Gen3a Turbo model. "
"Before diving in, review these best practices to ensure that "
"your input selections will set your generation up for success: "
@@ -234,7 +234,7 @@ class RunwayImageToVideoNodeGen4(IO.ComfyNode):
return IO.Schema(
node_id="RunwayImageToVideoNodeGen4",
display_name="Runway Image to Video (Gen4 Turbo)",
- category="api node/video/Runway",
+ category="video/partner/Runway",
description="Generate a video from a single starting frame using Gen4 Turbo model. "
"Before diving in, review these best practices to ensure that "
"your input selections will set your generation up for success: "
@@ -329,7 +329,7 @@ class RunwayFirstLastFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="RunwayFirstLastFrameNode",
display_name="Runway First-Last-Frame to Video",
- category="api node/video/Runway",
+ category="video/partner/Runway",
description="Upload first and last keyframes, draft a prompt, and generate a video. "
"More complex transitions, such as cases where the Last frame is completely different "
"from the First frame, may benefit from the longer 10s duration. "
@@ -440,7 +440,7 @@ class RunwayTextToImageNode(IO.ComfyNode):
return IO.Schema(
node_id="RunwayTextToImageNode",
display_name="Runway Text to Image",
- category="api node/image/Runway",
+ category="image/partner/Runway",
description="Generate an image from a text prompt using Runway's Gen 4 model. "
"You can also include reference image to guide the generation.",
inputs=[
diff --git a/comfy_api_nodes/nodes_sonilo.py b/comfy_api_nodes/nodes_sonilo.py
new file mode 100644
index 000000000..bc31a0074
--- /dev/null
+++ b/comfy_api_nodes/nodes_sonilo.py
@@ -0,0 +1,287 @@
+import base64
+import json
+import logging
+import time
+from urllib.parse import urljoin
+
+import aiohttp
+from typing_extensions import override
+
+from comfy_api.latest import IO, ComfyExtension, Input
+from comfy_api_nodes.util import (
+ ApiEndpoint,
+ audio_bytes_to_audio_input,
+ upload_video_to_comfyapi,
+ validate_string,
+)
+from comfy_api_nodes.util._helpers import (
+ default_base_url,
+ get_auth_header,
+ get_node_id,
+ is_processing_interrupted,
+)
+from comfy_api_nodes.util.common_exceptions import ProcessingInterrupted
+from server import PromptServer
+
+logger = logging.getLogger(__name__)
+
+
+class SoniloVideoToMusic(IO.ComfyNode):
+ """Generate music from video using Sonilo's AI model."""
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="SoniloVideoToMusic",
+ display_name="Sonilo Video to Music",
+ category="audio/partner/Sonilo",
+ description="Generate music from video content using Sonilo's AI model. "
+ "Analyzes the video and creates matching music.",
+ inputs=[
+ IO.Video.Input(
+ "video",
+ tooltip="Input video to generate music from. Maximum duration: 6 minutes.",
+ ),
+ IO.String.Input(
+ "prompt",
+ default="",
+ multiline=True,
+ tooltip="Optional text prompt to guide music generation. "
+ "Leave empty for best quality - the model will fully analyze the video content.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=0xFFFFFFFFFFFFFFFF,
+ control_after_generate=True,
+ tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
+ "service but kept for graph consistency.",
+ ),
+ ],
+ outputs=[IO.Audio.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ expr='{"type":"usd","usd":0.009,"format":{"suffix":"/second"}}',
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ video: Input.Video,
+ prompt: str = "",
+ seed: int = 0,
+ ) -> IO.NodeOutput:
+ video_url = await upload_video_to_comfyapi(cls, video, max_duration=360)
+ form = aiohttp.FormData()
+ form.add_field("video_url", video_url)
+ if prompt.strip():
+ form.add_field("prompt", prompt.strip())
+ audio_bytes = await _stream_sonilo_music(
+ cls,
+ ApiEndpoint(path="/proxy/sonilo/v2m/generate", method="POST"),
+ form,
+ )
+ return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
+
+
+class SoniloTextToMusic(IO.ComfyNode):
+ """Generate music from a text prompt using Sonilo's AI model."""
+
+ @classmethod
+ def define_schema(cls) -> IO.Schema:
+ return IO.Schema(
+ node_id="SoniloTextToMusic",
+ display_name="Sonilo Text to Music",
+ category="audio/partner/Sonilo",
+ description="Generate music from a text prompt using Sonilo's AI model. "
+ "Leave duration at 0 to let the model infer it from the prompt.",
+ inputs=[
+ IO.String.Input(
+ "prompt",
+ default="",
+ multiline=True,
+ tooltip="Text prompt describing the music to generate.",
+ ),
+ IO.Int.Input(
+ "duration",
+ default=0,
+ min=0,
+ max=360,
+ tooltip="Target duration in seconds. Set to 0 to let the model "
+ "infer the duration from the prompt. Maximum: 6 minutes.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=0xFFFFFFFFFFFFFFFF,
+ control_after_generate=True,
+ tooltip="Seed for reproducibility. Currently ignored by the Sonilo "
+ "service but kept for graph consistency.",
+ ),
+ ],
+ outputs=[IO.Audio.Output()],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["duration"]),
+ expr="""
+ (
+ widgets.duration > 0
+ ? {"type":"usd","usd": 0.005 * widgets.duration}
+ : {"type":"usd","usd": 0.005, "format":{"suffix":"/second"}}
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ prompt: str,
+ duration: int = 0,
+ seed: int = 0,
+ ) -> IO.NodeOutput:
+ validate_string(prompt, strip_whitespace=True, min_length=1)
+ form = aiohttp.FormData()
+ form.add_field("prompt", prompt)
+ if duration > 0:
+ form.add_field("duration", str(duration))
+ audio_bytes = await _stream_sonilo_music(
+ cls,
+ ApiEndpoint(path="/proxy/sonilo/t2m/generate", method="POST"),
+ form,
+ )
+ return IO.NodeOutput(audio_bytes_to_audio_input(audio_bytes))
+
+
+async def _stream_sonilo_music(
+ cls: type[IO.ComfyNode],
+ endpoint: ApiEndpoint,
+ form: aiohttp.FormData,
+) -> bytes:
+ """POST ``form`` to Sonilo, read the NDJSON stream, and return the first stream's audio bytes."""
+ url = urljoin(default_base_url().rstrip("/") + "/", endpoint.path.lstrip("/"))
+
+ headers: dict[str, str] = {}
+ headers.update(get_auth_header(cls))
+ headers.update(endpoint.headers)
+
+ node_id = get_node_id(cls)
+ start_ts = time.monotonic()
+ last_chunk_status_ts = 0.0
+ audio_streams: dict[int, list[bytes]] = {}
+ title: str | None = None
+
+ timeout = aiohttp.ClientTimeout(total=1200.0, sock_read=300.0)
+ async with aiohttp.ClientSession(timeout=timeout) as session:
+ PromptServer.instance.send_progress_text("Status: Queued", node_id)
+ async with session.post(url, data=form, headers=headers) as resp:
+ if resp.status >= 400:
+ msg = await _extract_error_message(resp)
+ raise Exception(f"Sonilo API error ({resp.status}): {msg}")
+
+ while True:
+ if is_processing_interrupted():
+ raise ProcessingInterrupted("Task cancelled")
+
+ raw_line = await resp.content.readline()
+ if not raw_line:
+ break
+
+ line = raw_line.decode("utf-8").strip()
+ if not line:
+ continue
+
+ try:
+ evt = json.loads(line)
+ except json.JSONDecodeError:
+ logger.warning("Sonilo: skipping malformed NDJSON line")
+ continue
+
+ evt_type = evt.get("type")
+ if evt_type == "error":
+ code = evt.get("code", "UNKNOWN")
+ message = evt.get("message", "Unknown error")
+ raise Exception(f"Sonilo generation error ({code}): {message}")
+ if evt_type == "duration":
+ duration_sec = evt.get("duration_sec")
+ if duration_sec is not None:
+ PromptServer.instance.send_progress_text(
+ f"Status: Generating\nVideo duration: {duration_sec:.1f}s",
+ node_id,
+ )
+ elif evt_type in ("titles", "title"):
+ # v2m sends a "titles" list, t2m sends a scalar "title"
+ if evt_type == "titles":
+ titles = evt.get("titles", [])
+ if titles:
+ title = titles[0]
+ else:
+ title = evt.get("title") or title
+ if title:
+ PromptServer.instance.send_progress_text(
+ f"Status: Generating\nTitle: {title}",
+ node_id,
+ )
+ elif evt_type == "audio_chunk":
+ stream_idx = evt.get("stream_index", 0)
+ chunk_data = base64.b64decode(evt["data"])
+
+ if stream_idx not in audio_streams:
+ audio_streams[stream_idx] = []
+ audio_streams[stream_idx].append(chunk_data)
+
+ now = time.monotonic()
+ if now - last_chunk_status_ts >= 1.0:
+ total_chunks = sum(len(chunks) for chunks in audio_streams.values())
+ elapsed = int(now - start_ts)
+ status_lines = ["Status: Receiving audio"]
+ if title:
+ status_lines.append(f"Title: {title}")
+ status_lines.append(f"Chunks received: {total_chunks}")
+ status_lines.append(f"Time elapsed: {elapsed}s")
+ PromptServer.instance.send_progress_text("\n".join(status_lines), node_id)
+ last_chunk_status_ts = now
+ elif evt_type == "complete":
+ break
+
+ if not audio_streams:
+ raise Exception("Sonilo API returned no audio data.")
+
+ PromptServer.instance.send_progress_text("Status: Completed", node_id)
+ selected_stream = 0 if 0 in audio_streams else min(audio_streams)
+ return b"".join(audio_streams[selected_stream])
+
+
+async def _extract_error_message(resp: aiohttp.ClientResponse) -> str:
+ """Extract a human-readable error message from an HTTP error response."""
+ try:
+ error_body = await resp.json()
+ detail = error_body.get("detail", {})
+ if isinstance(detail, dict):
+ return detail.get("message", str(detail))
+ return str(detail)
+ except Exception:
+ return await resp.text()
+
+
+class SoniloExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [SoniloVideoToMusic, SoniloTextToMusic]
+
+
+async def comfy_entrypoint() -> SoniloExtension:
+ return SoniloExtension()
diff --git a/comfy_api_nodes/nodes_sora.py b/comfy_api_nodes/nodes_sora.py
index afc18bb25..83cfca495 100644
--- a/comfy_api_nodes/nodes_sora.py
+++ b/comfy_api_nodes/nodes_sora.py
@@ -33,9 +33,13 @@ class OpenAIVideoSora2(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="OpenAIVideoSora2",
- display_name="OpenAI Sora - Video",
- category="api node/video/Sora",
- description="OpenAI video and audio generation.",
+ display_name="OpenAI Sora - Video (DEPRECATED)",
+ category="video/partner/Sora",
+ description=(
+ "OpenAI video and audio generation.\n\n"
+ "DEPRECATION NOTICE: OpenAI will stop serving the Sora v2 API in September 2026. "
+ "This node will be removed from ComfyUI at that time."
+ ),
inputs=[
IO.Combo.Input(
"model",
diff --git a/comfy_api_nodes/nodes_stability.py b/comfy_api_nodes/nodes_stability.py
index 9ef13c83b..a1753d647 100644
--- a/comfy_api_nodes/nodes_stability.py
+++ b/comfy_api_nodes/nodes_stability.py
@@ -62,7 +62,7 @@ class StabilityStableImageUltraNode(IO.ComfyNode):
return IO.Schema(
node_id="StabilityStableImageUltraNode",
display_name="Stability AI Stable Image Ultra",
- category="api node/image/Stability AI",
+ category="image/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.String.Input(
@@ -197,7 +197,7 @@ class StabilityStableImageSD_3_5Node(IO.ComfyNode):
return IO.Schema(
node_id="StabilityStableImageSD_3_5Node",
display_name="Stability AI Stable Diffusion 3.5 Image",
- category="api node/image/Stability AI",
+ category="image/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.String.Input(
@@ -354,7 +354,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode):
return IO.Schema(
node_id="StabilityUpscaleConservativeNode",
display_name="Stability AI Upscale Conservative",
- category="api node/image/Stability AI",
+ category="image/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("image"),
@@ -401,7 +401,7 @@ class StabilityUpscaleConservativeNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.25}""",
+ expr="""{"type":"usd","usd":0.4}""",
),
)
@@ -457,7 +457,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode):
return IO.Schema(
node_id="StabilityUpscaleCreativeNode",
display_name="Stability AI Upscale Creative",
- category="api node/image/Stability AI",
+ category="image/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("image"),
@@ -510,7 +510,7 @@ class StabilityUpscaleCreativeNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.25}""",
+ expr="""{"type":"usd","usd":0.6}""",
),
)
@@ -578,7 +578,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode):
return IO.Schema(
node_id="StabilityUpscaleFastNode",
display_name="Stability AI Upscale Fast",
- category="api node/image/Stability AI",
+ category="image/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Image.Input("image"),
@@ -593,7 +593,7 @@ class StabilityUpscaleFastNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.01}""",
+ expr="""{"type":"usd","usd":0.02}""",
),
)
@@ -630,7 +630,7 @@ class StabilityTextToAudio(IO.ComfyNode):
return IO.Schema(
node_id="StabilityTextToAudio",
display_name="Stability AI Text To Audio",
- category="api node/audio/Stability AI",
+ category="audio/partner/Stability AI",
essentials_category="Audio",
description=cleandoc(cls.__doc__ or ""),
inputs=[
@@ -708,7 +708,7 @@ class StabilityAudioToAudio(IO.ComfyNode):
return IO.Schema(
node_id="StabilityAudioToAudio",
display_name="Stability AI Audio To Audio",
- category="api node/audio/Stability AI",
+ category="audio/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Combo.Input(
@@ -802,7 +802,7 @@ class StabilityAudioInpaint(IO.ComfyNode):
return IO.Schema(
node_id="StabilityAudioInpaint",
display_name="Stability AI Audio Inpaint",
- category="api node/audio/Stability AI",
+ category="audio/partner/Stability AI",
description=cleandoc(cls.__doc__ or ""),
inputs=[
IO.Combo.Input(
diff --git a/comfy_api_nodes/nodes_topaz.py b/comfy_api_nodes/nodes_topaz.py
index b18b31af1..d0906ee44 100644
--- a/comfy_api_nodes/nodes_topaz.py
+++ b/comfy_api_nodes/nodes_topaz.py
@@ -36,11 +36,15 @@ from comfy_api_nodes.util import (
)
UPSCALER_MODELS_MAP = {
+ "Astra 2": "ast-2",
"Starlight (Astra) Fast": "slf-1",
"Starlight (Astra) Creative": "slc-1",
"Starlight Precise 2.5": "slp-2.5",
}
+AST2_MAX_FRAMES = 9000
+AST2_MAX_FRAMES_WITH_PROMPT = 450
+
class TopazImageEnhance(IO.ComfyNode):
@classmethod
@@ -48,7 +52,7 @@ class TopazImageEnhance(IO.ComfyNode):
return IO.Schema(
node_id="TopazImageEnhance",
display_name="Topaz Image Enhance",
- category="api node/image/Topaz",
+ category="image/partner/Topaz",
description="Industry-standard upscaling and image enhancement.",
inputs=[
IO.Combo.Input("model", options=["Reimagine"]),
@@ -230,13 +234,20 @@ class TopazVideoEnhance(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="TopazVideoEnhance",
- display_name="Topaz Video Enhance",
- category="api node/video/Topaz",
+ display_name="Topaz Video Enhance (Legacy)",
+ category="video/partner/Topaz",
description="Breathe new life into video with powerful upscaling and recovery technology.",
inputs=[
IO.Video.Input("video"),
IO.Boolean.Input("upscaler_enabled", default=True),
- IO.Combo.Input("upscaler_model", options=list(UPSCALER_MODELS_MAP.keys())),
+ IO.Combo.Input(
+ "upscaler_model",
+ options=[
+ "Starlight (Astra) Fast",
+ "Starlight (Astra) Creative",
+ "Starlight Precise 2.5",
+ ],
+ ),
IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"]),
IO.Combo.Input(
"upscaler_creativity",
@@ -304,6 +315,7 @@ class TopazVideoEnhance(IO.ComfyNode):
IO.Hidden.unique_id,
],
is_api_node=True,
+ is_deprecated=True,
)
@classmethod
@@ -453,7 +465,350 @@ class TopazVideoEnhance(IO.ComfyNode):
progress_extractor=lambda x: getattr(x, "progress", 0),
price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None),
poll_interval=10.0,
- max_poll_attempts=320,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(final_response.download.url))
+
+
+class TopazVideoEnhanceV2(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="TopazVideoEnhanceV2",
+ display_name="Topaz Video Enhance",
+ category="video/partner/Topaz",
+ description="Breathe new life into video with powerful upscaling and recovery technology.",
+ inputs=[
+ IO.Video.Input("video"),
+ IO.DynamicCombo.Input(
+ "upscaler_model",
+ options=[
+ IO.DynamicCombo.Option(
+ "Astra 2",
+ [
+ IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"]),
+ IO.Float.Input(
+ "creativity",
+ default=0.5,
+ min=0.0,
+ max=1.0,
+ step=0.1,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Creative strength of the upscale.",
+ ),
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Optional descriptive (not instructive) scene prompt."
+ f"Capping input at {AST2_MAX_FRAMES_WITH_PROMPT} frames (~15s @ 30fps) when set.",
+ ),
+ IO.Float.Input(
+ "sharp",
+ default=0.5,
+ min=0.0,
+ max=1.0,
+ step=0.01,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Pre-enhance sharpness: "
+ "0.0=Gaussian blur, 0.5=passthrough (default), 1.0=USM sharpening.",
+ advanced=True,
+ ),
+ IO.Float.Input(
+ "realism",
+ default=0.0,
+ min=0.0,
+ max=1.0,
+ step=0.01,
+ display_mode=IO.NumberDisplay.slider,
+ tooltip="Pulls output toward photographic realism."
+ "Leave at 0 for the model default.",
+ advanced=True,
+ ),
+ ],
+ ),
+ IO.DynamicCombo.Option(
+ "Starlight (Astra) Fast",
+ [IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"]),],
+ ),
+ IO.DynamicCombo.Option(
+ "Starlight (Astra) Creative",
+ [
+ IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"]),
+ IO.Combo.Input(
+ "creativity",
+ options=["low", "middle", "high"],
+ default="low",
+ tooltip="Creative strength of the upscale.",
+ ),
+ ],
+ ),
+ IO.DynamicCombo.Option(
+ "Starlight Precise 2.5",
+ [IO.Combo.Input("upscaler_resolution", options=["FullHD (1080p)", "4K (2160p)"])],
+ ),
+ IO.DynamicCombo.Option("Disabled", []),
+ ],
+ ),
+ IO.DynamicCombo.Input(
+ "interpolation_model",
+ options=[
+ IO.DynamicCombo.Option("Disabled", []),
+ IO.DynamicCombo.Option(
+ "apo-8",
+ [
+ IO.Int.Input(
+ "interpolation_frame_rate",
+ default=60,
+ min=15,
+ max=240,
+ display_mode=IO.NumberDisplay.number,
+ tooltip="Output frame rate.",
+ ),
+ IO.Int.Input(
+ "interpolation_slowmo",
+ default=1,
+ min=1,
+ max=16,
+ display_mode=IO.NumberDisplay.number,
+ tooltip="Slow-motion factor applied to the input video. "
+ "For example, 2 makes the output twice as slow and doubles the duration.",
+ advanced=True,
+ ),
+ IO.Boolean.Input(
+ "interpolation_duplicate",
+ default=False,
+ tooltip="Analyze the input for duplicate frames and remove them.",
+ advanced=True,
+ ),
+ IO.Float.Input(
+ "interpolation_duplicate_threshold",
+ default=0.01,
+ min=0.001,
+ max=0.1,
+ step=0.001,
+ display_mode=IO.NumberDisplay.number,
+ tooltip="Detection sensitivity for duplicate frames.",
+ advanced=True,
+ ),
+ ],
+ ),
+ ],
+ ),
+ IO.Combo.Input(
+ "dynamic_compression_level",
+ options=["Low", "Mid", "High"],
+ default="Low",
+ tooltip="CQP level.",
+ optional=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=[
+ "upscaler_model",
+ "upscaler_model.upscaler_resolution",
+ "interpolation_model",
+ ]),
+ expr="""
+ (
+ $model := $lookup(widgets, "upscaler_model");
+ $res := $lookup(widgets, "upscaler_model.upscaler_resolution");
+ $interp := $lookup(widgets, "interpolation_model");
+ $is4k := $contains($res, "4k");
+ $hasInterp := $interp != "disabled";
+ $rates := {
+ "starlight (astra) fast": {"hd": 0.43, "uhd": 0.85},
+ "starlight precise 2.5": {"hd": 0.70, "uhd": 1.54},
+ "astra 2": {"hd": 1.72, "uhd": 2.85},
+ "starlight (astra) creative": {"hd": 2.25, "uhd": 3.99}
+ };
+ $surcharge := $is4k ? 0.28 : 0.14;
+ $entry := $lookup($rates, $model);
+ $base := $is4k ? $entry.uhd : $entry.hd;
+ $hi := $base + ($hasInterp ? $surcharge : 0);
+ $model = "disabled"
+ ? {"type":"text","text":"Interpolation only"}
+ : ($hasInterp
+ ? {"type":"text","text":"~" & $string($base) & "–" & $string($hi) & " credits/src frame"}
+ : {"type":"text","text":"~" & $string($base) & " credits/src frame"})
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ video: Input.Video,
+ upscaler_model: dict,
+ interpolation_model: dict,
+ dynamic_compression_level: str = "Low",
+ ) -> IO.NodeOutput:
+ upscaler_choice = upscaler_model["upscaler_model"]
+ interpolation_choice = interpolation_model["interpolation_model"]
+ if upscaler_choice == "Disabled" and interpolation_choice == "Disabled":
+ raise ValueError("There is nothing to do: both upscaling and interpolation are disabled.")
+ validate_container_format_is_mp4(video)
+ src_width, src_height = video.get_dimensions()
+ src_frame_rate = int(video.get_frame_rate())
+ duration_sec = video.get_duration()
+ src_video_stream = video.get_stream_source()
+ target_width = src_width
+ target_height = src_height
+ target_frame_rate = src_frame_rate
+ filters = []
+ if upscaler_choice != "Disabled":
+ if "1080p" in upscaler_model["upscaler_resolution"]:
+ target_pixel_p = 1080
+ max_long_side = 1920
+ else:
+ target_pixel_p = 2160
+ max_long_side = 3840
+ ar = src_width / src_height
+ if src_width >= src_height:
+ # Landscape or Square; Attempt to set height to target (e.g., 2160), calculate width
+ target_height = target_pixel_p
+ target_width = int(target_height * ar)
+ # Check if width exceeds standard bounds (for ultra-wide e.g., 21:9 ARs)
+ if target_width > max_long_side:
+ target_width = max_long_side
+ target_height = int(target_width / ar)
+ else:
+ # Portrait; Attempt to set width to target (e.g., 2160), calculate height
+ target_width = target_pixel_p
+ target_height = int(target_width / ar)
+ # Check if height exceeds standard bounds
+ if target_height > max_long_side:
+ target_height = max_long_side
+ target_width = int(target_height * ar)
+ if target_width % 2 != 0:
+ target_width += 1
+ if target_height % 2 != 0:
+ target_height += 1
+ model_id = UPSCALER_MODELS_MAP[upscaler_choice]
+ if model_id == "slc-1":
+ filters.append(
+ VideoEnhancementFilter(
+ model=model_id,
+ creativity=upscaler_model["creativity"],
+ isOptimizedMode=True,
+ )
+ )
+ elif model_id == "ast-2":
+ n_frames = video.get_frame_count()
+ ast2_prompt = (upscaler_model["prompt"] or "").strip()
+ if ast2_prompt and n_frames > AST2_MAX_FRAMES_WITH_PROMPT:
+ raise ValueError(
+ f"Astra 2 with a prompt is limited to {AST2_MAX_FRAMES_WITH_PROMPT} input frames "
+ f"(~15s @ 30fps); video has {n_frames}. Clear the prompt or shorten the clip."
+ )
+ if n_frames > AST2_MAX_FRAMES:
+ raise ValueError(f"Astra 2 is limited to {AST2_MAX_FRAMES} input frames; video has {n_frames}.")
+ realism = upscaler_model["realism"]
+ filters.append(
+ VideoEnhancementFilter(
+ model=model_id,
+ creativity=upscaler_model["creativity"],
+ prompt=(ast2_prompt or None),
+ sharp=upscaler_model["sharp"],
+ realism=(realism if realism > 0 else None),
+ )
+ )
+ else:
+ filters.append(VideoEnhancementFilter(model=model_id))
+ if interpolation_choice != "Disabled":
+ target_frame_rate = interpolation_model["interpolation_frame_rate"]
+ filters.append(
+ VideoFrameInterpolationFilter(
+ model=interpolation_choice,
+ slowmo=interpolation_model["interpolation_slowmo"],
+ fps=interpolation_model["interpolation_frame_rate"],
+ duplicate=interpolation_model["interpolation_duplicate"],
+ duplicate_threshold=interpolation_model["interpolation_duplicate_threshold"],
+ ),
+ )
+ initial_res = await sync_op(
+ cls,
+ ApiEndpoint(path="/proxy/topaz/video/", method="POST"),
+ response_model=CreateVideoResponse,
+ data=CreateVideoRequest(
+ source=CreateVideoRequestSource(
+ container="mp4",
+ size=get_fs_object_size(src_video_stream),
+ duration=int(duration_sec),
+ frameCount=video.get_frame_count(),
+ frameRate=src_frame_rate,
+ resolution=Resolution(width=src_width, height=src_height),
+ ),
+ filters=filters,
+ output=OutputInformationVideo(
+ resolution=Resolution(width=target_width, height=target_height),
+ frameRate=target_frame_rate,
+ audioCodec="AAC",
+ audioTransfer="Copy",
+ dynamicCompressionLevel=dynamic_compression_level,
+ ),
+ ),
+ wait_label="Creating task",
+ final_label_on_success="Task created",
+ )
+ upload_res = await sync_op(
+ cls,
+ ApiEndpoint(
+ path=f"/proxy/topaz/video/{initial_res.requestId}/accept",
+ method="PATCH",
+ ),
+ response_model=VideoAcceptResponse,
+ wait_label="Preparing upload",
+ final_label_on_success="Upload started",
+ )
+ if len(upload_res.urls) > 1:
+ raise NotImplementedError(
+ "Large files are not currently supported. Please open an issue in the ComfyUI repository."
+ )
+ async with aiohttp.ClientSession(headers={"Content-Type": "video/mp4"}) as session:
+ if isinstance(src_video_stream, BytesIO):
+ src_video_stream.seek(0)
+ async with session.put(upload_res.urls[0], data=src_video_stream, raise_for_status=True) as res:
+ upload_etag = res.headers["Etag"]
+ else:
+ with builtins.open(src_video_stream, "rb") as video_file:
+ async with session.put(upload_res.urls[0], data=video_file, raise_for_status=True) as res:
+ upload_etag = res.headers["Etag"]
+ await sync_op(
+ cls,
+ ApiEndpoint(
+ path=f"/proxy/topaz/video/{initial_res.requestId}/complete-upload",
+ method="PATCH",
+ ),
+ response_model=VideoCompleteUploadResponse,
+ data=VideoCompleteUploadRequest(
+ uploadResults=[
+ VideoCompleteUploadRequestPart(
+ partNum=1,
+ eTag=upload_etag,
+ ),
+ ],
+ ),
+ wait_label="Finalizing upload",
+ final_label_on_success="Upload completed",
+ )
+ final_response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/topaz/video/{initial_res.requestId}/status"),
+ response_model=VideoStatusResponse,
+ status_extractor=lambda x: x.status,
+ progress_extractor=lambda x: getattr(x, "progress", 0),
+ price_extractor=lambda x: (x.estimates.cost[0] * 0.08 if x.estimates and x.estimates.cost[0] else None),
+ poll_interval=10.0,
)
return IO.NodeOutput(await download_url_to_video_output(final_response.download.url))
@@ -464,6 +819,7 @@ class TopazExtension(ComfyExtension):
return [
TopazImageEnhance,
TopazVideoEnhance,
+ TopazVideoEnhanceV2,
]
diff --git a/comfy_api_nodes/nodes_tripo.py b/comfy_api_nodes/nodes_tripo.py
index 9f4298dce..6ee674a18 100644
--- a/comfy_api_nodes/nodes_tripo.py
+++ b/comfy_api_nodes/nodes_tripo.py
@@ -60,6 +60,7 @@ async def poll_until_finished(
],
status_extractor=lambda x: x.data.status,
progress_extractor=lambda x: x.data.progress,
+ price_extractor=lambda x: x.data.consumed_credit * 0.01 if x.data.consumed_credit else None,
estimated_duration=average_duration,
)
if response_poll.data.status == TripoTaskStatus.SUCCESS:
@@ -79,7 +80,7 @@ class TripoTextToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoTextToModelNode",
display_name="Tripo: Text to Model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.String.Input("prompt", multiline=True),
IO.String.Input("negative_prompt", multiline=True, optional=True),
@@ -113,7 +114,6 @@ class TripoTextToModelNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(
widgets=[
"model_version",
- "style",
"texture",
"pbr",
"quad",
@@ -124,20 +124,17 @@ class TripoTextToModelNode(IO.ComfyNode):
expr="""
(
$isV14 := $contains(widgets.model_version,"v1.4");
- $style := widgets.style;
- $hasStyle := ($style != "" and $style != "none");
+ $isV3OrLater := $contains(widgets.model_version,"v3.");
$withTexture := widgets.texture or widgets.pbr;
$isHdTexture := (widgets.texture_quality = "detailed");
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
- $baseCredits :=
- $isV14 ? 20 : ($withTexture ? 20 : 10);
- $credits :=
- $baseCredits
- + ($hasStyle ? 5 : 0)
+ $credits := $isV14 ? 20 : (
+ ($withTexture ? 20 : 10)
+ (widgets.quad ? 5 : 0)
+ ($isHdTexture ? 10 : 0)
- + ($isDetailedGeometry ? 20 : 0);
- {"type":"usd","usd": $round($credits * 0.01, 2)}
+ + (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
+ );
+ {"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
)
""",
),
@@ -198,7 +195,7 @@ class TripoImageToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoImageToModelNode",
display_name="Tripo: Image to Model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.Image.Input("image"),
IO.Combo.Input(
@@ -239,7 +236,6 @@ class TripoImageToModelNode(IO.ComfyNode):
depends_on=IO.PriceBadgeDepends(
widgets=[
"model_version",
- "style",
"texture",
"pbr",
"quad",
@@ -250,20 +246,17 @@ class TripoImageToModelNode(IO.ComfyNode):
expr="""
(
$isV14 := $contains(widgets.model_version,"v1.4");
- $style := widgets.style;
- $hasStyle := ($style != "" and $style != "none");
+ $isV3OrLater := $contains(widgets.model_version,"v3.");
$withTexture := widgets.texture or widgets.pbr;
$isHdTexture := (widgets.texture_quality = "detailed");
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
- $baseCredits :=
- $isV14 ? 30 : ($withTexture ? 30 : 20);
- $credits :=
- $baseCredits
- + ($hasStyle ? 5 : 0)
+ $credits := $isV14 ? 30 : (
+ ($withTexture ? 30 : 20)
+ (widgets.quad ? 5 : 0)
+ ($isHdTexture ? 10 : 0)
- + ($isDetailedGeometry ? 20 : 0);
- {"type":"usd","usd": $round($credits * 0.01, 2)}
+ + (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
+ );
+ {"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
)
""",
),
@@ -330,7 +323,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoMultiviewToModelNode",
display_name="Tripo: Multiview to Model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.Image.Input("image"),
IO.Image.Input("image_left", optional=True),
@@ -358,7 +351,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
"texture_alignment", default="original_image", options=["original_image", "geometry"], optional=True, advanced=True
),
IO.Int.Input("face_limit", default=-1, min=-1, max=500000, optional=True, advanced=True),
- IO.Boolean.Input("quad", default=False, optional=True, advanced=True),
+ IO.Boolean.Input("quad", default=False, optional=True, advanced=True, tooltip="This parameter is deprecated and does nothing."),
IO.Combo.Input("geometry_quality", default="standard", options=["standard", "detailed"], optional=True, advanced=True),
],
outputs=[
@@ -379,7 +372,6 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
"model_version",
"texture",
"pbr",
- "quad",
"texture_quality",
"geometry_quality",
],
@@ -387,17 +379,16 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
expr="""
(
$isV14 := $contains(widgets.model_version,"v1.4");
+ $isV3OrLater := $contains(widgets.model_version,"v3.");
$withTexture := widgets.texture or widgets.pbr;
$isHdTexture := (widgets.texture_quality = "detailed");
$isDetailedGeometry := (widgets.geometry_quality = "detailed");
- $baseCredits :=
- $isV14 ? 30 : ($withTexture ? 30 : 20);
- $credits :=
- $baseCredits
- + (widgets.quad ? 5 : 0)
+ $credits := $isV14 ? 30 : (
+ ($withTexture ? 30 : 20)
+ ($isHdTexture ? 10 : 0)
- + ($isDetailedGeometry ? 20 : 0);
- {"type":"usd","usd": $round($credits * 0.01, 2)}
+ + (($isDetailedGeometry and $isV3OrLater) ? 20 : 0)
+ );
+ {"type":"usd","usd": $round($credits * 0.01, 2), "format": {"approximate": true}}
)
""",
),
@@ -457,7 +448,7 @@ class TripoMultiviewToModelNode(IO.ComfyNode):
geometry_quality=geometry_quality,
texture_alignment=texture_alignment,
face_limit=face_limit if face_limit != -1 else None,
- quad=quad,
+ quad=None,
),
)
return await poll_until_finished(cls, response, average_duration=80)
@@ -470,7 +461,7 @@ class TripoTextureNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoTextureNode",
display_name="Tripo: Texture model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.Custom("MODEL_TASK_ID").Input("model_task_id"),
IO.Boolean.Input("texture", default=True, optional=True),
@@ -498,7 +489,7 @@ class TripoTextureNode(IO.ComfyNode):
expr="""
(
$tq := widgets.texture_quality;
- {"type":"usd","usd": ($contains($tq,"detailed") ? 0.2 : 0.1)}
+ {"type":"usd","usd": ($contains($tq,"detailed") ? 0.2 : 0.1), "format": {"approximate": true}}
)
""",
),
@@ -537,7 +528,7 @@ class TripoRefineNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoRefineNode",
display_name="Tripo: Refine Draft model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
description="Refine a draft model created by v1.4 Tripo models only.",
inputs=[
IO.Custom("MODEL_TASK_ID").Input("model_task_id", tooltip="Must be a v1.4 Tripo model"),
@@ -555,7 +546,7 @@ class TripoRefineNode(IO.ComfyNode):
is_api_node=True,
is_output_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.3}""",
+ expr="""{"type":"usd","usd":0.3, "format": {"approximate": true}}""",
),
)
@@ -577,7 +568,7 @@ class TripoRigNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoRigNode",
display_name="Tripo: Rig model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[IO.Custom("MODEL_TASK_ID").Input("original_model_task_id")],
outputs=[
IO.String.Output(display_name="model_file"), # for backward compatibility only
@@ -592,7 +583,7 @@ class TripoRigNode(IO.ComfyNode):
is_api_node=True,
is_output_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.25}""",
+ expr="""{"type":"usd","usd":0.25, "format": {"approximate": true}}""",
),
)
@@ -614,7 +605,7 @@ class TripoRetargetNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoRetargetNode",
display_name="Tripo: Retarget rigged model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.Custom("RIG_TASK_ID").Input("original_model_task_id"),
IO.Combo.Input(
@@ -652,7 +643,7 @@ class TripoRetargetNode(IO.ComfyNode):
is_api_node=True,
is_output_node=True,
price_badge=IO.PriceBadge(
- expr="""{"type":"usd","usd":0.1}""",
+ expr="""{"type":"usd","usd":0.1, "format": {"approximate": true}}""",
),
)
@@ -679,7 +670,7 @@ class TripoConversionNode(IO.ComfyNode):
return IO.Schema(
node_id="TripoConversionNode",
display_name="Tripo: Convert model",
- category="api node/3d/Tripo",
+ category="3d/partner/Tripo",
inputs=[
IO.Custom("MODEL_TASK_ID,RIG_TASK_ID,RETARGET_TASK_ID").Input("original_model_task_id"),
IO.Combo.Input("format", options=["GLTF", "USDZ", "FBX", "OBJ", "STL", "3MF"]),
@@ -761,19 +752,10 @@ class TripoConversionNode(IO.ComfyNode):
"face_limit",
"texture_size",
"texture_format",
- "force_symmetry",
"flatten_bottom",
"flatten_bottom_threshold",
"pivot_to_center_bottom",
"scale_factor",
- "with_animation",
- "pack_uv",
- "bake",
- "part_names",
- "fbx_preset",
- "export_vertex_colors",
- "export_orientation",
- "animate_in_place",
],
),
expr="""
@@ -783,28 +765,16 @@ class TripoConversionNode(IO.ComfyNode):
$flatThresh := (widgets.flatten_bottom_threshold != null) ? widgets.flatten_bottom_threshold : 0;
$scale := (widgets.scale_factor != null) ? widgets.scale_factor : 1;
$texFmt := (widgets.texture_format != "" ? widgets.texture_format : "jpeg");
- $part := widgets.part_names;
- $fbx := (widgets.fbx_preset != "" ? widgets.fbx_preset : "blender");
- $orient := (widgets.export_orientation != "" ? widgets.export_orientation : "default");
$advanced :=
widgets.quad or
- widgets.force_symmetry or
widgets.flatten_bottom or
widgets.pivot_to_center_bottom or
- widgets.with_animation or
- widgets.pack_uv or
- widgets.bake or
- widgets.export_vertex_colors or
- widgets.animate_in_place or
($face != -1) or
($texSize != 4096) or
($flatThresh != 0) or
($scale != 1) or
- ($texFmt != "jpeg") or
- ($part != "") or
- ($fbx != "blender") or
- ($orient != "default");
- {"type":"usd","usd": ($advanced ? 0.1 : 0.05)}
+ ($texFmt != "jpeg");
+ {"type":"usd","usd": ($advanced ? 0.1 : 0.05), "format": {"approximate": true}}
)
""",
),
diff --git a/comfy_api_nodes/nodes_veo2.py b/comfy_api_nodes/nodes_veo2.py
index 13fc1cc36..068862397 100644
--- a/comfy_api_nodes/nodes_veo2.py
+++ b/comfy_api_nodes/nodes_veo2.py
@@ -24,8 +24,9 @@ from comfy_api_nodes.util import (
AVERAGE_DURATION_VIDEO_GEN = 32
MODELS_MAP = {
"veo-2.0-generate-001": "veo-2.0-generate-001",
- "veo-3.1-generate": "veo-3.1-generate-preview",
- "veo-3.1-fast-generate": "veo-3.1-fast-generate-preview",
+ "veo-3.1-generate": "veo-3.1-generate-001",
+ "veo-3.1-fast-generate": "veo-3.1-fast-generate-001",
+ "veo-3.1-lite": "veo-3.1-lite-generate-001",
"veo-3.0-generate-001": "veo-3.0-generate-001",
"veo-3.0-fast-generate-001": "veo-3.0-fast-generate-001",
}
@@ -44,7 +45,7 @@ class VeoVideoGenerationNode(IO.ComfyNode):
return IO.Schema(
node_id="VeoVideoGenerationNode",
display_name="Google Veo 2 Video Generation",
- category="api node/video/Veo",
+ category="video/partner/Veo",
description="Generates videos from text prompts using Google's Veo 2 API",
inputs=[
IO.String.Input(
@@ -247,24 +248,15 @@ class VeoVideoGenerationNode(IO.ComfyNode):
raise Exception("Video generation completed but no video was returned")
-class Veo3VideoGenerationNode(VeoVideoGenerationNode):
- """
- Generates videos from text prompts using Google's Veo 3 API.
-
- Supported models:
- - veo-3.0-generate-001
- - veo-3.0-fast-generate-001
-
- This node extends the base Veo node with Veo 3 specific features including
- audio generation and fixed 8-second duration.
- """
+class Veo3VideoGenerationNode(IO.ComfyNode):
+ """Generates videos from text prompts using Google's Veo 3 API."""
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="Veo3VideoGenerationNode",
display_name="Google Veo 3 Video Generation",
- category="api node/video/Veo",
+ category="video/partner/Veo",
description="Generates videos from text prompts using Google's Veo 3 API",
inputs=[
IO.String.Input(
@@ -279,6 +271,13 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
default="16:9",
tooltip="Aspect ratio of the output video",
),
+ IO.Combo.Input(
+ "resolution",
+ options=["720p", "1080p", "4k"],
+ default="720p",
+ tooltip="Output video resolution. 4K is not available for veo-3.1-lite and veo-3.0 models.",
+ optional=True,
+ ),
IO.String.Input(
"negative_prompt",
multiline=True,
@@ -289,11 +288,11 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
IO.Int.Input(
"duration_seconds",
default=8,
- min=8,
+ min=4,
max=8,
- step=1,
+ step=2,
display_mode=IO.NumberDisplay.number,
- tooltip="Duration of the output video in seconds (Veo 3 only supports 8 seconds)",
+ tooltip="Duration of the output video in seconds",
optional=True,
),
IO.Boolean.Input(
@@ -332,10 +331,10 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
options=[
"veo-3.1-generate",
"veo-3.1-fast-generate",
+ "veo-3.1-lite",
"veo-3.0-generate-001",
"veo-3.0-fast-generate-001",
],
- default="veo-3.0-generate-001",
tooltip="Veo 3 model to use for video generation",
optional=True,
),
@@ -356,21 +355,111 @@ class Veo3VideoGenerationNode(VeoVideoGenerationNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "resolution", "duration_seconds"]),
expr="""
(
$m := widgets.model;
+ $r := widgets.resolution;
$a := widgets.generate_audio;
- ($contains($m,"veo-3.0-fast-generate-001") or $contains($m,"veo-3.1-fast-generate"))
- ? {"type":"usd","usd": ($a ? 1.2 : 0.8)}
- : ($contains($m,"veo-3.0-generate-001") or $contains($m,"veo-3.1-generate"))
- ? {"type":"usd","usd": ($a ? 3.2 : 1.6)}
- : {"type":"range_usd","min_usd":0.8,"max_usd":3.2}
+ $seconds := widgets.duration_seconds;
+ $pps :=
+ $contains($m, "lite")
+ ? ($r = "1080p" ? ($a ? 0.08 : 0.05) : ($a ? 0.05 : 0.03))
+ : $contains($m, "3.1-fast")
+ ? ($r = "4k" ? ($a ? 0.30 : 0.25) : $r = "1080p" ? ($a ? 0.12 : 0.10) : ($a ? 0.10 : 0.08))
+ : $contains($m, "3.1-generate")
+ ? ($r = "4k" ? ($a ? 0.60 : 0.40) : ($a ? 0.40 : 0.20))
+ : $contains($m, "3.0-fast")
+ ? ($a ? 0.15 : 0.10)
+ : ($a ? 0.40 : 0.20);
+ {"type":"usd","usd": $pps * $seconds}
)
""",
),
)
+ @classmethod
+ async def execute(
+ cls,
+ prompt,
+ aspect_ratio="16:9",
+ resolution="720p",
+ negative_prompt="",
+ duration_seconds=8,
+ enhance_prompt=True,
+ person_generation="ALLOW",
+ seed=0,
+ image=None,
+ model="veo-3.0-generate-001",
+ generate_audio=False,
+ ):
+ if resolution == "4k" and ("lite" in model or "3.0" in model):
+ raise Exception("4K resolution is not supported by the veo-3.1-lite or veo-3.0 models.")
+
+ model = MODELS_MAP[model]
+
+ instances = [{"prompt": prompt}]
+ if image is not None:
+ image_base64 = tensor_to_base64_string(image)
+ if image_base64:
+ instances[0]["image"] = {"bytesBase64Encoded": image_base64, "mimeType": "image/png"}
+
+ parameters = {
+ "aspectRatio": aspect_ratio,
+ "personGeneration": person_generation,
+ "durationSeconds": duration_seconds,
+ "enhancePrompt": True,
+ "generateAudio": generate_audio,
+ }
+ if negative_prompt:
+ parameters["negativePrompt"] = negative_prompt
+ if seed > 0:
+ parameters["seed"] = seed
+ if "veo-3.1" in model:
+ parameters["resolution"] = resolution
+
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/veo/{model}/generate", method="POST"),
+ response_model=VeoGenVidResponse,
+ data=VeoGenVidRequest(
+ instances=instances,
+ parameters=parameters,
+ ),
+ )
+
+ poll_response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/veo/{model}/poll", method="POST"),
+ response_model=VeoGenVidPollResponse,
+ status_extractor=lambda r: "completed" if r.done else "pending",
+ data=VeoGenVidPollRequest(operationName=initial_response.name),
+ poll_interval=9.0,
+ estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
+ )
+
+ if poll_response.error:
+ raise Exception(f"Veo API error: {poll_response.error.message} (code: {poll_response.error.code})")
+
+ response = poll_response.response
+ filtered_count = response.raiMediaFilteredCount
+ if filtered_count:
+ reasons = response.raiMediaFilteredReasons or []
+ reason_part = f": {reasons[0]}" if reasons else ""
+ raise Exception(
+ f"Content blocked by Google's Responsible AI filters{reason_part} "
+ f"({filtered_count} video{'s' if filtered_count != 1 else ''} filtered)."
+ )
+
+ if response.videos:
+ video = response.videos[0]
+ if video.bytesBase64Encoded:
+ return IO.NodeOutput(InputImpl.VideoFromFile(BytesIO(base64.b64decode(video.bytesBase64Encoded))))
+ if video.gcsUri:
+ return IO.NodeOutput(await download_url_to_video_output(video.gcsUri))
+ raise Exception("Video returned but no data or URL was provided")
+ raise Exception("Video generation completed but no video was returned")
+
class Veo3FirstLastFrameNode(IO.ComfyNode):
@@ -379,7 +468,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
return IO.Schema(
node_id="Veo3FirstLastFrameNode",
display_name="Google Veo 3 First-Last-Frame to Video",
- category="api node/video/Veo",
+ category="video/partner/Veo",
description="Generate video using prompt and first and last frames.",
inputs=[
IO.String.Input(
@@ -394,7 +483,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
default="",
tooltip="Negative text prompt to guide what to avoid in the video",
),
- IO.Combo.Input("resolution", options=["720p", "1080p"]),
+ IO.Combo.Input("resolution", options=["720p", "1080p", "4k"]),
IO.Combo.Input(
"aspect_ratio",
options=["16:9", "9:16"],
@@ -424,8 +513,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
IO.Image.Input("last_frame", tooltip="End frame"),
IO.Combo.Input(
"model",
- options=["veo-3.1-generate", "veo-3.1-fast-generate"],
- default="veo-3.1-fast-generate",
+ options=["veo-3.1-generate", "veo-3.1-fast-generate", "veo-3.1-lite"],
),
IO.Boolean.Input(
"generate_audio",
@@ -443,26 +531,20 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
],
is_api_node=True,
price_badge=IO.PriceBadge(
- depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration"]),
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "generate_audio", "duration", "resolution"]),
expr="""
(
- $prices := {
- "veo-3.1-fast-generate": { "audio": 0.15, "no_audio": 0.10 },
- "veo-3.1-generate": { "audio": 0.40, "no_audio": 0.20 }
- };
$m := widgets.model;
- $ga := (widgets.generate_audio = "true");
+ $r := widgets.resolution;
+ $ga := widgets.generate_audio;
$seconds := widgets.duration;
- $modelKey :=
- $contains($m, "veo-3.1-fast-generate") ? "veo-3.1-fast-generate" :
- $contains($m, "veo-3.1-generate") ? "veo-3.1-generate" :
- "";
- $audioKey := $ga ? "audio" : "no_audio";
- $modelPrices := $lookup($prices, $modelKey);
- $pps := $lookup($modelPrices, $audioKey);
- ($pps != null)
- ? {"type":"usd","usd": $pps * $seconds}
- : {"type":"range_usd","min_usd": 0.4, "max_usd": 3.2}
+ $pps :=
+ $contains($m, "lite")
+ ? ($r = "1080p" ? ($ga ? 0.08 : 0.05) : ($ga ? 0.05 : 0.03))
+ : $contains($m, "fast")
+ ? ($r = "4k" ? ($ga ? 0.30 : 0.25) : $r = "1080p" ? ($ga ? 0.12 : 0.10) : ($ga ? 0.10 : 0.08))
+ : ($r = "4k" ? ($ga ? 0.60 : 0.40) : ($ga ? 0.40 : 0.20));
+ {"type":"usd","usd": $pps * $seconds}
)
""",
),
@@ -482,6 +564,9 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
model: str,
generate_audio: bool,
):
+ if "lite" in model and resolution == "4k":
+ raise Exception("4K resolution is not supported by the veo-3.1-lite model.")
+
model = MODELS_MAP[model]
initial_response = await sync_op(
cls,
@@ -519,7 +604,7 @@ class Veo3FirstLastFrameNode(IO.ComfyNode):
data=VeoGenVidPollRequest(
operationName=initial_response.name,
),
- poll_interval=5.0,
+ poll_interval=9.0,
estimated_duration=AVERAGE_DURATION_VIDEO_GEN,
)
diff --git a/comfy_api_nodes/nodes_vidu.py b/comfy_api_nodes/nodes_vidu.py
index f04407eb5..16f6113de 100644
--- a/comfy_api_nodes/nodes_vidu.py
+++ b/comfy_api_nodes/nodes_vidu.py
@@ -38,7 +38,7 @@ async def execute_task(
cls: type[IO.ComfyNode],
vidu_endpoint: str,
payload: TaskCreationRequest | TaskExtendCreationRequest | TaskMultiFrameCreationRequest,
- max_poll_attempts: int = 320,
+ max_poll_attempts: int = 480,
) -> list[TaskResult]:
task_creation_response = await sync_op(
cls,
@@ -71,7 +71,7 @@ class ViduTextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduTextToVideoNode",
display_name="Vidu Text To Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate video from a text prompt",
inputs=[
IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"),
@@ -169,7 +169,7 @@ class ViduImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduImageToVideoNode",
display_name="Vidu Image To Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate video from image and optional prompt",
inputs=[
IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"),
@@ -273,7 +273,7 @@ class ViduReferenceVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduReferenceVideoNode",
display_name="Vidu Reference To Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate video from multiple images and a prompt",
inputs=[
IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"),
@@ -388,7 +388,7 @@ class ViduStartEndToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduStartEndToVideoNode",
display_name="Vidu Start End To Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from start and end frames and a prompt",
inputs=[
IO.Combo.Input("model", options=["viduq1"], tooltip="Model name"),
@@ -492,7 +492,7 @@ class Vidu2TextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu2TextToVideoNode",
display_name="Vidu2 Text-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate video from a text prompt",
inputs=[
IO.Combo.Input("model", options=["viduq2"]),
@@ -584,7 +584,7 @@ class Vidu2ImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu2ImageToVideoNode",
display_name="Vidu2 Image-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from an image and an optional prompt.",
inputs=[
IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]),
@@ -714,7 +714,7 @@ class Vidu2ReferenceVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu2ReferenceVideoNode",
display_name="Vidu2 Reference-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from multiple reference images and a prompt.",
inputs=[
IO.Combo.Input("model", options=["viduq2"]),
@@ -849,7 +849,7 @@ class Vidu2StartEndToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu2StartEndToVideoNode",
display_name="Vidu2 Start/End Frame-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from a start frame, an end frame, and a prompt.",
inputs=[
IO.Combo.Input("model", options=["viduq2-pro-fast", "viduq2-pro", "viduq2-turbo"]),
@@ -969,7 +969,7 @@ class ViduExtendVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduExtendVideoNode",
display_name="Vidu Video Extension",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Extend an existing video by generating additional frames.",
inputs=[
IO.DynamicCombo.Input(
@@ -1097,7 +1097,6 @@ class ViduExtendVideoNode(IO.ComfyNode):
video_url=await upload_video_to_comfyapi(cls, video, wait_label="Uploading video"),
images=[image_url] if image_url else None,
),
- max_poll_attempts=480,
)
return IO.NodeOutput(await download_url_to_video_output(results[0].url))
@@ -1139,7 +1138,7 @@ class ViduMultiFrameVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="ViduMultiFrameVideoNode",
display_name="Vidu Multi-Frame Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video with multiple keyframe transitions.",
inputs=[
IO.Combo.Input("model", options=["viduq2-pro", "viduq2-turbo"]),
@@ -1285,7 +1284,7 @@ class Vidu3TextToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu3TextToVideoNode",
display_name="Vidu Q3 Text-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate video from a text prompt.",
inputs=[
IO.DynamicCombo.Input(
@@ -1430,7 +1429,7 @@ class Vidu3ImageToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu3ImageToVideoNode",
display_name="Vidu Q3 Image-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from an image and an optional prompt.",
inputs=[
IO.DynamicCombo.Input(
@@ -1572,7 +1571,7 @@ class Vidu3StartEndToVideoNode(IO.ComfyNode):
return IO.Schema(
node_id="Vidu3StartEndToVideoNode",
display_name="Vidu Q3 Start/End Frame-to-Video Generation",
- category="api node/video/Vidu",
+ category="video/partner/Vidu",
description="Generate a video from a start frame, an end frame, and a prompt.",
inputs=[
IO.DynamicCombo.Input(
diff --git a/comfy_api_nodes/nodes_wan.py b/comfy_api_nodes/nodes_wan.py
index d1470894a..a235dc387 100644
--- a/comfy_api_nodes/nodes_wan.py
+++ b/comfy_api_nodes/nodes_wan.py
@@ -61,7 +61,7 @@ class WanTextToImageApi(IO.ComfyNode):
return IO.Schema(
node_id="WanTextToImageApi",
display_name="Wan Text to Image",
- category="api node/image/Wan",
+ category="image/partner/Wan",
description="Generates an image based on a text prompt.",
inputs=[
IO.Combo.Input(
@@ -184,7 +184,7 @@ class WanImageToImageApi(IO.ComfyNode):
return IO.Schema(
node_id="WanImageToImageApi",
display_name="Wan Image to Image",
- category="api node/image/Wan",
+ category="image/partner/Wan",
description="Generates an image from one or two input images and a text prompt. "
"The output image is currently fixed at 1.6 MP, and its aspect ratio matches the input image(s).",
inputs=[
@@ -312,7 +312,7 @@ class WanTextToVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="WanTextToVideoApi",
display_name="Wan Text to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Generates a video based on a text prompt.",
inputs=[
IO.Combo.Input(
@@ -495,7 +495,7 @@ class WanImageToVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="WanImageToVideoApi",
display_name="Wan Image to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Generates a video from the first frame and a text prompt.",
inputs=[
IO.Combo.Input(
@@ -674,7 +674,7 @@ class WanReferenceVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="WanReferenceVideoApi",
display_name="Wan Reference to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Use the character and voice from input videos, combined with a prompt, "
"to generate a new video that maintains character consistency.",
inputs=[
@@ -818,7 +818,6 @@ class WanReferenceVideoApi(IO.ComfyNode):
response_model=VideoTaskStatusResponse,
status_extractor=lambda x: x.output.task_status,
poll_interval=6,
- max_poll_attempts=280,
)
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
@@ -829,7 +828,7 @@ class Wan2TextToVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="Wan2TextToVideoApi",
display_name="Wan 2.7 Text to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Generates a video based on a text prompt using the Wan 2.7 model.",
inputs=[
IO.DynamicCombo.Input(
@@ -982,7 +981,7 @@ class Wan2ImageToVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="Wan2ImageToVideoApi",
display_name="Wan 2.7 Image to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Generate a video from a first-frame image, with optional last-frame image and audio.",
inputs=[
IO.DynamicCombo.Input(
@@ -1153,7 +1152,7 @@ class Wan2VideoContinuationApi(IO.ComfyNode):
return IO.Schema(
node_id="Wan2VideoContinuationApi",
display_name="Wan 2.7 Video Continuation",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Continue a video from where it left off, with optional last-frame control.",
inputs=[
IO.DynamicCombo.Input(
@@ -1320,7 +1319,7 @@ class Wan2VideoEditApi(IO.ComfyNode):
return IO.Schema(
node_id="Wan2VideoEditApi",
display_name="Wan 2.7 Video Edit",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Edit a video using text instructions, reference images, or style transfer.",
inputs=[
IO.DynamicCombo.Input(
@@ -1478,7 +1477,7 @@ class Wan2ReferenceVideoApi(IO.ComfyNode):
return IO.Schema(
node_id="Wan2ReferenceVideoApi",
display_name="Wan 2.7 Reference to Video",
- category="api node/video/Wan",
+ category="video/partner/Wan",
description="Generate a video featuring a person or object from reference materials. "
"Supports single-character performances and multi-character interactions.",
inputs=[
@@ -1646,6 +1645,557 @@ class Wan2ReferenceVideoApi(IO.ComfyNode):
return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+class HappyHorseTextToVideoApi(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="HappyHorseTextToVideoApi",
+ display_name="HappyHorse Text to Video",
+ category="video/partner/Wan",
+ description="Generates a video based on a text prompt using the HappyHorse model.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "happyhorse-1.0-t2v",
+ [
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Prompt describing the elements and visual features. "
+ "Supports English and Chinese.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=["720P", "1080P"],
+ ),
+ IO.Combo.Input(
+ "ratio",
+ options=["16:9", "9:16", "1:1", "4:3", "3:4"],
+ ),
+ IO.Int.Input(
+ "duration",
+ default=5,
+ min=3,
+ max=15,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ ),
+ ],
+ ),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to use for generation.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add an AI-generated watermark to the result.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
+ expr="""
+ (
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $ppsTable := { "720p": 0.14, "1080p": 0.24 };
+ $pps := $lookup($ppsTable, $res);
+ { "type": "usd", "usd": $pps * $dur }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ seed: int,
+ watermark: bool,
+ ):
+ validate_string(model["prompt"], strip_whitespace=False, min_length=1)
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(
+ path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+ method="POST",
+ ),
+ response_model=TaskCreationResponse,
+ data=Wan27Text2VideoTaskCreationRequest(
+ model=model["model"],
+ input=Text2VideoInputField(
+ prompt=model["prompt"],
+ negative_prompt=None,
+ ),
+ parameters=Wan27Text2VideoParametersField(
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=model["duration"],
+ seed=seed,
+ watermark=watermark,
+ ),
+ ),
+ )
+ if not initial_response.output:
+ raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
+ response_model=VideoTaskStatusResponse,
+ status_extractor=lambda x: x.output.task_status,
+ poll_interval=7,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
+class HappyHorseImageToVideoApi(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="HappyHorseImageToVideoApi",
+ display_name="HappyHorse Image to Video",
+ category="video/partner/Wan",
+ description="Generate a video from a first-frame image using the HappyHorse model.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "happyhorse-1.0-i2v",
+ [
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Prompt describing the elements and visual features. "
+ "Supports English and Chinese.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=["720P", "1080P"],
+ ),
+ IO.Int.Input(
+ "duration",
+ default=5,
+ min=3,
+ max=15,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ ),
+ ],
+ ),
+ ],
+ ),
+ IO.Image.Input(
+ "first_frame",
+ tooltip="First frame image. The output aspect ratio is derived from this image.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to use for generation.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add an AI-generated watermark to the result.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
+ expr="""
+ (
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $ppsTable := { "720p": 0.14, "1080p": 0.24 };
+ $pps := $lookup($ppsTable, $res);
+ { "type": "usd", "usd": $pps * $dur }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ first_frame: Input.Image,
+ seed: int,
+ watermark: bool,
+ ):
+ media = [
+ Wan27MediaItem(
+ type="first_frame",
+ url=await upload_image_to_comfyapi(cls, image=first_frame),
+ )
+ ]
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(
+ path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+ method="POST",
+ ),
+ response_model=TaskCreationResponse,
+ data=Wan27ImageToVideoTaskCreationRequest(
+ model=model["model"],
+ input=Wan27ImageToVideoInputField(
+ prompt=model["prompt"] or None,
+ negative_prompt=None,
+ media=media,
+ ),
+ parameters=Wan27ImageToVideoParametersField(
+ resolution=model["resolution"],
+ duration=model["duration"],
+ seed=seed,
+ watermark=watermark,
+ ),
+ ),
+ )
+ if not initial_response.output:
+ raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
+ response_model=VideoTaskStatusResponse,
+ status_extractor=lambda x: x.output.task_status,
+ poll_interval=7,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
+class HappyHorseVideoEditApi(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="HappyHorseVideoEditApi",
+ display_name="HappyHorse Video Edit",
+ category="video/partner/Wan",
+ description="Edit a video using text instructions or reference images with the HappyHorse model. "
+ "Output duration is 3-15s and matches the input video; inputs longer than 15s are truncated.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "happyhorse-1.0-video-edit",
+ [
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Editing instructions or style transfer requirements.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=["720P", "1080P"],
+ ),
+ IO.Combo.Input(
+ "ratio",
+ options=["16:9", "9:16", "1:1", "4:3", "3:4"],
+ tooltip="Aspect ratio. If not changed, approximates the input video ratio.",
+ ),
+ IO.Autogrow.Input(
+ "reference_images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("reference_image"),
+ names=[
+ "image1",
+ "image2",
+ "image3",
+ "image4",
+ "image5",
+ ],
+ min=0,
+ ),
+ ),
+ ],
+ ),
+ ],
+ ),
+ IO.Video.Input(
+ "video",
+ tooltip="The video to edit.",
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to use for generation.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add an AI-generated watermark to the result.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution"]),
+ expr="""
+ (
+ $res := $lookup(widgets, "model.resolution");
+ $ppsTable := { "720p": 0.14, "1080p": 0.24 };
+ $pps := $lookup($ppsTable, $res);
+ { "type": "usd", "usd": $pps, "format": { "suffix": "/second" } }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ video: Input.Video,
+ seed: int,
+ watermark: bool,
+ ):
+ validate_string(model["prompt"], strip_whitespace=False, min_length=1)
+ validate_video_duration(video, min_duration=3, max_duration=60)
+ media = [Wan27MediaItem(type="video", url=await upload_video_to_comfyapi(cls, video))]
+ reference_images = model.get("reference_images", {})
+ for key in reference_images:
+ media.append(
+ Wan27MediaItem(
+ type="reference_image", url=await upload_image_to_comfyapi(cls, image=reference_images[key])
+ )
+ )
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(
+ path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+ method="POST",
+ ),
+ response_model=TaskCreationResponse,
+ data=Wan27VideoEditTaskCreationRequest(
+ model=model["model"],
+ input=Wan27VideoEditInputField(prompt=model["prompt"], media=media),
+ parameters=Wan27VideoEditParametersField(
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=None,
+ watermark=watermark,
+ seed=seed,
+ ),
+ ),
+ )
+ if not initial_response.output:
+ raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
+ response_model=VideoTaskStatusResponse,
+ status_extractor=lambda x: x.output.task_status,
+ poll_interval=7,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
+class HappyHorseReferenceVideoApi(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="HappyHorseReferenceVideoApi",
+ display_name="HappyHorse Reference to Video",
+ category="video/partner/Wan",
+ description="Generate a video featuring a person or object from reference materials with the HappyHorse "
+ "model. Supports single-character performances and multi-character interactions.",
+ inputs=[
+ IO.DynamicCombo.Input(
+ "model",
+ options=[
+ IO.DynamicCombo.Option(
+ "happyhorse-1.0-r2v",
+ [
+ IO.String.Input(
+ "prompt",
+ multiline=True,
+ default="",
+ tooltip="Prompt describing the video. Use identifiers such as 'character1' and "
+ "'character2' to refer to the reference characters.",
+ ),
+ IO.Combo.Input(
+ "resolution",
+ options=["720P", "1080P"],
+ ),
+ IO.Combo.Input(
+ "ratio",
+ options=["16:9", "9:16", "1:1", "4:3", "3:4"],
+ ),
+ IO.Int.Input(
+ "duration",
+ default=5,
+ min=3,
+ max=15,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ ),
+ IO.Autogrow.Input(
+ "reference_images",
+ template=IO.Autogrow.TemplateNames(
+ IO.Image.Input("reference_image"),
+ names=[
+ "image1",
+ "image2",
+ "image3",
+ "image4",
+ "image5",
+ "image6",
+ "image7",
+ "image8",
+ "image9",
+ ],
+ min=1,
+ ),
+ ),
+ ],
+ ),
+ ],
+ ),
+ IO.Int.Input(
+ "seed",
+ default=0,
+ min=0,
+ max=2147483647,
+ step=1,
+ display_mode=IO.NumberDisplay.number,
+ control_after_generate=True,
+ tooltip="Seed to use for generation.",
+ ),
+ IO.Boolean.Input(
+ "watermark",
+ default=False,
+ tooltip="Whether to add an AI-generated watermark to the result.",
+ advanced=True,
+ ),
+ ],
+ outputs=[
+ IO.Video.Output(),
+ ],
+ hidden=[
+ IO.Hidden.auth_token_comfy_org,
+ IO.Hidden.api_key_comfy_org,
+ IO.Hidden.unique_id,
+ ],
+ is_api_node=True,
+ price_badge=IO.PriceBadge(
+ depends_on=IO.PriceBadgeDepends(widgets=["model", "model.resolution", "model.duration"]),
+ expr="""
+ (
+ $res := $lookup(widgets, "model.resolution");
+ $dur := $lookup(widgets, "model.duration");
+ $ppsTable := { "720p": 0.14, "1080p": 0.24 };
+ $pps := $lookup($ppsTable, $res);
+ { "type": "usd", "usd": $pps * $dur }
+ )
+ """,
+ ),
+ )
+
+ @classmethod
+ async def execute(
+ cls,
+ model: dict,
+ seed: int,
+ watermark: bool,
+ ):
+ validate_string(model["prompt"], strip_whitespace=False, min_length=1)
+ media = []
+ reference_images = model.get("reference_images", {})
+ for key in reference_images:
+ media.append(
+ Wan27MediaItem(
+ type="reference_image",
+ url=await upload_image_to_comfyapi(cls, image=reference_images[key]),
+ )
+ )
+ if not media:
+ raise ValueError("At least one reference reference image must be provided.")
+
+ initial_response = await sync_op(
+ cls,
+ ApiEndpoint(
+ path="/proxy/wan/api/v1/services/aigc/video-generation/video-synthesis",
+ method="POST",
+ ),
+ response_model=TaskCreationResponse,
+ data=Wan27ReferenceVideoTaskCreationRequest(
+ model=model["model"],
+ input=Wan27ReferenceVideoInputField(
+ prompt=model["prompt"],
+ negative_prompt=None,
+ media=media,
+ ),
+ parameters=Wan27ReferenceVideoParametersField(
+ resolution=model["resolution"],
+ ratio=model["ratio"],
+ duration=model["duration"],
+ watermark=watermark,
+ seed=seed,
+ ),
+ ),
+ )
+ if not initial_response.output:
+ raise Exception(f"An unknown error occurred: {initial_response.code} - {initial_response.message}")
+ response = await poll_op(
+ cls,
+ ApiEndpoint(path=f"/proxy/wan/api/v1/tasks/{initial_response.output.task_id}"),
+ response_model=VideoTaskStatusResponse,
+ status_extractor=lambda x: x.output.task_status,
+ poll_interval=7,
+ )
+ return IO.NodeOutput(await download_url_to_video_output(response.output.video_url))
+
+
class WanApiExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -1660,6 +2210,10 @@ class WanApiExtension(ComfyExtension):
Wan2VideoContinuationApi,
Wan2VideoEditApi,
Wan2ReferenceVideoApi,
+ HappyHorseTextToVideoApi,
+ HappyHorseImageToVideoApi,
+ HappyHorseVideoEditApi,
+ HappyHorseReferenceVideoApi,
]
diff --git a/comfy_api_nodes/nodes_wavespeed.py b/comfy_api_nodes/nodes_wavespeed.py
index c59fafd3b..a250015c3 100644
--- a/comfy_api_nodes/nodes_wavespeed.py
+++ b/comfy_api_nodes/nodes_wavespeed.py
@@ -27,7 +27,7 @@ class WavespeedFlashVSRNode(IO.ComfyNode):
return IO.Schema(
node_id="WavespeedFlashVSRNode",
display_name="FlashVSR Video Upscale",
- category="api node/video/WaveSpeed",
+ category="video/partner/WaveSpeed",
description="Fast, high-quality video upscaler that "
"boosts resolution and restores clarity for low-resolution or blurry footage.",
inputs=[
@@ -84,7 +84,6 @@ class WavespeedFlashVSRNode(IO.ComfyNode):
response_model=TaskResultResponse,
status_extractor=lambda x: "failed" if x.data is None else x.data.status,
poll_interval=10.0,
- max_poll_attempts=480,
)
if final_response.code != 200:
raise ValueError(
@@ -99,7 +98,7 @@ class WavespeedImageUpscaleNode(IO.ComfyNode):
return IO.Schema(
node_id="WavespeedImageUpscaleNode",
display_name="WaveSpeed Image Upscale",
- category="api node/image/WaveSpeed",
+ category="image/partner/WaveSpeed",
description="Boost image resolution and quality, upscaling photos to 4K or 8K for sharp, detailed results.",
inputs=[
IO.Combo.Input("model", options=["SeedVR2", "Ultimate"]),
@@ -156,7 +155,6 @@ class WavespeedImageUpscaleNode(IO.ComfyNode):
response_model=TaskResultResponse,
status_extractor=lambda x: "failed" if x.data is None else x.data.status,
poll_interval=10.0,
- max_poll_attempts=480,
)
if final_response.code != 200:
raise ValueError(
diff --git a/comfy_api_nodes/util/__init__.py b/comfy_api_nodes/util/__init__.py
index 0cb9a47c7..25cb88869 100644
--- a/comfy_api_nodes/util/__init__.py
+++ b/comfy_api_nodes/util/__init__.py
@@ -16,6 +16,7 @@ from .conversions import (
convert_mask_to_image,
downscale_image_tensor,
downscale_image_tensor_by_max_side,
+ downscale_video_to_max_pixels,
image_tensor_pair_to_batch,
pil_to_bytesio,
resize_mask_to_image,
@@ -25,6 +26,7 @@ from .conversions import (
text_filepath_to_base64_string,
text_filepath_to_data_uri,
trim_video,
+ upscale_video_to_min_pixels,
video_to_base64_string,
)
from .download_helpers import (
@@ -87,6 +89,7 @@ __all__ = [
"convert_mask_to_image",
"downscale_image_tensor",
"downscale_image_tensor_by_max_side",
+ "downscale_video_to_max_pixels",
"image_tensor_pair_to_batch",
"pil_to_bytesio",
"resize_mask_to_image",
@@ -96,6 +99,7 @@ __all__ = [
"text_filepath_to_base64_string",
"text_filepath_to_data_uri",
"trim_video",
+ "upscale_video_to_min_pixels",
"video_to_base64_string",
# Validation utilities
"get_image_dimensions",
diff --git a/comfy_api_nodes/util/client.py b/comfy_api_nodes/util/client.py
index 9d730b81a..57c501724 100644
--- a/comfy_api_nodes/util/client.py
+++ b/comfy_api_nodes/util/client.py
@@ -19,6 +19,8 @@ from comfy import utils
from comfy_api.latest import IO
from server import PromptServer
+from comfy.deploy_environment import get_deploy_environment
+
from . import request_logger
from ._helpers import (
default_base_url,
@@ -84,7 +86,7 @@ class _PollUIState:
_RETRY_STATUS = {408, 500, 502, 503, 504} # status 429 is handled separately
COMPLETED_STATUSES = ["succeeded", "succeed", "success", "completed", "finished", "done", "complete"]
FAILED_STATUSES = ["cancelled", "canceled", "canceling", "fail", "failed", "error"]
-QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait"]
+QUEUED_STATUSES = ["created", "queued", "queueing", "submitted", "initializing", "wait", "in_queue"]
async def sync_op(
@@ -148,7 +150,7 @@ async def poll_op(
queued_statuses: list[str | int] | None = None,
data: BaseModel | None = None,
poll_interval: float = 5.0,
- max_poll_attempts: int = 160,
+ max_poll_attempts: int = 480,
timeout_per_poll: float = 120.0,
max_retries_per_poll: int = 10,
retry_delay_per_poll: float = 1.0,
@@ -156,6 +158,7 @@ async def poll_op(
estimated_duration: int | None = None,
cancel_endpoint: ApiEndpoint | None = None,
cancel_timeout: float = 10.0,
+ extra_text: str | None = None,
) -> M:
raw = await poll_op_raw(
cls,
@@ -176,6 +179,7 @@ async def poll_op(
estimated_duration=estimated_duration,
cancel_endpoint=cancel_endpoint,
cancel_timeout=cancel_timeout,
+ extra_text=extra_text,
)
if not isinstance(raw, dict):
raise Exception("Expected JSON response to validate into a Pydantic model, got non-JSON (binary or text).")
@@ -252,7 +256,7 @@ async def poll_op_raw(
queued_statuses: list[str | int] | None = None,
data: dict[str, Any] | BaseModel | None = None,
poll_interval: float = 5.0,
- max_poll_attempts: int = 160,
+ max_poll_attempts: int = 480,
timeout_per_poll: float = 120.0,
max_retries_per_poll: int = 10,
retry_delay_per_poll: float = 1.0,
@@ -260,6 +264,7 @@ async def poll_op_raw(
estimated_duration: int | None = None,
cancel_endpoint: ApiEndpoint | None = None,
cancel_timeout: float = 10.0,
+ extra_text: str | None = None,
) -> dict[str, Any]:
"""
Polls an endpoint until the task reaches a terminal state. Displays time while queued/processing,
@@ -299,6 +304,7 @@ async def poll_op_raw(
price=state.price,
is_queued=state.is_queued,
processing_elapsed_seconds=int(proc_elapsed),
+ extra_text=extra_text,
)
await asyncio.sleep(1.0)
except Exception as exc:
@@ -389,6 +395,7 @@ async def poll_op_raw(
price=state.price,
is_queued=False,
processing_elapsed_seconds=int(state.base_processing_elapsed),
+ extra_text=extra_text,
)
return resp_json
@@ -462,6 +469,7 @@ def _display_time_progress(
price: float | None = None,
is_queued: bool | None = None,
processing_elapsed_seconds: int | None = None,
+ extra_text: str | None = None,
) -> None:
if estimated_total is not None and estimated_total > 0 and is_queued is False:
pe = processing_elapsed_seconds if processing_elapsed_seconds is not None else elapsed_seconds
@@ -469,7 +477,8 @@ def _display_time_progress(
time_line = f"Time elapsed: {int(elapsed_seconds)}s (~{remaining}s remaining)"
else:
time_line = f"Time elapsed: {int(elapsed_seconds)}s"
- _display_text(node_cls, time_line, status=status, price=price)
+ text = f"{time_line}\n\n{extra_text}" if extra_text else time_line
+ _display_text(node_cls, text, status=status, price=price)
async def _diagnose_connectivity() -> dict[str, bool]:
@@ -479,10 +488,30 @@ async def _diagnose_connectivity() -> dict[str, bool]:
"api_accessible": False,
}
timeout = aiohttp.ClientTimeout(total=5.0)
+
+ # Probe Google and Baidu in parallel: Google is blocked by the GFW in mainland China, so a Baidu probe is required
+ # to correctly detect that Chinese users with working internet do have working internet.
+ internet_probe_urls = ("https://www.google.com", "https://www.baidu.com")
+
async with aiohttp.ClientSession(timeout=timeout) as session:
- with contextlib.suppress(ClientError, OSError):
- async with session.get("https://www.google.com") as resp:
- results["internet_accessible"] = resp.status < 500
+ async def _probe(url: str) -> bool:
+ try:
+ async with session.get(url) as resp:
+ return resp.status < 500
+ except (ClientError, OSError, asyncio.TimeoutError):
+ return False
+
+ probe_tasks = [asyncio.create_task(_probe(u)) for u in internet_probe_urls]
+ try:
+ for fut in asyncio.as_completed(probe_tasks):
+ if await fut:
+ results["internet_accessible"] = True
+ break
+ finally:
+ for t in probe_tasks:
+ if not t.done():
+ t.cancel()
+ await asyncio.gather(*probe_tasks, return_exceptions=True)
if not results["internet_accessible"]:
return results
@@ -617,6 +646,7 @@ async def _request_base(cfg: _RequestConfig, expect_binary: bool):
payload_headers = {"Accept": "*/*"} if expect_binary else {"Accept": "application/json"}
if not parsed_url.scheme and not parsed_url.netloc: # is URL relative?
payload_headers.update(get_auth_header(cfg.node_cls))
+ payload_headers["Comfy-Env"] = get_deploy_environment()
if cfg.endpoint.headers:
payload_headers.update(cfg.endpoint.headers)
diff --git a/comfy_api_nodes/util/conversions.py b/comfy_api_nodes/util/conversions.py
index 82b6d22a5..5738df57f 100644
--- a/comfy_api_nodes/util/conversions.py
+++ b/comfy_api_nodes/util/conversions.py
@@ -129,22 +129,38 @@ def pil_to_bytesio(img: Image.Image, mime_type: str = "image/png") -> BytesIO:
return img_byte_arr
+def _compute_downscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
+ """Return downscaled (w, h) with even dims fitting ``total_pixels``, or None if already fits.
+
+ Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
+ are rounded down to even values (many codecs require divisible-by-2).
+ """
+ pixels = src_w * src_h
+ if pixels <= total_pixels:
+ return None
+ scale = math.sqrt(total_pixels / pixels)
+ new_w = max(2, int(src_w * scale))
+ new_h = max(2, int(src_h * scale))
+ new_w -= new_w % 2
+ new_h -= new_h % 2
+ return new_w, new_h
+
+
def downscale_image_tensor(image: torch.Tensor, total_pixels: int = 1536 * 1024) -> torch.Tensor:
- """Downscale input image tensor to roughly the specified total pixels."""
+ """Downscale input image tensor to roughly the specified total pixels.
+
+ Output dimensions are rounded down to even values so that the result is guaranteed to fit within ``total_pixels``
+ and is compatible with codecs that require even dimensions (e.g. yuv420p).
+ """
samples = image.movedim(-1, 1)
- total = int(total_pixels)
- scale_by = math.sqrt(total / (samples.shape[3] * samples.shape[2]))
- if scale_by >= 1:
+ dims = _compute_downscale_dims(samples.shape[3], samples.shape[2], int(total_pixels))
+ if dims is None:
return image
- width = round(samples.shape[3] * scale_by)
- height = round(samples.shape[2] * scale_by)
-
- s = common_upscale(samples, width, height, "lanczos", "disabled")
- s = s.movedim(1, -1)
- return s
+ new_w, new_h = dims
+ return common_upscale(samples, new_w, new_h, "lanczos", "disabled").movedim(1, -1)
-def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
+def downscale_image_tensor_by_max_side(image: torch.Tensor, *, max_side: int) -> torch.Tensor:
"""Downscale input image tensor so the largest dimension is at most max_side pixels."""
samples = image.movedim(-1, 1)
height, width = samples.shape[2], samples.shape[3]
@@ -399,6 +415,106 @@ def trim_video(video: Input.Video, duration_sec: float) -> Input.Video:
raise RuntimeError(f"Failed to trim video: {str(e)}") from e
+def downscale_video_to_max_pixels(video: Input.Video, max_pixels: int) -> Input.Video:
+ """Downscale a video to fit within ``max_pixels`` (w * h), preserving aspect ratio.
+
+ Returns the original video object untouched when it already fits. Preserves frame rate, duration, and audio.
+ Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
+ """
+ src_w, src_h = video.get_dimensions()
+ scale_dims = _compute_downscale_dims(src_w, src_h, max_pixels)
+ if scale_dims is None:
+ return video
+ return _apply_video_scale(video, scale_dims)
+
+
+def _compute_upscale_dims(src_w: int, src_h: int, total_pixels: int) -> tuple[int, int] | None:
+ """Return upscaled (w, h) with even dims meeting at least ``total_pixels``, or None if already large enough.
+
+ Source aspect ratio is preserved; output may drift by a fraction of a percent because both dimensions
+ are rounded up to even values (many codecs require divisible-by-2). The result is guaranteed to be at
+ least ``total_pixels``.
+ """
+ pixels = src_w * src_h
+ if pixels >= total_pixels:
+ return None
+ scale = math.sqrt(total_pixels / pixels)
+ new_w = math.ceil(src_w * scale)
+ new_h = math.ceil(src_h * scale)
+ if new_w % 2:
+ new_w += 1
+ if new_h % 2:
+ new_h += 1
+ return new_w, new_h
+
+
+def upscale_video_to_min_pixels(video: Input.Video, min_pixels: int) -> Input.Video:
+ """Upscale a video to meet at least ``min_pixels`` (w * h), preserving aspect ratio.
+
+ Returns the original video object untouched when it already meets the minimum. Preserves frame rate,
+ duration, and audio. Aspect ratio is preserved up to a fraction of a percent (even-dim rounding).
+ Note: upscaling a low-resolution source does not add real detail; downstream model quality may suffer.
+ """
+ src_w, src_h = video.get_dimensions()
+ scale_dims = _compute_upscale_dims(src_w, src_h, min_pixels)
+ if scale_dims is None:
+ return video
+ return _apply_video_scale(video, scale_dims)
+
+
+def _apply_video_scale(video: Input.Video, scale_dims: tuple[int, int]) -> Input.Video:
+ """Re-encode ``video`` scaled to ``scale_dims`` with a single decode/encode pass."""
+ out_w, out_h = scale_dims
+ output_buffer = BytesIO()
+ input_container = None
+ output_container = None
+
+ try:
+ input_source = video.get_stream_source()
+ input_container = av.open(input_source, mode="r")
+ output_container = av.open(output_buffer, mode="w", format="mp4")
+
+ video_stream = output_container.add_stream("h264", rate=video.get_frame_rate())
+ video_stream.width = out_w
+ video_stream.height = out_h
+ video_stream.pix_fmt = "yuv420p"
+
+ audio_stream = None
+ for stream in input_container.streams:
+ if isinstance(stream, av.AudioStream):
+ audio_stream = output_container.add_stream("aac", rate=stream.sample_rate)
+ audio_stream.sample_rate = stream.sample_rate
+ audio_stream.layout = stream.layout
+ break
+
+ for frame in input_container.decode(video=0):
+ frame = frame.reformat(width=out_w, height=out_h, format="yuv420p")
+ for packet in video_stream.encode(frame):
+ output_container.mux(packet)
+ for packet in video_stream.encode():
+ output_container.mux(packet)
+
+ if audio_stream is not None:
+ input_container.seek(0)
+ for audio_frame in input_container.decode(audio=0):
+ for packet in audio_stream.encode(audio_frame):
+ output_container.mux(packet)
+ for packet in audio_stream.encode():
+ output_container.mux(packet)
+
+ output_container.close()
+ input_container.close()
+ output_buffer.seek(0)
+ return InputImpl.VideoFromFile(output_buffer)
+
+ except Exception as e:
+ if input_container is not None:
+ input_container.close()
+ if output_container is not None:
+ output_container.close()
+ raise RuntimeError(f"Failed to resize video: {str(e)}") from e
+
+
def _f32_pcm(wav: torch.Tensor) -> torch.Tensor:
"""Convert audio to float 32 bits PCM format. Copy-paste from nodes_audio.py file."""
if wav.dtype.is_floating_point:
diff --git a/comfy_execution/caching.py b/comfy_execution/caching.py
index f9c913bdb..ba1e8bc84 100644
--- a/comfy_execution/caching.py
+++ b/comfy_execution/caching.py
@@ -5,6 +5,7 @@ import psutil
import time
import torch
from typing import Sequence, Mapping, Dict
+from comfy.model_patcher import ModelPatcher
from comfy_execution.graph import DynamicPrompt
from abc import ABC, abstractmethod
@@ -523,13 +524,15 @@ class RAMPressureCache(LRUCache):
self.timestamps[self.cache_key_set.get_data_key(node_id)] = time.time()
super().set_local(node_id, value)
- def ram_release(self, target):
+ def ram_release(self, target, free_active=False):
if psutil.virtual_memory().available >= target:
return
clean_list = []
for key, cache_entry in self.cache.items():
+ if not free_active and self.used_generation[key] == self.generation:
+ continue
oom_score = RAM_CACHE_OLD_WORKFLOW_OOM_MULTIPLIER ** (self.generation - self.used_generation[key])
ram_usage = RAM_CACHE_DEFAULT_RAM_USAGE
@@ -542,6 +545,9 @@ class RAMPressureCache(LRUCache):
scan_list_for_ram_usage(output)
elif isinstance(output, torch.Tensor) and output.device.type == 'cpu':
ram_usage += output.numel() * output.element_size()
+ elif isinstance(output, ModelPatcher) and self.used_generation[key] != self.generation:
+ #old ModelPatchers are the first to go
+ ram_usage = 1e30
scan_list_for_ram_usage(cache_entry.outputs)
oom_score *= ram_usage
diff --git a/comfy_execution/graph.py b/comfy_execution/graph.py
index c47f3c79b..479ee8a53 100644
--- a/comfy_execution/graph.py
+++ b/comfy_execution/graph.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from typing import Type, Literal
import nodes
diff --git a/comfy_execution/progress.py b/comfy_execution/progress.py
index f951a3350..731b8dc66 100644
--- a/comfy_execution/progress.py
+++ b/comfy_execution/progress.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
from typing import TypedDict, Dict, Optional, Tuple
from typing_extensions import override
from PIL import Image
diff --git a/comfy_execution/validation.py b/comfy_execution/validation.py
index e73624bd1..ae9a2376c 100644
--- a/comfy_execution/validation.py
+++ b/comfy_execution/validation.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from comfy_api.latest import IO
diff --git a/comfy_extras/frame_interpolation_models/film_net.py b/comfy_extras/frame_interpolation_models/film_net.py
new file mode 100644
index 000000000..36bc79dc3
--- /dev/null
+++ b/comfy_extras/frame_interpolation_models/film_net.py
@@ -0,0 +1,261 @@
+"""FILM: Frame Interpolation for Large Motion (ECCV 2022)."""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+
+ops = comfy.ops.disable_weight_init
+
+
+class FilmConv2d(nn.Module):
+ """Conv2d with optional LeakyReLU and FILM-style padding."""
+
+ def __init__(self, in_channels, out_channels, size, activation=True, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.even_pad = not size % 2
+ self.conv = operations.Conv2d(in_channels, out_channels, kernel_size=size, padding=size // 2 if size % 2 else 0, device=device, dtype=dtype)
+ self.activation = nn.LeakyReLU(0.2) if activation else None
+
+ def forward(self, x):
+ if self.even_pad:
+ x = F.pad(x, (0, 1, 0, 1))
+ x = self.conv(x)
+ if self.activation is not None:
+ x = self.activation(x)
+ return x
+
+
+def _warp_core(image, flow, grid_x, grid_y):
+ dtype = image.dtype
+ H, W = flow.shape[2], flow.shape[3]
+ dx = flow[:, 0].float() / (W * 0.5)
+ dy = flow[:, 1].float() / (H * 0.5)
+ grid = torch.stack([grid_x[None, None, :] + dx, grid_y[None, :, None] + dy], dim=3)
+ return F.grid_sample(image.float(), grid, mode="bilinear", padding_mode="border", align_corners=False).to(dtype)
+
+
+def build_image_pyramid(image, pyramid_levels):
+ pyramid = [image]
+ for _ in range(1, pyramid_levels):
+ image = F.avg_pool2d(image, 2, 2)
+ pyramid.append(image)
+ return pyramid
+
+
+def flow_pyramid_synthesis(residual_pyramid):
+ flow = residual_pyramid[-1]
+ flow_pyramid = [flow]
+ for residual_flow in residual_pyramid[:-1][::-1]:
+ flow = F.interpolate(flow, size=residual_flow.shape[2:4], mode="bilinear", scale_factor=None).mul_(2).add_(residual_flow)
+ flow_pyramid.append(flow)
+ flow_pyramid.reverse()
+ return flow_pyramid
+
+
+def multiply_pyramid(pyramid, scalar):
+ return [image * scalar[:, None, None, None] for image in pyramid]
+
+
+def pyramid_warp(feature_pyramid, flow_pyramid, warp_fn):
+ return [warp_fn(features, flow) for features, flow in zip(feature_pyramid, flow_pyramid)]
+
+
+def concatenate_pyramids(pyramid1, pyramid2):
+ return [torch.cat([f1, f2], dim=1) for f1, f2 in zip(pyramid1, pyramid2)]
+
+
+class SubTreeExtractor(nn.Module):
+ def __init__(self, in_channels=3, channels=64, n_layers=4, device=None, dtype=None, operations=ops):
+ super().__init__()
+ convs = []
+ for i in range(n_layers):
+ out_ch = channels << i
+ convs.append(nn.Sequential(
+ FilmConv2d(in_channels, out_ch, 3, device=device, dtype=dtype, operations=operations),
+ FilmConv2d(out_ch, out_ch, 3, device=device, dtype=dtype, operations=operations)))
+ in_channels = out_ch
+ self.convs = nn.ModuleList(convs)
+
+ def forward(self, image, n):
+ head = image
+ pyramid = []
+ for i, layer in enumerate(self.convs):
+ head = layer(head)
+ pyramid.append(head)
+ if i < n - 1:
+ head = F.avg_pool2d(head, 2, 2)
+ return pyramid
+
+
+class FeatureExtractor(nn.Module):
+ def __init__(self, in_channels=3, channels=64, sub_levels=4, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.extract_sublevels = SubTreeExtractor(in_channels, channels, sub_levels, device=device, dtype=dtype, operations=operations)
+ self.sub_levels = sub_levels
+
+ def forward(self, image_pyramid):
+ sub_pyramids = [self.extract_sublevels(image_pyramid[i], min(len(image_pyramid) - i, self.sub_levels))
+ for i in range(len(image_pyramid))]
+ feature_pyramid = []
+ for i in range(len(image_pyramid)):
+ features = sub_pyramids[i][0]
+ for j in range(1, self.sub_levels):
+ if j <= i:
+ features = torch.cat([features, sub_pyramids[i - j][j]], dim=1)
+ feature_pyramid.append(features)
+ # Free sub-pyramids no longer needed by future levels
+ if i >= self.sub_levels - 1:
+ sub_pyramids[i - self.sub_levels + 1] = None
+ return feature_pyramid
+
+
+class FlowEstimator(nn.Module):
+ def __init__(self, in_channels, num_convs, num_filters, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self._convs = nn.ModuleList()
+ for _ in range(num_convs):
+ self._convs.append(FilmConv2d(in_channels, num_filters, 3, device=device, dtype=dtype, operations=operations))
+ in_channels = num_filters
+ self._convs.append(FilmConv2d(in_channels, num_filters // 2, 1, device=device, dtype=dtype, operations=operations))
+ self._convs.append(FilmConv2d(num_filters // 2, 2, 1, activation=False, device=device, dtype=dtype, operations=operations))
+
+ def forward(self, features_a, features_b):
+ net = torch.cat([features_a, features_b], dim=1)
+ for conv in self._convs:
+ net = conv(net)
+ return net
+
+
+class PyramidFlowEstimator(nn.Module):
+ def __init__(self, filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops):
+ super().__init__()
+ in_channels = filters << 1
+ predictors = []
+ for i in range(len(flow_convs)):
+ predictors.append(FlowEstimator(in_channels, flow_convs[i], flow_filters[i], device=device, dtype=dtype, operations=operations))
+ in_channels += filters << (i + 2)
+ self._predictor = predictors[-1]
+ self._predictors = nn.ModuleList(predictors[:-1][::-1])
+
+ def forward(self, feature_pyramid_a, feature_pyramid_b, warp_fn):
+ levels = len(feature_pyramid_a)
+ v = self._predictor(feature_pyramid_a[-1], feature_pyramid_b[-1])
+ residuals = [v]
+ # Coarse-to-fine: shared predictor for deep levels, then specialized predictors for fine levels
+ steps = [(i, self._predictor) for i in range(levels - 2, len(self._predictors) - 1, -1)]
+ steps += [(len(self._predictors) - 1 - k, p) for k, p in enumerate(self._predictors)]
+ for i, predictor in steps:
+ v = F.interpolate(v, size=feature_pyramid_a[i].shape[2:4], mode="bilinear").mul_(2)
+ v_residual = predictor(feature_pyramid_a[i], warp_fn(feature_pyramid_b[i], v))
+ residuals.append(v_residual)
+ v = v.add_(v_residual)
+ residuals.reverse()
+ return residuals
+
+
+def _get_fusion_channels(level, filters):
+ # Per direction: multi-scale features + RGB image (3ch) + flow (2ch), doubled for both directions
+ return (sum(filters << i for i in range(level)) + 3 + 2) * 2
+
+
+class Fusion(nn.Module):
+ def __init__(self, n_layers=4, specialized_layers=3, filters=64, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.output_conv = operations.Conv2d(filters, 3, kernel_size=1, device=device, dtype=dtype)
+ self.convs = nn.ModuleList()
+ in_channels = _get_fusion_channels(n_layers, filters)
+ increase = 0
+ for i in range(n_layers)[::-1]:
+ num_filters = (filters << i) if i < specialized_layers else (filters << specialized_layers)
+ self.convs.append(nn.ModuleList([
+ FilmConv2d(in_channels, num_filters, 2, activation=False, device=device, dtype=dtype, operations=operations),
+ FilmConv2d(in_channels + (increase or num_filters), num_filters, 3, device=device, dtype=dtype, operations=operations),
+ FilmConv2d(num_filters, num_filters, 3, device=device, dtype=dtype, operations=operations)]))
+ in_channels = num_filters
+ increase = _get_fusion_channels(i, filters) - num_filters // 2
+
+ def forward(self, pyramid):
+ net = pyramid[-1]
+ for k, layers in enumerate(self.convs):
+ i = len(self.convs) - 1 - k
+ net = layers[0](F.interpolate(net, size=pyramid[i].shape[2:4], mode="nearest"))
+ net = layers[2](layers[1](torch.cat([pyramid[i], net], dim=1)))
+ return self.output_conv(net)
+
+
+class FILMNet(nn.Module):
+ def __init__(self, pyramid_levels=7, fusion_pyramid_levels=5, specialized_levels=3, sub_levels=4,
+ filters=64, flow_convs=(3, 3, 3, 3), flow_filters=(32, 64, 128, 256), device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.pyramid_levels = pyramid_levels
+ self.fusion_pyramid_levels = fusion_pyramid_levels
+ self.extract = FeatureExtractor(3, filters, sub_levels, device=device, dtype=dtype, operations=operations)
+ self.predict_flow = PyramidFlowEstimator(filters, flow_convs, flow_filters, device=device, dtype=dtype, operations=operations)
+ self.fuse = Fusion(sub_levels, specialized_levels, filters, device=device, dtype=dtype, operations=operations)
+ self._warp_grids = {}
+
+ def get_dtype(self):
+ return self.extract.extract_sublevels.convs[0][0].conv.weight.dtype
+
+ def memory_used_forward(self, shape, dtype):
+ return 1700 * shape[1] * shape[2] * dtype.itemsize
+
+ def _build_warp_grids(self, H, W, device):
+ """Pre-compute warp grids for all pyramid levels."""
+ if (H, W) in self._warp_grids:
+ return
+ self._warp_grids = {} # clear old resolution grids to prevent memory leaks
+ for _ in range(self.pyramid_levels):
+ self._warp_grids[(H, W)] = (
+ torch.linspace(-(1 - 1 / W), 1 - 1 / W, W, dtype=torch.float32, device=device),
+ torch.linspace(-(1 - 1 / H), 1 - 1 / H, H, dtype=torch.float32, device=device),
+ )
+ H, W = H // 2, W // 2
+
+ def warp(self, image, flow):
+ grid_x, grid_y = self._warp_grids[(flow.shape[2], flow.shape[3])]
+ return _warp_core(image, flow, grid_x, grid_y)
+
+ def extract_features(self, img):
+ """Extract image and feature pyramids for a single frame. Can be cached across pairs."""
+ image_pyramid = build_image_pyramid(img, self.pyramid_levels)
+ feature_pyramid = self.extract(image_pyramid)
+ return image_pyramid, feature_pyramid
+
+ def forward(self, img0, img1, timestep=0.5, cache=None):
+ # FILM uses a scalar timestep per batch element (spatially-varying timesteps not supported)
+ t = timestep.mean(dim=(1, 2, 3)).item() if isinstance(timestep, torch.Tensor) else timestep
+ return self.forward_multi_timestep(img0, img1, [t], cache=cache)
+
+ def forward_multi_timestep(self, img0, img1, timesteps, cache=None):
+ """Compute flow once, synthesize at multiple timesteps. Expects batch=1 inputs."""
+ self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device)
+
+ image_pyr0, feat_pyr0 = cache["img0"] if cache and "img0" in cache else self.extract_features(img0)
+ image_pyr1, feat_pyr1 = cache["img1"] if cache and "img1" in cache else self.extract_features(img1)
+
+ fwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr0, feat_pyr1, self.warp))[:self.fusion_pyramid_levels]
+ bwd_flow = flow_pyramid_synthesis(self.predict_flow(feat_pyr1, feat_pyr0, self.warp))[:self.fusion_pyramid_levels]
+
+ # Build warp targets and free full pyramids (only first fpl levels needed from here)
+ fpl = self.fusion_pyramid_levels
+ p2w = [concatenate_pyramids(image_pyr0[:fpl], feat_pyr0[:fpl]),
+ concatenate_pyramids(image_pyr1[:fpl], feat_pyr1[:fpl])]
+ del image_pyr0, image_pyr1, feat_pyr0, feat_pyr1
+
+ results = []
+ dt_tensors = torch.tensor(timesteps, device=img0.device, dtype=img0.dtype)
+ for idx in range(len(timesteps)):
+ batch_dt = dt_tensors[idx:idx + 1]
+ bwd_scaled = multiply_pyramid(bwd_flow, batch_dt)
+ fwd_scaled = multiply_pyramid(fwd_flow, 1 - batch_dt)
+ fwd_warped = pyramid_warp(p2w[0], bwd_scaled, self.warp)
+ bwd_warped = pyramid_warp(p2w[1], fwd_scaled, self.warp)
+ aligned = [torch.cat([fw, bw, bf, ff], dim=1)
+ for fw, bw, bf, ff in zip(fwd_warped, bwd_warped, bwd_scaled, fwd_scaled)]
+ del fwd_warped, bwd_warped, bwd_scaled, fwd_scaled
+ results.append(self.fuse(aligned))
+ del aligned
+ return torch.cat(results, dim=0)
diff --git a/comfy_extras/frame_interpolation_models/ifnet.py b/comfy_extras/frame_interpolation_models/ifnet.py
new file mode 100644
index 000000000..ad6edbec9
--- /dev/null
+++ b/comfy_extras/frame_interpolation_models/ifnet.py
@@ -0,0 +1,131 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import comfy.ops
+
+ops = comfy.ops.disable_weight_init
+
+
+def _warp(img, flow, warp_grids):
+ B, _, H, W = img.shape
+ base_grid, flow_div = warp_grids[(H, W)]
+ flow_norm = torch.cat([flow[:, 0:1] / flow_div[0], flow[:, 1:2] / flow_div[1]], 1).float()
+ grid = (base_grid.expand(B, -1, -1, -1) + flow_norm).permute(0, 2, 3, 1)
+ return F.grid_sample(img.float(), grid, mode="bilinear", padding_mode="border", align_corners=True).to(img.dtype)
+
+
+class Head(nn.Module):
+ def __init__(self, out_ch=4, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.cnn0 = operations.Conv2d(3, 16, 3, 2, 1, device=device, dtype=dtype)
+ self.cnn1 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype)
+ self.cnn2 = operations.Conv2d(16, 16, 3, 1, 1, device=device, dtype=dtype)
+ self.cnn3 = operations.ConvTranspose2d(16, out_ch, 4, 2, 1, device=device, dtype=dtype)
+ self.relu = nn.LeakyReLU(0.2, True)
+
+ def forward(self, x):
+ x = self.relu(self.cnn0(x))
+ x = self.relu(self.cnn1(x))
+ x = self.relu(self.cnn2(x))
+ return self.cnn3(x)
+
+
+class ResConv(nn.Module):
+ def __init__(self, c, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.conv = operations.Conv2d(c, c, 3, 1, 1, device=device, dtype=dtype)
+ self.beta = nn.Parameter(torch.ones((1, c, 1, 1), device=device, dtype=dtype))
+ self.relu = nn.LeakyReLU(0.2, True)
+
+ def forward(self, x):
+ return self.relu(torch.addcmul(x, self.conv(x), self.beta))
+
+
+class IFBlock(nn.Module):
+ def __init__(self, in_planes, c=64, device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.conv0 = nn.Sequential(
+ nn.Sequential(operations.Conv2d(in_planes, c // 2, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True)),
+ nn.Sequential(operations.Conv2d(c // 2, c, 3, 2, 1, device=device, dtype=dtype), nn.LeakyReLU(0.2, True)))
+ self.convblock = nn.Sequential(*(ResConv(c, device=device, dtype=dtype, operations=operations) for _ in range(8)))
+ self.lastconv = nn.Sequential(operations.ConvTranspose2d(c, 4 * 13, 4, 2, 1, device=device, dtype=dtype), nn.PixelShuffle(2))
+
+ def forward(self, x, flow=None, scale=1):
+ x = F.interpolate(x, scale_factor=1.0 / scale, mode="bilinear")
+ if flow is not None:
+ flow = F.interpolate(flow, scale_factor=1.0 / scale, mode="bilinear").div_(scale)
+ x = torch.cat((x, flow), 1)
+ feat = self.convblock(self.conv0(x))
+ tmp = F.interpolate(self.lastconv(feat), scale_factor=scale, mode="bilinear")
+ return tmp[:, :4] * scale, tmp[:, 4:5], tmp[:, 5:]
+
+
+class IFNet(nn.Module):
+ def __init__(self, head_ch=4, channels=(192, 128, 96, 64, 32), device=None, dtype=None, operations=ops):
+ super().__init__()
+ self.encode = Head(out_ch=head_ch, device=device, dtype=dtype, operations=operations)
+ block_in = [7 + 2 * head_ch] + [8 + 4 + 8 + 2 * head_ch] * 4
+ self.blocks = nn.ModuleList([IFBlock(block_in[i], channels[i], device=device, dtype=dtype, operations=operations) for i in range(5)])
+ self.scale_list = [16, 8, 4, 2, 1]
+ self.pad_align = 64
+ self._warp_grids = {}
+
+ def get_dtype(self):
+ return self.encode.cnn0.weight.dtype
+
+ def memory_used_forward(self, shape, dtype):
+ return 300 * shape[1] * shape[2] * dtype.itemsize
+
+ def _build_warp_grids(self, H, W, device):
+ if (H, W) in self._warp_grids:
+ return
+ self._warp_grids = {} # clear old resolution grids to prevent memory leaks
+ grid_y, grid_x = torch.meshgrid(
+ torch.linspace(-1.0, 1.0, H, device=device, dtype=torch.float32),
+ torch.linspace(-1.0, 1.0, W, device=device, dtype=torch.float32), indexing="ij")
+ self._warp_grids[(H, W)] = (
+ torch.stack((grid_x, grid_y), dim=0).unsqueeze(0),
+ torch.tensor([(W - 1.0) / 2.0, (H - 1.0) / 2.0], dtype=torch.float32, device=device))
+
+ def warp(self, img, flow):
+ return _warp(img, flow, self._warp_grids)
+
+ def extract_features(self, img):
+ """Extract head features for a single frame. Can be cached across pairs."""
+ return self.encode(img)
+
+ def forward(self, img0, img1, timestep=0.5, cache=None):
+ if not isinstance(timestep, torch.Tensor):
+ timestep = torch.full((img0.shape[0], 1, img0.shape[2], img0.shape[3]), timestep, device=img0.device, dtype=img0.dtype)
+
+ self._build_warp_grids(img0.shape[2], img0.shape[3], img0.device)
+
+ B = img0.shape[0]
+ f0 = cache["img0"].expand(B, -1, -1, -1) if cache and "img0" in cache else self.encode(img0)
+ f1 = cache["img1"].expand(B, -1, -1, -1) if cache and "img1" in cache else self.encode(img1)
+ flow = mask = feat = None
+ warped_img0, warped_img1 = img0, img1
+ for i, block in enumerate(self.blocks):
+ if flow is None:
+ flow, mask, feat = block(torch.cat((img0, img1, f0, f1, timestep), 1), None, scale=self.scale_list[i])
+ else:
+ fd, mask, feat = block(
+ torch.cat((warped_img0, warped_img1, self.warp(f0, flow[:, :2]), self.warp(f1, flow[:, 2:4]), timestep, mask, feat), 1),
+ flow, scale=self.scale_list[i])
+ flow = flow.add_(fd)
+ warped_img0 = self.warp(img0, flow[:, :2])
+ warped_img1 = self.warp(img1, flow[:, 2:4])
+ return torch.lerp(warped_img1, warped_img0, torch.sigmoid(mask))
+
+
+def detect_rife_config(state_dict):
+ head_ch = state_dict["encode.cnn3.weight"].shape[1] # ConvTranspose2d: (in_ch, out_ch, kH, kW)
+ channels = []
+ for i in range(5):
+ key = f"blocks.{i}.conv0.1.0.weight"
+ if key in state_dict:
+ channels.append(state_dict[key].shape[0])
+ if len(channels) != 5:
+ raise ValueError(f"Unsupported RIFE model: expected 5 blocks, found {len(channels)}")
+ return head_ch, channels
diff --git a/comfy_extras/mediapipe/face_geometry.py b/comfy_extras/mediapipe/face_geometry.py
new file mode 100644
index 000000000..4f3813430
--- /dev/null
+++ b/comfy_extras/mediapipe/face_geometry.py
@@ -0,0 +1,110 @@
+"""Pure-numpy port of MediaPipe's face_geometry (FACE_LANDMARK_PIPELINE mode)
++ weighted Procrustes solver. Computes the 4x4 facial transformation matrix.
+"""
+
+
+import math
+import numpy as np
+
+
+def _solve_weighted_orthogonal_problem(src: np.ndarray, tgt: np.ndarray, weights: np.ndarray) -> np.ndarray:
+ """Weighted orthogonal Procrustes (similarity). Returns 4x4 M with
+ `target ≈ M @ homogeneous(source)` in the weighted LS sense. fp64 for
+ SVD stability. Port of procrustes_solver.cc."""
+ sqrt_w = np.sqrt(weights.astype(np.float64))
+ w_total = float((sqrt_w ** 2).sum())
+ ws = src.astype(np.float64) * sqrt_w
+ wt = tgt.astype(np.float64) * sqrt_w
+
+ c_w = (ws @ sqrt_w) / w_total
+ centered = ws - np.outer(c_w, sqrt_w)
+ U, _S, Vt = np.linalg.svd(wt @ centered.T, full_matrices=True)
+ # Disallow reflection: flip the least-significant axis when det(U)·det(V)<0.
+ post, pre = U.copy(), Vt.T.copy()
+ if np.linalg.det(post) * np.linalg.det(pre) < 0:
+ post[:, 2] *= -1.0
+ R = post @ pre.T
+
+ denom = float((centered * ws).sum())
+ if denom < 1e-12:
+ raise ValueError("Procrustes denominator collapsed (degenerate source).")
+ scale = float((R @ centered * wt).sum()) / denom
+ translation = ((wt - scale * (R @ ws)) @ sqrt_w) / w_total
+
+ M = np.eye(4, dtype=np.float64)
+ M[:3, :3] = scale * R
+ M[:3, 3] = translation
+ return M
+
+
+def _estimate_scale(canonical: np.ndarray, runtime: np.ndarray, weights: np.ndarray) -> float:
+ """scale = ‖first column of M[:3]‖ per geometry_pipeline.cc::EstimateScale."""
+ return float(np.linalg.norm(_solve_weighted_orthogonal_problem(canonical, runtime, weights)[:3, 0]))
+
+
+def solve_facial_transformation_matrix(
+ landmarks_normalized: np.ndarray,
+ canonical_vertices: np.ndarray,
+ procrustes_indices: np.ndarray,
+ procrustes_weights: np.ndarray,
+ image_width: int,
+ image_height: int,
+ # face_geometry_calculator_options.pbtxt defaults
+ vertical_fov_degrees: float = 63.0,
+ near: float = 1.0,
+) -> np.ndarray:
+ """4x4 facial transformation matrix via two-pass scale recovery
+ `landmarks_normalized` is (N, 3) in MediaPipe normalized convention: x, y
+ in [0,1] with TOP-LEFT origin, z in width-scaled units.
+ """
+
+ h_near = 2.0 * near * math.tan(0.5 * math.radians(vertical_fov_degrees))
+ w_near = image_width * h_near / image_height
+
+ sub = procrustes_indices.astype(np.int64)
+ screen = landmarks_normalized[sub].T.astype(np.float64).copy()
+ canon = canonical_vertices[sub].T.astype(np.float64).copy()
+ weights = procrustes_weights.astype(np.float64)
+
+ # ProjectXY (TOP_LEFT y-flip, then scale all 3 axes; z uses x-scale).
+ screen[1] = 1.0 - screen[1]
+ screen[0] = screen[0] * w_near - 0.5 * w_near
+ screen[1] = screen[1] * h_near - 0.5 * h_near
+ screen[2] = screen[2] * w_near
+ depth_offset = float(screen[2].mean())
+
+ def _unproject(s: np.ndarray, scale: float) -> np.ndarray:
+ s = s.copy()
+ s[2] = (s[2] - depth_offset + near) / scale
+ s[0] *= s[2] / near
+ s[1] *= s[2] / near
+ s[2] *= -1.0
+ return s
+
+ first = screen.copy()
+ first[2] *= -1.0
+ s1 = _estimate_scale(canon, first, weights) # 1st pass: Procrustes on projected XY
+ s2 = _estimate_scale(canon, _unproject(screen, s1), weights) # 2nd pass: rescale z by s1, un-project XY
+ return _solve_weighted_orthogonal_problem(canon, _unproject(screen, s1 * s2), weights).astype(np.float32)
+
+
+def transformation_matrix_from_detection(face_dict: dict, image_width: int, image_height: int, canonical_data: dict) -> np.ndarray:
+ """Adapt a FaceLandmarker face dict to MP's normalized convention and solve.
+ FaceMesh emits (x, y, z) in 192-canonical units; MP's geometry expects
+ z_norm = z_canonical * scale_x / image_width"""
+
+ lmks_xy, lmks_3d = face_dict["landmarks_xy"], face_dict["landmarks_3d"]
+ aug = np.concatenate([lmks_3d[:, :2].astype(np.float64), np.ones((lmks_xy.shape[0], 1))], axis=1)
+ M, *_ = np.linalg.lstsq(aug, lmks_xy.astype(np.float64), rcond=None)
+ scale_x = float(np.linalg.norm(M[0]))
+ z_scale = scale_x / image_width if scale_x > 1e-6 else 1.0 / image_width
+
+ normalized = np.empty((lmks_xy.shape[0], 3), dtype=np.float32)
+ normalized[:, 0] = lmks_xy[:, 0] / image_width
+ normalized[:, 1] = lmks_xy[:, 1] / image_height
+ normalized[:, 2] = lmks_3d[:, 2] * z_scale
+ return solve_facial_transformation_matrix(
+ normalized, canonical_data["canonical_vertices"],
+ canonical_data["procrustes_indices"], canonical_data["procrustes_weights"],
+ image_width=image_width, image_height=image_height,
+ )
diff --git a/comfy_extras/mediapipe/face_landmarker.py b/comfy_extras/mediapipe/face_landmarker.py
new file mode 100644
index 000000000..e6b463c4c
--- /dev/null
+++ b/comfy_extras/mediapipe/face_landmarker.py
@@ -0,0 +1,681 @@
+"""Pure-PyTorch port of MediaPipe's face_landmarker_v2_with_blendshapes.task:
+BlazeFace detector → FaceMesh v2 → ARKit-52 blendshapes."""
+
+
+import math
+from functools import lru_cache
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy.special import expit
+from torch import Tensor, nn
+
+
+# Values below must stay verbatim with the published face_landmarker_v2 graph
+
+# face_blendshapes_graph.cc::kLandmarksSubsetIdxs
+_BS_INPUT_INDICES: Tuple[int, ...] = (
+ 0, 1, 4, 5, 6, 7, 8, 10, 13, 14, 17, 21, 33, 37, 39, 40, 46, 52, 53, 54,
+ 55, 58, 61, 63, 65, 66, 67, 70, 78, 80, 81, 82, 84, 87, 88, 91, 93, 95,
+ 103, 105, 107, 109, 127, 132, 133, 136, 144, 145, 146, 148, 149, 150, 152,
+ 153, 154, 155, 157, 158, 159, 160, 161, 162, 163, 168, 172, 173, 176, 178,
+ 181, 185, 191, 195, 197, 234, 246, 249, 251, 263, 267, 269, 270, 276, 282,
+ 283, 284, 285, 288, 291, 293, 295, 296, 297, 300, 308, 310, 311, 312, 314,
+ 317, 318, 321, 323, 324, 332, 334, 336, 338, 356, 361, 362, 365, 373, 374,
+ 375, 377, 378, 379, 380, 381, 382, 384, 385, 386, 387, 388, 389, 390, 397,
+ 398, 400, 402, 405, 409, 415, 454, 466, 468, 469, 470, 471, 472, 473, 474,
+ 475, 476, 477,
+)
+
+# face_blendshapes_graph.cc::kCategoryNames
+BLENDSHAPE_NAMES: Tuple[str, ...] = (
+ "_neutral", "browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft",
+ "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight",
+ "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight",
+ "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight",
+ "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight",
+ "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen",
+ "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight",
+ "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft",
+ "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft",
+ "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower",
+ "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft",
+ "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight",
+ "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight",
+)
+
+# face_detection.pbtxt — short-range BlazeFace.
+_BF_NUM_LAYERS = 4
+_BF_INPUT_SIZE = 128
+_BF_STRIDES = (8, 16, 16, 16)
+_BF_ANCHOR_OFFSET_X = 0.5
+_BF_ANCHOR_OFFSET_Y = 0.5
+_BF_ASPECT_RATIOS = (1.0,)
+_BF_INTERP_SCALE_AR = 1.0
+_BF_BOX_SCALE = 128.0
+_BF_KP_OFFSET = 4
+_BF_SCORE_CLIP = 100.0
+_BF_MIN_SCORE = 0.5
+
+# face_detection_full_range.pbtxt — 48x48 grid at stride 4, 1 anchor/cell.
+_BF_FR_INPUT_SIZE = 192
+_BF_FR_GRID = 48
+_BF_FR_NUM_ANCHORS = _BF_FR_GRID * _BF_FR_GRID
+_BF_FR_BOX_SCALE = 192.0
+_BF_FR_SCORE_CLIP = 100.0
+
+_FM_INPUT_SIZE = 192
+
+# Face ROI: 1.5xbbox rect warped anisotropically into 192x192.
+_FACE_LEFT_EYE_KP = 0
+_FACE_RIGHT_EYE_KP = 1
+_FACE_ROI_SCALE_X = 1.5
+_FACE_ROI_SCALE_Y = 1.5
+_FACE_ROI_TARGET_ANGLE = 0.0
+
+
+def _tf_same_pad(x: Tensor, kernel: int, stride: int) -> Tensor:
+ """TF SAME pad (asymmetric on stride-2; PyTorch's symmetric pad undershoots by 1 px)."""
+ H, W = x.shape[-2], x.shape[-1]
+ pad_h = max(((H + stride - 1) // stride - 1) * stride + kernel - H, 0)
+ pad_w = max(((W + stride - 1) // stride - 1) * stride + kernel - W, 0)
+ if pad_h == 0 and pad_w == 0:
+ return x
+ return F.pad(x, (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
+
+
+# BlazeFace short-range: stem 5x5/s2 → 16 BlazeBlocks → parallel heads at
+# 16²x88 (2 anchors/cell) and 8²x96 (6/cell) = 896 anchors. (in, out, stride):
+_BLAZEFACE_BLOCKS = [
+ (24, 24, 1), (24, 28, 1), (28, 32, 2), (32, 36, 1),
+ (36, 42, 1), (42, 48, 2), (48, 56, 1), (56, 64, 1),
+ (64, 72, 1), (72, 80, 1), (80, 88, 1), (88, 96, 2),
+ (96, 96, 1), (96, 96, 1), (96, 96, 1), (96, 96, 1),
+]
+
+
+class BlazeFaceBlock(nn.Module):
+ """DW 3x3 + PW + residual. Residual max-pools on stride>1, channel-pads on out_ch>in_ch."""
+
+ def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride
+ self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, device=device, dtype=dtype)
+ self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, device=device, dtype=dtype)
+
+ def forward(self, x: Tensor) -> Tensor:
+ residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x
+ if self.out_ch > self.in_ch:
+ residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch))
+ x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1))
+ return F.relu(self.pointwise(self.depthwise(x)) + residual)
+
+
+class BlazeFace(nn.Module):
+ """Short-range BlazeFace: (B, 3, 128, 128) in [-1, 1] → 896 anchors x 17."""
+
+ def __init__(self, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ self.stem = ops.Conv2d(3, 24, 5, stride=2, padding=0, bias=True, **kw)
+ self.blocks = nn.ModuleList(BlazeFaceBlock(i, o, s, device=device, dtype=dtype, operations=operations)
+ for (i, o, s) in _BLAZEFACE_BLOCKS)
+ # 16²x2 + 8²x6 = 512 + 384 = 896 anchors.
+ self.cls_16 = ops.Conv2d(88, 2, 1, padding=0, bias=True, **kw)
+ self.cls_8 = ops.Conv2d(96, 6, 1, padding=0, bias=True, **kw)
+ self.reg_16 = ops.Conv2d(88, 32, 1, padding=0, bias=True, **kw)
+ self.reg_8 = ops.Conv2d(96, 96, 1, padding=0, bias=True, **kw)
+
+ def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+ x = F.relu(self.stem(_tf_same_pad(image_chw_normalized, 5, 2)))
+ # 16x16 tap is block-10 output (before the 88→96 stride-2 in block 11).
+ for i in range(11):
+ x = self.blocks[i](x)
+ feat_16 = x
+ for i in range(11, 16):
+ x = self.blocks[i](x)
+ feat_8 = x
+
+ def flat(t, a, k): # NHWC flatten → (B, H*W*A, K)
+ B, _, H, W = t.shape
+ return t.permute(0, 2, 3, 1).reshape(B, H * W * a, k)
+
+ cls = torch.cat([flat(self.cls_16(feat_16), 2, 1), flat(self.cls_8(feat_8), 6, 1)], dim=1)
+ reg = torch.cat([flat(self.reg_16(feat_16), 2, 16), flat(self.reg_8(feat_8), 6, 16)], dim=1)
+ return reg, cls
+
+
+# BlazeFace full-range (face_detection_full_range_sparse.tflite): MobileNetV2-ish
+# backbone + top-down FPN, 192² input → 2304 anchors at the 48x48 grid.
+class FRBlock(nn.Module):
+ """Double inverted residual: DW → PW(mid) → DW → PW(out) [+ residual].
+
+ Per source tflite: dw* have no fused activation, pw1 is always ReLU, pw2
+ is ReLU only when no residual (else ReLU fuses into the ADD).
+ """
+
+ def __init__(self, in_ch: int, mid_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ self.has_residual = (in_ch == out_ch and stride == 1)
+ self.dw1 = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw)
+ self.pw1 = ops.Conv2d(in_ch, mid_ch, 1, padding=0, bias=True, **kw)
+ self.dw2 = ops.Conv2d(mid_ch, mid_ch, 3, stride=1, padding=0, groups=mid_ch, bias=True, **kw)
+ self.pw2 = ops.Conv2d(mid_ch, out_ch, 1, padding=0, bias=True, **kw)
+
+ def forward(self, x: Tensor) -> Tensor:
+ residual = x if self.has_residual else None
+ x = F.relu(self.pw1(self.dw1(F.pad(x, (1, 1, 1, 1)))))
+ x = self.pw2(self.dw2(F.pad(x, (1, 1, 1, 1))))
+ return F.relu(x + residual) if residual is not None else F.relu(x)
+
+
+# (in_ch, mid_ch, out_ch, stride). Stages downsample 96²x32 → 48²x64 → 24²x128
+# → 12²x192 → 6²x384. Lateral taps at indices 4, 7, 10 (see _FR_LATERAL_*).
+_FR_BACKBONE_BLOCKS = [
+ (32, 8, 32, 1), (32, 8, 32, 1), # 96²x32
+ (32, 16, 64, 2), (64, 16, 64, 1), (64, 16, 64, 1), # 48²x64 — tap[0]
+ (64, 32, 128, 2), (128, 32, 128, 1), (128, 32, 128, 1), # 24²x128 — tap[1]
+ (128, 48, 192, 2), (192, 48, 192, 1), (192, 48, 192, 1), # 12²x192 — tap[2]
+ (192, 96, 384, 2), (384, 96, 384, 1), (384, 96, 384, 1), (384, 96, 384, 1), # 6²x384
+]
+_FR_LATERAL_TAP_INDICES = (4, 7, 10)
+_FR_LATERAL_CHANNELS = ((64, 48), (128, 64), (192, 96)) # (in, out) per side-conv
+
+# Decoder blocks per FPN level (after upsample-and-merge with the lateral).
+_FR_DECODER_BLOCKS = [
+ [(96, 48, 96, 1), (96, 48, 96, 1)], # 12²x96
+ [(64, 32, 64, 1), (64, 32, 64, 1)], # 24²x64
+ [(48, 24, 48, 1)], # 48²x48 — feeds the heads
+]
+
+
+def _dcr_depth_to_space(t: Tensor, r: int, c_out: int) -> Tensor:
+ """TF DEPTH_TO_SPACE in DCR layout (input channels = (i, j, c_out)).
+ pixel_shuffle uses CRD which permutes output channels for c_out > 1."""
+ B_, _, H_, W_ = t.shape
+ t = t.reshape(B_, r, r, c_out, H_, W_)
+ t = t.permute(0, 3, 4, 1, 5, 2).contiguous()
+ return t.reshape(B_, c_out, H_ * r, W_ * r)
+
+
+class BlazeFaceFullRange(nn.Module):
+ """Full-range face detector: (B, 3, 192, 192) in [-1, 1] → 2304 anchors x 17 values."""
+
+ def __init__(self, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ mk_block = lambda i, m, o, s: FRBlock(i, m, o, s, device=device, dtype=dtype, operations=operations)
+ self.stem = ops.Conv2d(3, 32, 3, stride=2, padding=0, bias=True, **kw)
+ self.backbone = nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in _FR_BACKBONE_BLOCKS)
+ self.lateral_convs = nn.ModuleList(ops.Conv2d(i, o, 1, padding=0, bias=True, **kw) for (i, o) in _FR_LATERAL_CHANNELS)
+ self.top_conv = ops.Conv2d(384, 96, 1, padding=0, bias=True, **kw)
+ self.decoder_levels = nn.ModuleList(
+ nn.ModuleList(mk_block(i, m, o, s) for (i, m, o, s) in lvl) for lvl in _FR_DECODER_BLOCKS
+ )
+ # 96→64 before 12→24, 64→48 before 24→48.
+ self.decoder_reduce_convs = nn.ModuleList([
+ ops.Conv2d(96, 64, 1, padding=0, bias=True, **kw),
+ ops.Conv2d(64, 48, 1, padding=0, bias=True, **kw),
+ ])
+ # Heads mix 2x2-cell info via DW-stride-2 + depth_to_space block_size=2.
+ self.cls_conv = ops.Conv2d(48, 4, 1, padding=0, bias=True, **kw)
+ self.cls_dw = ops.Conv2d(4, 4, 3, stride=2, padding=0, groups=4, bias=True, **kw)
+ self.reg_conv = ops.Conv2d(48, 64, 1, padding=0, bias=True, **kw)
+ self.reg_dw = ops.Conv2d(64, 64, 3, stride=2, padding=0, groups=64, bias=True, **kw)
+
+ def forward(self, image_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+ # Symmetric pad-1 throughout (full-range tflite uses explicit TF PAD, not SAME).
+ x = F.relu(self.stem(F.pad(image_chw_normalized, (1, 1, 1, 1))))
+ tap_set = set(_FR_LATERAL_TAP_INDICES)
+ laterals: list[Tensor] = []
+ for i, blk in enumerate(self.backbone):
+ x = blk(x)
+ if i in tap_set:
+ laterals.append(x)
+
+ # top_conv / lateral_convs / decoder_reduce_convs all have fused ReLU in the tflite.
+ p = F.relu(self.top_conv(x))
+ laterals_rev = list(reversed(laterals))
+ lateral_convs_rev = list(reversed(self.lateral_convs))
+ for level in range(len(self.decoder_levels)):
+ lateral = laterals_rev[level]
+ p = F.interpolate(p, size=lateral.shape[-2:], mode="bilinear", align_corners=False)
+ p = p + F.relu(lateral_convs_rev[level](lateral))
+ for blk in self.decoder_levels[level]:
+ p = blk(p)
+ if level < len(self.decoder_reduce_convs):
+ p = F.relu(self.decoder_reduce_convs[level](p))
+
+ c = self.cls_dw(F.pad(self.cls_conv(p), (1, 1, 1, 1)))
+ c = _dcr_depth_to_space(c, r=2, c_out=1)
+ r = self.reg_dw(F.pad(self.reg_conv(p), (1, 1, 1, 1)))
+ r = _dcr_depth_to_space(r, r=2, c_out=16)
+ B = c.shape[0]
+ cls_out = c.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 1)
+ reg_out = r.permute(0, 2, 3, 1).reshape(B, _BF_FR_NUM_ANCHORS, 16)
+ return reg_out, cls_out
+
+
+@lru_cache(maxsize=1)
+def _blazeface_full_range_anchors() -> np.ndarray:
+ """2304 anchors over 48x48; anchor_w=anchor_h=1 (fixed_anchor_size)."""
+ feat = _BF_FR_GRID
+ yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij")
+ cx, cy, ones = (xx + 0.5) / feat, (yy + 0.5) / feat, np.ones_like(xx)
+ return np.stack([cx, cy, ones, ones], axis=-1).reshape(_BF_FR_NUM_ANCHORS, 4)
+
+
+def _decode_blazeface_full_range(regressors: np.ndarray, classificators: np.ndarray,
+ score_thresh: float = _BF_MIN_SCORE) -> np.ndarray:
+ """Same decode as short-range with 2304-anchor grid and box_scale=192."""
+ scores = expit(np.clip(classificators[:, 0], -_BF_FR_SCORE_CLIP, _BF_FR_SCORE_CLIP))
+ keep = scores >= score_thresh
+ if not keep.any():
+ return np.empty((0, 17), dtype=np.float32)
+ r = regressors[keep] / _BF_FR_BOX_SCALE
+ a = _blazeface_full_range_anchors()[keep]
+ cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4]
+ xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys
+ w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs
+ out = np.empty((r.shape[0], 17), dtype=np.float32)
+ out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2
+ out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs
+ out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys
+ out[:, 16] = scores[keep]
+ return out
+
+
+# FaceMesh (face_landmarks_detector.tflite): PReLU variant of BlazeBlock,
+# 17 blocks, heads for 478x3 landmarks + presence.
+_FACEMESH_BLOCKS = [ # (in_ch, out_ch, stride)
+ (16, 16, 1), (16, 16, 1), (16, 32, 2), (32, 32, 1), (32, 32, 1), (32, 64, 2),
+ (64, 64, 1), (64, 64, 1), (64, 128, 2), (128, 128, 1), (128, 128, 1), (128, 128, 2),
+ (128, 128, 1), (128, 128, 1), (128, 128, 2), (128, 128, 1), (128, 128, 1),
+]
+
+
+class FaceMeshBlock(nn.Module):
+ """PReLU BlazeBlock: PReLU between DW and PW, and after the residual add."""
+
+ def __init__(self, in_ch: int, out_ch: int, stride: int, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ self.in_ch, self.out_ch, self.stride = in_ch, out_ch, stride
+ self.depthwise = ops.Conv2d(in_ch, in_ch, 3, stride=stride, padding=0, groups=in_ch, bias=True, **kw)
+ self.prelu_dwise = nn.PReLU(num_parameters=in_ch, **kw)
+ self.pointwise = ops.Conv2d(in_ch, out_ch, 1, padding=0, bias=True, **kw)
+ self.prelu_out = nn.PReLU(num_parameters=out_ch, **kw)
+
+ def forward(self, x: Tensor) -> Tensor:
+ residual = F.max_pool2d(x, 2, 2) if self.stride > 1 else x
+ if self.out_ch > self.in_ch:
+ residual = F.pad(residual, (0, 0, 0, 0, 0, self.out_ch - self.in_ch))
+ x = _tf_same_pad(x, 3, self.stride) if self.stride > 1 else F.pad(x, (1, 1, 1, 1))
+ return self.prelu_out(self.pointwise(self.prelu_dwise(self.depthwise(x))) + residual)
+
+
+class FaceMesh(nn.Module):
+ NUM_LANDMARKS = 478
+
+ def __init__(self, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ self.stem = ops.Conv2d(3, 16, 3, stride=2, padding=0, bias=True, **kw)
+ self.prelu_stem = nn.PReLU(num_parameters=16, **kw)
+ self.blocks = nn.ModuleList(FaceMeshBlock(i, o, s, device=device, dtype=dtype, operations=operations)
+ for (i, o, s) in _FACEMESH_BLOCKS)
+ self.head_reduce = ops.Conv2d(128, 8, 1, padding=0, bias=True, **kw)
+ self.prelu_head_reduce = nn.PReLU(num_parameters=8, **kw)
+ self.head_block = FaceMeshBlock(8, 8, 1, device=device, dtype=dtype, operations=operations)
+ self.head_presence = ops.Conv2d(8, 1, 3, padding=0, bias=True, **kw)
+ self.head_landmarks = ops.Conv2d(8, self.NUM_LANDMARKS * 3, 3, padding=0, bias=True, **kw)
+
+ def forward(self, face_chw_normalized: Tensor) -> tuple[Tensor, Tensor]:
+ """(B, 3, 192, 192) in [0, 1] → ((B, 478, 3) landmarks in 192-canonical, (B,) presence)."""
+ x = self.prelu_stem(self.stem(_tf_same_pad(face_chw_normalized, 3, 2)))
+ for blk in self.blocks:
+ x = blk(x)
+ x = self.prelu_head_reduce(self.head_reduce(x))
+ x = self.head_block(x)
+ B = x.shape[0]
+ presence = self.head_presence(x).reshape(B)
+ lmks = self.head_landmarks(x).reshape(B, self.NUM_LANDMARKS, 3)
+ return lmks, presence
+
+
+# FaceBlendshapes (MLP-Mixer "GhumMarkerPoserMlpMixerGeneral"):
+# 146x2 → token-reduce 146→96 → embed 2→64 → +cls token → 4x mixer → cls→52.
+_BS_NUM_INPUT_LANDMARKS = 146
+_BS_NUM_TOKENS_REDUCED = 96
+_BS_NUM_TOKENS = 97 # +1 cls
+_BS_TOKEN_DIM = 64
+_BS_TOKEN_MIX_HIDDEN = 384
+_BS_CHANNEL_MIX_HIDDEN = 256
+_BS_NUM_BLENDSHAPES = 52
+_BS_LN_EPS = 1e-6
+
+
+class MlpMixerBlock(nn.Module):
+ """MLP-Mixer block: token-mixing MLP (over tokens) → channel-mixing MLP (over dim).
+ Both pre-LN, both residual. LN has no beta (bias=False) to match MP."""
+
+ def __init__(self, num_tokens: int, token_dim: int, token_hidden: int, channel_hidden: int,
+ device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ # bias=False → no LN beta (matches MP).
+ self.ln1 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw)
+ self.ln2 = ops.LayerNorm(token_dim, eps=_BS_LN_EPS, bias=False, **kw)
+ self.token_mlp1 = ops.Linear(num_tokens, token_hidden, bias=True, **kw)
+ self.token_mlp2 = ops.Linear(token_hidden, num_tokens, bias=True, **kw)
+ self.channel_mlp1 = ops.Linear(token_dim, channel_hidden, bias=True, **kw)
+ self.channel_mlp2 = ops.Linear(channel_hidden, token_dim, bias=True, **kw)
+
+ def forward(self, x: Tensor) -> Tensor:
+ y = self.ln1(x).transpose(1, 2)
+ x = x + self.token_mlp2(F.relu(self.token_mlp1(y))).transpose(1, 2)
+ return x + self.channel_mlp2(F.relu(self.channel_mlp1(self.ln2(x))))
+
+
+class FaceBlendshapes(nn.Module):
+ def __init__(self, device=None, dtype=None, operations=None):
+ super().__init__()
+ ops = operations if operations is not None else nn
+ kw = dict(device=device, dtype=dtype)
+ self.token_reduce = ops.Linear(_BS_NUM_INPUT_LANDMARKS, _BS_NUM_TOKENS_REDUCED, bias=True, **kw)
+ self.token_embed = ops.Linear(2, _BS_TOKEN_DIM, bias=True, **kw)
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, _BS_TOKEN_DIM, **kw))
+ self.blocks = nn.ModuleList(
+ MlpMixerBlock(_BS_NUM_TOKENS, _BS_TOKEN_DIM, _BS_TOKEN_MIX_HIDDEN, _BS_CHANNEL_MIX_HIDDEN,
+ device=device, dtype=dtype, operations=operations) for _ in range(4)
+ )
+ self.head = ops.Linear(_BS_TOKEN_DIM, _BS_NUM_BLENDSHAPES, bias=True, **kw)
+
+ @staticmethod
+ def _input_normalize(landmarks_2d: Tensor) -> Tensor:
+ # Centroid-subtract → L2 scale → x0.5. The 0.5 is baked into training.
+ centroid = landmarks_2d.mean(dim=1, keepdim=True)
+ x = landmarks_2d - centroid
+ mag = torch.sqrt((x * x).sum(dim=-1, keepdim=True))
+ scale = mag.mean(dim=1, keepdim=True)
+ return (x / scale.clamp(min=1e-12)) * 0.5
+
+ def forward(self, landmarks_2d: Tensor) -> Tensor:
+ """(B, 146, 2) → (B, 52) in [0, 1]. Input units don't matter (centroid + L2 normalize)."""
+ x = self._input_normalize(landmarks_2d)
+ x = self.token_reduce(x.transpose(1, 2)).transpose(1, 2)
+ x = self.token_embed(x)
+ cls = self.cls_token.expand(x.shape[0], -1, -1)
+ x = torch.cat([cls, x], dim=1)
+ for blk in self.blocks:
+ x = blk(x)
+ return torch.sigmoid(self.head(x[:, 0]))
+
+
+@lru_cache(maxsize=1)
+def _blazeface_anchors() -> np.ndarray:
+ """896 anchors per SsdAnchorsCalculator (fixed_anchor_size → anchor_w=anchor_h=1)."""
+ per_ar = len(_BF_ASPECT_RATIOS) + (1 if _BF_INTERP_SCALE_AR > 0 else 0)
+ layer_anchors: List[np.ndarray] = []
+ layer = 0
+ while layer < _BF_NUM_LAYERS:
+ stride = _BF_STRIDES[layer]
+ last = layer
+ while last < _BF_NUM_LAYERS and _BF_STRIDES[last] == stride:
+ last += 1
+ per_cell = per_ar * (last - layer)
+ feat = (_BF_INPUT_SIZE + stride - 1) // stride
+ yy, xx = np.meshgrid(np.arange(feat, dtype=np.float32), np.arange(feat, dtype=np.float32), indexing="ij")
+ cx, cy, ones = (xx + _BF_ANCHOR_OFFSET_X) / feat, (yy + _BF_ANCHOR_OFFSET_Y) / feat, np.ones_like(xx)
+ cell = np.stack([cx, cy, ones, ones], axis=-1).reshape(-1, 4)
+ layer_anchors.append(np.repeat(cell, per_cell, axis=0))
+ layer = last
+ out = np.concatenate(layer_anchors, axis=0)
+ assert out.shape == (896, 4), out.shape
+ return out
+
+
+def _decode_blazeface(regressors: np.ndarray, classificators: np.ndarray,
+ score_thresh: float = _BF_MIN_SCORE) -> np.ndarray:
+ """Decode (regs (896,16), cls (896,1)) → (N, 17) = [xyxy, kp0x..kp5y, score] in [0, 1]."""
+ scores = expit(np.clip(classificators[:, 0], -_BF_SCORE_CLIP, _BF_SCORE_CLIP))
+ keep = scores >= score_thresh
+ if not keep.any():
+ return np.empty((0, 17), dtype=np.float32)
+ r = regressors[keep] / _BF_BOX_SCALE
+ a = _blazeface_anchors()[keep] # (N, 4) cx, cy, 1, 1
+ cxs, cys, aws, ahs = a[:, 0:1], a[:, 1:2], a[:, 2:3], a[:, 3:4]
+ xc, yc = r[:, 0:1] * aws + cxs, r[:, 1:2] * ahs + cys
+ w, h = r[:, 2:3] * aws, r[:, 3:4] * ahs
+ out = np.empty((r.shape[0], 17), dtype=np.float32)
+ out[:, 0:1], out[:, 1:2], out[:, 2:3], out[:, 3:4] = xc - w / 2, yc - h / 2, xc + w / 2, yc + h / 2
+ out[:, 4:16:2] = r[:, _BF_KP_OFFSET::2] * aws + cxs
+ out[:, 5:16:2] = r[:, _BF_KP_OFFSET + 1::2] * ahs + cys
+ out[:, 16] = scores[keep]
+ return out
+
+
+def _weighted_nms(detections: np.ndarray, iou_thresh: float = 0.5) -> np.ndarray:
+ """MP weighted NMS — kept boxes are score-weighted averages of overlapping detections."""
+ if detections.shape[0] == 0:
+ return detections
+ dets = detections[np.argsort(-detections[:, 16])]
+ N = dets.shape[0]
+ areas = np.clip(dets[:, 2] - dets[:, 0], 0, None) * np.clip(dets[:, 3] - dets[:, 1], 0, None)
+ kept: List[np.ndarray] = []
+ used = np.zeros(N, dtype=bool)
+ for i in range(N):
+ if used[i]:
+ continue
+ ax1, ay1, ax2, ay2 = dets[i, 0:4]
+ merge_idx = [i]
+ for j in range(i + 1, N):
+ if used[j]:
+ continue
+ bx1, by1, bx2, by2 = dets[j, 0:4]
+ iw = max(0.0, min(ax2, bx2) - max(ax1, bx1))
+ ih = max(0.0, min(ay2, by2) - max(ay1, by1))
+ inter = iw * ih
+ union = areas[i] + areas[j] - inter
+ if union > 0 and inter / union > iou_thresh: # strict > matches MP
+ merge_idx.append(j)
+ used[j] = True
+ used[i] = True
+ cluster = dets[merge_idx]
+ ws = cluster[:, 16:17]
+ ws_sum = ws.sum()
+ merged = np.copy(cluster[0])
+ if ws_sum > 0:
+ merged[:16] = (cluster[:, :16] * ws).sum(axis=0) / ws_sum
+ kept.append(merged)
+ return np.stack(kept, axis=0) if kept else np.empty((0, 17), dtype=np.float32)
+
+
+def _detection_to_face_rect(detection: np.ndarray, image_w: int, image_h: int) -> Tuple[float, float, float, float, float]:
+ """Detection (normalized) → rotated 1.5xbbox ROI in image pixels (anisotropic)."""
+ xmin, ymin, xmax, ymax = detection[0:4]
+ lx = detection[4 + _FACE_LEFT_EYE_KP * 2 + 0] * image_w
+ ly = detection[4 + _FACE_LEFT_EYE_KP * 2 + 1] * image_h
+ rx = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 0] * image_w
+ ry = detection[4 + _FACE_RIGHT_EYE_KP * 2 + 1] * image_h
+ # Image-y-down convention: angle = target - atan2(-dy, dx).
+ angle = _FACE_ROI_TARGET_ANGLE - math.atan2(ly - ry, rx - lx)
+ return (float((xmin + xmax) * 0.5 * image_w),
+ float((ymin + ymax) * 0.5 * image_h),
+ float((xmax - xmin) * image_w * _FACE_ROI_SCALE_X),
+ float((ymax - ymin) * image_h * _FACE_ROI_SCALE_Y),
+ float(angle))
+
+
+def _sample_warp(image_chw: Tensor, src_x: Tensor, src_y: Tensor, padding_mode: str) -> Tensor:
+ """Bilinear-sample image_chw at corner-aligned (src_x, src_y)."""
+ H, W = int(image_chw.shape[-2]), int(image_chw.shape[-1])
+ grid = torch.stack([(2.0 * src_x + 1.0) / W - 1.0,
+ (2.0 * src_y + 1.0) / H - 1.0], dim=-1).unsqueeze(0)
+ return F.grid_sample(image_chw.unsqueeze(0), grid, mode="bilinear",
+ align_corners=False, padding_mode=padding_mode).squeeze(0)
+
+
+def _warp_face_crop(image_chw: Tensor, cx: float, cy: float, width: float, height: float,
+ angle: float, output_size: int = _FM_INPUT_SIZE) -> Tensor:
+ """Rotated rect → output_size² with BORDER_REPLICATE. image_chw must be in [0, 1]."""
+ s_x, s_y = width / output_size, height / output_size
+ cos_a, sin_a = math.cos(angle), math.sin(angle)
+ arange = torch.arange(output_size, dtype=image_chw.dtype, device=image_chw.device) - output_size * 0.5
+ v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij")
+ src_x = cx + u_grid * s_x * cos_a - v_grid * s_y * sin_a
+ src_y = cy + u_grid * s_x * sin_a + v_grid * s_y * cos_a
+ return _sample_warp(image_chw, src_x, src_y, "border")
+
+
+def _blazeface_input_warp(image_chw_raw: Tensor, target: int = _BF_INPUT_SIZE) -> Tuple[Tensor, float, float, float]:
+ """Centered max(W,H) square → target² with BORDER_ZERO + [-1, 1] norm.
+
+ Sub-pixel grid_sample matters; integer-pad-then-resize drifts the bbox ~5%.
+ Returns (warped, sub_rect_cx, sub_rect_cy, sub_rect_size) — the triplet maps
+ tensor-normalized [0,1] detections back to image pixels.
+ """
+ H, W = int(image_chw_raw.shape[1]), int(image_chw_raw.shape[2])
+ sub_rect_size = float(max(W, H))
+ sub_rect_cx, sub_rect_cy = W * 0.5, H * 0.5
+ s = sub_rect_size / target
+ arange = torch.arange(target, dtype=image_chw_raw.dtype, device=image_chw_raw.device) - target * 0.5
+ v_grid, u_grid = torch.meshgrid(arange, arange, indexing="ij")
+ out = _sample_warp(image_chw_raw, sub_rect_cx + u_grid * s, sub_rect_cy + v_grid * s, "zeros")
+ return (out / 127.5) - 1.0, sub_rect_cx, sub_rect_cy, sub_rect_size
+
+
+class FaceLandmarker(nn.Module):
+ """BlazeFace → FaceMesh v2 → blendshapes. `detector_variant` selects 'short'
+ (128², ≤2m) or 'full' (192² FPN, ≤5m). State dict uses inner-module prefixes
+ `detector.*` / `mesh.*` / `blendshapes.*`; the outer FaceLandmarkerModel
+ wrapper rewrites `detector_{variant}.*` keys to `detector.*` before loading.
+ """
+
+ def __init__(self, device=None, dtype=None, operations=None, detector_variant: str = "short"):
+ super().__init__()
+ det_cls = {"short": BlazeFace, "full": BlazeFaceFullRange}.get(detector_variant)
+
+ self.detector_variant = detector_variant
+ self.detector = det_cls(device=device, dtype=dtype, operations=operations)
+ self.mesh = FaceMesh(device=device, dtype=dtype, operations=operations)
+ self.blendshapes = FaceBlendshapes(device=device, dtype=dtype, operations=operations)
+ self.register_buffer("_bs_idx", torch.tensor(_BS_INPUT_INDICES, dtype=torch.long), persistent=False)
+
+ def run_detector_batch(self, images_rgb_uint8: List[np.ndarray],
+ score_thresh: float = _BF_MIN_SCORE,
+ iou_thresh: float = 0.5):
+ """Batched detector pass. Returns (img_raws, sub_rects, sizes, per_frame_decoded)
+ where per_frame_decoded[b] is (N, 17) in tensor-normalized [0,1] coords."""
+ if not images_rgb_uint8:
+ return [], [], [], []
+ device, dtype = self.detector.stem.weight.device, self.detector.stem.weight.dtype
+ det_input_size, decode_fn = ((_BF_FR_INPUT_SIZE, _decode_blazeface_full_range)
+ if self.detector_variant == "full"
+ else (_BF_INPUT_SIZE, _decode_blazeface))
+
+ # Same-size frames: stack once and transfer once. Variable size falls back
+ # to per-image (only triggers for SAM3DBody's head crops).
+ sizes = [tuple(img.shape[:2]) for img in images_rgb_uint8]
+ if len(set(sizes)) == 1:
+ batch_chw = torch.from_numpy(np.stack(images_rgb_uint8, axis=0)).to(device, dtype).movedim(-1, -3).contiguous()
+ img_raws = [batch_chw[bi] for bi in range(batch_chw.shape[0])]
+ else:
+ img_raws = [torch.from_numpy(img).to(device, dtype).movedim(-1, -3).contiguous() for img in images_rgb_uint8]
+
+ warps = [_blazeface_input_warp(img_raw, det_input_size) for img_raw in img_raws]
+ det_crops = [w[0] for w in warps]
+ sub_rects = [(w[1], w[2], w[3]) for w in warps]
+
+ regs_b, cls_b = self.detector(torch.stack(det_crops, dim=0))
+ regs_np, cls_np = regs_b.float().cpu().numpy(), cls_b.float().cpu().numpy()
+ per_frame = []
+ for b in range(len(images_rgb_uint8)):
+ decoded = decode_fn(regs_np[b], cls_np[b], score_thresh=score_thresh)
+ per_frame.append(_weighted_nms(decoded, iou_thresh=iou_thresh) if decoded.shape[0] > 0 else decoded)
+ return img_raws, sub_rects, sizes, per_frame
+
+ def detect_batch(self, images_rgb_uint8: List[np.ndarray], num_faces: int = 1,
+ score_thresh: float = _BF_MIN_SCORE) -> List[List[dict]]:
+ """Full pipeline batched across `images_rgb_uint8`. Returns one face-dict
+ list per image (empty if nothing detected). Face dict:
+ bbox_xyxy (4,) image pixels, blendshapes {52} ∈ [0,1],
+ landmarks_xy (478, 2) image pixels, landmarks_3d (478, 3) in
+ 192-canonical (pre-transformation) units, presence float (raw logit).
+ """
+ img_raws, sub_rects, sizes, per_frame_dets = self.run_detector_batch(
+ images_rgb_uint8, score_thresh=score_thresh,
+ )
+ # tensor-normalized → image-normalized [0,1] for _detection_to_face_rect.
+ for b, decoded in enumerate(per_frame_dets):
+ if decoded.shape[0] == 0:
+ continue
+ cx, cy, size = sub_rects[b]
+ H, W = sizes[b]
+ sx0, sy0 = cx - size * 0.5, cy - size * 0.5
+ decoded[:, 0:16:2] = (sx0 + size * decoded[:, 0:16:2]) / W
+ decoded[:, 1:16:2] = (sy0 + size * decoded[:, 1:16:2]) / H
+ if num_faces > 0:
+ per_frame_dets[b] = decoded[: int(num_faces)]
+
+ # Collect every detected face across all frames into one mesh input.
+ face_params: List[Tuple[int, float, float, float, float, float, float]] = []
+ mesh_crops: List[Tensor] = []
+ for b, dets in enumerate(per_frame_dets):
+ if dets.shape[0] == 0:
+ continue
+ H, W = sizes[b]
+ img_for_mesh = img_raws[b] / 255.0
+ for det in dets:
+ cx, cy, w, h, angle = _detection_to_face_rect(det, W, H)
+ mesh_crops.append(_warp_face_crop(img_for_mesh, cx, cy, w, h, angle, _FM_INPUT_SIZE))
+ face_params.append((b, float(det[16]), cx, cy, w, h, angle))
+
+ results: List[List[dict]] = [[] for _ in range(len(images_rgb_uint8))]
+ if not mesh_crops:
+ return results
+
+ lmks_canon_b, presence_b = self.mesh(torch.stack(mesh_crops, dim=0))
+ bs_out_b = self.blendshapes(lmks_canon_b[:, self._bs_idx, :2])
+
+ # Batched canonical→image affine
+ params_t = torch.tensor(
+ [(cx, cy, w, h, math.cos(a), math.sin(a)) for (_b, _s, cx, cy, w, h, a) in face_params],
+ device=lmks_canon_b.device, dtype=lmks_canon_b.dtype,
+ )
+ cxs, cys, ws, hs, cos_a, sin_a = params_t.unbind(dim=1)
+ inv = 1.0 / _FM_INPUT_SIZE
+ u = lmks_canon_b[..., 0] - _FM_INPUT_SIZE * 0.5
+ v = lmks_canon_b[..., 1] - _FM_INPUT_SIZE * 0.5
+ lmks_xy_t = torch.stack([
+ cxs[:, None] + u * (ws * inv * cos_a)[:, None] - v * (hs * inv * sin_a)[:, None],
+ cys[:, None] + u * (ws * inv * sin_a)[:, None] + v * (hs * inv * cos_a)[:, None],
+ ], dim=-1)
+
+ lmks_xy_np = lmks_xy_t.float().cpu().numpy()
+ lmks_canon_np = lmks_canon_b.float().cpu().numpy()
+ presence_np = presence_b.float().cpu().numpy()
+ bs_np = bs_out_b.float().cpu().numpy()
+
+ for i, (b, score, *_) in enumerate(face_params):
+ lmks_xy = lmks_xy_np[i]
+ mn, mx = lmks_xy.min(0), lmks_xy.max(0)
+ results[b].append({
+ "bbox_xyxy": np.array([mn[0], mn[1], mx[0], mx[1]], dtype=np.float32),
+ "blendshapes": dict(zip(BLENDSHAPE_NAMES, bs_np[i].tolist())),
+ "landmarks_xy": lmks_xy,
+ "landmarks_3d": lmks_canon_np[i],
+ "presence": float(presence_np[i]),
+ "score": score,
+ })
+ return results
diff --git a/comfy_extras/nodes_ace.py b/comfy_extras/nodes_ace.py
index cbfaf913d..044077b18 100644
--- a/comfy_extras/nodes_ace.py
+++ b/comfy_extras/nodes_ace.py
@@ -3,136 +3,136 @@ from typing_extensions import override
import comfy.model_management
import node_helpers
-from comfy_api.latest import ComfyExtension, io
+from comfy_api.latest import ComfyExtension, IO
-class TextEncodeAceStepAudio(io.ComfyNode):
+class TextEncodeAceStepAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
- return io.Schema(
+ return IO.Schema(
node_id="TextEncodeAceStepAudio",
- category="conditioning",
+ category="model/conditioning",
inputs=[
- io.Clip.Input("clip"),
- io.String.Input("tags", multiline=True, dynamic_prompts=True),
- io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
- io.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
+ IO.Clip.Input("clip"),
+ IO.String.Input("tags", multiline=True, dynamic_prompts=True),
+ IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+ IO.Float.Input("lyrics_strength", default=1.0, min=0.0, max=10.0, step=0.01),
],
- outputs=[io.Conditioning.Output()],
+ outputs=[IO.Conditioning.Output()],
)
@classmethod
- def execute(cls, clip, tags, lyrics, lyrics_strength) -> io.NodeOutput:
+ def execute(cls, clip, tags, lyrics, lyrics_strength) -> IO.NodeOutput:
tokens = clip.tokenize(tags, lyrics=lyrics)
conditioning = clip.encode_from_tokens_scheduled(tokens)
conditioning = node_helpers.conditioning_set_values(conditioning, {"lyrics_strength": lyrics_strength})
- return io.NodeOutput(conditioning)
+ return IO.NodeOutput(conditioning)
-class TextEncodeAceStepAudio15(io.ComfyNode):
+class TextEncodeAceStepAudio15(IO.ComfyNode):
@classmethod
def define_schema(cls):
- return io.Schema(
+ return IO.Schema(
node_id="TextEncodeAceStepAudio1.5",
- category="conditioning",
+ category="model/conditioning",
inputs=[
- io.Clip.Input("clip"),
- io.String.Input("tags", multiline=True, dynamic_prompts=True),
- io.String.Input("lyrics", multiline=True, dynamic_prompts=True),
- io.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
- io.Int.Input("bpm", default=120, min=10, max=300),
- io.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
- io.Combo.Input("timesignature", options=['2', '3', '4', '6']),
- io.Combo.Input("language", options=["en", "ja", "zh", "es", "de", "fr", "pt", "ru", "it", "nl", "pl", "tr", "vi", "cs", "fa", "id", "ko", "uk", "hu", "ar", "sv", "ro", "el"]),
- io.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
- io.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
- io.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
- io.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
- io.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
- io.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
- io.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
+ IO.Clip.Input("clip"),
+ IO.String.Input("tags", multiline=True, dynamic_prompts=True),
+ IO.String.Input("lyrics", multiline=True, dynamic_prompts=True),
+ IO.Int.Input("seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True),
+ IO.Int.Input("bpm", default=120, min=10, max=300),
+ IO.Float.Input("duration", default=120.0, min=0.0, max=2000.0, step=0.1),
+ IO.Combo.Input("timesignature", options=['2', '3', '4', '6']),
+ IO.Combo.Input("language", options=['ar', 'az', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fr', 'he', 'hi', 'hr', 'ht', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'la', 'lt', 'ms', 'ne', 'nl', 'no', 'pa', 'pl', 'pt', 'ro', 'ru', 'sa', 'sk', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'yue', 'zh', 'unknown'], default='en'),
+ IO.Combo.Input("keyscale", options=[f"{root} {quality}" for quality in ["major", "minor"] for root in ["C", "C#", "Db", "D", "D#", "Eb", "E", "F", "F#", "Gb", "G", "G#", "Ab", "A", "A#", "Bb", "B"]]),
+ IO.Boolean.Input("generate_audio_codes", default=True, tooltip="Enable the LLM that generates audio codes. This can be slow but will increase the quality of the generated audio. Turn this off if you are giving the model an audio reference.", advanced=True),
+ IO.Float.Input("cfg_scale", default=2.0, min=0.0, max=100.0, step=0.1, advanced=True),
+ IO.Float.Input("temperature", default=0.85, min=0.0, max=2.0, step=0.01, advanced=True),
+ IO.Float.Input("top_p", default=0.9, min=0.0, max=2000.0, step=0.01, advanced=True),
+ IO.Int.Input("top_k", default=0, min=0, max=100, advanced=True),
+ IO.Float.Input("min_p", default=0.000, min=0.0, max=1.0, step=0.001, advanced=True),
],
- outputs=[io.Conditioning.Output()],
+ outputs=[IO.Conditioning.Output()],
)
@classmethod
- def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> io.NodeOutput:
+ def execute(cls, clip, tags, lyrics, seed, bpm, duration, timesignature, language, keyscale, generate_audio_codes, cfg_scale, temperature, top_p, top_k, min_p) -> IO.NodeOutput:
tokens = clip.tokenize(tags, lyrics=lyrics, bpm=bpm, duration=duration, timesignature=int(timesignature), language=language, keyscale=keyscale, seed=seed, generate_audio_codes=generate_audio_codes, cfg_scale=cfg_scale, temperature=temperature, top_p=top_p, top_k=top_k, min_p=min_p)
conditioning = clip.encode_from_tokens_scheduled(tokens)
- return io.NodeOutput(conditioning)
+ return IO.NodeOutput(conditioning)
-class EmptyAceStepLatentAudio(io.ComfyNode):
+class EmptyAceStepLatentAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
- return io.Schema(
+ return IO.Schema(
node_id="EmptyAceStepLatentAudio",
display_name="Empty Ace Step 1.0 Latent Audio",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
- io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
- io.Int.Input(
+ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.1),
+ IO.Int.Input(
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
),
],
- outputs=[io.Latent.Output()],
+ outputs=[IO.Latent.Output()],
)
@classmethod
- def execute(cls, seconds, batch_size) -> io.NodeOutput:
+ def execute(cls, seconds, batch_size) -> IO.NodeOutput:
length = int(seconds * 44100 / 512 / 8)
latent = torch.zeros([batch_size, 8, 16, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
- return io.NodeOutput({"samples": latent, "type": "audio"})
+ return IO.NodeOutput({"samples": latent, "type": "audio"})
-class EmptyAceStep15LatentAudio(io.ComfyNode):
+class EmptyAceStep15LatentAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
- return io.Schema(
+ return IO.Schema(
node_id="EmptyAceStep1.5LatentAudio",
display_name="Empty Ace Step 1.5 Latent Audio",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
- io.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
- io.Int.Input(
+ IO.Float.Input("seconds", default=120.0, min=1.0, max=1000.0, step=0.01),
+ IO.Int.Input(
"batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."
),
],
- outputs=[io.Latent.Output()],
+ outputs=[IO.Latent.Output()],
)
@classmethod
- def execute(cls, seconds, batch_size) -> io.NodeOutput:
+ def execute(cls, seconds, batch_size) -> IO.NodeOutput:
length = round((seconds * 48000 / 1920))
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
- return io.NodeOutput({"samples": latent, "type": "audio"})
+ return IO.NodeOutput({"samples": latent, "type": "audio", "downscale_ratio_temporal": 1764})
-class ReferenceAudio(io.ComfyNode):
+class ReferenceAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
- return io.Schema(
+ return IO.Schema(
node_id="ReferenceTimbreAudio",
display_name="Reference Audio",
category="advanced/conditioning/audio",
is_experimental=True,
description="This node sets the reference audio for ace step 1.5",
inputs=[
- io.Conditioning.Input("conditioning"),
- io.Latent.Input("latent", optional=True),
+ IO.Conditioning.Input("conditioning"),
+ IO.Latent.Input("latent", optional=True),
],
outputs=[
- io.Conditioning.Output(),
+ IO.Conditioning.Output(),
]
)
@classmethod
- def execute(cls, conditioning, latent=None) -> io.NodeOutput:
+ def execute(cls, conditioning, latent=None) -> IO.NodeOutput:
if latent is not None:
conditioning = node_helpers.conditioning_set_values(conditioning, {"reference_audio_timbre_latents": [latent["samples"]]}, append=True)
- return io.NodeOutput(conditioning)
+ return IO.NodeOutput(conditioning)
class AceExtension(ComfyExtension):
@override
- async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
return [
TextEncodeAceStepAudio,
EmptyAceStepLatentAudio,
diff --git a/comfy_extras/nodes_advanced_samplers.py b/comfy_extras/nodes_advanced_samplers.py
index 7f716cd76..77a561e30 100644
--- a/comfy_extras/nodes_advanced_samplers.py
+++ b/comfy_extras/nodes_advanced_samplers.py
@@ -45,7 +45,7 @@ class SamplerLCMUpscale(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="SamplerLCMUpscale",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("scale_ratio", default=1.0, min=0.1, max=20.0, step=0.01, advanced=True),
io.Int.Input("scale_steps", default=-1, min=-1, max=1000, step=1, advanced=True),
@@ -86,13 +86,44 @@ def sample_euler_pp(model, x, sigmas, extra_args=None, callback=None, disable=No
return x
+class SamplerLCM(io.ComfyNode):
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="SamplerLCM",
+ category="model/sampling/samplers",
+ description=("LCM sampler with tunable per-step noise. s_noise is a multiplier on the model's training noise scale"),
+ inputs=[
+ io.Float.Input("s_noise", default=1.0, min=0.0, max=64.0, step=0.01,
+ tooltip="Per-step noise multiplier at the first step (1.0 = match training)."),
+ io.Float.Input("s_noise_end", default=1.0, min=0.0, max=64.0, step=0.01,
+ tooltip="Per-step noise multiplier at the last step. Set equal to s_noise for a constant schedule."),
+ io.Float.Input("noise_clip_std", default=0.0, min=0.0, max=10.0, step=0.01,
+ tooltip="Clamp per-step noise to +/- N*std. 0 disables."),
+ ],
+ outputs=[io.Sampler.Output()],
+ )
+
+ @classmethod
+ def execute(cls, s_noise, s_noise_end, noise_clip_std) -> io.NodeOutput:
+ sampler = comfy.samplers.ksampler(
+ "lcm",
+ {
+ "s_noise": float(s_noise),
+ "s_noise_end": float(s_noise_end),
+ "noise_clip_std": float(noise_clip_std),
+ },
+ )
+ return io.NodeOutput(sampler)
+
+
class SamplerEulerCFGpp(io.ComfyNode):
@classmethod
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="SamplerEulerCFGpp",
display_name="SamplerEulerCFG++",
- category="_for_testing", # "sampling/custom_sampling/samplers"
+ category="experimental", # "sampling/samplers"
inputs=[
io.Combo.Input("version", options=["regular", "alternative"], advanced=True),
],
@@ -114,6 +145,7 @@ class AdvancedSamplersExtension(ComfyExtension):
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
SamplerLCMUpscale,
+ SamplerLCM,
SamplerEulerCFGpp,
]
diff --git a/comfy_extras/nodes_align_your_steps.py b/comfy_extras/nodes_align_your_steps.py
index 4fc511d2c..f89a809bb 100644
--- a/comfy_extras/nodes_align_your_steps.py
+++ b/comfy_extras/nodes_align_your_steps.py
@@ -29,7 +29,7 @@ class AlignYourStepsScheduler(io.ComfyNode):
return io.Schema(
node_id="AlignYourStepsScheduler",
search_aliases=["AYS scheduler"],
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Combo.Input("model_type", options=["SD1", "SDXL", "SVD"]),
io.Int.Input("steps", default=10, min=1, max=10000),
diff --git a/comfy_extras/nodes_apg.py b/comfy_extras/nodes_apg.py
index fd561d360..4a352038a 100644
--- a/comfy_extras/nodes_apg.py
+++ b/comfy_extras/nodes_apg.py
@@ -16,7 +16,7 @@ class APG(io.ComfyNode):
return io.Schema(
node_id="APG",
display_name="Adaptive Projected Guidance",
- category="sampling/custom_sampling",
+ category="model/sampling/custom_sampling",
inputs=[
io.Model.Input("model"),
io.Float.Input(
diff --git a/comfy_extras/nodes_ar_video.py b/comfy_extras/nodes_ar_video.py
new file mode 100644
index 000000000..c22359eb2
--- /dev/null
+++ b/comfy_extras/nodes_ar_video.py
@@ -0,0 +1,136 @@
+"""
+ComfyUI nodes for autoregressive video generation (Causal Forcing, Self-Forcing, etc.).
+ - EmptyARVideoLatent: create 5D [B, C, T, H, W] video latent tensors
+ - SamplerARVideo: SAMPLER for the block-by-block autoregressive denoising loop
+ - ARVideoI2V: image-to-video conditioning for AR models (seeds KV cache with start image)
+"""
+
+import torch
+from typing_extensions import override
+
+import comfy.model_management
+import comfy.samplers
+import comfy.utils
+from comfy_api.latest import ComfyExtension, io
+
+
+class EmptyARVideoLatent(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="EmptyARVideoLatent",
+ category="model/latent/video",
+ inputs=[
+ io.Int.Input("width", default=832, min=16, max=8192, step=16),
+ io.Int.Input("height", default=480, min=16, max=8192, step=16),
+ io.Int.Input("length", default=81, min=1, max=1024, step=4),
+ io.Int.Input("batch_size", default=1, min=1, max=64),
+ ],
+ outputs=[
+ io.Latent.Output(display_name="LATENT"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, width, height, length, batch_size) -> io.NodeOutput:
+ lat_t = ((length - 1) // 4) + 1
+ latent = torch.zeros(
+ [batch_size, 16, lat_t, height // 8, width // 8],
+ device=comfy.model_management.intermediate_device(),
+ )
+ return io.NodeOutput({"samples": latent})
+
+
+class SamplerARVideo(io.ComfyNode):
+ """Sampler for autoregressive video models (Causal Forcing, Self-Forcing).
+
+ All AR-loop parameters are owned by this node so they live in the workflow.
+ Add new widgets here as the AR sampler grows new options.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SamplerARVideo",
+ display_name="Sampler AR Video",
+ category="model/sampling/samplers",
+ inputs=[
+ io.Int.Input(
+ "num_frame_per_block",
+ default=1, min=1, max=64,
+ tooltip="Frames per autoregressive block. 1 = framewise, "
+ "3 = chunkwise. Must match the checkpoint's training mode.",
+ ),
+ ],
+ outputs=[io.Sampler.Output()],
+ )
+
+ @classmethod
+ def execute(cls, num_frame_per_block) -> io.NodeOutput:
+ extra_options = {
+ "num_frame_per_block": num_frame_per_block,
+ }
+ return io.NodeOutput(comfy.samplers.ksampler("ar_video", extra_options))
+
+
+class ARVideoI2V(io.ComfyNode):
+ """Image-to-video setup for AR video models (Causal Forcing, Self-Forcing).
+
+ VAE-encodes the start image and stores it in the model's transformer_options
+ so that sample_ar_video can seed the KV cache before denoising.
+ Uses the same T2V model checkpoint -- no separate I2V architecture needed.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="ARVideoI2V",
+ category="model/conditioning/video_models",
+ inputs=[
+ io.Model.Input("model"),
+ io.Vae.Input("vae"),
+ io.Image.Input("start_image"),
+ io.Int.Input("width", default=832, min=16, max=8192, step=16),
+ io.Int.Input("height", default=480, min=16, max=8192, step=16),
+ io.Int.Input("length", default=81, min=1, max=1024, step=4),
+ io.Int.Input("batch_size", default=1, min=1, max=64),
+ ],
+ outputs=[
+ io.Model.Output(display_name="MODEL"),
+ io.Latent.Output(display_name="LATENT"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, model, vae, start_image, width, height, length, batch_size) -> io.NodeOutput:
+ start_image = comfy.utils.common_upscale(
+ start_image[:1].movedim(-1, 1), width, height, "bilinear", "center"
+ ).movedim(1, -1)
+
+ initial_latent = vae.encode(start_image[:, :, :, :3])
+
+ m = model.clone()
+ to = m.model_options.setdefault("transformer_options", {})
+ ar_cfg = to.setdefault("ar_config", {})
+ ar_cfg["initial_latent"] = initial_latent
+
+ lat_t = ((length - 1) // 4) + 1
+ latent = torch.zeros(
+ [batch_size, 16, lat_t, height // 8, width // 8],
+ device=comfy.model_management.intermediate_device(),
+ )
+ return io.NodeOutput(m, {"samples": latent})
+
+
+class ARVideoExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ EmptyARVideoLatent,
+ SamplerARVideo,
+ ARVideoI2V,
+ ]
+
+
+async def comfy_entrypoint() -> ARVideoExtension:
+ return ARVideoExtension()
diff --git a/comfy_extras/nodes_attention_multiply.py b/comfy_extras/nodes_attention_multiply.py
index 060a5c9be..f4ee6a689 100644
--- a/comfy_extras/nodes_attention_multiply.py
+++ b/comfy_extras/nodes_attention_multiply.py
@@ -25,7 +25,7 @@ class UNetSelfAttentionMultiply(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="UNetSelfAttentionMultiply",
- category="_for_testing/attention_experiments",
+ category="experimental/attention_experiments",
inputs=[
io.Model.Input("model"),
io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01, advanced=True),
@@ -48,7 +48,7 @@ class UNetCrossAttentionMultiply(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="UNetCrossAttentionMultiply",
- category="_for_testing/attention_experiments",
+ category="experimental/attention_experiments",
inputs=[
io.Model.Input("model"),
io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01, advanced=True),
@@ -72,7 +72,7 @@ class CLIPAttentionMultiply(io.ComfyNode):
return io.Schema(
node_id="CLIPAttentionMultiply",
search_aliases=["clip attention scale", "text encoder attention"],
- category="_for_testing/attention_experiments",
+ category="experimental/attention_experiments",
inputs=[
io.Clip.Input("clip"),
io.Float.Input("q", default=1.0, min=0.0, max=10.0, step=0.01, advanced=True),
@@ -106,7 +106,7 @@ class UNetTemporalAttentionMultiply(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="UNetTemporalAttentionMultiply",
- category="_for_testing/attention_experiments",
+ category="experimental/attention_experiments",
inputs=[
io.Model.Input("model"),
io.Float.Input("self_structural", default=1.0, min=0.0, max=10.0, step=0.01, advanced=True),
diff --git a/comfy_extras/nodes_audio.py b/comfy_extras/nodes_audio.py
index a395392d8..ff078f74c 100644
--- a/comfy_extras/nodes_audio.py
+++ b/comfy_extras/nodes_audio.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import av
import torchaudio
import torch
@@ -18,7 +16,7 @@ class EmptyLatentAudio(IO.ComfyNode):
return IO.Schema(
node_id="EmptyLatentAudio",
display_name="Empty Latent Audio",
- category="latent/audio",
+ category="model/latent/audio",
essentials_category="Audio",
inputs=[
IO.Float.Input("seconds", default=47.6, min=1.0, max=1000.0, step=0.1),
@@ -33,7 +31,7 @@ class EmptyLatentAudio(IO.ComfyNode):
def execute(cls, seconds, batch_size) -> IO.NodeOutput:
length = round((seconds * 44100 / 2048) / 2) * 2
latent = torch.zeros([batch_size, 64, length], device=comfy.model_management.intermediate_device())
- return IO.NodeOutput({"samples":latent, "type": "audio"})
+ return IO.NodeOutput({"samples": latent, "type": "audio", "downscale_ratio_temporal": 2048})
generate = execute # TODO: remove
@@ -43,7 +41,7 @@ class ConditioningStableAudio(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ConditioningStableAudio",
- category="conditioning",
+ category="model/conditioning",
inputs=[
IO.Conditioning.Input("positive"),
IO.Conditioning.Input("negative"),
@@ -72,7 +70,7 @@ class VAEEncodeAudio(IO.ComfyNode):
node_id="VAEEncodeAudio",
search_aliases=["audio to latent"],
display_name="VAE Encode Audio",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
IO.Audio.Input("audio"),
IO.Vae.Input("vae"),
@@ -82,6 +80,8 @@ class VAEEncodeAudio(IO.ComfyNode):
@classmethod
def execute(cls, vae, audio) -> IO.NodeOutput:
+ if audio is None:
+ raise ValueError("VAEEncodeAudio: input audio is None (source video may have no audio track).")
sample_rate = audio["sample_rate"]
vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
if vae_sample_rate != sample_rate:
@@ -104,7 +104,7 @@ def vae_decode_audio(vae, samples, tile=None, overlap=None):
std = torch.std(audio, dim=[1, 2], keepdim=True) * 5.0
std[std < 1.0] = 1.0
audio /= std
- vae_sample_rate = getattr(vae, "audio_sample_rate", 44100)
+ vae_sample_rate = getattr(vae, "audio_sample_rate_output", getattr(vae, "audio_sample_rate", 44100))
return {"waveform": audio, "sample_rate": vae_sample_rate if "sample_rate" not in samples else samples["sample_rate"]}
@@ -115,7 +115,7 @@ class VAEDecodeAudio(IO.ComfyNode):
node_id="VAEDecodeAudio",
search_aliases=["latent to audio"],
display_name="VAE Decode Audio",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),
@@ -137,7 +137,7 @@ class VAEDecodeAudioTiled(IO.ComfyNode):
node_id="VAEDecodeAudioTiled",
search_aliases=["latent to audio"],
display_name="VAE Decode Audio (Tiled)",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),
@@ -171,6 +171,8 @@ class SaveAudio(IO.ComfyNode):
@classmethod
def execute(cls, audio, filename_prefix="ComfyUI", format="flac") -> IO.NodeOutput:
+ if audio is None:
+ raise ValueError("SaveAudio: input audio is None (source video may have no audio track).")
return IO.NodeOutput(
ui=UI.AudioSaveHelper.get_save_audio_ui(audio, filename_prefix=filename_prefix, cls=cls, format=format)
)
@@ -198,6 +200,8 @@ class SaveAudioMP3(IO.ComfyNode):
@classmethod
def execute(cls, audio, filename_prefix="ComfyUI", format="mp3", quality="128k") -> IO.NodeOutput:
+ if audio is None:
+ raise ValueError("SaveAudioMP3: input audio is None (source video may have no audio track).")
return IO.NodeOutput(
ui=UI.AudioSaveHelper.get_save_audio_ui(
audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality
@@ -226,6 +230,8 @@ class SaveAudioOpus(IO.ComfyNode):
@classmethod
def execute(cls, audio, filename_prefix="ComfyUI", format="opus", quality="V3") -> IO.NodeOutput:
+ if audio is None:
+ raise ValueError("SaveAudioOpus: input audio is None (source video may have no audio track).")
return IO.NodeOutput(
ui=UI.AudioSaveHelper.get_save_audio_ui(
audio, filename_prefix=filename_prefix, cls=cls, format=format, quality=quality
@@ -252,6 +258,8 @@ class PreviewAudio(IO.ComfyNode):
@classmethod
def execute(cls, audio) -> IO.NodeOutput:
+ if audio is None:
+ raise ValueError("PreviewAudio: input audio is None (source video may have no audio track).")
return IO.NodeOutput(ui=UI.PreviewAudio(audio, cls=cls))
save_flac = execute # TODO: remove
@@ -297,6 +305,7 @@ class LoadAudio(IO.ComfyNode):
@classmethod
def define_schema(cls):
input_dir = folder_paths.get_input_directory()
+ os.makedirs(input_dir, exist_ok=True)
files = folder_paths.filter_files_content_types(os.listdir(input_dir), ["audio", "video"])
return IO.Schema(
node_id="LoadAudio",
@@ -391,21 +400,26 @@ class TrimAudioDuration(IO.ComfyNode):
@classmethod
def execute(cls, audio, start_index, duration) -> IO.NodeOutput:
+ if audio is None:
+ return IO.NodeOutput(None)
waveform = audio["waveform"]
sample_rate = audio["sample_rate"]
audio_length = waveform.shape[-1]
+ if audio_length == 0:
+ return IO.NodeOutput(audio)
+
if start_index < 0:
start_frame = audio_length + int(round(start_index * sample_rate))
else:
start_frame = int(round(start_index * sample_rate))
- start_frame = max(0, min(start_frame, audio_length - 1))
+ start_frame = max(0, min(start_frame, audio_length))
end_frame = start_frame + int(round(duration * sample_rate))
end_frame = max(0, min(end_frame, audio_length))
if start_frame >= end_frame:
- raise ValueError("AudioTrim: Start time must be less than end time and be within the audio length.")
+ raise ValueError("TrimAudioDuration: Start time must be less than end time and be within the audio length.")
return IO.NodeOutput({"waveform": waveform[..., start_frame:end_frame], "sample_rate": sample_rate})
@@ -432,11 +446,13 @@ class SplitAudioChannels(IO.ComfyNode):
@classmethod
def execute(cls, audio) -> IO.NodeOutput:
+ if audio is None:
+ return IO.NodeOutput(None, None)
waveform = audio["waveform"]
sample_rate = audio["sample_rate"]
if waveform.shape[1] != 2:
- raise ValueError("AudioSplit: Input audio has only one channel.")
+ raise ValueError(f"AudioSplit: Input audio must be stereo (2 channels), got {waveform.shape[1]} channel(s).")
left_channel = waveform[..., 0:1, :]
right_channel = waveform[..., 1:2, :]
@@ -464,6 +480,12 @@ class JoinAudioChannels(IO.ComfyNode):
@classmethod
def execute(cls, audio_left, audio_right) -> IO.NodeOutput:
+ if audio_left is None and audio_right is None:
+ return IO.NodeOutput(None)
+ if audio_left is None:
+ return IO.NodeOutput(audio_right)
+ if audio_right is None:
+ return IO.NodeOutput(audio_left)
waveform_left = audio_left["waveform"]
sample_rate_left = audio_left["sample_rate"]
waveform_right = audio_right["waveform"]
@@ -519,7 +541,7 @@ class AudioConcat(IO.ComfyNode):
return IO.Schema(
node_id="AudioConcat",
search_aliases=["join audio", "combine audio", "append audio"],
- display_name="Audio Concat",
+ display_name="Concatenate Audio",
description="Concatenates the audio1 to audio2 in the specified direction.",
category="audio",
inputs=[
@@ -537,6 +559,12 @@ class AudioConcat(IO.ComfyNode):
@classmethod
def execute(cls, audio1, audio2, direction) -> IO.NodeOutput:
+ if audio1 is None and audio2 is None:
+ return IO.NodeOutput(None)
+ if audio1 is None:
+ return IO.NodeOutput(audio2)
+ if audio2 is None:
+ return IO.NodeOutput(audio1)
waveform_1 = audio1["waveform"]
waveform_2 = audio2["waveform"]
sample_rate_1 = audio1["sample_rate"]
@@ -567,7 +595,7 @@ class AudioMerge(IO.ComfyNode):
return IO.Schema(
node_id="AudioMerge",
search_aliases=["mix audio", "overlay audio", "layer audio"],
- display_name="Audio Merge",
+ display_name="Merge Audio",
description="Combine two audio tracks by overlaying their waveforms.",
category="audio",
inputs=[
@@ -584,6 +612,12 @@ class AudioMerge(IO.ComfyNode):
@classmethod
def execute(cls, audio1, audio2, merge_method) -> IO.NodeOutput:
+ if audio1 is None and audio2 is None:
+ return IO.NodeOutput(None)
+ if audio1 is None:
+ return IO.NodeOutput(audio2)
+ if audio2 is None:
+ return IO.NodeOutput(audio1)
waveform_1 = audio1["waveform"]
waveform_2 = audio2["waveform"]
sample_rate_1 = audio1["sample_rate"]
@@ -594,6 +628,9 @@ class AudioMerge(IO.ComfyNode):
length_1 = waveform_1.shape[-1]
length_2 = waveform_2.shape[-1]
+ if length_1 == 0 or length_2 == 0:
+ return IO.NodeOutput({"waveform": waveform_1, "sample_rate": output_sample_rate})
+
if length_2 > length_1:
logging.info(f"AudioMerge: Trimming audio2 from {length_2} to {length_1} samples to match audio1 length.")
waveform_2 = waveform_2[..., :length_1]
@@ -628,8 +665,9 @@ class AudioAdjustVolume(IO.ComfyNode):
return IO.Schema(
node_id="AudioAdjustVolume",
search_aliases=["audio gain", "loudness", "audio level"],
- display_name="Audio Adjust Volume",
+ display_name="Adjust Audio Volume",
category="audio",
+ description="Adjust the volume of the audio by a specified amount in decibels (dB).",
inputs=[
IO.Audio.Input("audio"),
IO.Int.Input(
@@ -645,6 +683,8 @@ class AudioAdjustVolume(IO.ComfyNode):
@classmethod
def execute(cls, audio, volume) -> IO.NodeOutput:
+ if audio is None:
+ return IO.NodeOutput(None)
if volume == 0:
return IO.NodeOutput(audio)
waveform = audio["waveform"]
@@ -728,8 +768,14 @@ class AudioEqualizer3Band(IO.ComfyNode):
@classmethod
def execute(cls, audio, low_gain_dB, low_freq, mid_gain_dB, mid_freq, mid_q, high_gain_dB, high_freq) -> IO.NodeOutput:
+ if audio is None:
+ return IO.NodeOutput(None)
waveform = audio["waveform"]
sample_rate = audio["sample_rate"]
+
+ if waveform.shape[-1] == 0:
+ return IO.NodeOutput(audio)
+
eq_waveform = waveform.clone()
# 1. Apply Low Shelf (Bass)
diff --git a/comfy_extras/nodes_audio_encoder.py b/comfy_extras/nodes_audio_encoder.py
index 13aacd41a..2ae30d321 100644
--- a/comfy_extras/nodes_audio_encoder.py
+++ b/comfy_extras/nodes_audio_encoder.py
@@ -10,7 +10,8 @@ class AudioEncoderLoader(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="AudioEncoderLoader",
- category="loaders",
+ display_name="Load Audio Encoder",
+ category="model/loaders",
inputs=[
io.Combo.Input(
"audio_encoder_name",
@@ -35,7 +36,7 @@ class AudioEncoderEncode(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="AudioEncoderEncode",
- category="conditioning",
+ category="model/conditioning",
inputs=[
io.AudioEncoder.Input("audio_encoder"),
io.Audio.Input("audio"),
diff --git a/comfy_extras/nodes_bg_removal.py b/comfy_extras/nodes_bg_removal.py
new file mode 100644
index 000000000..9dc9ad854
--- /dev/null
+++ b/comfy_extras/nodes_bg_removal.py
@@ -0,0 +1,61 @@
+import folder_paths
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, IO
+from comfy.bg_removal_model import load
+
+
+class LoadBackgroundRemovalModel(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ files = folder_paths.get_filename_list("background_removal")
+ return IO.Schema(
+ node_id="LoadBackgroundRemovalModel",
+ display_name="Load Background Removal Model",
+ category="model/loaders",
+ inputs=[
+ IO.Combo.Input("bg_removal_name", options=sorted(files), tooltip="The model used to remove backgrounds from images"),
+ ],
+ outputs=[
+ IO.BackgroundRemoval.Output("bg_model")
+ ]
+ )
+ @classmethod
+ def execute(cls, bg_removal_name):
+ path = folder_paths.get_full_path_or_raise("background_removal", bg_removal_name)
+ bg = load(path)
+ if bg is None:
+ raise RuntimeError("ERROR: background model file is invalid and does not contain a valid background removal model.")
+ return IO.NodeOutput(bg)
+
+class RemoveBackground(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="RemoveBackground",
+ display_name="Remove Background",
+ category="image/background removal",
+ description="Generates a foreground mask to remove the background from an image using a background removal model.",
+ inputs=[
+ IO.Image.Input("image", tooltip="Input image to remove the background from"),
+ IO.BackgroundRemoval.Input("bg_removal_model", tooltip="Background removal model used to generate the mask")
+ ],
+ outputs=[
+ IO.Mask.Output("mask", tooltip="Generated foreground mask")
+ ]
+ )
+ @classmethod
+ def execute(cls, image, bg_removal_model):
+ mask = bg_removal_model.encode_image(image)
+ return IO.NodeOutput(mask)
+
+class BackgroundRemovalExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [
+ LoadBackgroundRemovalModel,
+ RemoveBackground
+ ]
+
+
+async def comfy_entrypoint() -> BackgroundRemovalExtension:
+ return BackgroundRemovalExtension()
diff --git a/comfy_extras/nodes_camera_trajectory.py b/comfy_extras/nodes_camera_trajectory.py
index e7efa29ba..13a1448f4 100644
--- a/comfy_extras/nodes_camera_trajectory.py
+++ b/comfy_extras/nodes_camera_trajectory.py
@@ -153,7 +153,7 @@ class WanCameraEmbedding(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanCameraEmbedding",
- category="camera",
+ category="model/conditioning/video_models",
inputs=[
io.Combo.Input(
"camera_pose",
diff --git a/comfy_extras/nodes_canny.py b/comfy_extras/nodes_canny.py
index 648b4279d..462f6fea0 100644
--- a/comfy_extras/nodes_canny.py
+++ b/comfy_extras/nodes_canny.py
@@ -11,9 +11,9 @@ class Canny(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Canny",
- display_name="Canny",
+ display_name="Detect Edges (Canny)",
search_aliases=["edge detection", "outline", "contour detection", "line art"],
- category="image/preprocessors",
+ category="image/filters",
essentials_category="Image Tools",
inputs=[
io.Image.Input("image"),
diff --git a/comfy_extras/nodes_cfg.py b/comfy_extras/nodes_cfg.py
index 4ebb4b51e..b585c560f 100644
--- a/comfy_extras/nodes_cfg.py
+++ b/comfy_extras/nodes_cfg.py
@@ -57,24 +57,55 @@ class CFGNorm(io.ComfyNode):
inputs=[
io.Model.Input("model"),
io.Float.Input("strength", default=1.0, min=0.0, max=100.0, step=0.01),
+ io.Boolean.Input(
+ "pre_cfg",
+ default=False,
+ optional=True,
+ tooltip=(
+ "If true, rescale the combined noise BEFORE the sampler's CFG combine, "
+ "without clamping (can amplify). Matches the norm-scaled CFG used by "
+ "models like Lens. Default false keeps the original post-CFG x0-space "
+ "attenuate-only behavior."
+ ),
+ ),
],
outputs=[io.Model.Output(display_name="patched_model")],
is_experimental=True,
)
@classmethod
- def execute(cls, model, strength) -> io.NodeOutput:
+ def execute(cls, model, strength, pre_cfg=False) -> io.NodeOutput:
m = model.clone()
- def cfg_norm(args):
- cond_p = args['cond_denoised']
- pred_text_ = args["denoised"]
+ if pre_cfg:
+ def cfg_norm_pre(args):
+ cond = args["cond"]
+ uncond = args["uncond"]
+ cond_scale = args["cond_scale"]
+ comb = uncond + cond_scale * (cond - uncond)
+ cond_norm = torch.linalg.vector_norm(cond, dim=1, keepdim=True)
+ comb_norm = torch.linalg.vector_norm(comb, dim=1, keepdim=True)
+ rescale = torch.where(
+ comb_norm > 0,
+ cond_norm / comb_norm.clamp_min(1e-12),
+ torch.ones_like(comb_norm),
+ )
+ rescaled = comb * rescale
+ # strength blends back toward standard linear CFG (1.0 = full rescale).
+ if strength != 1.0:
+ rescaled = strength * rescaled + (1.0 - strength) * comb
+ return rescaled
+ m.set_model_sampler_cfg_function(cfg_norm_pre)
+ else:
+ def cfg_norm(args):
+ cond_p = args['cond_denoised']
+ pred_text_ = args["denoised"]
- norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True)
- norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True)
- scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0)
- return pred_text_ * scale * strength
+ norm_full_cond = torch.norm(cond_p, dim=1, keepdim=True)
+ norm_pred_text = torch.norm(pred_text_, dim=1, keepdim=True)
+ scale = (norm_full_cond / (norm_pred_text + 1e-8)).clamp(min=0.0, max=1.0)
+ return pred_text_ * scale * strength
- m.set_model_sampler_post_cfg_function(cfg_norm)
+ m.set_model_sampler_post_cfg_function(cfg_norm)
return io.NodeOutput(m)
diff --git a/comfy_extras/nodes_chroma_radiance.py b/comfy_extras/nodes_chroma_radiance.py
index 509436062..ca427e5cb 100644
--- a/comfy_extras/nodes_chroma_radiance.py
+++ b/comfy_extras/nodes_chroma_radiance.py
@@ -13,7 +13,7 @@ class EmptyChromaRadianceLatentImage(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="EmptyChromaRadianceLatentImage",
- category="latent/chroma_radiance",
+ category="model/latent/chroma_radiance",
inputs=[
io.Int.Input(id="width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input(id="height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -33,7 +33,7 @@ class ChromaRadianceOptions(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="ChromaRadianceOptions",
- category="model_patches/chroma_radiance",
+ category="model/patch/chroma_radiance",
description="Allows setting advanced options for the Chroma Radiance model.",
inputs=[
io.Model.Input(id="model"),
diff --git a/comfy_extras/nodes_color.py b/comfy_extras/nodes_color.py
index 80ba121cd..01a05035e 100644
--- a/comfy_extras/nodes_color.py
+++ b/comfy_extras/nodes_color.py
@@ -8,7 +8,7 @@ class ColorToRGBInt(io.ComfyNode):
return io.Schema(
node_id="ColorToRGBInt",
display_name="Color to RGB Int",
- category="utils",
+ category="utilities",
description="Convert a color to a RGB integer value.",
inputs=[
io.Color.Input("color"),
diff --git a/comfy_extras/nodes_compositing.py b/comfy_extras/nodes_compositing.py
index 3bc9fccb3..8fcbe720e 100644
--- a/comfy_extras/nodes_compositing.py
+++ b/comfy_extras/nodes_compositing.py
@@ -111,7 +111,7 @@ class PorterDuffImageComposite(io.ComfyNode):
node_id="PorterDuffImageComposite",
search_aliases=["alpha composite", "blend modes", "layer blend", "transparency blend"],
display_name="Porter-Duff Image Composite",
- category="mask/compositing",
+ category="image/compositing",
inputs=[
io.Image.Input("source"),
io.Mask.Input("source_alpha"),
@@ -168,7 +168,7 @@ class SplitImageWithAlpha(io.ComfyNode):
node_id="SplitImageWithAlpha",
search_aliases=["extract alpha", "separate transparency", "remove alpha"],
display_name="Split Image with Alpha",
- category="mask/compositing",
+ category="image/compositing",
inputs=[
io.Image.Input("image"),
],
@@ -192,7 +192,7 @@ class JoinImageWithAlpha(io.ComfyNode):
node_id="JoinImageWithAlpha",
search_aliases=["add transparency", "apply alpha", "composite alpha", "RGBA"],
display_name="Join Image with Alpha",
- category="mask/compositing",
+ category="image/compositing",
inputs=[
io.Image.Input("image"),
io.Mask.Input("alpha"),
@@ -202,14 +202,11 @@ class JoinImageWithAlpha(io.ComfyNode):
@classmethod
def execute(cls, image: torch.Tensor, alpha: torch.Tensor) -> io.NodeOutput:
- batch_size = min(len(image), len(alpha))
- out_images = []
-
- alpha = 1.0 - resize_mask(alpha, image.shape[1:])
- for i in range(batch_size):
- out_images.append(torch.cat((image[i][:,:,:3], alpha[i].unsqueeze(2)), dim=2))
-
- return io.NodeOutput(torch.stack(out_images))
+ batch_size = max(len(image), len(alpha))
+ alpha = 1.0 - resize_mask(alpha.to(image), image.shape[1:])
+ alpha = comfy.utils.repeat_to_batch_size(alpha, batch_size)
+ image = comfy.utils.repeat_to_batch_size(image, batch_size)
+ return io.NodeOutput(torch.cat((image[..., :3], alpha.unsqueeze(-1)), dim=-1))
class CompositingExtension(ComfyExtension):
diff --git a/comfy_extras/nodes_cond.py b/comfy_extras/nodes_cond.py
index 86426a780..b745a43af 100644
--- a/comfy_extras/nodes_cond.py
+++ b/comfy_extras/nodes_cond.py
@@ -8,7 +8,7 @@ class CLIPTextEncodeControlnet(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="CLIPTextEncodeControlnet",
- category="_for_testing/conditioning",
+ category="experimental/conditioning",
inputs=[
io.Clip.Input("clip"),
io.Conditioning.Input("conditioning"),
@@ -35,7 +35,7 @@ class T5TokenizerOptions(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="T5TokenizerOptions",
- category="_for_testing/conditioning",
+ category="experimental/conditioning",
inputs=[
io.Clip.Input("clip"),
io.Int.Input("min_padding", default=0, min=0, max=10000, step=1, advanced=True),
diff --git a/comfy_extras/nodes_context_windows.py b/comfy_extras/nodes_context_windows.py
index 0e43f2e44..d9e32b9d9 100644
--- a/comfy_extras/nodes_context_windows.py
+++ b/comfy_extras/nodes_context_windows.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from comfy_api.latest import ComfyExtension, io
import comfy.context_windows
import nodes
@@ -10,7 +9,7 @@ class ContextWindowsManualNode(io.ComfyNode):
return io.Schema(
node_id="ContextWindowsManual",
display_name="Context Windows (Manual)",
- category="context",
+ category="model/patch",
description="Manually set context windows.",
inputs=[
io.Model.Input("model", tooltip="The model to apply context windows to during sampling."),
@@ -29,6 +28,7 @@ class ContextWindowsManualNode(io.ComfyNode):
io.Boolean.Input("freenoise", default=False, tooltip="Whether to apply FreeNoise noise shuffling, improves window blending."),
io.String.Input("cond_retain_index_list", default="", tooltip="List of latent indices to retain in the conditioning tensors for each window, for example setting this to '0' will use the initial start image for each window."),
io.Boolean.Input("split_conds_to_windows", default=False, tooltip="Whether to split multiple conditionings (created by ConditionCombine) to each window based on region index."),
+ io.Boolean.Input("causal_window_fix", default=True, tooltip="Whether to add a causal fix frame to non-0-indexed context windows."),
],
outputs=[
io.Model.Output(tooltip="The model with context windows applied during sampling."),
@@ -38,7 +38,7 @@ class ContextWindowsManualNode(io.ComfyNode):
@classmethod
def execute(cls, model: io.Model.Type, context_length: int, context_overlap: int, context_schedule: str, context_stride: int, closed_loop: bool, fuse_method: str, dim: int, freenoise: bool,
- cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False) -> io.Model:
+ cond_retain_index_list: list[int]=[], split_conds_to_windows: bool=False, causal_window_fix: bool=True) -> io.Model:
model = model.clone()
model.model_options["context_handler"] = comfy.context_windows.IndexListContextHandler(
context_schedule=comfy.context_windows.get_matching_context_schedule(context_schedule),
@@ -50,7 +50,8 @@ class ContextWindowsManualNode(io.ComfyNode):
dim=dim,
freenoise=freenoise,
cond_retain_index_list=cond_retain_index_list,
- split_conds_to_windows=split_conds_to_windows
+ split_conds_to_windows=split_conds_to_windows,
+ causal_window_fix=causal_window_fix,
)
# make memory usage calculation only take into account the context window latents
comfy.context_windows.create_prepare_sampling_wrapper(model)
diff --git a/comfy_extras/nodes_controlnet.py b/comfy_extras/nodes_controlnet.py
index 847cb0bdf..17d965405 100644
--- a/comfy_extras/nodes_controlnet.py
+++ b/comfy_extras/nodes_controlnet.py
@@ -9,7 +9,7 @@ class SetUnionControlNetType(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SetUnionControlNetType",
- category="conditioning/controlnet",
+ category="model/conditioning/controlnet",
inputs=[
io.ControlNet.Input("control_net"),
io.Combo.Input("type", options=["auto"] + list(UNION_CONTROLNET_TYPES.keys())),
@@ -39,7 +39,7 @@ class ControlNetInpaintingAliMamaApply(io.ComfyNode):
return io.Schema(
node_id="ControlNetInpaintingAliMamaApply",
search_aliases=["masked controlnet"],
- category="conditioning/controlnet",
+ category="model/conditioning/controlnet",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/nodes_cosmos.py b/comfy_extras/nodes_cosmos.py
index 7dd129d19..d754ab442 100644
--- a/comfy_extras/nodes_cosmos.py
+++ b/comfy_extras/nodes_cosmos.py
@@ -13,7 +13,7 @@ class EmptyCosmosLatentVideo(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="EmptyCosmosLatentVideo",
- category="latent/video",
+ category="model/latent/video",
inputs=[
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=704, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -45,7 +45,7 @@ class CosmosImageToVideoLatent(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="CosmosImageToVideoLatent",
- category="conditioning/inpaint",
+ category="model/conditioning/inpaint",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=1280, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -88,7 +88,7 @@ class CosmosPredict2ImageToVideoLatent(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="CosmosPredict2ImageToVideoLatent",
- category="conditioning/inpaint",
+ category="model/conditioning/inpaint",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
diff --git a/comfy_extras/nodes_curve.py b/comfy_extras/nodes_curve.py
index 9803e8034..aa2d94bb6 100644
--- a/comfy_extras/nodes_curve.py
+++ b/comfy_extras/nodes_curve.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import numpy as np
from comfy_api.latest import ComfyExtension, io
@@ -13,7 +11,7 @@ class CurveEditor(io.ComfyNode):
return io.Schema(
node_id="CurveEditor",
display_name="Curve Editor",
- category="utils",
+ category="utilities",
inputs=[
io.Curve.Input("curve"),
io.Histogram.Input("histogram", optional=True),
@@ -40,7 +38,7 @@ class ImageHistogram(io.ComfyNode):
return io.Schema(
node_id="ImageHistogram",
display_name="Image Histogram",
- category="utils",
+ category="utilities",
inputs=[
io.Image.Input("image"),
],
diff --git a/comfy_extras/nodes_custom_sampler.py b/comfy_extras/nodes_custom_sampler.py
index 1e957c09b..c3346bf09 100644
--- a/comfy_extras/nodes_custom_sampler.py
+++ b/comfy_extras/nodes_custom_sampler.py
@@ -17,7 +17,7 @@ class BasicScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="BasicScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Model.Input("model"),
io.Combo.Input("scheduler", options=comfy.samplers.SCHEDULER_NAMES),
@@ -47,7 +47,7 @@ class KarrasScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="KarrasScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True),
@@ -69,7 +69,7 @@ class ExponentialScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ExponentialScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True),
@@ -90,7 +90,7 @@ class PolyexponentialScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PolyexponentialScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True),
@@ -112,7 +112,7 @@ class LaplaceScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LaplaceScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("sigma_max", default=14.614642, min=0.0, max=5000.0, step=0.01, round=False, advanced=True),
@@ -136,7 +136,7 @@ class SDTurboScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SDTurboScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Model.Input("model"),
io.Int.Input("steps", default=1, min=1, max=10),
@@ -160,7 +160,7 @@ class BetaSamplingScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="BetaSamplingScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Model.Input("model"),
io.Int.Input("steps", default=20, min=1, max=10000),
@@ -182,7 +182,7 @@ class VPScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="VPScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("beta_d", default=19.9, min=0.0, max=5000.0, step=0.01, round=False, advanced=True), #TODO: fix default values
@@ -204,7 +204,7 @@ class SplitSigmas(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SplitSigmas",
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[
io.Sigmas.Input("sigmas"),
io.Int.Input("step", default=0, min=0, max=10000),
@@ -228,7 +228,7 @@ class SplitSigmasDenoise(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SplitSigmasDenoise",
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[
io.Sigmas.Input("sigmas"),
io.Float.Input("denoise", default=1.0, min=0.0, max=1.0, step=0.01),
@@ -254,7 +254,7 @@ class FlipSigmas(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="FlipSigmas",
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[io.Sigmas.Input("sigmas")],
outputs=[io.Sigmas.Output()]
)
@@ -276,7 +276,7 @@ class SetFirstSigma(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SetFirstSigma",
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[
io.Sigmas.Input("sigmas"),
io.Float.Input("sigma", default=136.0, min=0.0, max=20000.0, step=0.001, round=False),
@@ -298,7 +298,7 @@ class ExtendIntermediateSigmas(io.ComfyNode):
return io.Schema(
node_id="ExtendIntermediateSigmas",
search_aliases=["interpolate sigmas"],
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[
io.Sigmas.Input("sigmas"),
io.Int.Input("steps", default=2, min=1, max=100),
@@ -351,7 +351,7 @@ class SamplingPercentToSigma(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplingPercentToSigma",
- category="sampling/custom_sampling/sigmas",
+ category="model/sampling/sigmas",
inputs=[
io.Model.Input("model"),
io.Float.Input("sampling_percent", default=0.0, min=0.0, max=1.0, step=0.0001),
@@ -379,7 +379,7 @@ class KSamplerSelect(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="KSamplerSelect",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[io.Combo.Input("sampler_name", options=comfy.samplers.SAMPLER_NAMES)],
outputs=[io.Sampler.Output()]
)
@@ -396,7 +396,7 @@ class SamplerDPMPP_3M_SDE(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerDPMPP_3M_SDE",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
@@ -421,7 +421,7 @@ class SamplerDPMPP_2M_SDE(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerDPMPP_2M_SDE",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Combo.Input("solver_type", options=['midpoint', 'heun']),
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
@@ -448,7 +448,7 @@ class SamplerDPMPP_SDE(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerDPMPP_SDE",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
@@ -474,7 +474,7 @@ class SamplerDPMPP_2S_Ancestral(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerDPMPP_2S_Ancestral",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False),
@@ -494,7 +494,7 @@ class SamplerEulerAncestral(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerEulerAncestral",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
io.Float.Input("s_noise", default=1.0, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
@@ -515,7 +515,7 @@ class SamplerEulerAncestralCFGPP(io.ComfyNode):
return io.Schema(
node_id="SamplerEulerAncestralCFGPP",
display_name="SamplerEulerAncestralCFG++",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Float.Input("eta", default=1.0, min=0.0, max=1.0, step=0.01, round=False),
io.Float.Input("s_noise", default=1.0, min=0.0, max=10.0, step=0.01, round=False),
@@ -537,7 +537,7 @@ class SamplerLMS(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerLMS",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[io.Int.Input("order", default=4, min=1, max=100, advanced=True)],
outputs=[io.Sampler.Output()]
)
@@ -554,7 +554,7 @@ class SamplerDPMAdaptative(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerDPMAdaptative",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Int.Input("order", default=3, min=2, max=3, advanced=True),
io.Float.Input("rtol", default=0.05, min=0.0, max=100.0, step=0.01, round=False, advanced=True),
@@ -585,7 +585,7 @@ class SamplerER_SDE(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerER_SDE",
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Combo.Input("solver_type", options=["ER-SDE", "Reverse-time SDE", "ODE"]),
io.Int.Input("max_stage", default=3, min=1, max=3, advanced=True),
@@ -623,7 +623,7 @@ class SamplerSASolver(io.ComfyNode):
return io.Schema(
node_id="SamplerSASolver",
search_aliases=["sde"],
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Model.Input("model"),
io.Float.Input("eta", default=1.0, min=0.0, max=10.0, step=0.01, round=False, advanced=True),
@@ -668,7 +668,7 @@ class SamplerSEEDS2(io.ComfyNode):
return io.Schema(
node_id="SamplerSEEDS2",
search_aliases=["sde", "exp heun"],
- category="sampling/custom_sampling/samplers",
+ category="model/sampling/samplers",
inputs=[
io.Combo.Input("solver_type", options=["phi_1", "phi_2"]),
io.Float.Input("eta", default=1.0, min=0.0, max=100.0, step=0.01, round=False, tooltip="Stochastic strength", advanced=True),
@@ -727,7 +727,7 @@ class SamplerCustom(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerCustom",
- category="sampling/custom_sampling",
+ category="model/sampling/custom_sampling",
inputs=[
io.Model.Input("model"),
io.Boolean.Input("add_noise", default=True, advanced=True),
@@ -750,7 +750,7 @@ class SamplerCustom(io.ComfyNode):
latent = latent_image
latent_image = latent["samples"]
latent = latent.copy()
- latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
+ latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None), latent.get("downscale_ratio_temporal", None))
latent["samples"] = latent_image
if not add_noise:
@@ -770,6 +770,7 @@ class SamplerCustom(io.ComfyNode):
out = latent.copy()
out.pop("downscale_ratio_spacial", None)
+ out.pop("downscale_ratio_temporal", None)
out["samples"] = samples
if "x0" in x0_output:
x0_out = model.model.process_latent_out(x0_output["x0"].cpu())
@@ -793,7 +794,8 @@ class BasicGuider(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="BasicGuider",
- category="sampling/custom_sampling/guiders",
+ display_name="Basic Guider",
+ category="model/sampling/guiders",
inputs=[
io.Model.Input("model"),
io.Conditioning.Input("conditioning"),
@@ -814,7 +816,8 @@ class CFGGuider(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CFGGuider",
- category="sampling/custom_sampling/guiders",
+ display_name="CFG Guider",
+ category="model/sampling/guiders",
inputs=[
io.Model.Input("model"),
io.Conditioning.Input("positive"),
@@ -868,7 +871,8 @@ class DualCFGGuider(io.ComfyNode):
return io.Schema(
node_id="DualCFGGuider",
search_aliases=["dual prompt guidance"],
- category="sampling/custom_sampling/guiders",
+ display_name="Dual CFG Guider",
+ category="model/sampling/guiders",
inputs=[
io.Model.Input("model"),
io.Conditioning.Input("cond1"),
@@ -896,7 +900,7 @@ class DisableNoise(io.ComfyNode):
return io.Schema(
node_id="DisableNoise",
search_aliases=["zero noise"],
- category="sampling/custom_sampling/noise",
+ category="model/sampling/noise",
inputs=[],
outputs=[io.Noise.Output()]
)
@@ -913,7 +917,7 @@ class RandomNoise(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RandomNoise",
- category="sampling/custom_sampling/noise",
+ category="model/sampling/noise",
inputs=[io.Int.Input("noise_seed", default=0, min=0, max=0xffffffffffffffff, control_after_generate=True)],
outputs=[io.Noise.Output()]
)
@@ -930,7 +934,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SamplerCustomAdvanced",
- category="sampling/custom_sampling",
+ category="model/sampling/custom_sampling",
inputs=[
io.Noise.Input("noise"),
io.Guider.Input("guider"),
@@ -949,7 +953,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
latent = latent_image
latent_image = latent["samples"]
latent = latent.copy()
- latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image, latent.get("downscale_ratio_spacial", None))
+ latent_image = comfy.sample.fix_empty_latent_channels(guider.model_patcher, latent_image, latent.get("downscale_ratio_spacial", None), latent.get("downscale_ratio_temporal", None))
latent["samples"] = latent_image
noise_mask = None
@@ -965,6 +969,7 @@ class SamplerCustomAdvanced(io.ComfyNode):
out = latent.copy()
out.pop("downscale_ratio_spacial", None)
+ out.pop("downscale_ratio_temporal", None)
out["samples"] = samples
if "x0" in x0_output:
x0_out = guider.model_patcher.model.process_latent_out(x0_output["x0"].cpu())
@@ -984,7 +989,7 @@ class AddNoise(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="AddNoise",
- category="_for_testing/custom_sampling/noise",
+ category="experimental/custom_sampling/noise",
is_experimental=True,
inputs=[
io.Model.Input("model"),
@@ -1034,7 +1039,7 @@ class ManualSigmas(io.ComfyNode):
return io.Schema(
node_id="ManualSigmas",
search_aliases=["custom noise schedule", "define sigmas"],
- category="_for_testing/custom_sampling",
+ category="experimental/custom_sampling",
is_experimental=True,
inputs=[
io.String.Input("sigmas", default="1, 0.5", multiline=False)
diff --git a/comfy_extras/nodes_dataset.py b/comfy_extras/nodes_dataset.py
index 98ed25d7e..35a164ec8 100644
--- a/comfy_extras/nodes_dataset.py
+++ b/comfy_extras/nodes_dataset.py
@@ -47,8 +47,10 @@ class LoadImageDataSetFromFolderNode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LoadImageDataSetFromFolder",
- display_name="Load Image Dataset from Folder",
- category="dataset",
+ search_aliases=["load folder", "load from folder", "load dataset", "load images", "import dataset"],
+ display_name="Load Image (from Folder)",
+ category="image",
+ description="Load a dataset of images from a specified folder and return a list of images. Supported formats: PNG, JPG, JPEG, WEBP.",
is_experimental=True,
inputs=[
io.Combo.Input(
@@ -84,14 +86,16 @@ class LoadImageTextDataSetFromFolderNode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LoadImageTextDataSetFromFolder",
- display_name="Load Image and Text Dataset from Folder",
- category="dataset",
+ search_aliases=["load folder", "load from folder", "load dataset", "load images", "import dataset"],
+ display_name="Load Image-Text (from Folder)",
+ category="image",
+ description="Load a dataset of pairs of images and text captions from a specified folder and return them as a list. Supported formats: PNG, JPG, JPEG, WEBP.",
is_experimental=True,
inputs=[
io.Combo.Input(
"folder",
options=folder_paths.get_input_subfolders(),
- tooltip="The folder to load images from.",
+ tooltip="The folder to load images and text captions from.",
)
],
outputs=[
@@ -206,8 +210,10 @@ class SaveImageDataSetToFolderNode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SaveImageDataSetToFolder",
- display_name="Save Image Dataset to Folder",
- category="dataset",
+ search_aliases=["save folder", "save to folder", "save dataset", "save images", "export dataset"],
+ display_name="Save Image (to Folder) (DEPRECATED)",
+ category="image",
+ description="Save a dataset of images to a specified folder. Supported formats: PNG.",
is_experimental=True,
is_output_node=True,
is_input_list=True, # Receive images as list
@@ -226,6 +232,7 @@ class SaveImageDataSetToFolderNode(io.ComfyNode):
),
],
outputs=[],
+ is_deprecated=True, # This node is redundant and superseded by existing Save Image nodes where the target folder can be specified in the filename_prefix
)
@classmethod
@@ -246,14 +253,20 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SaveImageTextDataSetToFolder",
- display_name="Save Image and Text Dataset to Folder",
- category="dataset",
+ search_aliases=["save folder", "save to folder", "save dataset", "save images", "save text", "export dataset"],
+ display_name="Save Image-Text (to Folder)",
+ category="image",
+ description="Save a dataset of pairs of images and text captions to a specified folder. Images are saved as PNG files and captions are saved as TXT files with the same filename_prefix.",
is_experimental=True,
is_output_node=True,
is_input_list=True, # Receive both images and texts as lists
inputs=[
io.Image.Input("images", tooltip="List of images to save."),
- io.String.Input("texts", tooltip="List of text captions to save."),
+ io.String.Input("texts",
+ optional=True,
+ force_input=True,
+ tooltip="List of text captions to save."
+ ),
io.String.Input(
"folder_name",
default="dataset",
@@ -270,7 +283,7 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode):
)
@classmethod
- def execute(cls, images, texts, folder_name, filename_prefix):
+ def execute(cls, images, folder_name, filename_prefix, texts=None):
# Extract scalar values
folder_name = folder_name[0]
filename_prefix = filename_prefix[0]
@@ -279,11 +292,12 @@ class SaveImageTextDataSetToFolderNode(io.ComfyNode):
saved_files = save_images_to_folder(images, output_dir, filename_prefix)
# Save captions
- for idx, (filename, caption) in enumerate(zip(saved_files, texts)):
- caption_filename = filename.replace(".png", ".txt")
- caption_path = os.path.join(output_dir, caption_filename)
- with open(caption_path, "w", encoding="utf-8") as f:
- f.write(caption)
+ if texts:
+ for idx, (filename, caption) in enumerate(zip(saved_files, texts)):
+ caption_filename = filename.replace(".png", ".txt")
+ caption_path = os.path.join(output_dir, caption_filename)
+ with open(caption_path, "w", encoding="utf-8") as f:
+ f.write(caption)
logging.info(f"Saved {len(saved_files)} images and captions to {output_dir}.")
return io.NodeOutput()
@@ -314,11 +328,13 @@ class ImageProcessingNode(io.ComfyNode):
Child classes should set:
node_id: Unique node identifier (required)
+ search_aliases: List of search aliases (optional)
display_name: Display name (optional, defaults to node_id)
description: Node description (optional)
extra_inputs: List of additional io.Input objects beyond "images" (optional)
is_group_process: None (auto-detect), True (group), or False (individual) (optional)
is_output_list: True (list output) or False (single output) (optional, default True)
+ is_deprecated: True if the node is deprecated (optional, default False)
Child classes must implement ONE of:
_process(cls, image, **kwargs) -> tensor (for single-item processing)
@@ -326,12 +342,13 @@ class ImageProcessingNode(io.ComfyNode):
"""
node_id = None
+ search_aliases = []
display_name = None
description = None
extra_inputs = []
is_group_process = None # None = auto-detect, True/False = explicit
is_output_list = None # None = auto-detect based on processing mode
-
+ is_deprecated = False
@classmethod
def _detect_processing_mode(cls):
"""Detect whether this node uses group or individual processing.
@@ -402,8 +419,10 @@ class ImageProcessingNode(io.ComfyNode):
return io.Schema(
node_id=cls.node_id,
+ search_aliases=cls.search_aliases,
display_name=cls.display_name or cls.node_id,
- category="dataset/image",
+ category=cls.category,
+ description=cls.description,
is_experimental=True,
is_input_list=is_group, # True for group, False for individual
inputs=inputs,
@@ -472,11 +491,13 @@ class TextProcessingNode(io.ComfyNode):
Child classes should set:
node_id: Unique node identifier (required)
+ search_aliases: List of search aliases (optional)
display_name: Display name (optional, defaults to node_id)
description: Node description (optional)
extra_inputs: List of additional io.Input objects beyond "texts" (optional)
is_group_process: None (auto-detect), True (group), or False (individual) (optional)
is_output_list: True (list output) or False (single output) (optional, default True)
+ is_deprecated: True if the node is deprecated (optional, default False)
Child classes must implement ONE of:
_process(cls, text, **kwargs) -> str (for single-item processing)
@@ -484,12 +505,13 @@ class TextProcessingNode(io.ComfyNode):
"""
node_id = None
+ search_aliases = []
display_name = None
description = None
extra_inputs = []
is_group_process = None # None = auto-detect, True/False = explicit
is_output_list = None # None = auto-detect based on processing mode
-
+ is_deprecated = False
@classmethod
def _detect_processing_mode(cls):
"""Detect whether this node uses group or individual processing.
@@ -552,7 +574,7 @@ class TextProcessingNode(io.ComfyNode):
return io.Schema(
node_id=cls.node_id,
display_name=cls.display_name or cls.node_id,
- category="dataset/text",
+ category="text",
is_experimental=True,
is_input_list=is_group, # True for group, False for individual
inputs=inputs,
@@ -627,15 +649,17 @@ class TextProcessingNode(io.ComfyNode):
class ResizeImagesByShorterEdgeNode(ImageProcessingNode):
node_id = "ResizeImagesByShorterEdge"
- display_name = "Resize Images by Shorter Edge"
- description = "Resize images so that the shorter edge matches the specified length while preserving aspect ratio."
+ display_name = "Resize Images by Shorter Edge (DEPRECATED)"
+ category = "image/transform"
+ description = "Resize images so that the shorter edge matches the specified dimension while preserving aspect ratio."
+ is_deprecated = True # This node is superseded by Resize Image/Mask with resize_type = scale shorter dimension
extra_inputs = [
io.Int.Input(
"shorter_edge",
default=512,
min=1,
max=8192,
- tooltip="Target length for the shorter edge.",
+ tooltip="Target dimension for the shorter edge.",
),
]
@@ -655,15 +679,17 @@ class ResizeImagesByShorterEdgeNode(ImageProcessingNode):
class ResizeImagesByLongerEdgeNode(ImageProcessingNode):
node_id = "ResizeImagesByLongerEdge"
- display_name = "Resize Images by Longer Edge"
- description = "Resize images so that the longer edge matches the specified length while preserving aspect ratio."
+ display_name = "Resize Images by Longer Edge (DEPRECATED)"
+ category = "image/transform"
+ description = "Resize images so that the longer edge matches the specified dimension while preserving aspect ratio."
+ is_deprecated = True # This node is superseded by Resize Image/Mask with resize_type = scale longer dimension
extra_inputs = [
io.Int.Input(
"longer_edge",
default=1024,
min=1,
max=8192,
- tooltip="Target length for the longer edge.",
+ tooltip="Target dimension for the longer edge.",
),
]
@@ -686,8 +712,10 @@ class ResizeImagesByLongerEdgeNode(ImageProcessingNode):
class CenterCropImagesNode(ImageProcessingNode):
node_id = "CenterCropImages"
- display_name = "Center Crop Images"
- description = "Center crop all images to the specified dimensions."
+ search_aliases=["crop", "cut", "trim"]
+ display_name="Crop Image (Center)"
+ category="image/transform"
+ description = "Center crop an image to the specified dimensions."
extra_inputs = [
io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."),
io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."),
@@ -706,10 +734,11 @@ class CenterCropImagesNode(ImageProcessingNode):
class RandomCropImagesNode(ImageProcessingNode):
node_id = "RandomCropImages"
- display_name = "Random Crop Images"
- description = (
- "Randomly crop all images to the specified dimensions (for data augmentation)."
- )
+ search_aliases=["crop", "cut", "trim"]
+ display_name = "Crop Image (Random)"
+ category="image/transform"
+ description = "Randomly crop an image to the specified dimensions."
+
extra_inputs = [
io.Int.Input("width", default=512, min=1, max=8192, tooltip="Crop width."),
io.Int.Input("height", default=512, min=1, max=8192, tooltip="Crop height."),
@@ -734,7 +763,9 @@ class RandomCropImagesNode(ImageProcessingNode):
class NormalizeImagesNode(ImageProcessingNode):
node_id = "NormalizeImages"
- display_name = "Normalize Images"
+ search_aliases=["normalize", "normalize colors"]
+ display_name = "Normalize Image Colors"
+ category = "image/color"
description = "Normalize images using mean and standard deviation."
extra_inputs = [
io.Float.Input(
@@ -762,8 +793,10 @@ class NormalizeImagesNode(ImageProcessingNode):
class AdjustBrightnessNode(ImageProcessingNode):
node_id = "AdjustBrightness"
+ search_aliases=["brightness"]
display_name = "Adjust Brightness"
- description = "Adjust brightness of all images."
+ category="image/adjustments"
+ description = "Adjust the brightness of an image."
extra_inputs = [
io.Float.Input(
"factor",
@@ -781,8 +814,10 @@ class AdjustBrightnessNode(ImageProcessingNode):
class AdjustContrastNode(ImageProcessingNode):
node_id = "AdjustContrast"
+ search_aliases=["contrast"]
display_name = "Adjust Contrast"
- description = "Adjust contrast of all images."
+ category="image/adjustments"
+ description = "Adjust the contrast of an image."
extra_inputs = [
io.Float.Input(
"factor",
@@ -800,8 +835,10 @@ class AdjustContrastNode(ImageProcessingNode):
class ShuffleDatasetNode(ImageProcessingNode):
node_id = "ShuffleDataset"
- display_name = "Shuffle Image Dataset"
- description = "Randomly shuffle the order of images in the dataset."
+ search_aliases=["shuffle", "randomize", "mix"]
+ display_name = "Shuffle Images List"
+ category = "image/batch"
+ description = "Randomly shuffle the order of images in a list."
is_group_process = True # Requires full list to shuffle
extra_inputs = [
io.Int.Input(
@@ -823,13 +860,15 @@ class ShuffleImageTextDatasetNode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ShuffleImageTextDataset",
- display_name="Shuffle Image-Text Dataset",
- category="dataset/image",
+ search_aliases=["shuffle", "randomize", "mix"],
+ display_name = "Shuffle Pairs of Image-Text",
+ category = "image/batch",
+ description = "Randomly shuffle the order of pairs of image-text in a list.",
is_experimental=True,
is_input_list=True,
inputs=[
io.Image.Input("images", tooltip="List of images to shuffle."),
- io.String.Input("texts", tooltip="List of texts to shuffle."),
+ io.String.Input("texts", tooltip="List of texts to shuffle.", force_input=True),
io.Int.Input(
"seed",
default=0,
@@ -865,8 +904,11 @@ class ShuffleImageTextDatasetNode(io.ComfyNode):
class TextToLowercaseNode(TextProcessingNode):
node_id = "TextToLowercase"
- display_name = "Text to Lowercase"
- description = "Convert all texts to lowercase."
+ search_aliases=["lowercase"]
+ display_name = "Convert Text to Lowercase (DEPRECATED)"
+ category = "text"
+ description = "Convert text to lowercase."
+ is_deprecated = True # This node is superseded by the Convert Text Case node
@classmethod
def _process(cls, text):
@@ -875,8 +917,11 @@ class TextToLowercaseNode(TextProcessingNode):
class TextToUppercaseNode(TextProcessingNode):
node_id = "TextToUppercase"
- display_name = "Text to Uppercase"
- description = "Convert all texts to uppercase."
+ search_aliases=["uppercase"]
+ display_name = "Convert Text to Uppercase (DEPRECATED)"
+ category = "text"
+ description = "Convert text to uppercase."
+ is_deprecated = True # This node is superseded by the Convert Text Case node
@classmethod
def _process(cls, text):
@@ -885,8 +930,10 @@ class TextToUppercaseNode(TextProcessingNode):
class TruncateTextNode(TextProcessingNode):
node_id = "TruncateText"
+ search_aliases=["truncate", "cut", "shorten"]
display_name = "Truncate Text"
- description = "Truncate all texts to a maximum length."
+ category = "text"
+ description = "Truncate text to a maximum length."
extra_inputs = [
io.Int.Input(
"max_length", default=77, min=1, max=10000, tooltip="Maximum text length."
@@ -900,8 +947,10 @@ class TruncateTextNode(TextProcessingNode):
class AddTextPrefixNode(TextProcessingNode):
node_id = "AddTextPrefix"
- display_name = "Add Text Prefix"
+ display_name = "Add Text Prefix (DEPRECATED)"
+ category = "text"
description = "Add a prefix to all texts."
+ is_deprecated = True # This node is superseded by the Concatenate Text node
extra_inputs = [
io.String.Input("prefix", default="", tooltip="Prefix to add."),
]
@@ -913,8 +962,10 @@ class AddTextPrefixNode(TextProcessingNode):
class AddTextSuffixNode(TextProcessingNode):
node_id = "AddTextSuffix"
- display_name = "Add Text Suffix"
+ display_name = "Add Text Suffix (DEPRECATED)"
+ category = "text"
description = "Add a suffix to all texts."
+ is_deprecated = True # This node is superseded by the Concatenate Text node
extra_inputs = [
io.String.Input("suffix", default="", tooltip="Suffix to add."),
]
@@ -926,8 +977,10 @@ class AddTextSuffixNode(TextProcessingNode):
class ReplaceTextNode(TextProcessingNode):
node_id = "ReplaceText"
- display_name = "Replace Text"
+ display_name = "Replace Text (DEPRECATED)"
+ category = "text"
description = "Replace text in all texts."
+ is_deprecated = True # This node is superseded by the other Replace Text node
extra_inputs = [
io.String.Input("find", default="", tooltip="Text to find."),
io.String.Input("replace", default="", tooltip="Text to replace with."),
@@ -940,8 +993,10 @@ class ReplaceTextNode(TextProcessingNode):
class StripWhitespaceNode(TextProcessingNode):
node_id = "StripWhitespace"
- display_name = "Strip Whitespace"
+ display_name = "Strip Whitespace (DEPRECATED)"
+ category = "text"
description = "Strip leading and trailing whitespace from all texts."
+ is_deprecated = True # This node is superseded by the Trim Text node
@classmethod
def _process(cls, text):
@@ -952,11 +1007,13 @@ class StripWhitespaceNode(TextProcessingNode):
class ImageDeduplicationNode(ImageProcessingNode):
- """Remove duplicate or very similar images from the dataset using perceptual hashing."""
+ """Remove duplicate or very similar images from a list using perceptual hashing."""
node_id = "ImageDeduplication"
- display_name = "Image Deduplication"
- description = "Remove duplicate or very similar images from the dataset."
+ search_aliases=["deduplicate", "remove duplicates", "similarity filter"]
+ display_name = "Deduplicate Images"
+ category = "image/batch"
+ description = "Remove duplicate or very similar images from a list."
is_group_process = True # Requires full list to compare images
extra_inputs = [
io.Float.Input(
@@ -1026,7 +1083,9 @@ class ImageGridNode(ImageProcessingNode):
"""Combine multiple images into a single grid/collage."""
node_id = "ImageGrid"
- display_name = "Image Grid"
+ search_aliases=["grid", "collage", "combine"]
+ display_name = "Make Image Grid"
+ category="image/batch"
description = "Arrange multiple images into a grid layout."
is_group_process = True # Requires full list to create grid
is_output_list = False # Outputs single grid image
@@ -1102,9 +1161,12 @@ class MergeImageListsNode(ImageProcessingNode):
"""Merge multiple image lists into a single list."""
node_id = "MergeImageLists"
- display_name = "Merge Image Lists"
+ search_aliases=["list", "merge list", "make list"]
+ display_name = "Merge Image Lists (DEPRECATED)"
+ category = "image/batch"
description = "Concatenate multiple image lists into one."
is_group_process = True # Receives images as list
+ is_deprecated = True # This node is superseded by the Create List node
@classmethod
def _group_process(cls, images):
@@ -1119,9 +1181,11 @@ class MergeTextListsNode(TextProcessingNode):
"""Merge multiple text lists into a single list."""
node_id = "MergeTextLists"
- display_name = "Merge Text Lists"
+ display_name = "Merge Text Lists (DEPRECATED)"
+ category = "text"
description = "Concatenate multiple text lists into one."
is_group_process = True # Receives texts as list
+ is_deprecated = True # This node is superseded by the Create List node
@classmethod
def _group_process(cls, texts):
@@ -1142,8 +1206,10 @@ class ResolutionBucket(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ResolutionBucket",
+ search_aliases=["bucket by resolution", "group by resolution", "batch by resolution"],
display_name="Resolution Bucket",
- category="dataset",
+ category="model/training",
+ description="Group latents and conditionings into buckets",
is_experimental=True,
is_input_list=True,
inputs=[
@@ -1236,7 +1302,8 @@ class MakeTrainingDataset(io.ComfyNode):
node_id="MakeTrainingDataset",
search_aliases=["encode dataset"],
display_name="Make Training Dataset",
- category="dataset",
+ category="model/training",
+ description="Encode images with VAE and texts with CLIP to create a training dataset of latents and conditionings.",
is_experimental=True,
is_input_list=True, # images and texts as lists
inputs=[
@@ -1251,6 +1318,7 @@ class MakeTrainingDataset(io.ComfyNode):
"texts",
optional=True,
tooltip="List of text captions. Can be length n (matching images), 1 (repeated for all), or omitted (uses empty string).",
+ force_input=True
),
],
outputs=[
@@ -1320,9 +1388,10 @@ class SaveTrainingDataset(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SaveTrainingDataset",
- search_aliases=["export training data"],
+ search_aliases=["export dataset", "save dataset"],
display_name="Save Training Dataset",
- category="dataset",
+ category="model/training",
+ description="Save encoded training dataset (latents + conditioning) to disk for efficient loading during training.",
is_experimental=True,
is_output_node=True,
is_input_list=True, # Receive lists
@@ -1424,7 +1493,8 @@ class LoadTrainingDataset(io.ComfyNode):
node_id="LoadTrainingDataset",
search_aliases=["import dataset", "training data"],
display_name="Load Training Dataset",
- category="dataset",
+ category="model/training",
+ description="Load encoded training dataset (latents + conditioning) from disk for use in training.",
is_experimental=True,
inputs=[
io.String.Input(
diff --git a/comfy_extras/nodes_differential_diffusion.py b/comfy_extras/nodes_differential_diffusion.py
index 34ffb9a89..4fa61ad0e 100644
--- a/comfy_extras/nodes_differential_diffusion.py
+++ b/comfy_extras/nodes_differential_diffusion.py
@@ -13,7 +13,7 @@ class DifferentialDiffusion(io.ComfyNode):
node_id="DifferentialDiffusion",
search_aliases=["inpaint gradient", "variable denoise strength"],
display_name="Differential Diffusion",
- category="_for_testing",
+ category="experimental",
inputs=[
io.Model.Input("model"),
io.Float.Input(
diff --git a/comfy_extras/nodes_eps.py b/comfy_extras/nodes_eps.py
index 0fb3871c8..8c397f132 100644
--- a/comfy_extras/nodes_eps.py
+++ b/comfy_extras/nodes_eps.py
@@ -18,7 +18,7 @@ class EpsilonScaling(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Epsilon Scaling",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Float.Input(
@@ -84,7 +84,7 @@ class TemporalScoreRescaling(io.ComfyNode):
return io.Schema(
node_id="TemporalScoreRescaling",
display_name="TSR - Temporal Score Rescaling",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Float.Input(
diff --git a/comfy_extras/nodes_flux.py b/comfy_extras/nodes_flux.py
index 3a23c7d04..afc663b22 100644
--- a/comfy_extras/nodes_flux.py
+++ b/comfy_extras/nodes_flux.py
@@ -40,7 +40,7 @@ class EmptyFlux2LatentImage(io.ComfyNode):
return io.Schema(
node_id="EmptyFlux2LatentImage",
display_name="Empty Flux 2 Latent",
- category="latent",
+ category="model/latent",
inputs=[
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -102,7 +102,7 @@ class FluxDisableGuidance(io.ComfyNode):
append = execute # TODO: remove
-PREFERED_KONTEXT_RESOLUTIONS = [
+PREFERRED_KONTEXT_RESOLUTIONS = [
(672, 1568),
(688, 1504),
(720, 1456),
@@ -143,7 +143,7 @@ class FluxKontextImageScale(io.ComfyNode):
width = image.shape[2]
height = image.shape[1]
aspect_ratio = width / height
- _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS)
+ _, width, height = min((abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS)
image = comfy.utils.common_upscale(image.movedim(-1, 1), width, height, "lanczos", "center").movedim(1, -1)
return io.NodeOutput(image)
@@ -215,7 +215,7 @@ class Flux2Scheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Flux2Scheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=4096),
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=1),
@@ -263,7 +263,7 @@ class FluxKVCache(io.ComfyNode):
node_id="FluxKVCache",
display_name="Flux KV Cache",
description="Enables KV Cache optimization for reference images on Flux family models.",
- category="",
+ category="experimental",
is_experimental=True,
inputs=[
io.Model.Input("model", tooltip="The model to use KV Cache on."),
diff --git a/comfy_extras/nodes_frame_interpolation.py b/comfy_extras/nodes_frame_interpolation.py
new file mode 100644
index 000000000..4d5bca17e
--- /dev/null
+++ b/comfy_extras/nodes_frame_interpolation.py
@@ -0,0 +1,208 @@
+import torch
+from tqdm import tqdm
+from typing_extensions import override
+
+import comfy.model_patcher
+import comfy.utils
+import folder_paths
+from comfy import model_management
+from comfy_extras.frame_interpolation_models.ifnet import IFNet, detect_rife_config
+from comfy_extras.frame_interpolation_models.film_net import FILMNet
+from comfy_api.latest import ComfyExtension, io
+
+FrameInterpolationModel = io.Custom("INTERP_MODEL")
+
+
+class FrameInterpolationModelLoader(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="FrameInterpolationModelLoader",
+ display_name="Load Frame Interpolation Model",
+ category="model/loaders",
+ inputs=[
+ io.Combo.Input("model_name", options=folder_paths.get_filename_list("frame_interpolation"),
+ tooltip="Select a frame interpolation model to load. Models must be placed in the 'frame_interpolation' folder."),
+ ],
+ outputs=[
+ FrameInterpolationModel.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, model_name) -> io.NodeOutput:
+ model_path = folder_paths.get_full_path_or_raise("frame_interpolation", model_name)
+ sd = comfy.utils.load_torch_file(model_path, safe_load=True)
+
+ model = cls._detect_and_load(sd)
+ dtype = torch.float16 if model_management.should_use_fp16(model_management.get_torch_device()) else torch.float32
+ model.eval().to(dtype)
+ patcher = comfy.model_patcher.CoreModelPatcher(
+ model,
+ load_device=model_management.get_torch_device(),
+ offload_device=model_management.unet_offload_device(),
+ )
+ return io.NodeOutput(patcher)
+
+ @classmethod
+ def _detect_and_load(cls, sd):
+ # Try FILM
+ if "extract.extract_sublevels.convs.0.0.conv.weight" in sd:
+ model = FILMNet()
+ model.load_state_dict(sd)
+ return model
+
+ # Try RIFE (needs key remapping for raw checkpoints)
+ sd = comfy.utils.state_dict_prefix_replace(sd, {"module.": "", "flownet.": ""})
+ key_map = {}
+ for k in sd:
+ for i in range(5):
+ if k.startswith(f"block{i}."):
+ key_map[k] = f"blocks.{i}.{k[len(f'block{i}.'):]}"
+ if key_map:
+ sd = {key_map.get(k, k): v for k, v in sd.items()}
+ sd = {k: v for k, v in sd.items() if not k.startswith(("teacher.", "caltime."))}
+
+ try:
+ head_ch, channels = detect_rife_config(sd)
+ except (KeyError, ValueError):
+ raise ValueError("Unrecognized frame interpolation model format")
+ model = IFNet(head_ch=head_ch, channels=channels)
+ model.load_state_dict(sd)
+ return model
+
+
+class FrameInterpolate(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="FrameInterpolate",
+ display_name="Frame Interpolate",
+ category="video",
+ search_aliases=["rife", "film", "frame interpolation", "slow motion", "interpolate frames", "vfi"],
+ inputs=[
+ FrameInterpolationModel.Input("interp_model"),
+ io.Image.Input("images"),
+ io.Int.Input("multiplier", default=2, min=2, max=16),
+ ],
+ outputs=[
+ io.Image.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, interp_model, images, multiplier) -> io.NodeOutput:
+ offload_device = model_management.intermediate_device()
+
+ num_frames = images.shape[0]
+ if num_frames < 2 or multiplier < 2:
+ return io.NodeOutput(images)
+
+ device = interp_model.load_device
+ dtype = interp_model.model_dtype()
+ inference_model = interp_model.model
+ activation_mem = inference_model.memory_used_forward(images.shape, dtype)
+ model_management.load_models_gpu([interp_model], memory_required=activation_mem)
+ align = getattr(inference_model, "pad_align", 1)
+ H, W = images.shape[1], images.shape[2]
+
+ # Prepare a single padded frame on device for determining output dimensions
+ def prepare_frame(idx):
+ frame = images[idx:idx + 1].movedim(-1, 1).to(dtype=dtype, device=device)
+ if align > 1:
+ from comfy.ldm.common_dit import pad_to_patch_size
+ frame = pad_to_patch_size(frame, (align, align), padding_mode="reflect")
+ return frame
+
+ # Count total interpolation passes for progress bar
+ total_pairs = num_frames - 1
+ num_interp = multiplier - 1
+ total_steps = total_pairs * num_interp
+ pbar = comfy.utils.ProgressBar(total_steps)
+ tqdm_bar = tqdm(total=total_steps, desc="Frame interpolation")
+
+ batch = num_interp # reduced on OOM and persists across pairs (same resolution = same limit)
+ t_values = [t / multiplier for t in range(1, multiplier)]
+
+ out_dtype = model_management.intermediate_dtype()
+ total_out_frames = total_pairs * multiplier + 1
+ result = torch.empty((total_out_frames, 3, H, W), dtype=out_dtype, device=offload_device)
+ result[0] = images[0].movedim(-1, 0).to(out_dtype)
+ out_idx = 1
+
+ # Pre-compute timestep tensor on device (padded dimensions needed)
+ sample = prepare_frame(0)
+ pH, pW = sample.shape[2], sample.shape[3]
+ ts_full = torch.tensor(t_values, device=device, dtype=dtype).reshape(num_interp, 1, 1, 1)
+ ts_full = ts_full.expand(-1, 1, pH, pW)
+ del sample
+
+ multi_fn = getattr(inference_model, "forward_multi_timestep", None)
+ feat_cache = {}
+ prev_frame = None
+
+ try:
+ for i in range(total_pairs):
+ img0_single = prev_frame if prev_frame is not None else prepare_frame(i)
+ img1_single = prepare_frame(i + 1)
+ prev_frame = img1_single
+
+ # Cache features: img1 of pair N becomes img0 of pair N+1
+ feat_cache["img0"] = feat_cache.pop("next") if "next" in feat_cache else inference_model.extract_features(img0_single)
+ feat_cache["img1"] = inference_model.extract_features(img1_single)
+ feat_cache["next"] = feat_cache["img1"]
+
+ used_multi = False
+ if multi_fn is not None:
+ # Models with timestep-independent flow can compute it once for all timesteps
+ try:
+ mids = multi_fn(img0_single, img1_single, t_values, cache=feat_cache)
+ result[out_idx:out_idx + num_interp] = mids[:, :, :H, :W].to(out_dtype)
+ out_idx += num_interp
+ pbar.update(num_interp)
+ tqdm_bar.update(num_interp)
+ used_multi = True
+ except model_management.OOM_EXCEPTION:
+ model_management.soft_empty_cache()
+ multi_fn = None # fall through to single-timestep path
+
+ if not used_multi:
+ j = 0
+ while j < num_interp:
+ b = min(batch, num_interp - j)
+ try:
+ img0 = img0_single.expand(b, -1, -1, -1)
+ img1 = img1_single.expand(b, -1, -1, -1)
+ mids = inference_model(img0, img1, timestep=ts_full[j:j + b], cache=feat_cache)
+ result[out_idx:out_idx + b] = mids[:, :, :H, :W].to(out_dtype)
+ out_idx += b
+ pbar.update(b)
+ tqdm_bar.update(b)
+ j += b
+ except model_management.OOM_EXCEPTION:
+ if batch <= 1:
+ raise
+ batch = max(1, batch // 2)
+ model_management.soft_empty_cache()
+
+ result[out_idx] = images[i + 1].movedim(-1, 0).to(out_dtype)
+ out_idx += 1
+ finally:
+ tqdm_bar.close()
+
+ # BCHW -> BHWC
+ result = result.movedim(1, -1).clamp_(0.0, 1.0)
+ return io.NodeOutput(result)
+
+
+class FrameInterpolationExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ FrameInterpolationModelLoader,
+ FrameInterpolate,
+ ]
+
+
+async def comfy_entrypoint() -> FrameInterpolationExtension:
+ return FrameInterpolationExtension()
diff --git a/comfy_extras/nodes_freelunch.py b/comfy_extras/nodes_freelunch.py
index 248efdef3..ccbd1fd90 100644
--- a/comfy_extras/nodes_freelunch.py
+++ b/comfy_extras/nodes_freelunch.py
@@ -29,7 +29,7 @@ class FreeU(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="FreeU",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
IO.Model.Input("model"),
IO.Float.Input("b1", default=1.1, min=0.0, max=10.0, step=0.01, advanced=True),
@@ -76,7 +76,7 @@ class FreeU_V2(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="FreeU_V2",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
IO.Model.Input("model"),
IO.Float.Input("b1", default=1.3, min=0.0, max=10.0, step=0.01, advanced=True),
diff --git a/comfy_extras/nodes_fresca.py b/comfy_extras/nodes_fresca.py
index eab4f303f..173f42154 100644
--- a/comfy_extras/nodes_fresca.py
+++ b/comfy_extras/nodes_fresca.py
@@ -60,7 +60,7 @@ class FreSca(io.ComfyNode):
node_id="FreSca",
search_aliases=["frequency guidance"],
display_name="FreSca",
- category="_for_testing",
+ category="experimental",
description="Applies frequency-dependent scaling to the guidance",
inputs=[
io.Model.Input("model"),
diff --git a/comfy_extras/nodes_gits.py b/comfy_extras/nodes_gits.py
index d48483862..434a24387 100644
--- a/comfy_extras/nodes_gits.py
+++ b/comfy_extras/nodes_gits.py
@@ -340,7 +340,7 @@ class GITSScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="GITSScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Float.Input("coeff", default=1.20, min=0.80, max=1.50, step=0.05, advanced=True),
io.Int.Input("steps", default=10, min=2, max=1000),
diff --git a/comfy_extras/nodes_hidream_o1.py b/comfy_extras/nodes_hidream_o1.py
new file mode 100644
index 000000000..8648d2e26
--- /dev/null
+++ b/comfy_extras/nodes_hidream_o1.py
@@ -0,0 +1,256 @@
+from typing_extensions import override
+
+import torch
+
+import comfy.model_management
+import comfy.patcher_extension
+import node_helpers
+from comfy_api.latest import ComfyExtension, io
+
+
+class EmptyHiDreamO1LatentImage(io.ComfyNode):
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="EmptyHiDreamO1LatentImage",
+ display_name="Empty HiDream-O1 Latent Image",
+ category="model/latent/image",
+ description=(
+ "Empty pixel-space latent for HiDream-O1-Image. The model was "
+ "trained at ~4 megapixels; lower resolutions go off-distribution "
+ "and quality regresses noticeably. Trained resolutions: "
+ "2048x2048, 2304x1728, 1728x2304, 2560x1440, 1440x2560, "
+ "2496x1664, 1664x2496, 3104x1312, 1312x3104, 2304x1792, 1792x2304."
+ ),
+ inputs=[
+ io.Int.Input(id="width", default=2048, min=64, max=4096, step=32),
+ io.Int.Input(id="height", default=2048, min=64, max=4096, step=32),
+ io.Int.Input(id="batch_size", default=1, min=1, max=64),
+ ],
+ outputs=[io.Latent().Output()],
+ )
+
+ @classmethod
+ def execute(cls, *, width: int, height: int, batch_size: int = 1) -> io.NodeOutput:
+ latent = torch.zeros(
+ (batch_size, 3, height, width),
+ device=comfy.model_management.intermediate_device(),
+ )
+ return io.NodeOutput({"samples": latent})
+
+
+class HiDreamO1ReferenceImages(io.ComfyNode):
+ """Attach reference images to both positive and negative conditioning."""
+
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="HiDreamO1ReferenceImages",
+ display_name="HiDream-O1 Reference Images",
+ category="model/conditioning/image",
+ description=(
+ "Attach 1-10 reference images to conditioning, one for edit instruction"
+ "or multiple for subject-driven personalization."
+ ),
+ inputs=[
+ io.Conditioning.Input(id="positive"),
+ io.Conditioning.Input(id="negative"),
+ io.Autogrow.Input(
+ "images",
+ template=io.Autogrow.TemplateNames(
+ io.Image.Input("image"),
+ names=[f"image_{i}" for i in range(1, 11)],
+ min=1,
+ ),
+ tooltip=("Reference images. 1 image = instruction edit; 2-10 images = multi reference."
+ ),
+ ),
+ ],
+ outputs=[
+ io.Conditioning.Output(display_name="positive"),
+ io.Conditioning.Output(display_name="negative"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, *, positive, negative, images: io.Autogrow.Type) -> io.NodeOutput:
+ refs = [images[f"image_{i}"] for i in range(1, 11) if f"image_{i}" in images]
+ positive = node_helpers.conditioning_set_values(positive, {"reference_latents": refs}, append=True)
+ negative = node_helpers.conditioning_set_values(negative, {"reference_latents": refs}, append=True)
+ return io.NodeOutput(positive, negative)
+
+
+class HiDreamO1PatchSeamSmoothing(io.ComfyNode):
+ PATCH_SIZE = 32
+ EDGE_FEATHER = 4
+
+ # Shift presets per (pattern, N). 8-pass = 4-quadrant + 4 quarter-patch offsets.
+ SHIFTS_BY_PATTERN = {
+ ("single_shift", 2): [(0, 0), (16, 16)],
+ ("single_shift", 4): [(0, 0), (16, 0), (0, 16), (16, 16)],
+ ("single_shift", 8): [(0, 0), (16, 0), (0, 16), (16, 16),
+ (8, 8), (24, 8), (8, 24), (24, 24)],
+ ("symmetric", 2): [(-8, -8), (8, 8)],
+ ("symmetric", 4): [(-8, -8), (8, -8), (-8, 8), (8, 8)],
+ ("symmetric", 8): [(-12, -12), (4, -12), (-12, 4), (4, 4),
+ (-4, -4), (12, -4), (-4, 12), (12, 12)],
+ }
+ RAMP_LEVELS = {
+ "2": [2],
+ "4": [4],
+ "ramp_2_4": [2, 4],
+ "ramp_2_4_8": [2, 4, 8],
+ }
+
+ @staticmethod
+ def _hann_tile(cy: int, cx: int, size: int = 32) -> torch.Tensor:
+ """size x size Hann tile peaking at (cy, cx) within a patch."""
+ half = size // 2
+ yy = torch.arange(size).view(size, 1)
+ xx = torch.arange(size).view(1, size)
+ dy = ((yy - cy + half) % size) - half
+ dx = ((xx - cx + half) % size) - half
+ return 0.25 * (1 + torch.cos(torch.pi * dy / half)) * (1 + torch.cos(torch.pi * dx / half))
+
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="HiDreamO1PatchSeamSmoothing",
+ display_name="HiDream-O1 Patch Seam Smoothing",
+ category="advanced/model",
+ is_experimental=True,
+ description=(
+ "Average the model output across multiple shifted patch-grid "
+ "positions during the late portion of sampling. Cancels seams."
+ ),
+ inputs=[
+ io.Model.Input(id="model"),
+ io.Float.Input(id="start_percent", default=0.8, min=0.0, max=1.0, step=0.01,
+ tooltip="Sampling progress (0=start, 1=end) at which the blend turns ON.",
+ ),
+ io.Float.Input(id="end_percent", default=1.0, min=0.0, max=1.0, step=0.01,
+ tooltip="Sampling progress at which the blend turns OFF.",
+ ),
+ io.Combo.Input(
+ id="pattern",
+ options=["single_shift", "symmetric"],
+ default="single_shift",
+ tooltip="Shift layout. single_shift: one pass at the natural patch grid + others offset. symmetric: all passes off-grid, shifts split around origin.",
+ ),
+ io.Combo.Input(
+ id="passes",
+ options=["2", "4", "ramp_2_4", "ramp_2_4_8"],
+ default="2",
+ tooltip="Number of passes per gated step. 2/4 = fixed. ramp_*: pass count increases as sampling approaches end (more smoothing where seams are most visible).",
+ ),
+ io.Combo.Input(
+ id="blend",
+ options=["average", "window", "median"],
+ default="average",
+ tooltip="average: equal-weight mean. window: Hann-windowed weighting favoring each pass away from its patch boundaries. median: per-pixel median, rejects wraparound-outlier passes.",
+ ),
+ io.Float.Input(id="strength", default=1.0, min=0.0, max=1.0, step=0.01,
+ tooltip="Interpolation between the natural-grid pred (0) and the averaged result (1).",
+ ),
+ ],
+ outputs=[io.Model.Output()],
+ )
+
+ @classmethod
+ def execute(cls, *, model, start_percent: float, end_percent: float, pattern: str, passes: str, blend: str, strength: float) -> io.NodeOutput:
+ if strength <= 0.0 or end_percent <= start_percent:
+ return io.NodeOutput(model)
+
+ P = cls.PATCH_SIZE
+ half = P // 2
+ shift_levels = [cls.SHIFTS_BY_PATTERN[(pattern, n)] for n in cls.RAMP_LEVELS[passes]]
+
+ if blend == "window":
+ window_tile_levels = [
+ torch.stack([cls._hann_tile((half - sy) % P, (half - sx) % P, P) for sy, sx in lst], dim=0)
+ for lst in shift_levels
+ ]
+ else:
+ window_tile_levels = [None] * len(shift_levels)
+
+ m = model.clone()
+ model_sampling = m.get_model_object("model_sampling")
+ multiplier = float(model_sampling.multiplier)
+ start_t = float(model_sampling.percent_to_sigma(start_percent)) * multiplier
+ end_t = float(model_sampling.percent_to_sigma(end_percent)) * multiplier
+
+ edge_ramp_cache: dict = {}
+
+ def get_edge_ramp(H: int, W: int, device, dtype) -> torch.Tensor:
+ key = (H, W, device, dtype)
+ cached = edge_ramp_cache.get(key)
+ if cached is not None:
+ return cached
+ feather = cls.EDGE_FEATHER
+ ys = torch.minimum(torch.arange(H, device=device, dtype=torch.float32),
+ (H - 1) - torch.arange(H, device=device, dtype=torch.float32))
+ xs = torch.minimum(torch.arange(W, device=device, dtype=torch.float32),
+ (W - 1) - torch.arange(W, device=device, dtype=torch.float32))
+ y_mask = ((ys - P) / feather).clamp(0, 1)
+ x_mask = ((xs - P) / feather).clamp(0, 1)
+ ramp = (y_mask[:, None] * x_mask[None, :]).to(dtype)
+ edge_ramp_cache[key] = ramp
+ return ramp
+
+ def smoothing_wrapper(executor, *args, **kwargs):
+ x = args[0]
+ t = float(args[1][0])
+ pred = executor(*args, **kwargs)
+ if not (end_t <= t <= start_t):
+ return pred
+ # Pick shift-level by sigma phase across the gated range.
+ if len(shift_levels) == 1:
+ level_idx = 0
+ else:
+ phase = (start_t - t) / max(start_t - end_t, 1e-8)
+ level_idx = min(int(phase * len(shift_levels)), len(shift_levels) - 1)
+ shifts = shift_levels[level_idx]
+ window_tiles = window_tile_levels[level_idx]
+
+ preds = []
+ for sy, sx in shifts:
+ if sy == 0 and sx == 0:
+ preds.append(pred)
+ continue
+ x_rolled = torch.roll(x, shifts=(sy, sx), dims=(-2, -1))
+ pred_rolled = executor(x_rolled, *args[1:], **kwargs)
+ preds.append(torch.roll(pred_rolled, shifts=(-sy, -sx), dims=(-2, -1)))
+ stacked = torch.stack(preds, dim=0) # (N, B, C, H, W)
+ _, _, _, H, W = stacked.shape
+ if blend == "window":
+ N = stacked.shape[0]
+ tiles = window_tiles.to(device=stacked.device, dtype=stacked.dtype)
+ w = tiles.repeat(1, H // P, W // P)[:, :H, :W]
+ sum_w = w.sum(dim=0, keepdim=True)
+ w = torch.where(sum_w < 1e-3, torch.full_like(w, 1.0 / N), w / sum_w.clamp(min=1e-8))
+ avg = (stacked * w[:, None, None, :, :]).sum(dim=0)
+ elif blend == "median":
+ avg = torch.median(stacked, dim=0).values
+ else:
+ avg = stacked.mean(dim=0)
+
+ # Mask out the P-px wraparound contamination strip at each edge.
+ mask = get_edge_ramp(H, W, pred.device, pred.dtype)
+ return pred * (1.0 - mask * strength) + avg * (mask * strength)
+
+ m.add_wrapper_with_key(comfy.patcher_extension.WrappersMP.DIFFUSION_MODEL, "hidream_o1_patch_seam_smoothing", smoothing_wrapper)
+ return io.NodeOutput(m)
+
+
+class HiDreamO1Extension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ EmptyHiDreamO1LatentImage,
+ HiDreamO1ReferenceImages,
+ HiDreamO1PatchSeamSmoothing,
+ ]
+
+
+async def comfy_entrypoint() -> HiDreamO1Extension:
+ return HiDreamO1Extension()
diff --git a/comfy_extras/nodes_hunyuan.py b/comfy_extras/nodes_hunyuan.py
index 4ea93a499..16fff12af 100644
--- a/comfy_extras/nodes_hunyuan.py
+++ b/comfy_extras/nodes_hunyuan.py
@@ -41,7 +41,7 @@ class EmptyHunyuanLatentVideo(io.ComfyNode):
return io.Schema(
node_id="EmptyHunyuanLatentVideo",
display_name="Empty HunyuanVideo 1.0 Latent",
- category="latent/video",
+ category="model/latent/video",
inputs=[
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -81,7 +81,7 @@ class HunyuanVideo15ImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanVideo15ImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -131,6 +131,8 @@ class HunyuanVideo15SuperResolution(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanVideo15SuperResolution",
+ display_name="Hunyuan Video 1.5 Super Resolution",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -176,7 +178,7 @@ class LatentUpscaleModelLoader(io.ComfyNode):
return io.Schema(
node_id="LatentUpscaleModelLoader",
display_name="Load Latent Upscale Model",
- category="loaders",
+ category="model/loaders",
inputs=[
io.Combo.Input("model_name", options=folder_paths.get_filename_list("latent_upscale_models")),
],
@@ -225,7 +227,7 @@ class HunyuanVideo15LatentUpscaleWithModel(io.ComfyNode):
return io.Schema(
node_id="HunyuanVideo15LatentUpscaleWithModel",
display_name="Hunyuan Video 15 Latent Upscale With Model",
- category="latent",
+ category="model/latent",
inputs=[
io.LatentUpscaleModel.Input("model"),
io.Latent.Input("samples"),
@@ -306,7 +308,7 @@ class HunyuanImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Vae.Input("vae"),
@@ -357,7 +359,7 @@ class EmptyHunyuanImageLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyHunyuanImageLatent",
- category="latent",
+ category="model/latent",
inputs=[
io.Int.Input("width", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("height", default=2048, min=64, max=nodes.MAX_RESOLUTION, step=32),
@@ -381,6 +383,8 @@ class HunyuanRefinerLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HunyuanRefinerLatent",
+ display_name="Hunyuan Latent Refiner",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/nodes_hunyuan3d.py b/comfy_extras/nodes_hunyuan3d.py
index df0c3e4b1..60e530626 100644
--- a/comfy_extras/nodes_hunyuan3d.py
+++ b/comfy_extras/nodes_hunyuan3d.py
@@ -1,12 +1,7 @@
import torch
-import os
-import json
-import struct
-import numpy as np
from comfy.ldm.modules.diffusionmodules.mmdit import get_1d_sincos_pos_embed_from_grid_torch
-import folder_paths
import comfy.model_management
-from comfy.cli_args import args
+from comfy_extras.nodes_save_3d import pack_variable_mesh_batch
from typing_extensions import override
from comfy_api.latest import ComfyExtension, IO, Types
from comfy_api.latest._util import MESH, VOXEL # only for backward compatibility if someone import it from this file (will be removed later) # noqa
@@ -17,7 +12,7 @@ class EmptyLatentHunyuan3Dv2(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="EmptyLatentHunyuan3Dv2",
- category="latent/3d",
+ category="model/latent/3d",
inputs=[
IO.Int.Input("resolution", default=3072, min=1, max=8192),
IO.Int.Input("batch_size", default=1, min=1, max=4096, tooltip="The number of latent images in the batch."),
@@ -40,7 +35,7 @@ class Hunyuan3Dv2Conditioning(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="Hunyuan3Dv2Conditioning",
- category="conditioning/video_models",
+ category="model/conditioning/3d_models",
inputs=[
IO.ClipVisionOutput.Input("clip_vision_output"),
],
@@ -65,7 +60,7 @@ class Hunyuan3Dv2ConditioningMultiView(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="Hunyuan3Dv2ConditioningMultiView",
- category="conditioning/video_models",
+ category="model/conditioning/3d_models",
inputs=[
IO.ClipVisionOutput.Input("front", optional=True),
IO.ClipVisionOutput.Input("left", optional=True),
@@ -102,7 +97,7 @@ class VAEDecodeHunyuan3D(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="VAEDecodeHunyuan3D",
- category="latent/3d",
+ category="model/latent/3d",
inputs=[
IO.Latent.Input("samples"),
IO.Vae.Input("vae"),
@@ -424,14 +419,17 @@ class VoxelToMeshBasic(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="VoxelToMeshBasic",
+ display_name="Voxel to Mesh (Basic) (DEPRECATED)",
category="3d",
+ description="Converts a voxel grid to a mesh.",
+ is_deprecated=True, # This node is superseded by the Voxel To Mesh node
inputs=[
IO.Voxel.Input("voxel"),
IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
],
outputs=[
IO.Mesh.Output(),
- ]
+ ],
)
@classmethod
@@ -443,7 +441,9 @@ class VoxelToMeshBasic(IO.ComfyNode):
vertices.append(v)
faces.append(f)
- return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
+ if vertices and all(v.shape == vertices[0].shape for v in vertices) and all(f.shape == faces[0].shape for f in faces):
+ return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
+ return IO.NodeOutput(pack_variable_mesh_batch(vertices, faces))
decode = execute # TODO: remove
@@ -453,10 +453,12 @@ class VoxelToMesh(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="VoxelToMesh",
+ display_name="Voxel to Mesh",
category="3d",
+ description="Converts a voxel grid to a mesh.",
inputs=[
IO.Voxel.Input("voxel"),
- IO.Combo.Input("algorithm", options=["surface net", "basic"], advanced=True),
+ IO.Combo.Input("algorithm", options=["surface net", "basic"]),
IO.Float.Input("threshold", default=0.6, min=-1.0, max=1.0, step=0.01),
],
outputs=[
@@ -479,206 +481,13 @@ class VoxelToMesh(IO.ComfyNode):
vertices.append(v)
faces.append(f)
- return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
+ if vertices and all(v.shape == vertices[0].shape for v in vertices) and all(f.shape == faces[0].shape for f in faces):
+ return IO.NodeOutput(Types.MESH(torch.stack(vertices), torch.stack(faces)))
+ return IO.NodeOutput(pack_variable_mesh_batch(vertices, faces))
decode = execute # TODO: remove
-def save_glb(vertices, faces, filepath, metadata=None):
- """
- Save PyTorch tensor vertices and faces as a GLB file without external dependencies.
-
- Parameters:
- vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
- faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces)
- filepath: str - Output filepath (should end with .glb)
- """
-
- # Convert tensors to numpy arrays
- vertices_np = vertices.cpu().numpy().astype(np.float32)
- faces_np = faces.cpu().numpy().astype(np.uint32)
-
- vertices_buffer = vertices_np.tobytes()
- indices_buffer = faces_np.tobytes()
-
- def pad_to_4_bytes(buffer):
- padding_length = (4 - (len(buffer) % 4)) % 4
- return buffer + b'\x00' * padding_length
-
- vertices_buffer_padded = pad_to_4_bytes(vertices_buffer)
- indices_buffer_padded = pad_to_4_bytes(indices_buffer)
-
- buffer_data = vertices_buffer_padded + indices_buffer_padded
-
- vertices_byte_length = len(vertices_buffer)
- vertices_byte_offset = 0
- indices_byte_length = len(indices_buffer)
- indices_byte_offset = len(vertices_buffer_padded)
-
- gltf = {
- "asset": {"version": "2.0", "generator": "ComfyUI"},
- "buffers": [
- {
- "byteLength": len(buffer_data)
- }
- ],
- "bufferViews": [
- {
- "buffer": 0,
- "byteOffset": vertices_byte_offset,
- "byteLength": vertices_byte_length,
- "target": 34962 # ARRAY_BUFFER
- },
- {
- "buffer": 0,
- "byteOffset": indices_byte_offset,
- "byteLength": indices_byte_length,
- "target": 34963 # ELEMENT_ARRAY_BUFFER
- }
- ],
- "accessors": [
- {
- "bufferView": 0,
- "byteOffset": 0,
- "componentType": 5126, # FLOAT
- "count": len(vertices_np),
- "type": "VEC3",
- "max": vertices_np.max(axis=0).tolist(),
- "min": vertices_np.min(axis=0).tolist()
- },
- {
- "bufferView": 1,
- "byteOffset": 0,
- "componentType": 5125, # UNSIGNED_INT
- "count": faces_np.size,
- "type": "SCALAR"
- }
- ],
- "meshes": [
- {
- "primitives": [
- {
- "attributes": {
- "POSITION": 0
- },
- "indices": 1,
- "mode": 4 # TRIANGLES
- }
- ]
- }
- ],
- "nodes": [
- {
- "mesh": 0
- }
- ],
- "scenes": [
- {
- "nodes": [0]
- }
- ],
- "scene": 0
- }
-
- if metadata is not None:
- gltf["asset"]["extras"] = metadata
-
- # Convert the JSON to bytes
- gltf_json = json.dumps(gltf).encode('utf8')
-
- def pad_json_to_4_bytes(buffer):
- padding_length = (4 - (len(buffer) % 4)) % 4
- return buffer + b' ' * padding_length
-
- gltf_json_padded = pad_json_to_4_bytes(gltf_json)
-
- # Create the GLB header
- # Magic glTF
- glb_header = struct.pack('<4sII', b'glTF', 2, 12 + 8 + len(gltf_json_padded) + 8 + len(buffer_data))
-
- # Create JSON chunk header (chunk type 0)
- json_chunk_header = struct.pack(' IO.NodeOutput:
- full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory())
- results = []
-
- metadata = {}
- if not args.disable_metadata:
- if cls.hidden.prompt is not None:
- metadata["prompt"] = json.dumps(cls.hidden.prompt)
- if cls.hidden.extra_pnginfo is not None:
- for x in cls.hidden.extra_pnginfo:
- metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
-
- if isinstance(mesh, Types.File3D):
- # Handle File3D input - save BytesIO data to output folder
- ext = mesh.format or "glb"
- f = f"{filename}_{counter:05}_.{ext}"
- mesh.save_to(os.path.join(full_output_folder, f))
- results.append({
- "filename": f,
- "subfolder": subfolder,
- "type": "output"
- })
- else:
- # Handle Mesh input - save vertices and faces as GLB
- for i in range(mesh.vertices.shape[0]):
- f = f"{filename}_{counter:05}_.glb"
- save_glb(mesh.vertices[i], mesh.faces[i], os.path.join(full_output_folder, f), metadata)
- results.append({
- "filename": f,
- "subfolder": subfolder,
- "type": "output"
- })
- counter += 1
- return IO.NodeOutput(ui={"3d": results})
-
-
class Hunyuan3dExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -689,7 +498,6 @@ class Hunyuan3dExtension(ComfyExtension):
VAEDecodeHunyuan3D,
VoxelToMeshBasic,
VoxelToMesh,
- SaveGLB,
]
diff --git a/comfy_extras/nodes_hypernetwork.py b/comfy_extras/nodes_hypernetwork.py
index 2a6a87a81..2d3f1bd05 100644
--- a/comfy_extras/nodes_hypernetwork.py
+++ b/comfy_extras/nodes_hypernetwork.py
@@ -102,7 +102,8 @@ class HypernetworkLoader(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="HypernetworkLoader",
- category="loaders",
+ display_name="Load Hypernetwork",
+ category="model/loaders",
inputs=[
IO.Model.Input("model"),
IO.Combo.Input("hypernetwork_name", options=folder_paths.get_filename_list("hypernetworks")),
diff --git a/comfy_extras/nodes_hypertile.py b/comfy_extras/nodes_hypertile.py
index 354d96db1..2a96416be 100644
--- a/comfy_extras/nodes_hypertile.py
+++ b/comfy_extras/nodes_hypertile.py
@@ -27,7 +27,7 @@ class HyperTile(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="HyperTile",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Int.Input("tile_size", default=256, min=1, max=2048, advanced=True),
diff --git a/comfy_extras/nodes_image_compare.py b/comfy_extras/nodes_image_compare.py
index 3d943be67..58af9ae82 100644
--- a/comfy_extras/nodes_image_compare.py
+++ b/comfy_extras/nodes_image_compare.py
@@ -11,7 +11,7 @@ class ImageCompare(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ImageCompare",
- display_name="Image Compare",
+ display_name="Compare Images",
description="Compares two images side by side with a slider.",
category="image",
essentials_category="Image Tools",
diff --git a/comfy_extras/nodes_images.py b/comfy_extras/nodes_images.py
index a77f0641f..469a7be55 100644
--- a/comfy_extras/nodes_images.py
+++ b/comfy_extras/nodes_images.py
@@ -1,17 +1,23 @@
-from __future__ import annotations
-
import nodes
import folder_paths
+import av
import json
+
import os
import re
import math
+import numpy as np
+import struct
import torch
+
+import zlib
import comfy.utils
+from fractions import Fraction
from server import PromptServer
from comfy_api.latest import ComfyExtension, IO, UI
+from comfy.cli_args import args
from typing_extensions import override
SVG = IO.SVG.Type # TODO: temporary solution for backward compatibility, will be removed later.
@@ -24,7 +30,7 @@ class ImageCrop(IO.ComfyNode):
return IO.Schema(
node_id="ImageCrop",
search_aliases=["trim"],
- display_name="Image Crop (Deprecated)",
+ display_name="Crop Image (DEPRECATED)",
category="image/transform",
is_deprecated=True,
essentials_category="Image Tools",
@@ -55,9 +61,10 @@ class ImageCropV2(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ImageCropV2",
- search_aliases=["trim"],
- display_name="Image Crop",
+ search_aliases=["crop", "cut", "trim"],
+ display_name="Crop Image",
category="image/transform",
+ description = "Crop an image to the specified dimensions.",
essentials_category="Image Tools",
has_intermediate_output=True,
inputs=[
@@ -88,7 +95,7 @@ class BoundingBox(IO.ComfyNode):
return IO.Schema(
node_id="PrimitiveBoundingBox",
display_name="Bounding Box",
- category="utils/primitive",
+ category="utilities/primitive",
inputs=[
IO.Int.Input("x", default=0, min=0, max=MAX_RESOLUTION),
IO.Int.Input("y", default=0, min=0, max=MAX_RESOLUTION),
@@ -109,6 +116,7 @@ class RepeatImageBatch(IO.ComfyNode):
return IO.Schema(
node_id="RepeatImageBatch",
search_aliases=["duplicate image", "clone image"],
+ display_name="Repeat Image Batch",
category="image/batch",
inputs=[
IO.Image.Input("image"),
@@ -131,10 +139,11 @@ class ImageFromBatch(IO.ComfyNode):
return IO.Schema(
node_id="ImageFromBatch",
search_aliases=["select image", "pick from batch", "extract image"],
+ display_name="Get Image from Batch",
category="image/batch",
inputs=[
IO.Image.Input("image"),
- IO.Int.Input("batch_index", default=0, min=0, max=4095),
+ IO.Int.Input("batch_index", default=0, min=-MAX_RESOLUTION, max=MAX_RESOLUTION),
IO.Int.Input("length", default=1, min=1, max=4096),
],
outputs=[IO.Image.Output()],
@@ -143,7 +152,9 @@ class ImageFromBatch(IO.ComfyNode):
@classmethod
def execute(cls, image, batch_index, length) -> IO.NodeOutput:
s_in = image
- batch_index = min(s_in.shape[0] - 1, batch_index)
+ if batch_index < 0:
+ batch_index += s_in.shape[0]
+ batch_index = max(0, min(s_in.shape[0] - 1, batch_index))
length = min(s_in.shape[0] - batch_index, length)
s = s_in[batch_index:batch_index + length].clone()
return IO.NodeOutput(s)
@@ -157,7 +168,8 @@ class ImageAddNoise(IO.ComfyNode):
return IO.Schema(
node_id="ImageAddNoise",
search_aliases=["film grain"],
- category="image",
+ display_name="Add Noise to Image",
+ category="image/filters",
inputs=[
IO.Image.Input("image"),
IO.Int.Input(
@@ -189,7 +201,8 @@ class SaveAnimatedWEBP(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="SaveAnimatedWEBP",
- category="image/animation",
+ display_name="Save Animated WEBP",
+ category="image",
inputs=[
IO.Image.Input("images"),
IO.String.Input("filename_prefix", default="ComfyUI"),
@@ -226,7 +239,8 @@ class SaveAnimatedPNG(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="SaveAnimatedPNG",
- category="image/animation",
+ display_name="Save Animated PNG",
+ category="image",
inputs=[
IO.Image.Input("images"),
IO.String.Input("filename_prefix", default="ComfyUI"),
@@ -259,7 +273,7 @@ class ImageStitch(IO.ComfyNode):
return IO.Schema(
node_id="ImageStitch",
search_aliases=["combine images", "join images", "concatenate images", "side by side"],
- display_name="Image Stitch",
+ display_name="Stitch Images",
description="Stitches image2 to image1 in the specified direction.\n"
"If image2 is not provided, returns image1 unchanged.\n"
"Optional spacing can be added between images.",
@@ -434,6 +448,7 @@ class ResizeAndPadImage(IO.ComfyNode):
return IO.Schema(
node_id="ResizeAndPadImage",
search_aliases=["fit to size"],
+ display_name="Resize And Pad Image",
category="image/transform",
inputs=[
IO.Image.Input("image"),
@@ -485,8 +500,9 @@ class SaveSVGNode(IO.ComfyNode):
return IO.Schema(
node_id="SaveSVGNode",
search_aliases=["export vector", "save vector graphics"],
+ display_name="Save SVG",
description="Save SVG files on disk.",
- category="image/save",
+ category="image",
inputs=[
IO.SVG.Input("svg"),
IO.String.Input(
@@ -591,7 +607,7 @@ class ImageRotate(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ImageRotate",
- display_name="Image Rotate",
+ display_name="Rotate Image",
search_aliases=["turn", "flip orientation"],
category="image/transform",
essentials_category="Image Tools",
@@ -624,6 +640,7 @@ class ImageFlip(IO.ComfyNode):
return IO.Schema(
node_id="ImageFlip",
search_aliases=["mirror", "reflect"],
+ display_name="Flip Image",
category="image/transform",
inputs=[
IO.Image.Input("image"),
@@ -650,6 +667,7 @@ class ImageScaleToMaxDimension(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ImageScaleToMaxDimension",
+ display_name="Scale Image to Max Dimension",
category="image/upscaling",
inputs=[
IO.Image.Input("image"),
@@ -709,7 +727,7 @@ class SplitImageToTileList(IO.ComfyNode):
def get_grid_coords(width, height, tile_width, tile_height, overlap):
coords = []
stride_x = round(max(tile_width * 0.25, tile_width - overlap))
- stride_y = round(max(tile_width * 0.25, tile_height - overlap))
+ stride_y = round(max(tile_height * 0.25, tile_height - overlap))
y = 0
while y < height:
@@ -823,6 +841,405 @@ class ImageMergeTileList(IO.ComfyNode):
return IO.NodeOutput(merged_image)
+# ---------------------------------------------------------------------------
+# Format specifications
+# ---------------------------------------------------------------------------
+
+# Maps (file_format, bit_depth, has_alpha) -> (numpy dtype scale, av pixel format,
+# stream pix_fmt). Keeps the encode path declarative instead of branchy.
+_FORMAT_SPECS = {
+ ("png", "8-bit", False): {"scale": 255.0, "dtype": np.uint8, "frame_fmt": "rgb24", "stream_fmt": "rgb24"},
+ ("png", "8-bit", True): {"scale": 255.0, "dtype": np.uint8, "frame_fmt": "rgba", "stream_fmt": "rgba"},
+ ("png", "16-bit", False): {"scale": 65535.0, "dtype": np.uint16, "frame_fmt": "rgb48le", "stream_fmt": "rgb48be"},
+ ("png", "16-bit", True): {"scale": 65535.0, "dtype": np.uint16, "frame_fmt": "rgba64le", "stream_fmt": "rgba64be"},
+ ("exr", "32-bit float", False): {"scale": 1.0, "dtype": np.float32, "frame_fmt": "gbrpf32le", "stream_fmt": "gbrpf32le"},
+ ("exr", "32-bit float", True): {"scale": 1.0, "dtype": np.float32, "frame_fmt": "gbrapf32le", "stream_fmt": "gbrapf32le"},
+}
+
+
+# ---------------------------------------------------------------------------
+# Color transforms
+# ---------------------------------------------------------------------------
+
+def srgb_to_linear(t: torch.Tensor) -> torch.Tensor:
+ """Inverse sRGB EOTF (IEC 61966-2-1). Operates on RGB channels only;
+ alpha (if present as the 4th channel) is passed through unchanged."""
+ if t.shape[-1] == 4:
+ rgb, alpha = t[..., :3], t[..., 3:]
+ return torch.cat([srgb_to_linear(rgb), alpha], dim=-1)
+
+ # Piecewise: linear toe below 0.04045, gamma curve above.
+ low = t / 12.92
+ high = ((t.clamp(min=0.0) + 0.055) / 1.055) ** 2.4
+ return torch.where(t <= 0.04045, low, high)
+
+
+# HLG OETF constants from BT.2100 Table 5.
+_HLG_A = 0.17883277
+_HLG_B = 0.28466892
+_HLG_C = 0.55991072928 # = 0.5 - a*ln(4*a)
+
+
+def hlg_to_linear(t: torch.Tensor) -> torch.Tensor:
+ """Inverse HLG OETF (BT.2100). Maps a non-linear HLG signal in [0, 1] to
+ *scene*-linear light in [0, 1]. Per BT.2100 Note 5a, this is the correct
+ transform when converting HLG to a linear scene-light representation
+ (rather than display-light, which would also involve the HLG OOTF).
+
+ Operates on RGB channels only; alpha is passed through unchanged."""
+ if t.shape[-1] == 4:
+ rgb, alpha = t[..., :3], t[..., 3:]
+ return torch.cat([hlg_to_linear(rgb), alpha], dim=-1)
+
+ # Piecewise: sqrt branch below 0.5, log branch above.
+ # Clamp inside the log branch so negative / out-of-range values don't blow up;
+ # values above 1.0 are allowed and extrapolate naturally.
+ low = (t ** 2) / 3.0
+ high = (torch.exp((t.clamp(min=_HLG_C) - _HLG_C) / _HLG_A) + _HLG_B) / 12.0
+ return torch.where(t <= 0.5, low, high)
+
+
+# ---------------------------------------------------------------------------
+# Metadata injection
+# ---------------------------------------------------------------------------
+
+_PNG_SIGNATURE = b"\x89PNG\r\n\x1a\n"
+
+
+def _png_chunk(chunk_type: bytes, data: bytes) -> bytes:
+ """Build a single PNG chunk: length | type | data | CRC32(type+data)."""
+ crc = zlib.crc32(chunk_type + data) & 0xFFFFFFFF
+ return struct.pack(">I", len(data)) + chunk_type + data + struct.pack(">I", crc)
+
+
+def _png_text_chunk(keyword: str, text: str) -> bytes:
+ """tEXt chunk: latin-1 keyword + NUL + latin-1 text."""
+ payload = keyword.encode("latin-1") + b"\x00" + text.encode("latin-1", errors="replace")
+ return _png_chunk(b"tEXt", payload)
+
+
+def inject_png_metadata(png_bytes: bytes, prompt: dict | None, extra_pnginfo: dict | None) -> bytes:
+ """Insert ComfyUI prompt/workflow as tEXt chunks right after IHDR."""
+ if not png_bytes.startswith(_PNG_SIGNATURE):
+ return png_bytes
+
+ chunks: list[bytes] = []
+ if prompt is not None:
+ chunks.append(_png_text_chunk("prompt", json.dumps(prompt)))
+ if extra_pnginfo:
+ for key, value in extra_pnginfo.items():
+ chunks.append(_png_text_chunk(key, json.dumps(value)))
+ if not chunks:
+ return png_bytes
+
+ # IHDR is always the first chunk; insert ours immediately after it.
+ ihdr_length = struct.unpack(">I", png_bytes[8:12])[0]
+ ihdr_end = 8 + 8 + ihdr_length + 4 # signature + (len+type) + data + crc
+ return png_bytes[:ihdr_end] + b"".join(chunks) + png_bytes[ihdr_end:]
+
+
+# Standard chromaticities (CIE 1931 xy) for the colorspaces this node writes.
+# Each tuple is (Rx, Ry, Gx, Gy, Bx, By, Wx, Wy). All share D65 white point.
+_CHROMATICITIES = {
+ # ITU-R BT.709 / sRGB primaries
+ "Rec.709": (0.6400, 0.3300, 0.3000, 0.6000, 0.1500, 0.0600, 0.3127, 0.3290),
+ # ITU-R BT.2020 (UHDTV / wide-gamut HDR) primaries
+ "Rec.2020": (0.7080, 0.2920, 0.1700, 0.7970, 0.1310, 0.0460, 0.3127, 0.3290),
+}
+
+
+def _pack_chromaticities(primaries: tuple) -> bytes:
+ """Serialize 8 chromaticity floats into the EXR `chromaticities` payload."""
+ return struct.pack("<8f", *primaries)
+
+
+def _exr_attribute(name: str, attr_type: str, value: bytes) -> bytes:
+ """Serialize one EXR header attribute: name\\0 type\\0 size:int32 value."""
+ return (
+ name.encode("utf-8") + b"\x00"
+ + attr_type.encode("utf-8") + b"\x00"
+ + struct.pack(" bytes:
+ """Insert ComfyUI metadata and color-space info into an EXR header.
+
+ Color: EXR pixels are linear by convention. The standard way to describe
+ their RGB→XYZ relationship is the `chromaticities` attribute. We pick the
+ primaries that match what the user told us their input was:
+
+ colorspace="sRGB" → Rec. 709 / sRGB primaries (D65)
+ colorspace="HDR" → Rec. 2020 / BT.2100 primaries (D65)
+
+ Pixels are always converted to linear scene light upstream (sRGB EOTF
+ inverse for sRGB; HLG OETF inverse for HDR), so the file content is
+ scene-linear in the indicated gamut. OpenEXR has no standard transfer-
+ function attribute (the OpenEXR TSC has discussed adding one but it
+ doesn't exist), so we don't invent one — `chromaticities` plus the EXR
+ linear-by-convention rule fully specifies the color.
+
+ Prompt/workflow: written as plain `string` attributes using the same keys
+ (`prompt`, `workflow`, ...) that Comfy uses for PNG tEXt chunks, so the
+ same readers can pull them out symmetrically.
+
+ Implementation note: the chunk-offset table that follows the header stores
+ *absolute* byte offsets into the file. Inserting N bytes into the header
+ means every offset must be incremented by N or the file becomes unreadable.
+ """
+ if len(exr_bytes) < 8 or exr_bytes[:4] != b"\x76\x2f\x31\x01":
+ return exr_bytes
+
+ new_blob = b""
+ if prompt is not None:
+ new_blob += _exr_attribute("prompt", "string", json.dumps(prompt).encode("utf-8"))
+ if extra_pnginfo:
+ for key, value in extra_pnginfo.items():
+ new_blob += _exr_attribute(key, "string", json.dumps(value).encode("utf-8"))
+ if colorspace is not None:
+ # Map each colorspace option to the RGB primaries the linear pixels
+ # are now in. "sRGB" and "linear" both produce Rec. 709 linear; "HDR"
+ # (HLG-encoded Rec. 2020 input) produces Rec. 2020 linear.
+ primaries_name = {
+ "sRGB": "Rec.709",
+ "linear": "Rec.709",
+ "HDR": "Rec.2020",
+ }.get(colorspace, "Rec.709")
+ new_blob += _exr_attribute(
+ "chromaticities",
+ "chromaticities",
+ _pack_chromaticities(_CHROMATICITIES[primaries_name]),
+ )
+ if not new_blob:
+ return exr_bytes
+
+ # Walk header attributes to find the terminating null byte, and pick up
+ # dataWindow + compression so we know how many chunks the offset table has.
+ pos = 8 # past magic (4) + version (4)
+ data_window = None
+ compression = 0
+ while pos < len(exr_bytes) and exr_bytes[pos] != 0:
+ name_end = exr_bytes.index(b"\x00", pos)
+ attr_name = exr_bytes[pos:name_end].decode("latin-1", errors="replace")
+ type_end = exr_bytes.index(b"\x00", name_end + 1)
+ attr_type = exr_bytes[name_end + 1:type_end].decode("latin-1", errors="replace")
+ size = struct.unpack(" bytes:
+ """Encode a single HxWxC tensor to PNG or EXR bytes in memory.
+
+ For EXR the input is interpreted according to `colorspace` and converted
+ to scene-linear (EXR's convention) before writing:
+
+ "sRGB" → input is sRGB-encoded Rec. 709; apply inverse sRGB EOTF.
+ "HDR" → input is HLG-encoded Rec. 2020 (BT.2100); apply inverse HLG
+ OETF to get scene-linear, per BT.2100 Note 5a.
+ "linear" → input is already scene-linear (Rec. 709 primaries); write
+ through unchanged. Use this for renderer/compositor output.
+
+ For PNG, colorspace selection does not modify pixels — PNG is delivered
+ sRGB-encoded and there is no PNG path for wide-gamut HDR in this node.
+ """
+ height, width, num_channels = img_tensor.shape
+ has_alpha = num_channels == 4
+
+ spec = _FORMAT_SPECS[(file_format, bit_depth, has_alpha)]
+
+ if spec["dtype"] == np.float32:
+ # EXR path: preserve full range, no clamp.
+ if colorspace == "sRGB":
+ img_tensor = srgb_to_linear(img_tensor)
+ elif colorspace == "HDR":
+ img_tensor = hlg_to_linear(img_tensor)
+ img_np = img_tensor.cpu().numpy().astype(np.float32)
+ else:
+ # PNG path: quantize to integer range.
+ scaled = (img_tensor * spec["scale"]).clamp(0, spec["scale"])
+ img_np = scaled.to(torch.int32).cpu().numpy().astype(spec["dtype"])
+
+ # Encode directly via CodecContext. PyAV's `image2` muxer does NOT write to
+ # BytesIO (it expects a real file path), so we bypass the container entirely.
+ # For single-frame PNG/EXR the raw codec output IS the file.
+ codec = av.CodecContext.create(file_format, "w")
+ codec.width = width
+ codec.height = height
+ codec.pix_fmt = spec["stream_fmt"]
+ codec.time_base = Fraction(1, 1)
+
+ frame = av.VideoFrame.from_ndarray(img_np, format=spec["frame_fmt"])
+ if spec["frame_fmt"] != spec["stream_fmt"]:
+ frame = frame.reformat(format=spec["stream_fmt"])
+ frame.pts = 0
+ frame.time_base = codec.time_base
+
+ packets = list(codec.encode(frame)) + list(codec.encode(None)) # flush with None
+ return b"".join(bytes(p) for p in packets)
+
+
+# ---------------------------------------------------------------------------
+# Node
+# ---------------------------------------------------------------------------
+
+class SaveImageAdvanced(IO.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="SaveImageAdvanced",
+ search_aliases=["save", "save image", "export image", "output image", "write image"],
+ display_name="Save Image (Advanced)",
+ description="Saves the input images to your ComfyUI output directory.",
+ category="image",
+ essentials_category="Basics",
+ inputs=[
+ IO.Image.Input("images", tooltip="The images to save."),
+ IO.String.Input(
+ "filename_prefix",
+ default="ComfyUI",
+ tooltip=(
+ "The prefix for the file to save. May include formatting tokens "
+ "such as %date:yyyy-MM-dd% or %Empty Latent Image.width%."
+ ),
+ ),
+ IO.DynamicCombo.Input(
+ "format",
+ options=[
+ IO.DynamicCombo.Option("png", [
+ IO.Combo.Input("bit_depth", options=["8-bit", "16-bit"],
+ default="8-bit", advanced=True),
+ IO.Combo.Input("input_color_space", options=["sRGB"],
+ default="sRGB", advanced=True),
+ ]),
+ IO.DynamicCombo.Option("exr", [
+ IO.Combo.Input("bit_depth", options=["32-bit float"],
+ default="32-bit float", advanced=True),
+ IO.Combo.Input(
+ "input_color_space",
+ options=["sRGB", "HDR", "linear"],
+ default="sRGB",
+ advanced=True,
+ tooltip=(
+ "Colorspace of the input tensor. The EXR is "
+ "always written as scene-linear in the matching "
+ "gamut.\n"
+ " 'sRGB' — input is sRGB-encoded Rec.709; "
+ "the inverse sRGB EOTF is applied.\n"
+ " 'HDR' — input is HLG-encoded Rec.2020 "
+ "(BT.2100); the inverse HLG OETF is applied "
+ "to get scene-linear light.\n"
+ " 'linear' — input is already scene-linear "
+ "(Rec.709 primaries); written through unchanged. "
+ "Use this for renderer/compositor output."
+ ),
+ ),
+ ]),
+ ],
+ tooltip="The file format in which to save the image.",
+ ),
+ ],
+ hidden=[IO.Hidden.prompt, IO.Hidden.extra_pnginfo],
+ is_output_node=True,
+ )
+
+ @classmethod
+ def execute(cls, images, filename_prefix: str, format: dict) -> IO.NodeOutput:
+ file_format = format["format"]
+ bit_depth = format["bit_depth"]
+ colorspace = format.get("input_color_space", "sRGB")
+
+ output_dir = folder_paths.get_output_directory()
+ full_output_folder, filename, counter, subfolder, filename_prefix = (
+ folder_paths.get_save_image_path(
+ filename_prefix, output_dir, images[0].shape[1], images[0].shape[0]
+ )
+ )
+
+ prompt = cls.hidden.prompt
+ extra_pnginfo = cls.hidden.extra_pnginfo
+ write_metadata = not args.disable_metadata
+
+ results = []
+ for batch_number, image in enumerate(images):
+ encoded = _encode_image(image, file_format, bit_depth, colorspace)
+
+ if write_metadata:
+ if file_format == "png":
+ encoded = inject_png_metadata(encoded, prompt, extra_pnginfo)
+ elif file_format == "exr":
+ encoded = inject_exr_metadata(encoded, prompt, extra_pnginfo, colorspace)
+
+ name = filename.replace("%batch_num%", str(batch_number))
+ file = f"{name}_{counter:05}.{file_format}"
+ with open(os.path.join(full_output_folder, file), "wb") as f:
+ f.write(encoded)
+
+ results.append({"filename": file, "subfolder": subfolder, "type": "output"})
+ counter += 1
+
+ return IO.NodeOutput(ui={"images": results})
+
+
class ImagesExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[IO.ComfyNode]]:
@@ -835,6 +1252,7 @@ class ImagesExtension(ComfyExtension):
ImageAddNoise,
SaveAnimatedWEBP,
SaveAnimatedPNG,
+ SaveImageAdvanced,
SaveSVGNode,
ImageStitch,
ResizeAndPadImage,
diff --git a/comfy_extras/nodes_ip2p.py b/comfy_extras/nodes_ip2p.py
index 78f29915d..9c80834f0 100644
--- a/comfy_extras/nodes_ip2p.py
+++ b/comfy_extras/nodes_ip2p.py
@@ -9,7 +9,7 @@ class InstructPixToPixConditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="InstructPixToPixConditioning",
- category="conditioning/instructpix2pix",
+ category="model/conditioning/instructpix2pix",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/nodes_kandinsky5.py b/comfy_extras/nodes_kandinsky5.py
index 346c50cde..015965498 100644
--- a/comfy_extras/nodes_kandinsky5.py
+++ b/comfy_extras/nodes_kandinsky5.py
@@ -13,7 +13,7 @@ class Kandinsky5ImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Kandinsky5ImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -71,7 +71,7 @@ class NormalizeVideoLatentStart(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="NormalizeVideoLatentStart",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
description="Normalizes the initial frames of a video latent to match the mean and standard deviation of subsequent reference frames. Helps reduce differences between the starting frames and the rest of the video.",
inputs=[
io.Latent.Input("latent"),
diff --git a/comfy_extras/nodes_latent.py b/comfy_extras/nodes_latent.py
index 8bb368dec..32da9e8ac 100644
--- a/comfy_extras/nodes_latent.py
+++ b/comfy_extras/nodes_latent.py
@@ -22,7 +22,7 @@ class LatentAdd(io.ComfyNode):
return io.Schema(
node_id="LatentAdd",
search_aliases=["combine latents", "sum latents"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples1"),
io.Latent.Input("samples2"),
@@ -49,7 +49,7 @@ class LatentSubtract(io.ComfyNode):
return io.Schema(
node_id="LatentSubtract",
search_aliases=["difference latent", "remove features"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples1"),
io.Latent.Input("samples2"),
@@ -76,7 +76,7 @@ class LatentMultiply(io.ComfyNode):
return io.Schema(
node_id="LatentMultiply",
search_aliases=["scale latent", "amplify latent", "latent gain"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples"),
io.Float.Input("multiplier", default=1.0, min=-10.0, max=10.0, step=0.01),
@@ -100,7 +100,7 @@ class LatentInterpolate(io.ComfyNode):
return io.Schema(
node_id="LatentInterpolate",
search_aliases=["blend latent", "mix latent", "lerp latent", "transition"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples1"),
io.Latent.Input("samples2"),
@@ -139,7 +139,7 @@ class LatentConcat(io.ComfyNode):
return io.Schema(
node_id="LatentConcat",
search_aliases=["join latents", "stitch latents"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples1"),
io.Latent.Input("samples2"),
@@ -179,7 +179,7 @@ class LatentCut(io.ComfyNode):
return io.Schema(
node_id="LatentCut",
search_aliases=["crop latent", "slice latent", "extract region"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples"),
io.Combo.Input("dim", options=["x", "y", "t"]),
@@ -220,7 +220,7 @@ class LatentCutToBatch(io.ComfyNode):
return io.Schema(
node_id="LatentCutToBatch",
search_aliases=["slice to batch", "split latent", "tile latent"],
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples"),
io.Combo.Input("dim", options=["t", "x", "y"]),
@@ -262,7 +262,7 @@ class LatentBatch(io.ComfyNode):
return io.Schema(
node_id="LatentBatch",
search_aliases=["combine latents", "merge latents", "join latents"],
- category="latent/batch",
+ category="model/latent/batch",
is_deprecated=True,
inputs=[
io.Latent.Input("samples1"),
@@ -290,7 +290,7 @@ class LatentBatchSeedBehavior(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LatentBatchSeedBehavior",
- category="latent/advanced",
+ category="model/latent/advanced",
inputs=[
io.Latent.Input("samples"),
io.Combo.Input("seed_behavior", options=["random", "fixed"], default="fixed"),
@@ -319,7 +319,7 @@ class LatentApplyOperation(io.ComfyNode):
return io.Schema(
node_id="LatentApplyOperation",
search_aliases=["transform latent"],
- category="latent/advanced/operations",
+ category="model/latent/advanced/operations",
is_experimental=True,
inputs=[
io.Latent.Input("samples"),
@@ -343,7 +343,7 @@ class LatentApplyOperationCFG(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LatentApplyOperationCFG",
- category="latent/advanced/operations",
+ category="model/latent/advanced/operations",
is_experimental=True,
inputs=[
io.Model.Input("model"),
@@ -375,7 +375,7 @@ class LatentOperationTonemapReinhard(io.ComfyNode):
return io.Schema(
node_id="LatentOperationTonemapReinhard",
search_aliases=["hdr latent"],
- category="latent/advanced/operations",
+ category="model/latent/advanced/operations",
is_experimental=True,
inputs=[
io.Float.Input("multiplier", default=1.0, min=0.0, max=100.0, step=0.01),
@@ -410,7 +410,7 @@ class LatentOperationSharpen(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LatentOperationSharpen",
- category="latent/advanced/operations",
+ category="model/latent/advanced/operations",
is_experimental=True,
inputs=[
io.Int.Input("sharpen_radius", default=9, min=1, max=31, step=1, advanced=True),
@@ -447,7 +447,7 @@ class ReplaceVideoLatentFrames(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ReplaceVideoLatentFrames",
- category="latent/batch",
+ category="model/latent/batch",
inputs=[
io.Latent.Input("destination", tooltip="The destination latent where frames will be replaced."),
io.Latent.Input("source", optional=True, tooltip="The source latent providing frames to insert into the destination latent. If not provided, the destination latent is returned unchanged."),
diff --git a/comfy_extras/nodes_logic.py b/comfy_extras/nodes_logic.py
index c066064ac..95f6ab848 100644
--- a/comfy_extras/nodes_logic.py
+++ b/comfy_extras/nodes_logic.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from typing import TypedDict
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
@@ -8,6 +7,82 @@ from comfy_api.latest import _io
MISSING = object()
+class NotNode(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="ComfyNotNode",
+ display_name="Not",
+ category="utilities/logic",
+ description="Logical NOT operation. Returns true if the value is falsy. Uses Python's rules for truthiness.",
+ search_aliases=["invert", "toggle", "negate", "flip boolean"],
+ inputs=[
+ io.AnyType.Input("value"),
+ ],
+ outputs=[
+ io.Boolean.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, value) -> io.NodeOutput:
+ return io.NodeOutput(not value)
+
+
+class AndNode(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ template = io.Autogrow.TemplatePrefix(
+ input=io.AnyType.Input("value"),
+ prefix="value",
+ min=1,
+ )
+ return io.Schema(
+ node_id="ComfyAndNode",
+ display_name="And",
+ category="utilities/logic",
+ description="Logical AND operation. Returns true if all of the values are truthy. Uses Python's rules for truthiness.",
+ search_aliases=["all", "every"],
+ inputs=[
+ io.Autogrow.Input("values", template=template),
+ ],
+ outputs=[
+ io.Boolean.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, values: io.Autogrow.Type) -> io.NodeOutput:
+ return io.NodeOutput(all(values.values()))
+
+
+class OrNode(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ template = io.Autogrow.TemplatePrefix(
+ input=io.AnyType.Input("value"),
+ prefix="value",
+ min=1,
+ )
+ return io.Schema(
+ node_id="ComfyOrNode",
+ display_name="Or",
+ category="utilities/logic",
+ description="Logical OR operation. Returns true if any of the values are truthy. Uses Python's rules for truthiness.",
+ search_aliases=["any", "some"],
+ inputs=[
+ io.Autogrow.Input("values", template=template),
+ ],
+ outputs=[
+ io.Boolean.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, values: io.Autogrow.Type) -> io.NodeOutput:
+ return io.NodeOutput(any(values.values()))
+
+
class SwitchNode(io.ComfyNode):
@classmethod
def define_schema(cls):
@@ -15,7 +90,7 @@ class SwitchNode(io.ComfyNode):
return io.Schema(
node_id="ComfySwitchNode",
display_name="Switch",
- category="logic",
+ category="utilities/logic",
is_experimental=True,
inputs=[
io.Boolean.Input("switch"),
@@ -46,7 +121,7 @@ class SoftSwitchNode(io.ComfyNode):
return io.Schema(
node_id="ComfySoftSwitchNode",
display_name="Soft Switch",
- category="logic",
+ category="utilities/logic",
is_experimental=True,
inputs=[
io.Boolean.Input("switch"),
@@ -101,7 +176,7 @@ class CustomComboNode(io.ComfyNode):
return io.Schema(
node_id="CustomCombo",
display_name="Custom Combo",
- category="utils",
+ category="utilities",
is_experimental=True,
inputs=[io.Combo.Input("choice", options=[])],
outputs=[
@@ -136,7 +211,7 @@ class DCTestNode(io.ComfyNode):
return io.Schema(
node_id="DCTestNode",
display_name="DCTest",
- category="logic",
+ category="utilities/logic",
is_output_node=True,
inputs=[io.DynamicCombo.Input("combo", options=[
io.DynamicCombo.Option("option1", [io.String.Input("string")]),
@@ -174,7 +249,7 @@ class AutogrowNamesTestNode(io.ComfyNode):
return io.Schema(
node_id="AutogrowNamesTestNode",
display_name="AutogrowNamesTest",
- category="logic",
+ category="utilities/logic",
inputs=[
_io.Autogrow.Input("autogrow", template=template)
],
@@ -194,7 +269,7 @@ class AutogrowPrefixTestNode(io.ComfyNode):
return io.Schema(
node_id="AutogrowPrefixTestNode",
display_name="AutogrowPrefixTest",
- category="logic",
+ category="utilities/logic",
inputs=[
_io.Autogrow.Input("autogrow", template=template)
],
@@ -213,7 +288,7 @@ class ComboOutputTestNode(io.ComfyNode):
return io.Schema(
node_id="ComboOptionTestNode",
display_name="ComboOptionTest",
- category="logic",
+ category="utilities/logic",
inputs=[io.Combo.Input("combo", options=["option1", "option2", "option3"]),
io.Combo.Input("combo2", options=["option4", "option5", "option6"])],
outputs=[io.Combo.Output(), io.Combo.Output()],
@@ -230,7 +305,7 @@ class ConvertStringToComboNode(io.ComfyNode):
node_id="ConvertStringToComboNode",
search_aliases=["string to dropdown", "text to combo"],
display_name="Convert String to Combo",
- category="logic",
+ category="utilities/logic",
inputs=[io.String.Input("string")],
outputs=[io.Combo.Output()],
)
@@ -246,7 +321,7 @@ class InvertBooleanNode(io.ComfyNode):
node_id="InvertBooleanNode",
search_aliases=["not", "toggle", "negate", "flip boolean"],
display_name="Invert Boolean",
- category="logic",
+ category="utilities/logic",
inputs=[io.Boolean.Input("boolean")],
outputs=[io.Boolean.Output()],
)
@@ -261,6 +336,9 @@ class LogicExtension(ComfyExtension):
return [
SwitchNode,
CustomComboNode,
+ NotNode,
+ AndNode,
+ OrNode,
# SoftSwitchNode,
# ConvertStringToComboNode,
# DCTestNode,
diff --git a/comfy_extras/nodes_lora_debug.py b/comfy_extras/nodes_lora_debug.py
index 937a0fbfb..3f68064e5 100644
--- a/comfy_extras/nodes_lora_debug.py
+++ b/comfy_extras/nodes_lora_debug.py
@@ -30,7 +30,7 @@ class LoraLoaderBypass:
OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.")
FUNCTION = "load_lora"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
DESCRIPTION = "Apply LoRA in bypass mode. Unlike regular LoRA, this doesn't modify model weights - instead it injects the LoRA computation during forward pass. Useful for training scenarios."
EXPERIMENTAL = True
diff --git a/comfy_extras/nodes_lora_extract.py b/comfy_extras/nodes_lora_extract.py
index 975f90f45..bcd249c29 100644
--- a/comfy_extras/nodes_lora_extract.py
+++ b/comfy_extras/nodes_lora_extract.py
@@ -91,7 +91,7 @@ class LoraSave(io.ComfyNode):
node_id="LoraSave",
search_aliases=["export lora"],
display_name="Extract and Save Lora",
- category="_for_testing",
+ category="experimental",
inputs=[
io.String.Input("filename_prefix", default="loras/ComfyUI_extracted_lora"),
io.Int.Input("rank", default=8, min=1, max=4096, step=1, advanced=True),
diff --git a/comfy_extras/nodes_lotus.py b/comfy_extras/nodes_lotus.py
index 9f62ba2bf..9fe4c5c7b 100644
--- a/comfy_extras/nodes_lotus.py
+++ b/comfy_extras/nodes_lotus.py
@@ -10,7 +10,7 @@ class LotusConditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LotusConditioning",
- category="conditioning/lotus",
+ category="model/conditioning/lotus",
inputs=[],
outputs=[io.Conditioning.Output(display_name="conditioning")],
)
diff --git a/comfy_extras/nodes_lt.py b/comfy_extras/nodes_lt.py
index d7c2e8744..6d6078abe 100644
--- a/comfy_extras/nodes_lt.py
+++ b/comfy_extras/nodes_lt.py
@@ -1,6 +1,7 @@
import nodes
import node_helpers
import torch
+import torchaudio
import comfy.model_management
import comfy.model_sampling
import comfy.samplers
@@ -13,12 +14,55 @@ from typing_extensions import override
from comfy.ldm.lightricks.symmetric_patchifier import SymmetricPatchifier, latent_to_pixel_coords
from comfy_api.latest import ComfyExtension, io
+ICLoRAParameters = io.Custom("IC_LORA_PARAMETERS")
+
+
+class GetICLoRAParameters(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="GetICLoRAParameters",
+ display_name="Get IC-LoRA Parameters",
+ description="Extracts IC-LoRA parameters from the safetensors metadata of a LoRA-loaded "
+ "model and outputs them for LTXVAddGuide (eg. reference_downscale_factor).",
+ category="model/conditioning/video_models",
+ search_aliases=["ic-lora", "ic lora", "iclora", "downscale factor", "reference downscale"],
+ inputs=[
+ io.Model.Input(
+ "iclora_model",
+ tooltip="Direct output from a LoRA Loader for the specific IC-LoRA "
+ "from which to extract the metadata.",
+ ),
+ ],
+ outputs=[
+ ICLoRAParameters.Output(
+ "iclora_parameters",
+ tooltip="IC-LoRA parameters extracted from the LoRA metadata "
+ "(eg. reference_downscale_factor). Connect to LTXVAddGuide "
+ "if the LoRA requires special handling of the guides.",
+ ),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, iclora_model) -> io.NodeOutput:
+ metadata = iclora_model.get_attachment("lora_metadata")
+ factor = 1
+ if metadata:
+ try:
+ factor = max(1, round(float(metadata.get("reference_downscale_factor", 1))))
+ except (TypeError, ValueError):
+ factor = 1
+ parameters = {"reference_downscale_factor": factor}
+ return io.NodeOutput(parameters)
+
+
class EmptyLTXVLatentVideo(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="EmptyLTXVLatentVideo",
- category="latent/video/ltxv",
+ category="model/latent/video/ltxv",
inputs=[
io.Int.Input("width", default=768, min=64, max=nodes.MAX_RESOLUTION, step=32),
io.Int.Input("height", default=512, min=64, max=nodes.MAX_RESOLUTION, step=32),
@@ -33,7 +77,7 @@ class EmptyLTXVLatentVideo(io.ComfyNode):
@classmethod
def execute(cls, width, height, length, batch_size=1) -> io.NodeOutput:
latent = torch.zeros([batch_size, 128, ((length - 1) // 8) + 1, height // 32, width // 32], device=comfy.model_management.intermediate_device())
- return io.NodeOutput({"samples": latent})
+ return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 32})
generate = execute # TODO: remove
@@ -42,7 +86,7 @@ class LTXVImgToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVImgToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -87,7 +131,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVImgToVideoInplace",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Vae.Input("vae"),
io.Image.Input("image"),
@@ -105,12 +149,12 @@ class LTXVImgToVideoInplace(io.ComfyNode):
if bypass:
return (latent,)
- samples = latent["samples"]
+ samples = latent["samples"].clone()
_, height_scale_factor, width_scale_factor = (
vae.downscale_index_formula
)
- batch, _, latent_frames, latent_height, latent_width = samples.shape
+ _, _, _, latent_height, latent_width = samples.shape
width = latent_width * width_scale_factor
height = latent_height * height_scale_factor
@@ -123,11 +167,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
samples[:, :, :t.shape[2]] = t
- conditioning_latent_frames_mask = torch.ones(
- (batch, 1, latent_frames, 1, 1),
- dtype=torch.float32,
- device=samples.device,
- )
+ conditioning_latent_frames_mask = get_noise_mask(latent)
conditioning_latent_frames_mask[:, :, :t.shape[2]] = 1.0 - strength
return io.NodeOutput({"samples": samples, "noise_mask": conditioning_latent_frames_mask})
@@ -135,7 +175,7 @@ class LTXVImgToVideoInplace(io.ComfyNode):
generate = execute # TODO: remove
-def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_shape, strength=1.0):
+def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_shape, strength=1.0, attention_mask=None):
"""Append a guide_attention_entry to both positive and negative conditioning.
Each entry tracks one guide reference for per-reference attention control.
@@ -144,9 +184,10 @@ def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_s
new_entry = {
"pre_filter_count": pre_filter_count,
"strength": strength,
- "pixel_mask": None,
+ "pixel_mask": attention_mask.unsqueeze(0).unsqueeze(0) if attention_mask is not None else None, # reshape to (1, 1, F, H, W)
"latent_shape": latent_shape,
}
+
results = []
for cond in (positive, negative):
# Read existing entries from this specific conditioning
@@ -156,8 +197,7 @@ def _append_guide_attention_entry(positive, negative, pre_filter_count, latent_s
if found is not None:
existing = found
break
- # Shallow copy and append (no deepcopy needed — entries contain
- # only scalars and None for pixel_mask at this call site).
+ # Shallow copy only and append (pixel_mask is never mutated).
entries = [*existing, new_entry]
results.append(node_helpers.conditioning_set_values(
cond, {"guide_attention_entries": entries}
@@ -186,10 +226,20 @@ def get_noise_mask(latent):
noise_mask = noise_mask.clone()
return noise_mask
-def get_keyframe_idxs(cond):
+def get_keyframe_idxs(cond, latent_shape=None):
keyframe_idxs = conditioning_get_any_value(cond, "keyframe_idxs", None)
if keyframe_idxs is None:
return None, 0
+ # Get number of keyframes from latent_shape or guide_attention_entries if available
+ if latent_shape is not None and len(latent_shape) == 5:
+ tokens_per_frame = latent_shape[-2] * latent_shape[-1]
+ num_keyframes = keyframe_idxs.shape[2] // tokens_per_frame
+ return keyframe_idxs, num_keyframes
+ entries = conditioning_get_any_value(cond, "guide_attention_entries", None)
+ if entries:
+ num_keyframes = sum(e["latent_shape"][0] for e in entries)
+ return keyframe_idxs, num_keyframes
+ # fallback, may under-count if keyframes share t-start
# keyframe_idxs contains start/end positions (last dimension), checking for unqiue values only for start
num_keyframes = torch.unique(keyframe_idxs[:, 0, :, 0]).shape[0]
return keyframe_idxs, num_keyframes
@@ -201,7 +251,7 @@ class LTXVAddGuide(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVAddGuide",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -222,7 +272,21 @@ class LTXVAddGuide(io.ComfyNode):
"For videos with 9+ frames, frame_idx must be divisible by 8, otherwise it will be rounded "
"down to the nearest multiple of 8. Negative values are counted from the end of the video.",
),
- io.Float.Input("strength", default=1.0, min=0.0, max=1.0, step=0.01),
+ io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
+ io.Mask.Input(
+ "attention_mask",
+ optional=True,
+ tooltip="Optional pixel-space spatial mask. Controls per-region "
+ "conditioning influence via self-attention, multiplied by strength.",
+ ),
+ ICLoRAParameters.Input(
+ "iclora_parameters",
+ optional=True,
+ tooltip="Optional IC-LoRA parameters from a Get IC-LoRA Parameters node. "
+ "Used for adjusting guide processing as required by certain IC-LoRAs "
+ "(eg. those with a reference_downscale_factor > 1). "
+ "When chained, each LTXVAddGuide uses only the parameters connected to it.",
+ ),
],
outputs=[
io.Conditioning.Output(display_name="positive"),
@@ -232,18 +296,45 @@ class LTXVAddGuide(io.ComfyNode):
)
@classmethod
- def encode(cls, vae, latent_width, latent_height, images, scale_factors):
+ def encode(cls, vae, latent_width, latent_height, images, scale_factors, latent_downscale_factor=1):
time_scale_factor, width_scale_factor, height_scale_factor = scale_factors
images = images[:(images.shape[0] - 1) // time_scale_factor * time_scale_factor + 1]
- pixels = comfy.utils.common_upscale(images.movedim(-1, 1), latent_width * width_scale_factor, latent_height * height_scale_factor, "bilinear", crop="disabled").movedim(1, -1)
+ target_width = int(latent_width * width_scale_factor / latent_downscale_factor)
+ target_height = int(latent_height * height_scale_factor / latent_downscale_factor)
+ pixels = comfy.utils.common_upscale(images.movedim(-1, 1), target_width, target_height, "bilinear", crop="center").movedim(1, -1)
encode_pixels = pixels[:, :, :, :3]
t = vae.encode(encode_pixels)
return encode_pixels, t
@classmethod
- def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors):
+ def dilate_latent(cls, guide_latent, latent_downscale_factor):
+ if latent_downscale_factor <= 1:
+ return guide_latent, None
+ scale = int(latent_downscale_factor)
+ dilated_shape = guide_latent.shape[:3] + (guide_latent.shape[3] * scale, guide_latent.shape[4] * scale)
+ dilated = torch.zeros(dilated_shape, device=guide_latent.device, dtype=guide_latent.dtype)
+ dilated[..., ::scale, ::scale] = guide_latent
+ dilated_mask = torch.full(
+ (dilated.shape[0], 1, dilated.shape[2], dilated.shape[3], dilated.shape[4]),
+ -1.0, device=guide_latent.device, dtype=guide_latent.dtype,
+ )
+ dilated_mask[..., ::scale, ::scale] = 1.0
+ return dilated, dilated_mask
+
+ @classmethod
+ def get_reference_downscale_factor(cls, iclora_parameters):
+ if not iclora_parameters:
+ return 1
+ try:
+ factor = max(1, round(float(iclora_parameters.get("reference_downscale_factor", 1))))
+ except (TypeError, ValueError):
+ factor = 1
+ return factor
+
+ @classmethod
+ def get_latent_index(cls, cond, latent_length, guide_length, frame_idx, scale_factors, latent_shape=None):
time_scale_factor, _, _ = scale_factors
- _, num_keyframes = get_keyframe_idxs(cond)
+ _, num_keyframes = get_keyframe_idxs(cond, latent_shape)
latent_count = latent_length - num_keyframes
frame_idx = frame_idx if frame_idx >= 0 else max((latent_count - 1) * time_scale_factor + 1 + frame_idx, 0)
if guide_length > 1 and frame_idx != 0:
@@ -301,7 +392,7 @@ class LTXVAddGuide(io.ComfyNode):
else:
mask = torch.full(
(noise_mask.shape[0], 1, guiding_latent.shape[2], noise_mask.shape[3], noise_mask.shape[4]),
- 1.0 - strength,
+ max(0.0, 1.0 - strength), # clamp here to amplify only via the attention mask
dtype=noise_mask.dtype,
device=noise_mask.device,
)
@@ -321,7 +412,7 @@ class LTXVAddGuide(io.ComfyNode):
mask = torch.full(
(noise_mask.shape[0], 1, cond_length, 1, 1),
- 1.0 - strength,
+ max(0.0, 1.0 - strength), # clamp here to amplify only via the attention mask
dtype=noise_mask.dtype,
device=noise_mask.device,
)
@@ -335,15 +426,45 @@ class LTXVAddGuide(io.ComfyNode):
return latent_image, noise_mask
@classmethod
- def execute(cls, positive, negative, vae, latent, image, frame_idx, strength) -> io.NodeOutput:
+ def execute(cls, positive, negative, vae, latent, image, frame_idx, strength, attention_mask=None, iclora_parameters=None) -> io.NodeOutput:
scale_factors = vae.downscale_index_formula
latent_image = latent["samples"]
noise_mask = get_noise_mask(latent)
_, _, latent_length, latent_height, latent_width = latent_image.shape
- image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors)
- frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors)
+ latent_downscale_factor = cls.get_reference_downscale_factor(iclora_parameters)
+ if latent_downscale_factor > 1:
+ if latent_width % latent_downscale_factor != 0 or latent_height % latent_downscale_factor != 0:
+ raise ValueError(
+ f"Latent spatial size {latent_width}x{latent_height} must be divisible by "
+ f"reference_downscale_factor {latent_downscale_factor} from the IC-LoRA parameters."
+ )
+
+ # For mid-video multi-frame guides, prepend+strip a throwaway first frame so the VAE's "first latent = 1 pixel frame" asymmetry lands on the discarded slot
+ time_scale_factor = scale_factors[0]
+ num_frames_to_keep = ((image.shape[0] - 1) // time_scale_factor) * time_scale_factor + 1
+ resolved_frame_idx = frame_idx
+ if frame_idx < 0:
+ _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape)
+ resolved_frame_idx = max((latent_length - num_keyframes - 1) * time_scale_factor + 1 + frame_idx, 0)
+ causal_fix = resolved_frame_idx == 0 or num_frames_to_keep == 1
+
+ if not causal_fix:
+ image = torch.cat([image[:1], image], dim=0)
+
+ image, t = cls.encode(vae, latent_width, latent_height, image, scale_factors, latent_downscale_factor)
+
+ if not causal_fix:
+ t = t[:, :, 1:, :, :]
+ image = image[1:]
+
+ guide_latent_shape = list(t.shape[2:]) # pre-dilation [F, H, W] for spatial-mask downsampling
+ guide_mask = None
+ if latent_downscale_factor > 1:
+ t, guide_mask = cls.dilate_latent(t, latent_downscale_factor)
+
+ frame_idx, latent_idx = cls.get_latent_index(positive, latent_length, len(image), frame_idx, scale_factors, latent_shape=latent_image.shape)
assert latent_idx + t.shape[2] <= latent_length, "Conditioning frames exceed the length of the latent sequence."
positive, negative, latent_image, noise_mask = cls.append_keyframe(
@@ -355,13 +476,16 @@ class LTXVAddGuide(io.ComfyNode):
t,
strength,
scale_factors,
+ guide_mask=guide_mask,
+ latent_downscale_factor=latent_downscale_factor,
+ causal_fix=causal_fix,
)
# Track this guide for per-reference attention control.
pre_filter_count = t.shape[2] * t.shape[3] * t.shape[4]
- guide_latent_shape = list(t.shape[2:]) # [F, H, W]
positive, negative = _append_guide_attention_entry(
positive, negative, pre_filter_count, guide_latent_shape, strength=strength,
+ attention_mask=attention_mask,
)
return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask})
@@ -374,7 +498,7 @@ class LTXVCropGuides(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVCropGuides",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -392,7 +516,7 @@ class LTXVCropGuides(io.ComfyNode):
latent_image = latent["samples"].clone()
noise_mask = get_noise_mask(latent)
- _, num_keyframes = get_keyframe_idxs(positive)
+ _, num_keyframes = get_keyframe_idxs(positive, latent_image.shape)
if num_keyframes == 0:
return io.NodeOutput(positive, negative, {"samples": latent_image, "noise_mask": noise_mask},)
@@ -418,7 +542,7 @@ class LTXVConditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVConditioning",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -487,7 +611,7 @@ class LTXVScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Int.Input("steps", default=20, min=1, max=10000),
io.Float.Input("max_shift", default=2.05, min=0.0, max=100.0, step=0.01),
@@ -593,7 +717,8 @@ class LTXVPreprocess(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVPreprocess",
- category="image",
+ display_name="LTXV Preprocess",
+ category="video/preprocessors",
inputs=[
io.Image.Input("image"),
io.Int.Input(
@@ -621,7 +746,7 @@ class LTXVConcatAVLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVConcatAVLatent",
- category="latent/video/ltxv",
+ category="model/latent/video/ltxv",
inputs=[
io.Latent.Input("video_latent"),
io.Latent.Input("audio_latent"),
@@ -656,7 +781,7 @@ class LTXVSeparateAVLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="LTXVSeparateAVLatent",
- category="latent/video/ltxv",
+ category="model/latent/video/ltxv",
description="LTXV Separate AV Latent",
inputs=[
io.Latent.Input("av_latent"),
@@ -689,7 +814,7 @@ class LTXVReferenceAudio(io.ComfyNode):
return io.Schema(
node_id="LTXVReferenceAudio",
display_name="LTXV Reference Audio (ID-LoRA)",
- category="conditioning/audio",
+ category="model/conditioning/audio",
description="Set reference audio for ID-LoRA speaker identity transfer. Encodes a reference audio clip into the conditioning and optionally patches the model with identity guidance (extra forward pass without reference, amplifying the speaker identity effect).",
inputs=[
io.Model.Input("model"),
@@ -711,7 +836,14 @@ class LTXVReferenceAudio(io.ComfyNode):
@classmethod
def execute(cls, model, positive, negative, reference_audio, audio_vae, identity_guidance_scale, start_percent, end_percent) -> io.NodeOutput:
# Encode reference audio to latents and patchify
- audio_latents = audio_vae.encode(reference_audio)
+ sample_rate = reference_audio["sample_rate"]
+ vae_sample_rate = getattr(audio_vae, "audio_sample_rate", 44100)
+ if vae_sample_rate != sample_rate:
+ waveform = torchaudio.functional.resample(reference_audio["waveform"], sample_rate, vae_sample_rate)
+ else:
+ waveform = reference_audio["waveform"]
+
+ audio_latents = audio_vae.encode(waveform.movedim(1, -1))
b, c, t, f = audio_latents.shape
ref_tokens = audio_latents.permute(0, 2, 1, 3).reshape(b, t, c * f)
ref_audio = {"tokens": ref_tokens}
@@ -771,6 +903,7 @@ class LtxvExtension(ComfyExtension):
ModelSamplingLTXV,
LTXVConditioning,
LTXVScheduler,
+ GetICLoRAParameters,
LTXVAddGuide,
LTXVPreprocess,
LTXVCropGuides,
diff --git a/comfy_extras/nodes_lt_audio.py b/comfy_extras/nodes_lt_audio.py
index 3e4222264..052186083 100644
--- a/comfy_extras/nodes_lt_audio.py
+++ b/comfy_extras/nodes_lt_audio.py
@@ -3,17 +3,16 @@ import comfy.utils
import comfy.model_management
import torch
-from comfy.ldm.lightricks.vae.audio_vae import AudioVAE
from comfy_api.latest import ComfyExtension, io
-
+from comfy_extras.nodes_audio import VAEEncodeAudio
class LTXVAudioVAELoader(io.ComfyNode):
@classmethod
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="LTXVAudioVAELoader",
- display_name="LTXV Audio VAE Loader",
- category="audio",
+ display_name="Load LTXV Audio VAE",
+ category="model/loaders",
inputs=[
io.Combo.Input(
"ckpt_name",
@@ -28,16 +27,20 @@ class LTXVAudioVAELoader(io.ComfyNode):
def execute(cls, ckpt_name: str) -> io.NodeOutput:
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
sd, metadata = comfy.utils.load_torch_file(ckpt_path, return_metadata=True)
- return io.NodeOutput(AudioVAE(sd, metadata))
+ sd = comfy.utils.state_dict_prefix_replace(sd, {"audio_vae.": "autoencoder.", "vocoder.": "vocoder."}, filter_keys=True)
+ vae = comfy.sd.VAE(sd=sd, metadata=metadata)
+ vae.throw_exception_if_invalid()
+
+ return io.NodeOutput(vae)
-class LTXVAudioVAEEncode(io.ComfyNode):
+class LTXVAudioVAEEncode(VAEEncodeAudio):
@classmethod
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="LTXVAudioVAEEncode",
display_name="LTXV Audio VAE Encode",
- category="audio",
+ category="model/latent/audio",
inputs=[
io.Audio.Input("audio", tooltip="The audio to be encoded."),
io.Vae.Input(
@@ -50,15 +53,8 @@ class LTXVAudioVAEEncode(io.ComfyNode):
)
@classmethod
- def execute(cls, audio, audio_vae: AudioVAE) -> io.NodeOutput:
- audio_latents = audio_vae.encode(audio)
- return io.NodeOutput(
- {
- "samples": audio_latents,
- "sample_rate": int(audio_vae.sample_rate),
- "type": "audio",
- }
- )
+ def execute(cls, audio, audio_vae) -> io.NodeOutput:
+ return super().execute(audio_vae, audio)
class LTXVAudioVAEDecode(io.ComfyNode):
@@ -67,7 +63,7 @@ class LTXVAudioVAEDecode(io.ComfyNode):
return io.Schema(
node_id="LTXVAudioVAEDecode",
display_name="LTXV Audio VAE Decode",
- category="audio",
+ category="model/latent/audio",
inputs=[
io.Latent.Input("samples", tooltip="The latent to be decoded."),
io.Vae.Input(
@@ -80,12 +76,12 @@ class LTXVAudioVAEDecode(io.ComfyNode):
)
@classmethod
- def execute(cls, samples, audio_vae: AudioVAE) -> io.NodeOutput:
+ def execute(cls, samples, audio_vae) -> io.NodeOutput:
audio_latent = samples["samples"]
if audio_latent.is_nested:
audio_latent = audio_latent.unbind()[-1]
- audio = audio_vae.decode(audio_latent).to(audio_latent.device)
- output_audio_sample_rate = audio_vae.output_sample_rate
+ audio = audio_vae.decode(audio_latent).movedim(-1, 1).to(audio_latent.device)
+ output_audio_sample_rate = audio_vae.first_stage_model.output_sample_rate
return io.NodeOutput(
{
"waveform": audio,
@@ -100,7 +96,7 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
return io.Schema(
node_id="LTXVEmptyLatentAudio",
display_name="LTXV Empty Latent Audio",
- category="latent/audio",
+ category="model/latent/audio",
inputs=[
io.Int.Input(
"frames_number",
@@ -143,17 +139,16 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
frames_number: int,
frame_rate: int,
batch_size: int,
- audio_vae: AudioVAE,
+ audio_vae,
) -> io.NodeOutput:
"""Generate empty audio latents matching the reference pipeline structure."""
assert audio_vae is not None, "Audio VAE model is required"
z_channels = audio_vae.latent_channels
- audio_freq = audio_vae.latent_frequency_bins
- sampling_rate = int(audio_vae.sample_rate)
+ audio_freq = audio_vae.first_stage_model.latent_frequency_bins
- num_audio_latents = audio_vae.num_of_latents_from_frames(frames_number, frame_rate)
+ num_audio_latents = audio_vae.first_stage_model.num_of_latents_from_frames(frames_number, frame_rate)
audio_latents = torch.zeros(
(batch_size, z_channels, num_audio_latents, audio_freq),
@@ -163,7 +158,6 @@ class LTXVEmptyLatentAudio(io.ComfyNode):
return io.NodeOutput(
{
"samples": audio_latents,
- "sample_rate": sampling_rate,
"type": "audio",
}
)
diff --git a/comfy_extras/nodes_lt_upsampler.py b/comfy_extras/nodes_lt_upsampler.py
index f99ba13fb..be9a36e69 100644
--- a/comfy_extras/nodes_lt_upsampler.py
+++ b/comfy_extras/nodes_lt_upsampler.py
@@ -1,32 +1,32 @@
from comfy import model_management
+from comfy_api.latest import ComfyExtension, IO
+from typing_extensions import override
import math
-class LTXVLatentUpsampler:
+
+class LTXVLatentUpsampler(IO.ComfyNode):
"""
Upsamples a video latent by a factor of 2.
"""
@classmethod
- def INPUT_TYPES(s):
- return {
- "required": {
- "samples": ("LATENT",),
- "upscale_model": ("LATENT_UPSCALE_MODEL",),
- "vae": ("VAE",),
- }
- }
+ def define_schema(cls):
+ return IO.Schema(
+ node_id="LTXVLatentUpsampler",
+ category="model/latent/video",
+ is_experimental=True,
+ inputs=[
+ IO.Latent.Input("samples"),
+ IO.LatentUpscaleModel.Input("upscale_model"),
+ IO.Vae.Input("vae"),
+ ],
+ outputs=[
+ IO.Latent.Output(),
+ ],
+ )
- RETURN_TYPES = ("LATENT",)
- FUNCTION = "upsample_latent"
- CATEGORY = "latent/video"
- EXPERIMENTAL = True
-
- def upsample_latent(
- self,
- samples: dict,
- upscale_model,
- vae,
- ) -> tuple:
+ @classmethod
+ def execute(cls, samples, upscale_model, vae) -> IO.NodeOutput:
"""
Upsample the input latent using the provided model.
@@ -34,7 +34,6 @@ class LTXVLatentUpsampler:
samples (dict): Input latent samples
upscale_model (LatentUpsampler): Loaded upscale model
vae: VAE model for normalization
- auto_tiling (bool): Whether to automatically tile the input for processing
Returns:
tuple: Tuple containing the upsampled latent
@@ -67,9 +66,16 @@ class LTXVLatentUpsampler:
return_dict = samples.copy()
return_dict["samples"] = upsampled_latents
return_dict.pop("noise_mask", None)
- return (return_dict,)
+ return IO.NodeOutput(return_dict)
+
+ upsample_latent = execute # TODO: remove
-NODE_CLASS_MAPPINGS = {
- "LTXVLatentUpsampler": LTXVLatentUpsampler,
-}
+class LTXVLatentUpsamplerExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [LTXVLatentUpsampler]
+
+
+async def comfy_entrypoint() -> LTXVLatentUpsamplerExtension:
+ return LTXVLatentUpsamplerExtension()
diff --git a/comfy_extras/nodes_lumina2.py b/comfy_extras/nodes_lumina2.py
index b35ab8b7d..c060a86a0 100644
--- a/comfy_extras/nodes_lumina2.py
+++ b/comfy_extras/nodes_lumina2.py
@@ -81,7 +81,7 @@ class CLIPTextEncodeLumina2(io.ComfyNode):
node_id="CLIPTextEncodeLumina2",
search_aliases=["lumina prompt"],
display_name="CLIP Text Encode for Lumina2",
- category="conditioning",
+ category="model/conditioning",
description="Encodes a system prompt and a user prompt using a CLIP model into an embedding "
"that can be used to guide the diffusion model towards generating specific images.",
inputs=[
diff --git a/comfy_extras/nodes_mahiro.py b/comfy_extras/nodes_mahiro.py
index a25226e6d..7bd5f6652 100644
--- a/comfy_extras/nodes_mahiro.py
+++ b/comfy_extras/nodes_mahiro.py
@@ -11,7 +11,7 @@ class Mahiro(io.ComfyNode):
return io.Schema(
node_id="Mahiro",
display_name="Positive-Biased Guidance",
- category="_for_testing",
+ category="experimental",
description="Modify the guidance to scale more on the 'direction' of the positive prompt rather than the difference between the negative prompt.",
inputs=[
io.Model.Input("model"),
diff --git a/comfy_extras/nodes_mask.py b/comfy_extras/nodes_mask.py
index c44602597..52484697a 100644
--- a/comfy_extras/nodes_mask.py
+++ b/comfy_extras/nodes_mask.py
@@ -2,6 +2,7 @@ import numpy as np
import scipy.ndimage
import torch
import comfy.utils
+import comfy.model_management
import node_helpers
from typing_extensions import override
from comfy_api.latest import ComfyExtension, IO, UI
@@ -45,13 +46,14 @@ def composite(destination, source, x, y, mask = None, multiplier = 8, resize_sou
destination[..., top:bottom, left:right] = source_portion + destination_portion
return destination
+
class LatentCompositeMasked(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="LatentCompositeMasked",
search_aliases=["overlay latent", "layer latent", "paste latent", "inpaint latent"],
- category="latent",
+ category="model/latent",
inputs=[
IO.Latent.Input("destination"),
IO.Latent.Input("source"),
@@ -79,8 +81,9 @@ class ImageCompositeMasked(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="ImageCompositeMasked",
- search_aliases=["paste image", "overlay", "layer"],
- category="image",
+ search_aliases=["overlay", "layer", "paste image", "images composition"],
+ display_name="Image Composite Masked",
+ category="image/compositing",
inputs=[
IO.Image.Input("destination"),
IO.Image.Input("source"),
@@ -109,7 +112,7 @@ class MaskToImage(IO.ComfyNode):
node_id="MaskToImage",
search_aliases=["convert mask"],
display_name="Convert Mask to Image",
- category="mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
],
@@ -131,7 +134,7 @@ class ImageToMask(IO.ComfyNode):
node_id="ImageToMask",
search_aliases=["extract channel", "channel to mask"],
display_name="Convert Image to Mask",
- category="mask",
+ category="image/mask",
inputs=[
IO.Image.Input("image"),
IO.Combo.Input("channel", options=["red", "green", "blue", "alpha"]),
@@ -154,7 +157,8 @@ class ImageColorToMask(IO.ComfyNode):
return IO.Schema(
node_id="ImageColorToMask",
search_aliases=["color keying", "chroma key"],
- category="mask",
+ display_name="Convert Image Color to Mask",
+ category="image/mask",
inputs=[
IO.Image.Input("image"),
IO.Int.Input("color", default=0, min=0, max=0xFFFFFF, step=1, display_mode=IO.NumberDisplay.number),
@@ -177,7 +181,8 @@ class SolidMask(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="SolidMask",
- category="mask",
+ display_name="Create Solid Mask",
+ category="image/mask",
inputs=[
IO.Float.Input("value", default=1.0, min=0.0, max=1.0, step=0.01),
IO.Int.Input("width", default=512, min=1, max=nodes.MAX_RESOLUTION, step=1),
@@ -188,7 +193,7 @@ class SolidMask(IO.ComfyNode):
@classmethod
def execute(cls, value, width, height) -> IO.NodeOutput:
- out = torch.full((1, height, width), value, dtype=torch.float32, device="cpu")
+ out = torch.full((1, height, width), value, dtype=torch.float32, device=comfy.model_management.intermediate_device())
return IO.NodeOutput(out)
solid = execute # TODO: remove
@@ -200,7 +205,8 @@ class InvertMask(IO.ComfyNode):
return IO.Schema(
node_id="InvertMask",
search_aliases=["reverse mask", "flip mask"],
- category="mask",
+ display_name="Invert Mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
],
@@ -221,7 +227,8 @@ class CropMask(IO.ComfyNode):
return IO.Schema(
node_id="CropMask",
search_aliases=["cut mask", "extract mask region", "mask slice"],
- category="mask",
+ display_name="Crop Mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
IO.Int.Input("x", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
@@ -246,8 +253,9 @@ class MaskComposite(IO.ComfyNode):
def define_schema(cls):
return IO.Schema(
node_id="MaskComposite",
- search_aliases=["combine masks", "blend masks", "layer masks"],
- category="mask",
+ search_aliases=["combine masks", "blend masks", "layer masks", "masks composition"],
+ display_name="Combine Masks",
+ category="image/mask",
inputs=[
IO.Mask.Input("destination"),
IO.Mask.Input("source"),
@@ -262,6 +270,7 @@ class MaskComposite(IO.ComfyNode):
def execute(cls, destination, source, x, y, operation) -> IO.NodeOutput:
output = destination.reshape((-1, destination.shape[-2], destination.shape[-1])).clone()
source = source.reshape((-1, source.shape[-2], source.shape[-1]))
+ source = source.to(output.device)
left, top = (x, y,)
right, bottom = (min(left + source.shape[-1], destination.shape[-1]), min(top + source.shape[-2], destination.shape[-2]))
@@ -296,7 +305,8 @@ class FeatherMask(IO.ComfyNode):
return IO.Schema(
node_id="FeatherMask",
search_aliases=["soft edge mask", "blur mask edges", "gradient mask edge"],
- category="mask",
+ display_name="Feather Mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
IO.Int.Input("left", default=0, min=0, max=nodes.MAX_RESOLUTION, step=1),
@@ -322,7 +332,7 @@ class FeatherMask(IO.ComfyNode):
for x in range(right):
feather_rate = (x + 1) / right
- output[:, :, -x] *= feather_rate
+ output[:, :, -(x + 1)] *= feather_rate
for y in range(top):
feather_rate = (y + 1) / top
@@ -330,7 +340,7 @@ class FeatherMask(IO.ComfyNode):
for y in range(bottom):
feather_rate = (y + 1) / bottom
- output[:, -y, :] *= feather_rate
+ output[:, -(y + 1), :] *= feather_rate
return IO.NodeOutput(output)
@@ -344,7 +354,7 @@ class GrowMask(IO.ComfyNode):
node_id="GrowMask",
search_aliases=["expand mask", "shrink mask"],
display_name="Grow Mask",
- category="mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
IO.Int.Input("expand", default=0, min=-nodes.MAX_RESOLUTION, max=nodes.MAX_RESOLUTION, step=1),
@@ -374,14 +384,14 @@ class GrowMask(IO.ComfyNode):
expand_mask = execute # TODO: remove
-
class ThresholdMask(IO.ComfyNode):
@classmethod
def define_schema(cls):
return IO.Schema(
node_id="ThresholdMask",
search_aliases=["binary mask"],
- category="mask",
+ display_name="Threshold Mask",
+ category="image/mask",
inputs=[
IO.Mask.Input("mask"),
IO.Float.Input("value", default=0.5, min=0.0, max=1.0, step=0.01),
@@ -407,7 +417,7 @@ class MaskPreview(IO.ComfyNode):
node_id="MaskPreview",
search_aliases=["show mask", "view mask", "inspect mask", "debug mask"],
display_name="Preview Mask",
- category="mask",
+ category="image/mask",
description="Saves the input images to your ComfyUI output directory.",
inputs=[
IO.Mask.Input("mask"),
diff --git a/comfy_extras/nodes_math.py b/comfy_extras/nodes_math.py
index 6417bacf1..873ee7b51 100644
--- a/comfy_extras/nodes_math.py
+++ b/comfy_extras/nodes_math.py
@@ -4,7 +4,6 @@ Provides a ComfyMathExpression node that evaluates math expressions
against dynamically-grown numeric inputs.
"""
-from __future__ import annotations
import math
import string
@@ -63,14 +62,14 @@ class MathExpressionNode(io.ComfyNode):
@classmethod
def define_schema(cls) -> io.Schema:
autogrow = io.Autogrow.TemplateNames(
- input=io.MultiType.Input("value", [io.Float, io.Int]),
+ input=io.MultiType.Input("value", [io.Float, io.Int, io.Boolean]),
names=list(string.ascii_lowercase),
min=1,
)
return io.Schema(
node_id="ComfyMathExpression",
display_name="Math Expression",
- category="math",
+ category="utilities",
search_aliases=[
"expression", "formula", "calculate", "calculator",
"eval", "math",
@@ -82,6 +81,7 @@ class MathExpressionNode(io.ComfyNode):
outputs=[
io.Float.Output(display_name="FLOAT"),
io.Int.Output(display_name="INT"),
+ io.Boolean.Output(display_name="BOOL"),
],
)
@@ -97,7 +97,7 @@ class MathExpressionNode(io.ComfyNode):
result = simple_eval(expression, names=context, functions=MATH_FUNCTIONS)
# bool check must come first because bool is a subclass of int in Python
- if isinstance(result, bool) or not isinstance(result, (int, float)):
+ if not isinstance(result, (int, float)):
raise ValueError(
f"Math Expression '{expression}' must evaluate to a numeric result, "
f"got {type(result).__name__}: {result!r}"
@@ -106,7 +106,7 @@ class MathExpressionNode(io.ComfyNode):
raise ValueError(
f"Math Expression '{expression}' produced a non-finite result: {result}"
)
- return io.NodeOutput(float(result), int(result))
+ return io.NodeOutput(float(result), int(result), bool(result))
class MathExtension(ComfyExtension):
diff --git a/comfy_extras/nodes_mediapipe.py b/comfy_extras/nodes_mediapipe.py
new file mode 100644
index 000000000..343d88dbb
--- /dev/null
+++ b/comfy_extras/nodes_mediapipe.py
@@ -0,0 +1,508 @@
+"""ComfyUI nodes for the pure-PyTorch MediaPipe Face Landmarker port.
+
+Custom IO types:
+ FACE_LANDMARKER — FaceLandmarkerModel wrapper (ModelPatcher inside)
+ FACE_LANDMARKS — {"frames": List[List[face_dict]], "image_size": (H, W),
+ "connection_sets": dict[str, frozenset[(int, int)]]}
+ face_dict: bbox_xyxy, blendshapes, landmarks_xy,
+ landmarks_3d, presence, score, transformation_matrix
+
+MediaPipeFaceLandmarker also emits the core BOUNDING_BOX type — pair with DrawBBoxes.
+"""
+
+
+import numpy as np
+import torch
+from PIL import Image, ImageColor, ImageDraw
+from tqdm.auto import tqdm
+from typing_extensions import override
+
+import comfy.model_management
+import comfy.model_patcher
+import comfy.utils
+import folder_paths
+from comfy_api.latest import ComfyExtension, io
+
+from comfy_extras.mediapipe.face_landmarker import FaceLandmarker
+from comfy_extras.mediapipe.face_geometry import transformation_matrix_from_detection
+
+
+FaceDetectionType = io.Custom("FACE_DETECTION_MODEL")
+FaceLandmarksType = io.Custom("FACE_LANDMARKS")
+
+_CANONICAL_KEYS = ("canonical_vertices", "procrustes_indices", "procrustes_weights")
+_CONTOUR_PARTS = ("face_oval", "left_eye", "right_eye", "left_eyebrow", "right_eyebrow", "lips")
+
+
+class FaceLandmarkerModel:
+ """Loaded FaceLandmarker variants + ModelPatcher per variant.
+
+ Safetensors layout: `detector_short.*` / `detector_full.*` plus shared
+ `mesh.*`, `blendshapes.*`, `canonical_*`, and `topology.*`.
+ PReLU forces plain-nn / fp32 (manual_cast strands buffers across devices).
+ """
+
+ def __init__(self, state_dict: dict):
+ self.load_device = comfy.model_management.text_encoder_device()
+ offload_device = comfy.model_management.text_encoder_offload_device()
+ self.dtype = torch.float32
+
+ # FACEMESH_* connection sets, embedded as int32 (N, 2) under topology.*.
+ base: dict[str, frozenset] = {}
+ for k in [k for k in state_dict if k.startswith("topology.")]:
+ base[k[len("topology."):]] = frozenset(map(tuple, state_dict.pop(k).tolist()))
+ base["contours"] = frozenset().union(*(base[p] for p in _CONTOUR_PARTS))
+ base["all"] = base["contours"] | base["irises"] | base["nose"]
+
+ self.connection_sets: dict[str, frozenset] = base
+ self.canonical_data: dict[str, np.ndarray] = {k: state_dict.pop(k).numpy() for k in _CANONICAL_KEYS}
+
+ shared = {k: v for k, v in state_dict.items() if k.startswith(("mesh.", "blendshapes."))}
+
+ self.models: dict[str, FaceLandmarker] = {}
+ self.patchers: dict[str, comfy.model_patcher.ModelPatcher] = {}
+ for variant in ("short", "full"):
+ prefix = f"detector_{variant}."
+ sub = dict(shared)
+ sub.update({f"detector.{k[len(prefix):]}": v for k, v in state_dict.items() if k.startswith(prefix)})
+ fl = FaceLandmarker(device=offload_device, dtype=self.dtype, operations=None, detector_variant=variant).eval()
+ fl.load_state_dict(sub, strict=False)
+
+ self.models[variant] = fl
+ self.patchers[variant] = comfy.model_patcher.CoreModelPatcher(
+ fl, load_device=self.load_device, offload_device=offload_device,
+ size=comfy.model_management.module_size(fl),
+ )
+
+ def detect_batch(self, images, num_faces: int, score_thresh: float, variant: str):
+ comfy.model_management.load_model_gpu(self.patchers[variant])
+ return self.models[variant].detect_batch(images, num_faces=num_faces, score_thresh=score_thresh)
+
+
+def _image_to_uint8(image: torch.Tensor) -> np.ndarray:
+ return image[..., :3].mul(255.0).add_(0.5).clamp_(0, 255).to(torch.uint8).cpu().numpy()
+
+
+def _parse_color(color: str) -> tuple[int, int, int]:
+ try:
+ return ImageColor.getrgb(color)[:3]
+ except ValueError:
+ return (0, 255, 0)
+
+
+def _copy_face(face: dict) -> dict:
+ """Shallow copy of a face_dict with array-fields cloned so callers can mutate."""
+ return {
+ "bbox_xyxy": face["bbox_xyxy"].copy(),
+ "blendshapes": dict(face["blendshapes"]),
+ "landmarks_xy": face["landmarks_xy"].copy(),
+ "landmarks_3d": face["landmarks_3d"].copy(),
+ "presence": face["presence"],
+ "score": face["score"],
+ }
+
+
+def _lerp_face(a: dict, b: dict, t: float) -> dict:
+ return {
+ "bbox_xyxy": (1 - t) * a["bbox_xyxy"] + t * b["bbox_xyxy"],
+ "blendshapes": {k: (1 - t) * a["blendshapes"][k] + t * b["blendshapes"][k] for k in a["blendshapes"]},
+ "landmarks_xy": (1 - t) * a["landmarks_xy"] + t * b["landmarks_xy"],
+ "landmarks_3d": (1 - t) * a["landmarks_3d"] + t * b["landmarks_3d"],
+ "presence": (1 - t) * a["presence"] + t * b["presence"],
+ "score": (1 - t) * a["score"] + t * b["score"],
+ }
+
+
+def _match_faces(a: list[dict], b: list[dict]) -> list[tuple[int, int]]:
+ """Greedy nearest-neighbour pairing of faces between two frames by bbox
+ centre distance. Unmatched (when counts differ) are dropped."""
+ if not a or not b:
+ return []
+ centers_a = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]),
+ 0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in a])
+ centers_b = np.array([(0.5 * (f["bbox_xyxy"][0] + f["bbox_xyxy"][2]),
+ 0.5 * (f["bbox_xyxy"][1] + f["bbox_xyxy"][3])) for f in b])
+ dists = np.linalg.norm(centers_a[:, None] - centers_b[None], axis=-1)
+ pairs: list[tuple[int, int]] = []
+ used_a: set[int] = set()
+ used_b: set[int] = set()
+ candidates = sorted((dists[ia, ib], ia, ib) for ia in range(len(a)) for ib in range(len(b)))
+ for _, ia, ib in candidates:
+ if ia in used_a or ib in used_b:
+ continue
+ pairs.append((ia, ib))
+ used_a.add(ia)
+ used_b.add(ib)
+ return pairs
+
+
+def _fill_missing_frames(frames: list[list[dict]], mode: str) -> None:
+ """In-place fill empty frame slots from neighbouring detections. Multi-face
+ aware: pairs faces across bracketing frames by greedy bbox-centre NN.
+ When counts differ, unmatched faces are dropped from the synthesised frame."""
+ if mode == "empty":
+ return
+ valid = [i for i, fr in enumerate(frames) if fr]
+ if not valid:
+ return # nothing to fill from
+ if mode == "previous":
+ last: list[dict] = []
+ for i, fr in enumerate(frames):
+ if fr:
+ last = fr
+ elif last:
+ frames[i] = [_copy_face(f) for f in last]
+ return
+ # interpolate: lerp between bracketing valid frames; clamp at ends.
+ for i in range(len(frames)):
+ if frames[i]:
+ continue
+ prev_i = max((v for v in valid if v < i), default=None)
+ next_i = min((v for v in valid if v > i), default=None)
+ if prev_i is None:
+ frames[i] = [_copy_face(f) for f in frames[next_i]]
+ elif next_i is None:
+ frames[i] = [_copy_face(f) for f in frames[prev_i]]
+ else:
+ t = (i - prev_i) / (next_i - prev_i)
+ pairs = _match_faces(frames[prev_i], frames[next_i])
+ frames[i] = [_lerp_face(frames[prev_i][a], frames[next_i][b], t) for a, b in pairs]
+
+
+def _ordered_rings(edges: frozenset[tuple[int, int]]) -> list[list[int]]:
+ """Walk an unordered edge set into one or more closed-loop vertex rings
+ (handles multi-loop sets like FACEMESH_LIPS: outer + inner)."""
+ adj: dict[int, set[int]] = {}
+ for a, b in edges:
+ adj.setdefault(a, set()).add(b)
+ adj.setdefault(b, set()).add(a)
+ visited: set[int] = set()
+ rings: list[list[int]] = []
+ for start in adj:
+ if start in visited:
+ continue
+ ring = [start]
+ visited.add(start)
+ prev, cur = -1, start
+ while True:
+ nxt = next((v for v in adj[cur] if v != prev), None)
+ if nxt is None or nxt == start:
+ break
+ ring.append(nxt)
+ visited.add(nxt)
+ prev, cur = cur, nxt
+ rings.append(ring)
+ return rings
+
+
+class LoadMediaPipeFaceLandmarker(io.ComfyNode):
+ """Load MediaPipe Face Landmarker v2 weights. Contains both detector variants
+ (short / full), shared mesh, blendshapes, and canonical geometry."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="LoadMediaPipeFaceLandmarker",
+ search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection"],
+ display_name="Load Face Detection Model (MediaPipe)",
+ category="model/loaders",
+ inputs=[
+ io.Combo.Input("model_name", options=folder_paths.get_filename_list("detection"),
+ tooltip="Face detection model from models/detection/."),
+ ],
+ outputs=[FaceDetectionType.Output()],
+ )
+
+ @classmethod
+ def execute(cls, model_name) -> io.NodeOutput:
+ sd = comfy.utils.load_torch_file(folder_paths.get_full_path_or_raise("detection", model_name), safe_load=True)
+ wrapper = FaceLandmarkerModel(sd)
+ return io.NodeOutput(wrapper)
+
+
+# Per-frame fallback modes for detection failures in a batch.
+_FALLBACK_MODES = ("empty", "previous", "interpolate")
+
+
+class MediaPipeFaceLandmarker(io.ComfyNode):
+ """BlazeFace → FaceMesh v2 → ARKit-52 blendshapes, batched across the
+ input. Also emits a BOUNDING_BOX list (landmark-extent bbox per face) —
+ pair with DrawBBoxes for detector-only viz or MediaPipeFaceMeshVisualize
+ for the mesh overlay."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MediaPipeFaceLandmarker",
+ search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection"],
+ display_name="Detect Face Landmarks (MediaPipe)",
+ category="image/detection",
+ description="Detects facial landmarks using MediaPipe model.",
+ inputs=[
+ FaceDetectionType.Input("face_detection_model"),
+ io.Image.Input("image"),
+ io.Combo.Input("detector_variant", options=["short", "full", "both"], default="short",
+ tooltip="Face detector range. 'short' is tuned for close-up faces "
+ "(within ~2 m of the camera); 'full' covers farther / smaller "
+ "faces (up to ~5 m) but is slower. 'both' runs both detectors and "
+ "keeps whichever found more faces per frame (~2× detection cost)."),
+ io.Int.Input("num_faces", default=1, min=0, max=16, step=1,
+ tooltip="Maximum faces to return per frame. 0 = no cap (return all detected)."),
+ io.Float.Input("min_confidence", default=0.5, min=0.0, max=1.0, step=0.01, advanced=True,
+ tooltip="BlazeFace score threshold. Lower to catch small/occluded faces."),
+ io.Combo.Input("missing_frame_fallback", options=list(_FALLBACK_MODES), default="empty", advanced=True,
+ tooltip="Per-frame behaviour when detection fails in a batch. "
+ "'empty' leaves the frame faceless. 'previous' copies the most recent successful "
+ "detection. 'interpolate' lerps landmarks/bbox/blendshapes between bracketing "
+ "successful frames. Multi-face: pairs faces across frames by greedy bbox-centre NN."),
+ ],
+ outputs=[
+ FaceLandmarksType.Output(display_name="face_landmarks"),
+ io.BoundingBox.Output("bboxes"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, face_detection_model, image, detector_variant, num_faces, min_confidence,
+ missing_frame_fallback) -> io.NodeOutput:
+ canonical = face_detection_model.canonical_data
+ img_np = _image_to_uint8(image)
+ B, H, W = img_np.shape[:3]
+ chunk = 16
+ is_both = detector_variant == "both"
+ total_work = 2 * B if is_both else B
+ pbar = comfy.utils.ProgressBar(total_work)
+
+ def _run(variant: str) -> list[list[dict]]:
+ res: list[list[dict]] = []
+ with tqdm(total=B, desc=f"MediaPipe Face Landmarker ({variant})") as tq:
+ for i in range(0, B, chunk):
+ end = min(i + chunk, B)
+ res.extend(face_detection_model.detect_batch(
+ [img_np[bi] for bi in range(i, end)],
+ num_faces=int(num_faces),
+ score_thresh=float(min_confidence),
+ variant=variant,
+ ))
+ pbar.update_absolute(min(pbar.current + (end - i), total_work))
+ tq.update(end - i)
+ return res
+
+ if is_both:
+ short_res = _run("short")
+ full_res = _run("full")
+ # Per-frame keep whichever found more faces (tie → short).
+ frames: list[list[dict]] = [
+ short_res[bi] if len(short_res[bi]) >= len(full_res[bi]) else full_res[bi]
+ for bi in range(B)
+ ]
+ else:
+ frames = _run(detector_variant)
+ _fill_missing_frames(frames, missing_frame_fallback)
+ bboxes = []
+ for per_frame in frames:
+ per_bb = []
+ for f in per_frame:
+ f["transformation_matrix"] = transformation_matrix_from_detection(f, W, H, canonical)
+ x1, y1, x2, y2 = (float(v) for v in f["bbox_xyxy"])
+ per_bb.append({"x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1, "label": "face", "score": float(f["score"])})
+ bboxes.append(per_bb)
+ return io.NodeOutput({"frames": frames, "image_size": (H, W),
+ "connection_sets": face_detection_model.connection_sets}, bboxes)
+
+
+# Topology keys unioned by the 'all' connections preset (contour parts + irises + nose).
+_ALL_CONNECTION_PARTS: tuple[str, ...] = (*_CONTOUR_PARTS, "irises", "nose")
+_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = (
+ ("face_oval", True),
+ ("lips", True),
+ ("left_eye", True),
+ ("right_eye", True),
+ ("left_eyebrow", True),
+ ("right_eyebrow", True),
+ ("irises", True),
+ ("nose", True),
+ ("tesselation", False),
+)
+
+
+class MediaPipeFaceMeshVisualize(io.ComfyNode):
+ """Draw a FACEMESH_* subset over an image. Topology travels with the
+ FACE_LANDMARKS payload (set at detection time)."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MediaPipeFaceMeshVisualize",
+ search_aliases=["face", "facial", "mediapipe", "face landmark", "face mesh", "blazeface", "face detection", "visualize"],
+ display_name="Visualize Face Landmarks (MediaPipe)",
+ category="image/detection",
+ description="Draws face landmarks mesh on the input image.",
+ inputs=[
+ FaceLandmarksType.Input("face_landmarks"),
+ io.Image.Input("image", optional=True, tooltip="If not connected, a black canvas will be used."),
+ io.DynamicCombo.Input(
+ "connections",
+ tooltip="'all' = oval+eyes+brows+lips+irises+nose. 'fill' = solid face_oval polygon (silhouette mask). 'custom' = toggle each feature individually (including 'tesselation', the full 2547-edge wireframe).",
+ options=[
+ io.DynamicCombo.Option("all", []),
+ io.DynamicCombo.Option("fill", []),
+ io.DynamicCombo.Option("custom", [
+ io.Boolean.Input(feat, default=default,
+ tooltip=f"Draw the '{feat}' connection set.")
+ for feat, default in _CUSTOM_FEATURES
+ ]),
+ ],
+ ),
+ io.Color.Input("color", default="#00ff00"),
+ io.Int.Input("thickness", default=1, min=0, max=8, step=1,
+ tooltip="Edge line thickness in pixels. 0 disables edge drawing."),
+ io.Int.Input("point_size", default=2, min=0, max=16, step=1,
+ tooltip="Landmark dot radius in pixels. 0 disables point drawing."),
+ ],
+ outputs=[io.Image.Output()],
+ )
+
+ @classmethod
+ def execute(cls, face_landmarks, connections, color, thickness, point_size, image=None) -> io.NodeOutput:
+ sets = face_landmarks["connection_sets"]
+ sel = connections["connections"]
+ fill_rings: list[list[int]] | None = None
+ if sel == "fill":
+ fill_rings = _ordered_rings(sets["face_oval"])
+ edges = frozenset()
+ elif sel == "custom":
+ parts = [feat for feat, _ in _CUSTOM_FEATURES if connections.get(feat, False)]
+ edges = frozenset().union(*(sets[p] for p in parts))
+ else: # "all"
+ edges = frozenset().union(*(sets[p] for p in _ALL_CONNECTION_PARTS))
+ rgb, thick, psize = _parse_color(color), int(thickness), int(point_size)
+ frames = face_landmarks["frames"]
+ if image is None:
+ H, W = face_landmarks["image_size"]
+ img_np = np.zeros((len(frames), H, W, 3), dtype=np.uint8)
+ else:
+ img_np = _image_to_uint8(image)
+ B = img_np.shape[0]
+ n_frames = len(frames)
+ pbar = comfy.utils.ProgressBar(B)
+ out = np.empty_like(img_np)
+ for bi in range(B):
+ faces = frames[bi] if bi < n_frames else []
+ out[bi] = _draw_mesh(img_np[bi], faces, edges, rgb, thick, psize, fill_rings)
+ pbar.update_absolute(bi + 1)
+ return io.NodeOutput(torch.from_numpy(out).to(
+ device=comfy.model_management.intermediate_device(),
+ dtype=comfy.model_management.intermediate_dtype(),
+ ).div_(255.0))
+
+
+def _draw_mesh(image_rgb: np.ndarray, faces: list, edges,
+ rgb: tuple[int, int, int], thickness: int,
+ point_size: int, fill_rings: list[list[int]] | None = None) -> np.ndarray:
+ draw_edges = thickness > 0 and edges
+ if not faces or (fill_rings is None and not draw_edges and point_size <= 0):
+ return image_rgb.copy()
+ pil = Image.fromarray(image_rgb)
+ draw = ImageDraw.Draw(pil)
+ r = point_size * 0.5
+ if fill_rings is not None:
+ for f in faces:
+ lmks = f["landmarks_xy"]
+ for ring in fill_rings:
+ draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=rgb)
+ return np.asarray(pil)
+ for f in faces:
+ lmks = f["landmarks_xy"]
+ n = lmks.shape[0]
+ if draw_edges:
+ for a, b in edges:
+ if a < n and b < n:
+ draw.line([(float(lmks[a, 0]), float(lmks[a, 1])),
+ (float(lmks[b, 0]), float(lmks[b, 1]))], fill=rgb, width=thickness)
+ if point_size == 1:
+ draw.point(lmks.flatten().tolist(), fill=rgb)
+ elif point_size > 1:
+ for x, y in lmks:
+ draw.ellipse((float(x) - r, float(y) - r, float(x) + r, float(y) + r), fill=rgb)
+ return np.asarray(pil)
+
+
+# Mask region presets — closed-loop topologies only.
+_MASK_REGIONS: tuple[str, ...] = ("face_oval", "lips", "left_eye", "right_eye", "irises")
+_MASK_CUSTOM_FEATURES: tuple[tuple[str, bool], ...] = (
+ ("face_oval", True),
+ ("lips", False),
+ ("left_eye", False),
+ ("right_eye", False),
+ ("irises", False),
+)
+
+
+class MediaPipeFaceMask(io.ComfyNode):
+ """Binary mask from face landmarks, filled polygon per face. One mask per
+ frame in the batch; faces in the same frame composite (union)."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MediaPipeFaceMask",
+ search_aliases=["face", "facial", "mediapipe", "face mask", "blazeface", "face detection", "visualize"],
+ display_name="Draw Face Mask (MediaPipe)",
+ category="image/detection",
+ description="Draws a mask from face landmarks.",
+ inputs=[
+ FaceLandmarksType.Input("face_landmarks"),
+ io.DynamicCombo.Input(
+ "regions",
+ tooltip="'all' = union of face_oval+lips+eyes+irises (which collapses to face_oval since it encloses the rest). 'custom' = toggle each region individually for combos like lips+eyes.",
+ options=[
+ io.DynamicCombo.Option("all", []),
+ io.DynamicCombo.Option("custom", [
+ io.Boolean.Input(reg, default=default,
+ tooltip=f"Include the '{reg}' region in the mask.")
+ for reg, default in _MASK_CUSTOM_FEATURES
+ ]),
+ ],
+ ),
+ ],
+ outputs=[io.Mask.Output()],
+ )
+
+ @classmethod
+ def execute(cls, face_landmarks, regions) -> io.NodeOutput:
+ sets = face_landmarks["connection_sets"]
+ sel = regions["regions"]
+ if sel == "custom":
+ picked = [reg for reg, _ in _MASK_CUSTOM_FEATURES if regions.get(reg, False)]
+ else:
+ picked = list(_MASK_REGIONS)
+ rings = [r for reg in picked for r in _ordered_rings(sets[reg])]
+ frames = face_landmarks["frames"]
+ H, W = face_landmarks["image_size"]
+ masks = np.zeros((len(frames), H, W), dtype=np.uint8)
+ pbar = comfy.utils.ProgressBar(len(frames))
+ for bi, per_frame in enumerate(frames):
+ if per_frame:
+ pil = Image.new("L", (W, H), 0)
+ draw = ImageDraw.Draw(pil)
+ for f in per_frame:
+ lmks = f["landmarks_xy"]
+ for ring in rings:
+ draw.polygon([(float(lmks[i, 0]), float(lmks[i, 1])) for i in ring], fill=255)
+ masks[bi] = np.asarray(pil)
+ pbar.update_absolute(bi + 1)
+ return io.NodeOutput(torch.from_numpy(masks).to(
+ device=comfy.model_management.intermediate_device(),
+ dtype=comfy.model_management.intermediate_dtype(),
+ ).div_(255.0))
+
+
+class MediaPipeFaceExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [LoadMediaPipeFaceLandmarker, MediaPipeFaceLandmarker, MediaPipeFaceMeshVisualize, MediaPipeFaceMask]
+
+
+async def comfy_entrypoint() -> MediaPipeFaceExtension:
+ return MediaPipeFaceExtension()
diff --git a/comfy_extras/nodes_mochi.py b/comfy_extras/nodes_mochi.py
index d750194fc..3dcea6ab3 100644
--- a/comfy_extras/nodes_mochi.py
+++ b/comfy_extras/nodes_mochi.py
@@ -10,7 +10,7 @@ class EmptyMochiLatentVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptyMochiLatentVideo",
- category="latent/video",
+ category="model/latent/video",
inputs=[
io.Int.Input("width", default=848, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
diff --git a/comfy_extras/nodes_model_advanced.py b/comfy_extras/nodes_model_advanced.py
index 8bf6a1afa..b27ac1296 100644
--- a/comfy_extras/nodes_model_advanced.py
+++ b/comfy_extras/nodes_model_advanced.py
@@ -134,8 +134,11 @@ class ModelSamplingSD3:
class ModelSamplingAdvanced(sampling_base, sampling_type):
pass
+ original = m.get_model_object("model_sampling")
model_sampling = ModelSamplingAdvanced(model.model.model_config)
model_sampling.set_parameters(shift=shift, multiplier=multiplier)
+ if hasattr(original, "noise_scale"):
+ model_sampling.set_noise_scale(original.noise_scale)
m.add_object_patch("model_sampling", model_sampling)
return (m, )
@@ -300,6 +303,29 @@ class RescaleCFG:
m.set_model_sampler_cfg_function(rescale_cfg)
return (m, )
+class ModelNoiseScale:
+ @classmethod
+ def INPUT_TYPES(s):
+ return {"required": { "model": ("MODEL",),
+ "noise_scale": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 64.0, "step": 0.01,
+ "tooltip": "Absolute training noise scale. For example HiDream-O1 base: 8.0, dev: 7.5."}),
+ }}
+
+ RETURN_TYPES = ("MODEL",)
+ FUNCTION = "patch"
+
+ CATEGORY = "advanced/model"
+
+ def patch(self, model, noise_scale):
+ m = model.clone()
+ original = m.get_model_object("model_sampling")
+ ms = type(original)(m.model.model_config)
+ ms.set_parameters(shift=original.shift, multiplier=original.multiplier)
+ ms.set_noise_scale(noise_scale)
+ m.add_object_patch("model_sampling", ms)
+ return (m, )
+
+
class ModelComputeDtype:
SEARCH_ALIASES = ["model precision", "change dtype"]
@classmethod
@@ -327,6 +353,7 @@ NODE_CLASS_MAPPINGS = {
"ModelSamplingSD3": ModelSamplingSD3,
"ModelSamplingAuraFlow": ModelSamplingAuraFlow,
"ModelSamplingFlux": ModelSamplingFlux,
+ "ModelNoiseScale": ModelNoiseScale,
"RescaleCFG": RescaleCFG,
"ModelComputeDtype": ModelComputeDtype,
}
diff --git a/comfy_extras/nodes_model_downscale.py b/comfy_extras/nodes_model_downscale.py
index 24d47a903..817542452 100644
--- a/comfy_extras/nodes_model_downscale.py
+++ b/comfy_extras/nodes_model_downscale.py
@@ -10,7 +10,7 @@ class PatchModelAddDownscale(io.ComfyNode):
return io.Schema(
node_id="PatchModelAddDownscale",
display_name="PatchModelAddDownscale (Kohya Deep Shrink)",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Int.Input("block_number", default=3, min=1, max=32, step=1, advanced=True),
diff --git a/comfy_extras/nodes_model_merging.py b/comfy_extras/nodes_model_merging.py
index 5384ed531..b6b29e34a 100644
--- a/comfy_extras/nodes_model_merging.py
+++ b/comfy_extras/nodes_model_merging.py
@@ -276,8 +276,8 @@ class CLIPSave:
for x in extra_pnginfo:
metadata[x] = json.dumps(extra_pnginfo[x])
- comfy.model_management.load_models_gpu([clip.load_model()], force_patch_weights=True)
- clip_sd = clip.get_sd()
+ clip.load_model()
+ clip_sd = clip.state_dict_for_saving()
for prefix in ["clip_l.", "clip_g.", "clip_h.", "t5xxl.", "pile_t5xl.", "mt5xl.", "umt5xxl.", "t5base.", "gemma2_2b.", "llama.", "hydit_clip.", ""]:
k = list(filter(lambda a: a.startswith(prefix), clip_sd.keys()))
diff --git a/comfy_extras/nodes_model_patch.py b/comfy_extras/nodes_model_patch.py
index 176e6bc2f..bdccbf8c4 100644
--- a/comfy_extras/nodes_model_patch.py
+++ b/comfy_extras/nodes_model_patch.py
@@ -7,7 +7,10 @@ import comfy.model_management
import comfy.ldm.common_dit
import comfy.latent_formats
import comfy.ldm.lumina.controlnet
+import comfy.ldm.supir.supir_modules
from comfy.ldm.wan.model_multitalk import WanMultiTalkAttentionBlock, MultiTalkAudioProjModel
+from comfy_api.latest import io
+from comfy.ldm.supir.supir_patch import SUPIRPatch
class BlockWiseControlBlock(torch.nn.Module):
@@ -266,6 +269,27 @@ class ModelPatchLoader:
out_dim=sd["audio_proj.norm.weight"].shape[0],
device=comfy.model_management.unet_offload_device(),
operations=comfy.ops.manual_cast)
+ elif 'model.control_model.input_hint_block.0.weight' in sd or 'control_model.input_hint_block.0.weight' in sd:
+ prefix_replace = {}
+ if 'model.control_model.input_hint_block.0.weight' in sd:
+ prefix_replace["model.control_model."] = "control_model."
+ prefix_replace["model.diffusion_model.project_modules."] = "project_modules."
+ else:
+ prefix_replace["control_model."] = "control_model."
+ prefix_replace["project_modules."] = "project_modules."
+
+ # Extract denoise_encoder weights before filter_keys discards them
+ de_prefix = "first_stage_model.denoise_encoder."
+ denoise_encoder_sd = {}
+ for k in list(sd.keys()):
+ if k.startswith(de_prefix):
+ denoise_encoder_sd[k[len(de_prefix):]] = sd.pop(k)
+
+ sd = comfy.utils.state_dict_prefix_replace(sd, prefix_replace, filter_keys=True)
+ sd.pop("control_model.mask_LQ", None)
+ model = comfy.ldm.supir.supir_modules.SUPIR(device=comfy.model_management.unet_offload_device(), dtype=dtype, operations=comfy.ops.manual_cast)
+ if denoise_encoder_sd:
+ model.denoise_encoder_sd = denoise_encoder_sd
model_patcher = comfy.model_patcher.CoreModelPatcher(model, load_device=comfy.model_management.get_torch_device(), offload_device=comfy.model_management.unet_offload_device())
model.load_state_dict(sd, assign=model_patcher.is_dynamic())
@@ -524,7 +548,7 @@ class USOStyleReference:
FUNCTION = "apply_patch"
EXPERIMENTAL = True
- CATEGORY = "advanced/model_patches/flux"
+ CATEGORY = "model/patch/flux"
def apply_patch(self, model, model_patch, clip_vision_output):
encoded_image = torch.stack((clip_vision_output.all_hidden_states[:, -20], clip_vision_output.all_hidden_states[:, -11], clip_vision_output.penultimate_hidden_states))
@@ -565,9 +589,89 @@ class MultiTalkModelPatch(torch.nn.Module):
)
+class SUPIRApply(io.ComfyNode):
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="SUPIRApply",
+ category="model/patch/supir",
+ is_experimental=True,
+ inputs=[
+ io.Model.Input("model"),
+ io.ModelPatch.Input("model_patch"),
+ io.Vae.Input("vae"),
+ io.Image.Input("image"),
+ io.Float.Input("strength_start", default=1.0, min=0.0, max=10.0, step=0.01,
+ tooltip="Control strength at the start of sampling (high sigma)."),
+ io.Float.Input("strength_end", default=1.0, min=0.0, max=10.0, step=0.01,
+ tooltip="Control strength at the end of sampling (low sigma). Linearly interpolated from start."),
+ io.Float.Input("restore_cfg", default=4.0, min=0.0, max=20.0, step=0.1, advanced=True,
+ tooltip="Pulls denoised output toward the input latent. Higher = stronger fidelity to input. 0 to disable."),
+ io.Float.Input("restore_cfg_s_tmin", default=0.05, min=0.0, max=1.0, step=0.01, advanced=True,
+ tooltip="Sigma threshold below which restore_cfg is disabled."),
+ ],
+ outputs=[io.Model.Output()],
+ )
+
+ @classmethod
+ def _encode_with_denoise_encoder(cls, vae, model_patch, image):
+ """Encode using denoise_encoder weights from SUPIR checkpoint if available."""
+ denoise_sd = getattr(model_patch.model, 'denoise_encoder_sd', None)
+ if not denoise_sd:
+ return vae.encode(image)
+
+ # Clone VAE patcher, apply denoise_encoder weights to clone, encode
+ orig_patcher = vae.patcher
+ vae.patcher = orig_patcher.clone()
+ patches = {f"encoder.{k}": (v,) for k, v in denoise_sd.items()}
+ vae.patcher.add_patches(patches, strength_patch=1.0, strength_model=0.0)
+ try:
+ return vae.encode(image)
+ finally:
+ vae.patcher = orig_patcher
+
+ @classmethod
+ def execute(cls, *, model: io.Model.Type, model_patch: io.ModelPatch.Type, vae: io.Vae.Type, image: io.Image.Type,
+ strength_start: float, strength_end: float, restore_cfg: float, restore_cfg_s_tmin: float) -> io.NodeOutput:
+ model_patched = model.clone()
+ hint_latent = model.get_model_object("latent_format").process_in(
+ cls._encode_with_denoise_encoder(vae, model_patch, image[:, :, :, :3]))
+ patch = SUPIRPatch(model_patch, model_patch.model.project_modules, hint_latent, strength_start, strength_end)
+ patch.register(model_patched)
+
+ if restore_cfg > 0.0:
+ # Round-trip to match original pipeline: decode hint, re-encode with regular VAE
+ latent_format = model.get_model_object("latent_format")
+ decoded = vae.decode(latent_format.process_out(hint_latent))
+ x_center = latent_format.process_in(vae.encode(decoded[:, :, :, :3]))
+ sigma_max = 14.6146
+
+ def restore_cfg_function(args):
+ denoised = args["denoised"]
+ sigma = args["sigma"]
+ if sigma.dim() > 0:
+ s = sigma[0].item()
+ else:
+ s = sigma.item()
+ if s > restore_cfg_s_tmin:
+ ref = x_center.to(device=denoised.device, dtype=denoised.dtype)
+ b = denoised.shape[0]
+ if ref.shape[0] != b:
+ ref = ref.expand(b, -1, -1, -1) if ref.shape[0] == 1 else ref.repeat((b + ref.shape[0] - 1) // ref.shape[0], 1, 1, 1)[:b]
+ sigma_val = sigma.view(-1, 1, 1, 1) if sigma.dim() > 0 else sigma
+ d_center = denoised - ref
+ denoised = denoised - d_center * ((sigma_val / sigma_max) ** restore_cfg)
+ return denoised
+
+ model_patched.set_model_sampler_post_cfg_function(restore_cfg_function)
+
+ return io.NodeOutput(model_patched)
+
+
NODE_CLASS_MAPPINGS = {
"ModelPatchLoader": ModelPatchLoader,
"QwenImageDiffsynthControlnet": QwenImageDiffsynthControlnet,
"ZImageFunControlnet": ZImageFunControlnet,
"USOStyleReference": USOStyleReference,
+ "SUPIRApply": SUPIRApply,
}
diff --git a/comfy_extras/nodes_moge.py b/comfy_extras/nodes_moge.py
new file mode 100644
index 000000000..422949531
--- /dev/null
+++ b/comfy_extras/nodes_moge.py
@@ -0,0 +1,413 @@
+"""ComfyUI nodes for the native MoGe (Monocular Geometry Estimation) integration."""
+
+
+import torch
+
+import comfy.utils
+import folder_paths
+from comfy_api.latest import ComfyExtension, Types, io
+from typing_extensions import override
+
+from comfy.ldm.moge.model import MoGeModel
+from comfy.ldm.moge.geometry import triangulate_grid_mesh
+from comfy.ldm.moge.panorama import get_panorama_cameras, split_panorama_image, merge_panorama_depth, spherical_uv_to_directions, _uv_grid
+import comfy.model_management
+from tqdm.auto import tqdm
+
+MoGeModelType = io.Custom("MOGE_MODEL")
+MoGeGeometry = io.Custom("MOGE_GEOMETRY")
+
+
+# MOGE_GEOMETRY is a dict with these optional keys (absent when the upstream model didn't produce them):
+# "points": torch.Tensor (B, H, W, 3)
+# "depth": torch.Tensor (B, H, W)
+# "intrinsics": torch.Tensor (B, 3, 3) -- perspective only
+# "mask": torch.Tensor (B, H, W) bool
+# "normal": torch.Tensor (B, H, W, 3) -- v2 only
+# "image": torch.Tensor (B, H, W, 3) in [0, 1], CPU (always present)
+
+
+def _turbo(x: torch.Tensor) -> torch.Tensor:
+ """Anton Mikhailov polynomial approximation of the turbo colormap."""
+ x = x.clamp(0.0, 1.0)
+ x2 = x * x
+ x3 = x2 * x
+ x4 = x2 * x2
+ x5 = x4 * x
+ r = 0.13572138 + 4.61539260*x - 42.66032258*x2 + 132.13108234*x3 - 152.94239396*x4 + 59.28637943*x5
+ g = 0.09140261 + 2.19418839*x + 4.84296658*x2 - 14.18503333*x3 + 4.27729857*x4 + 2.82956604*x5
+ b = 0.10667330 + 12.64194608*x - 60.58204836*x2 + 110.36276771*x3 - 89.90310912*x4 + 27.34824973*x5
+ return torch.stack([r, g, b], dim=-1).clamp(0.0, 1.0)
+
+
+def _normals_from_points(points: torch.Tensor) -> torch.Tensor:
+ """Camera-space surface normals from a (B, H, W, 3) point map (v1 fallback)."""
+ finite = torch.isfinite(points).all(dim=-1)
+ pts = torch.where(finite.unsqueeze(-1), points, torch.zeros_like(points))
+ dx = pts[..., :, 2:, :] - pts[..., :, :-2, :]
+ dy = pts[..., 2:, :, :] - pts[..., :-2, :, :]
+ dx = torch.nn.functional.pad(dx.permute(0, 3, 1, 2), (1, 1, 0, 0)).permute(0, 2, 3, 1)
+ dy = torch.nn.functional.pad(dy.permute(0, 3, 1, 2), (0, 0, 1, 1)).permute(0, 2, 3, 1)
+ # dy x dx (not dx x dy) so the result is outward-facing in OpenCV (Y-down flips the right-hand rule), matching v2's predicted normals.
+ n = torch.cross(dy, dx, dim=-1)
+ n = torch.nn.functional.normalize(n, dim=-1)
+ return torch.where(finite.unsqueeze(-1), n, torch.zeros_like(n))
+
+
+def _normalize_disparity(depth: torch.Tensor) -> torch.Tensor:
+ """Per-batch normalize 1/depth to [0, 1] using 0.1/99.9 percentile clipping."""
+ out = torch.zeros_like(depth)
+ for i in range(depth.shape[0]):
+ d = depth[i]
+ valid = torch.isfinite(d) & (d > 0)
+ if not valid.any():
+ continue
+ disp = torch.where(valid, 1.0 / d.clamp_min(1e-6), torch.zeros_like(d))
+ disp_valid = disp[valid]
+ lo = torch.quantile(disp_valid, 0.001)
+ hi = torch.quantile(disp_valid, 0.999)
+ scale = (hi - lo).clamp_min(1e-6)
+ norm = ((disp - lo) / scale).clamp(0.0, 1.0)
+ out[i] = torch.where(valid, norm, torch.zeros_like(norm))
+ return out
+
+
+class LoadMoGeModel(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="LoadMoGeModel",
+ display_name="Load MoGe Model",
+ category="model/loaders",
+ inputs=[
+ io.Combo.Input("model_name", options=folder_paths.get_filename_list("geometry_estimation")),
+ ],
+ outputs=[MoGeModelType.Output()],
+ )
+
+ @classmethod
+ def execute(cls, model_name) -> io.NodeOutput:
+ path = folder_paths.get_full_path_or_raise("geometry_estimation", model_name)
+ sd = comfy.utils.load_torch_file(path, safe_load=True)
+ return io.NodeOutput(MoGeModel(sd))
+
+
+class MoGePanoramaInference(io.ComfyNode):
+ """Equirectangular panorama inference: split into 12 perspective views, run
+ MoGe at fov_x=90 on each, merge via multi-scale Poisson + gradient solve.
+ v2's predicted normals and metric scale are ignored (per-view scales would not align across seams).
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MoGePanoramaInference",
+ search_aliases=["moge", "panorama", "depth", "geometry", "depth estimation", "geometry estimation"],
+ display_name="Run MoGe Panorama Inference",
+ category="image/geometry estimation",
+ description="Run MoGe on an equirectangular panorama by splitting it into 12 perspective views, running inference on each, and merging the results into a single depth map.",
+ inputs=[
+ MoGeModelType.Input("moge_model"),
+ io.Image.Input("image", tooltip="Equirectangular panorama (any aspect)."),
+ io.Int.Input("resolution_level", default=9, min=0, max=9,
+ tooltip="Per-view detail (0 = fastest, 9 = most detailed)."),
+ io.Int.Input("split_resolution", default=512, min=256, max=1024,
+ tooltip="Resolution of each perspective split."),
+ io.Int.Input("merge_resolution", default=1920, min=256, max=8192,
+ tooltip="Long-side resolution of the merged equirect distance map."),
+ io.Int.Input("batch_size", default=4, min=1, max=12,
+ tooltip="Views per inference batch (12 splits total)."),
+ ],
+ outputs=[MoGeGeometry.Output(display_name="moge_geometry")],
+ )
+
+ @classmethod
+ def execute(cls, moge_model, image, resolution_level, split_resolution, merge_resolution, batch_size) -> io.NodeOutput:
+
+ if image.shape[0] != 1:
+ raise ValueError(f"MoGePanoramaInference takes a single image (got batch of {image.shape[0]})")
+
+ image = image[..., :3]
+ H, W = int(image.shape[1]), int(image.shape[2])
+ scale = min(merge_resolution / max(H, W), 1.0)
+ merge_h, merge_w = max(int(H * scale), 32), max(int(W * scale), 32)
+
+ extrinsics, intrinsics = get_panorama_cameras()
+
+ comfy.model_management.load_model_gpu(moge_model.patcher)
+ device = moge_model.load_device
+ img_chw = image[0].movedim(-1, -3).to(device=device, dtype=moge_model.dtype)
+ splits = split_panorama_image(img_chw, extrinsics, intrinsics, split_resolution)
+
+ n_views = splits.shape[0]
+
+ # Weight each lsmr solve by 4^level so the final-resolution solve doesn't leave the bar idle.
+ merge_levels: list[tuple[int, int]] = []
+ w_, h_ = merge_w, merge_h
+ while True:
+ merge_levels.append((w_, h_))
+ if max(w_, h_) <= 256:
+ break
+ w_, h_ = w_ // 2, h_ // 2
+ merge_levels.reverse()
+
+ solve_weight = {wh: 4 ** i for i, wh in enumerate(merge_levels)}
+ n_merge_view_units = n_views * len(merge_levels)
+ n_merge_solve_units = sum(solve_weight.values())
+
+ pbar = comfy.utils.ProgressBar(n_views + n_merge_view_units + n_merge_solve_units)
+ done = 0
+
+ distance_maps: list = []
+ masks: list = []
+ with tqdm(total=n_views, desc="MoGe panorama inference") as tq:
+ for i in range(0, n_views, batch_size):
+ batch = splits[i:i + batch_size]
+ # apply_metric_scale=False: per-view scales would not align across overlap seams.
+ result = moge_model.infer(batch, resolution_level=resolution_level,
+ fov_x=90.0, force_projection=True,
+ apply_mask=False, apply_metric_scale=False)
+ distance_maps.extend(list(result["points"].float().norm(dim=-1).cpu().numpy()))
+ masks.extend(list(result["mask"].cpu().numpy()))
+ n = batch.shape[0]
+ done += n
+ pbar.update_absolute(done)
+ tq.update(n)
+
+ with tqdm(total=n_merge_view_units + n_merge_solve_units, desc="MoGe panorama merge: views") as tq:
+ def _on_merge_view():
+ nonlocal done
+ done += 1
+ pbar.update_absolute(done)
+ tq.update(1)
+
+ def _on_solve_start(w, h):
+ tq.set_description(f"MoGe panorama merge: solving {w}x{h}")
+
+ def _on_solve_end(w, h):
+ nonlocal done
+ weight = solve_weight[(w, h)]
+ done += weight
+ pbar.update_absolute(done)
+ tq.update(weight)
+ tq.set_description("MoGe panorama merge: views")
+
+ pano_depth, pano_mask = merge_panorama_depth(
+ merge_w, merge_h, distance_maps, masks, list(extrinsics), intrinsics,
+ on_view=_on_merge_view, on_solve_start=_on_solve_start, on_solve_end=_on_solve_end)
+
+ pano_depth = torch.from_numpy(pano_depth)
+ pano_mask = torch.from_numpy(pano_mask)
+
+ if (merge_h, merge_w) != (H, W):
+ pano_depth = torch.nn.functional.interpolate(pano_depth[None, None], size=(H, W), mode="bilinear", align_corners=False).squeeze()
+ pano_mask = torch.nn.functional.interpolate(pano_mask[None, None].float(), size=(H, W), mode="nearest").squeeze() > 0
+
+ # Pixels uncovered by any view's predicted foreground are unconstrained in the lsmr solve and stay at log_depth=0 (depth=1)
+ if pano_mask.any() and not pano_mask.all():
+ far = torch.quantile(pano_depth[pano_mask], 0.95) * 5.0
+ pano_depth = torch.where(pano_mask, pano_depth, far)
+
+ directions = torch.from_numpy(spherical_uv_to_directions(_uv_grid(H, W)))
+ points = (directions * pano_depth[..., None]).unsqueeze(0)
+ depth = pano_depth.unsqueeze(0)
+ mask = pano_mask.unsqueeze(0)
+
+ # Points stay in MoGe spherical coords; MoGePointMapToMesh applies the spherical->glTF rotation after triangulation
+ moge_geometry = {"points": points, "depth": depth, "mask": mask, "image": image.cpu()}
+ return io.NodeOutput(moge_geometry)
+
+
+class MoGeInference(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MoGeInference",
+ search_aliases=["moge", "depth", "geometry", "depth estimation", "geometry estimation"],
+ display_name="Run MoGe Inference",
+ description="Run MoGe on a single image to estimate depth and geometry.",
+ category="image/geometry estimation",
+ inputs=[
+ MoGeModelType.Input("moge_model"),
+ io.Image.Input("image"),
+ io.Int.Input("resolution_level", default=9, min=0, max=9,
+ tooltip="0 = fastest, 9 = most detail."),
+ io.Float.Input("fov_x_degrees", default=0.0, min=0.0, max=170.0, step=0.1, advanced=True,
+ tooltip="Horizontal field of view of the source camera. Sets the focal length used to unproject the depth map into 3D. 0 = auto-recover from the predicted points."),
+ io.Int.Input("batch_size", default=4, min=1, max=64,
+ tooltip="Images per inference call. Lower if you OOM on a long video / image set."),
+ io.Boolean.Input("force_projection", default=True, advanced=True),
+ io.Boolean.Input("apply_mask", default=True, advanced=True,
+ tooltip="Set masked-out (sky / invalid) pixels to inf in points and depth so meshing culls them. Disable to keep the raw predicted geometry everywhere; the mask is still returned separately."),
+ ],
+ outputs=[MoGeGeometry.Output(display_name="moge_geometry")],
+ )
+
+ @classmethod
+ def execute(cls, moge_model, image, resolution_level, fov_x_degrees, batch_size, force_projection, apply_mask) -> io.NodeOutput:
+
+ image = image[..., :3]
+ bchw = image.movedim(-1, -3).contiguous()
+ B = bchw.shape[0]
+ fov = None if fov_x_degrees <= 0 else float(fov_x_degrees)
+
+ pbar = comfy.utils.ProgressBar(B)
+ chunks: list[dict] = []
+ with tqdm(total=B, desc="MoGe inference") as tq:
+ for i in range(0, B, batch_size):
+ chunk = bchw[i:i + batch_size]
+ chunks.append(moge_model.infer(chunk, resolution_level=resolution_level, fov_x=fov,
+ force_projection=force_projection, apply_mask=apply_mask))
+ pbar.update_absolute(min(i + batch_size, B))
+ tq.update(chunk.shape[0])
+
+ def stack(field):
+ vals = [c[field] for c in chunks if field in c]
+ return torch.cat(vals, dim=0) if vals else None
+
+ moge_geometry = {"image": image.cpu()}
+ for field in ("points", "depth", "intrinsics", "mask", "normal"):
+ v = stack(field)
+ if v is not None:
+ moge_geometry[field] = v
+ return io.NodeOutput(moge_geometry)
+
+
+class MoGeRender(io.ComfyNode):
+ """Render a visualization or mask from a MOGE_GEOMETRY packet."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MoGeRender",
+ search_aliases=["moge", "render", "geometry", "depth", "normal"],
+ display_name="Render MoGe Geometry",
+ description="Render a depth map or normal map from geometry data",
+ category="image/geometry estimation",
+ inputs=[
+ MoGeGeometry.Input("moge_geometry"),
+ io.Combo.Input("output", options=["depth", "depth_colored", "normal_opengl", "normal_directx", "mask"], default="depth",
+ tooltip="DirectX vs OpenGL controls the normal-map green-channel convention. DirectX: green = -Y down (Unreal). OpenGL: green = +Y up (Blender, Substance, Unity, glTF)."),
+ ],
+ outputs=[io.Image.Output()],
+ )
+
+ @classmethod
+ def execute(cls, moge_geometry, output) -> io.NodeOutput:
+ is_normal = output in ("normal_directx", "normal_opengl")
+ opengl = output.endswith("_opengl")
+
+ # Pick the input tensor for the chosen mode and validate availability.
+ if output in ("depth", "depth_colored"):
+ if "depth" not in moge_geometry:
+ raise ValueError("moge_geometry has no depth output.")
+ src = moge_geometry["depth"]
+ elif is_normal:
+ if "normal" in moge_geometry:
+ src = moge_geometry["normal"]
+ elif "points" in moge_geometry:
+ src = moge_geometry["points"]
+ else:
+ raise ValueError("moge_geometry has neither normals nor points to derive normals from.")
+ elif output == "mask":
+ if "mask" not in moge_geometry:
+ raise ValueError("moge_geometry has no mask output.")
+ src = moge_geometry["mask"]
+ else:
+ raise ValueError(f"Unknown output mode: {output}")
+
+ B = src.shape[0]
+ pbar = comfy.utils.ProgressBar(B)
+ out: list[torch.Tensor] = []
+ with tqdm(total=B, desc=f"MoGe render: {output}") as tq:
+ for i in range(B):
+ slc = src[i:i + 1].float()
+ if output in ("depth", "depth_colored"):
+ d = _normalize_disparity(slc)
+ out.append(_turbo(d) if output == "depth_colored"
+ else d.unsqueeze(-1).expand(*d.shape, 3).contiguous())
+ elif is_normal:
+ n = slc if "normal" in moge_geometry else _normals_from_points(slc)
+ # MoGe is OpenCV (Z+ into scene); normal-map convention is Z+ out of surface, so flip Z.
+ y_sign = -1.0 if opengl else 1.0
+ n = n * n.new_tensor([1.0, y_sign, -1.0])
+ out.append((n * 0.5 + 0.5).clamp(0.0, 1.0))
+ elif output == "mask":
+ out.append(slc.unsqueeze(-1).expand(*slc.shape, 3).contiguous())
+ pbar.update_absolute(i + 1)
+ tq.update(1)
+ result = torch.cat(out, dim=0).to(device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
+ return io.NodeOutput(result)
+
+
+class MoGePointMapToMesh(io.ComfyNode):
+ """Triangulate one image of a MoGe point map into a Types.MESH (UVs + texture)."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MoGePointMapToMesh",
+ search_aliases=["moge", "mesh", "geometry", "point map"],
+ display_name="Convert MoGe Point Map to Mesh",
+ description="Convert a MoGe point map into a 3D mesh.",
+ category="image/geometry estimation",
+ inputs=[
+ MoGeGeometry.Input("moge_geometry"),
+ io.Int.Input("batch_index", default=0, min=0, max=4096,
+ tooltip="Which image of a batched MoGe geometry to mesh. Per-image vertex counts "
+ "differ, so batches can't be stacked into a single MESH."),
+ io.Int.Input("decimation", default=1, min=1, max=8,
+ tooltip="Vertex stride; 1 = full resolution."),
+ io.Float.Input("discontinuity_threshold", default=0.04, min=0.0, max=1.0, step=0.01,
+ tooltip="Drop pixels whose 3x3 depth span exceeds this fraction. 0 = off."),
+ io.Boolean.Input("texture", default=True,
+ tooltip="Carry the source image through as the baseColor texture."),
+ ],
+ outputs=[io.Mesh.Output()],
+ )
+
+ @classmethod
+ def execute(cls, moge_geometry, batch_index, decimation, discontinuity_threshold, texture) -> io.NodeOutput:
+ if "points" not in moge_geometry:
+ raise ValueError("moge_geometry has no points output.")
+ points = moge_geometry["points"]
+ B = points.shape[0]
+ if batch_index >= B:
+ raise ValueError(f"batch_index {batch_index} out of range; moge_geometry has batch size {B}.")
+
+ # Pass depth so the rtol edge check sees radial depth -- for panoramas
+ # points[..., 2] = cos(phi)*r goes negative below the equator and the rtol clamp would drop the bottom half.
+ edge_depth = moge_geometry["depth"][batch_index] if "depth" in moge_geometry else None
+ verts, faces, uvs = triangulate_grid_mesh(
+ points[batch_index], decimation=decimation,
+ discontinuity_threshold=discontinuity_threshold, depth=edge_depth,
+ )
+ if verts.shape[0] == 0 or faces.shape[0] == 0:
+ raise ValueError("MoGe produced an empty mesh; try discontinuity_threshold=0 or apply_mask=False.")
+
+ if "intrinsics" not in moge_geometry:
+ # Panorama: rotate MoGe spherical (Z up) -> glTF (Y up, Z back), correct for inside-the-sphere viewing)
+ verts = verts[:, [1, 2, 0]].contiguous()
+ else:
+ # Perspective MoGe (X right, Y down, Z forward) -> glTF; face flip keeps winding CCW after the Y/Z flip.
+ verts = verts * torch.tensor([1.0, -1.0, -1.0], dtype=verts.dtype)
+ faces = faces[:, [0, 2, 1]].contiguous()
+
+ tex = moge_geometry["image"][batch_index:batch_index + 1] if texture else None
+ mesh = Types.MESH(
+ vertices=verts.unsqueeze(0),
+ faces=faces.unsqueeze(0),
+ uvs=uvs.unsqueeze(0),
+ texture=tex,
+ )
+ return io.NodeOutput(mesh)
+
+
+class MoGeExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [LoadMoGeModel, MoGeInference, MoGePanoramaInference, MoGeRender, MoGePointMapToMesh]
+
+
+async def comfy_entrypoint() -> MoGeExtension:
+ return MoGeExtension()
diff --git a/comfy_extras/nodes_morphology.py b/comfy_extras/nodes_morphology.py
index 4ab2fb7e8..0142040dd 100644
--- a/comfy_extras/nodes_morphology.py
+++ b/comfy_extras/nodes_morphology.py
@@ -13,8 +13,8 @@ class Morphology(io.ComfyNode):
return io.Schema(
node_id="Morphology",
search_aliases=["erode", "dilate"],
- display_name="ImageMorphology",
- category="image/postprocessing",
+ display_name="Apply Morphology",
+ category="image/filters",
inputs=[
io.Image.Input("image"),
io.Combo.Input(
@@ -59,7 +59,8 @@ class ImageRGBToYUV(io.ComfyNode):
return io.Schema(
node_id="ImageRGBToYUV",
search_aliases=["color space conversion"],
- category="image/batch",
+ display_name="Image RGB to YUV",
+ category="image/color",
inputs=[
io.Image.Input("image"),
],
@@ -81,7 +82,8 @@ class ImageYUVToRGB(io.ComfyNode):
return io.Schema(
node_id="ImageYUVToRGB",
search_aliases=["color space conversion"],
- category="image/batch",
+ display_name="Image YUV to RGB",
+ category="image/color",
inputs=[
io.Image.Input("Y"),
io.Image.Input("U"),
diff --git a/comfy_extras/nodes_multigpu.py b/comfy_extras/nodes_multigpu.py
new file mode 100644
index 000000000..d2f6fe67a
--- /dev/null
+++ b/comfy_extras/nodes_multigpu.py
@@ -0,0 +1,408 @@
+from __future__ import annotations
+
+import copy
+import logging
+from inspect import cleandoc
+from typing import TYPE_CHECKING
+from typing_extensions import override
+
+from comfy_api.latest import ComfyExtension, io
+
+if TYPE_CHECKING:
+ from comfy.model_patcher import ModelPatcher
+ from comfy.sd import CLIP, VAE
+import torch
+
+import comfy.model_management
+import comfy.multigpu
+
+
+class MultiGPUCFGSplitNode(io.ComfyNode):
+ """
+ Prepares model to have sampling accelerated via splitting work units.
+
+ Should be placed after nodes that modify the model object itself, such as compile or attention-switch nodes.
+
+ Other than those exceptions, this node can be placed in any order.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MultiGPU_WorkUnits",
+ display_name="MultiGPU CFG Split",
+ category="advanced/multigpu",
+ description=cleandoc(cls.__doc__),
+ inputs=[
+ io.Model.Input("model"),
+ io.Int.Input("max_gpus", default=2, min=1, step=1),
+ ],
+ outputs=[
+ io.Model.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, model: ModelPatcher, max_gpus: int) -> io.NodeOutput:
+ model = comfy.multigpu.create_multigpu_deepclones(model, max_gpus, reuse_loaded=True)
+ return io.NodeOutput(model)
+
+
+def _force_supported_compute_dtype(patcher: ModelPatcher, device: torch.device):
+ """Cast compute dtype to one the device supports; no-op if already supported."""
+ weight_dtype = patcher.model_dtype()
+ cast_dtype = comfy.model_management.unet_manual_cast(weight_dtype, device)
+ if cast_dtype is None:
+ return
+ logging.info(f"Select Model Device: using {cast_dtype} compute dtype on {device} (model weight dtype was {weight_dtype}).")
+ patcher.set_model_compute_dtype(cast_dtype)
+
+
+def _remember_base_devices(patcher: ModelPatcher):
+ """Stash the original load/offload device on the underlying model.
+
+ Stored on patcher.model (which is shared with the input patcher), so
+ later "default" selections can recover the loader's original routing.
+ Only the first Select on a given chain writes these attrs; subsequent
+ deepclones inherit them onto their freshly-loaded model below.
+ """
+ if not hasattr(patcher.model, "_select_base_load_device"):
+ patcher.model._select_base_load_device = patcher.load_device
+ patcher.model._select_base_offload_device = patcher.offload_device
+
+
+def _propagate_base_devices(src_model, dst_model):
+ """Carry the loader-original device attrs onto the freshly-deepcloned model."""
+ if hasattr(src_model, "_select_base_load_device") and not hasattr(dst_model, "_select_base_load_device"):
+ dst_model._select_base_load_device = src_model._select_base_load_device
+ dst_model._select_base_offload_device = src_model._select_base_offload_device
+
+
+def _retarget_patcher(patcher: ModelPatcher, target_load_device, target_offload_device):
+ """Return a patcher whose actual model weights live on *target_load_device*.
+
+ If *patcher* is already on *target_load_device* we just retarget the
+ (already-cloned) patcher's metadata in place. Otherwise we call
+ :meth:`ModelPatcher.deepclone_multigpu` to spawn a fresh model from
+ the loader's ``cached_patcher_init`` factory -- the only safe way to
+ move weights that may already be partially loaded onto another device.
+
+ NOTE: reusing the input patcher's model when the requested device
+ matches its current load_device is a deliberate fast path. Anything
+ that has already mutated the original model (e.g. a prior KSampler
+ invocation on the same model) will be observed here. This is by
+ design and documented on the SelectXDeviceNode docstrings -- placing
+ Select X Device after a node that consumes the same model is not
+ recommended.
+ """
+ if patcher.load_device == target_load_device:
+ # Fast path: weights already on the desired device, just update offload.
+ patcher.offload_device = target_offload_device
+ return patcher
+ src_model = patcher.model
+ patcher = patcher.deepclone_multigpu(new_load_device=target_load_device)
+ patcher.offload_device = target_offload_device
+ _propagate_base_devices(src_model, patcher.model)
+ if hasattr(patcher, "register_load_device"):
+ patcher.register_load_device(patcher.load_device)
+ return patcher
+
+
+def _apply_patcher_device(patcher: ModelPatcher, resolved, base_offload_override=None):
+ """Resolve the requested device and produce a patcher routed there.
+
+ For "default" we restore the loader's original load/offload pair.
+ For CPU we pin both load and offload to CPU (and, on a dynamic
+ patcher, downgrade to a plain ModelPatcher so the dynamic-only
+ code paths are bypassed).
+ For an explicit GPU we keep the loader's original offload but
+ target the requested load device; if that differs from the current
+ load device the patcher is deepcloned onto the new device.
+ """
+ _remember_base_devices(patcher)
+ base_load = patcher.model._select_base_load_device
+ base_offload = base_offload_override if base_offload_override is not None else patcher.model._select_base_offload_device
+
+ if resolved is None:
+ # "default" -> route back to the loader's original devices.
+ return _retarget_patcher(patcher, base_load, base_offload)
+ if resolved.type == "cpu":
+ if patcher.is_dynamic():
+ # clone(disable_dynamic=True) requires cached_patcher_init; let the
+ # exception surface to the caller (Select*DeviceNode.execute), which
+ # will translate it into a passthrough+log so unsupported loaders
+ # don't hard-fail the workflow.
+ patcher = patcher.clone(disable_dynamic=True)
+ patcher.load_device = resolved
+ patcher.offload_device = resolved
+ return patcher
+ return _retarget_patcher(patcher, resolved, base_offload)
+
+
+def _prune_multigpu_collision(model: ModelPatcher, primary_device):
+ """Drop any multigpu clone whose load_device matches *primary_device*.
+
+ Without pruning, MultiGPU CFG Split would have stacked a clone on
+ the same device the primary now occupies (i.e. the workflow places
+ MultiGPU CFG Split before Select Model Device). Keeps the clone set
+ consistent with the new primary placement.
+ """
+ multigpu_models = model.get_additional_models_with_key("multigpu")
+ if not multigpu_models:
+ return
+ filtered = [m for m in multigpu_models if m.load_device != primary_device]
+ if len(filtered) != len(multigpu_models):
+ logging.info(f"Select Model Device: pruning MultiGPU clone on {primary_device} that now collides with the primary model.")
+ model.set_additional_models("multigpu", filtered)
+ if hasattr(model, "match_multigpu_clones"):
+ model.match_multigpu_clones()
+
+
+class SelectModelDeviceNode(io.ComfyNode):
+ """
+ Place the diffusion model on a specific device (default / cpu / gpu:N).
+
+ - "default" restores the device assigned by the loader (even after a
+ prior Select Model Device call).
+ - "cpu" pins both the load and offload device to CPU.
+ - "gpu:N" pins the load device to the Nth available GPU; the offload
+ device is restored to the loader's original choice.
+
+ When the requested device differs from the device the input model is
+ already on, a fresh model is spawned via the loader's reload factory
+ (cached_patcher_init) so the new patcher owns independent weights on
+ the new device. Loaders that don't support multigpu (no factory) will
+ cause the node to pass through unchanged with a warning.
+
+ If the workflow already has MultiGPU CFG Split applied and the chosen
+ GPU collides with one of the existing multigpu clones, that clone is
+ dropped so two patchers don't end up bound to the same device.
+
+ When the selected device does not exist on the current machine
+ (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
+ the node passes the model through unchanged and logs a message
+ instead of failing.
+
+ NOTE: Placing Select Model Device *after* a node that has already
+ consumed the same model (e.g. a KSampler that ran on this model on
+ the original device) is not recommended -- any state the prior
+ consumer mutated on the original model will be observed when the
+ selected device matches the original (fast path). Place Select Model
+ Device before any consumer of the model.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SelectModelDevice",
+ display_name="Select Model Device",
+ category="advanced/multigpu",
+ description=cleandoc(cls.__doc__),
+ inputs=[
+ io.Model.Input("model"),
+ io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()),
+ ],
+ outputs=[
+ io.Model.Output(),
+ ],
+ )
+
+ @classmethod
+ def validate_inputs(cls, device="default"):
+ # Allow unknown gpu:N values so portable workflows do not error
+ # at validation time; runtime fallback will handle them.
+ return True
+
+ @classmethod
+ def execute(cls, model: ModelPatcher, device: str = "default") -> io.NodeOutput:
+ model = model.clone()
+ resolved = comfy.model_management.resolve_gpu_device_option(device)
+ if resolved is None and device not in (None, "default"):
+ logging.info(f"Select Model Device: requested device '{device}' not available, passing through unchanged.")
+ return io.NodeOutput(model)
+ try:
+ model = _apply_patcher_device(model, resolved)
+ except RuntimeError as e:
+ logging.warning(f"Select Model Device: cannot retarget model, passing through unchanged. ({e})")
+ return io.NodeOutput(model)
+ if resolved is not None:
+ _force_supported_compute_dtype(model, resolved)
+ _prune_multigpu_collision(model, model.load_device)
+ return io.NodeOutput(model)
+
+
+class SelectCLIPDeviceNode(io.ComfyNode):
+ """
+ Place the CLIP text encoder on a specific device (default / cpu / gpu:N).
+
+ - "default" restores the device assigned by the loader.
+ - "cpu" pins both the load and offload device to CPU.
+ - "gpu:N" pins the load device to the Nth available GPU.
+
+ When the selected device does not exist on the current machine
+ (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
+ the node passes the CLIP through unchanged and logs a message
+ instead of failing.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SelectCLIPDevice",
+ display_name="Select CLIP Device",
+ category="advanced/multigpu",
+ description=cleandoc(cls.__doc__),
+ inputs=[
+ io.Clip.Input("clip"),
+ io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options()),
+ ],
+ outputs=[
+ io.Clip.Output(),
+ ],
+ )
+
+ @classmethod
+ def validate_inputs(cls, device="default"):
+ return True
+
+ @classmethod
+ def execute(cls, clip: CLIP, device: str = "default") -> io.NodeOutput:
+ clip = clip.clone()
+ resolved = comfy.model_management.resolve_gpu_device_option(device)
+ if resolved is None and device not in (None, "default"):
+ logging.info(f"Select CLIP Device: requested device '{device}' not available, passing through unchanged.")
+ return io.NodeOutput(clip)
+ try:
+ clip.patcher = _apply_patcher_device(clip.patcher, resolved)
+ except RuntimeError as e:
+ logging.warning(f"Select CLIP Device: cannot retarget CLIP, passing through unchanged. ({e})")
+ return io.NodeOutput(clip)
+
+
+class SelectVAEDeviceNode(io.ComfyNode):
+ """
+ Place the VAE on a specific device (default / gpu:N).
+
+ - "default" restores the device assigned by the loader.
+ - "gpu:N" pins the load device to the Nth available GPU; the offload
+ device is set to the standard VAE offload device.
+
+ CPU is intentionally not exposed in the UI for the VAE; if a workflow
+ supplies "cpu" anyway (e.g. opened from another machine), the request
+ is dropped with a log message and the VAE is passed through unchanged.
+
+ When the selected device does not exist on the current machine
+ (e.g. a workflow built on a 2-GPU box opened on a 1-GPU box),
+ the node passes the VAE through unchanged and logs a message
+ instead of failing.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SelectVAEDevice",
+ display_name="Select VAE Device",
+ category="advanced/multigpu",
+ description=cleandoc(cls.__doc__),
+ inputs=[
+ io.Vae.Input("vae"),
+ io.Combo.Input("device", options=comfy.model_management.get_gpu_device_options_no_cpu()),
+ ],
+ outputs=[
+ io.Vae.Output(),
+ ],
+ )
+
+ @classmethod
+ def validate_inputs(cls, device="default"):
+ return True
+
+ @classmethod
+ def execute(cls, vae: VAE, device: str = "default") -> io.NodeOutput:
+ # VAE has no .clone(); shallow-copy the wrapper and clone the patcher
+ # so we can retarget load/offload device without affecting the input VAE.
+ vae = copy.copy(vae)
+ vae.patcher = vae.patcher.clone()
+ resolved = comfy.model_management.resolve_gpu_device_option(device)
+ if resolved is None and device not in (None, "default"):
+ logging.info(f"Select VAE Device: requested device '{device}' not available, passing through unchanged.")
+ return io.NodeOutput(vae)
+ if resolved is not None and resolved.type == "cpu":
+ logging.info("Select VAE Device: CPU is not a supported choice, passing through unchanged.")
+ return io.NodeOutput(vae)
+ if not hasattr(vae, "_select_base_device"):
+ vae._select_base_device = vae.device
+ try:
+ vae.patcher = _apply_patcher_device(
+ vae.patcher, resolved,
+ base_offload_override=comfy.model_management.vae_offload_device(),
+ )
+ except RuntimeError as e:
+ logging.warning(f"Select VAE Device: cannot retarget VAE, passing through unchanged. ({e})")
+ return io.NodeOutput(vae)
+ # Keep VAE wrapper in sync with whatever model the patcher now owns;
+ # deepclone_multigpu may have produced a fresh first_stage_model.
+ vae.first_stage_model = vae.patcher.model
+ vae.device = vae._select_base_device if resolved is None else resolved
+ return io.NodeOutput(vae)
+
+
+class MultiGPUOptionsNode(io.ComfyNode):
+ """
+ Select the relative speed of GPUs in the special case they have significantly different performance from one another.
+
+ NOTE (not registered yet, see MultiGPUExtension.get_node_list below):
+ The output GPUOptionsGroup is plumbed through create_multigpu_deepclones() and stored on
+ model.model_options['multigpu_options'] via GPUOptionsGroup.register(), but the cond
+ scheduler in comfy/samplers.py (calc_cond_batch_outer_multigpu) does NOT yet consult
+ relative_speed when distributing conds across devices; it uses a uniform conds_per_device
+ round-robin via next_available_device(). Before re-enabling this node, wire its
+ relative_speed into the scheduler (e.g. via comfy.multigpu.load_balance_devices(),
+ which already implements the proportional split) so the input actually affects work
+ distribution.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="MultiGPU_Options",
+ display_name="MultiGPU Options",
+ category="advanced/multigpu",
+ description=cleandoc(cls.__doc__),
+ inputs=[
+ io.Int.Input("device_index", default=0, min=0, max=64),
+ io.Float.Input("relative_speed", default=1.0, min=0.0, step=0.01),
+ io.Custom("GPU_OPTIONS").Input("gpu_options", optional=True),
+ ],
+ outputs=[
+ io.Custom("GPU_OPTIONS").Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, device_index: int, relative_speed: float, gpu_options: comfy.multigpu.GPUOptionsGroup = None) -> io.NodeOutput:
+ if not gpu_options:
+ gpu_options = comfy.multigpu.GPUOptionsGroup()
+ else:
+ gpu_options = gpu_options.clone()
+
+ opt = comfy.multigpu.GPUOptions(device_index=device_index, relative_speed=relative_speed)
+ gpu_options.add(opt)
+
+ return io.NodeOutput(gpu_options)
+
+
+class MultiGPUExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ MultiGPUCFGSplitNode,
+ SelectModelDeviceNode,
+ SelectCLIPDeviceNode,
+ SelectVAEDeviceNode,
+ # MultiGPUOptionsNode,
+ ]
+
+
+async def comfy_entrypoint() -> MultiGPUExtension:
+ return MultiGPUExtension()
diff --git a/comfy_extras/nodes_nop.py b/comfy_extras/nodes_nop.py
index 953061bcb..f9c1357c3 100644
--- a/comfy_extras/nodes_nop.py
+++ b/comfy_extras/nodes_nop.py
@@ -13,7 +13,7 @@ class wanBlockSwap(io.ComfyNode):
return io.Schema(
node_id="wanBlockSwap",
category="",
- description="NOP",
+ description="Intercept wanBlockSwap custom node that causes major instability and make it no-op.",
inputs=[
io.Model.Input("model"),
],
diff --git a/comfy_extras/nodes_number_convert.py b/comfy_extras/nodes_number_convert.py
index cac7e736d..d7e557e95 100644
--- a/comfy_extras/nodes_number_convert.py
+++ b/comfy_extras/nodes_number_convert.py
@@ -4,7 +4,6 @@ Provides a single node that converts INT, FLOAT, STRING, and BOOL
inputs into FLOAT and INT outputs.
"""
-from __future__ import annotations
import math
@@ -20,8 +19,8 @@ class NumberConvertNode(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="ComfyNumberConvert",
- display_name="Number Convert",
- category="math",
+ display_name="Convert Number",
+ category="utilities",
search_aliases=[
"int to float", "float to int", "number convert",
"int2float", "float2int", "cast", "parse number",
diff --git a/comfy_extras/nodes_optimalsteps.py b/comfy_extras/nodes_optimalsteps.py
index 73f0104d8..19629790f 100644
--- a/comfy_extras/nodes_optimalsteps.py
+++ b/comfy_extras/nodes_optimalsteps.py
@@ -31,7 +31,7 @@ class OptimalStepsScheduler(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="OptimalStepsScheduler",
- category="sampling/custom_sampling/schedulers",
+ category="model/sampling/schedulers",
inputs=[
io.Combo.Input("model_type", options=["FLUX", "Wan", "Chroma"]),
io.Int.Input("steps", default=20, min=3, max=1000),
diff --git a/comfy_extras/nodes_pag.py b/comfy_extras/nodes_pag.py
index 79fea5f0c..c875e1e06 100644
--- a/comfy_extras/nodes_pag.py
+++ b/comfy_extras/nodes_pag.py
@@ -15,7 +15,7 @@ class PerturbedAttentionGuidance(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PerturbedAttentionGuidance",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Float.Input("scale", default=3.0, min=0.0, max=100.0, step=0.01, round=0.01),
diff --git a/comfy_extras/nodes_painter.py b/comfy_extras/nodes_painter.py
index e104c8480..df7a0b76a 100644
--- a/comfy_extras/nodes_painter.py
+++ b/comfy_extras/nodes_painter.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import hashlib
import os
diff --git a/comfy_extras/nodes_perpneg.py b/comfy_extras/nodes_perpneg.py
index ed1467de9..a7a72d1bc 100644
--- a/comfy_extras/nodes_perpneg.py
+++ b/comfy_extras/nodes_perpneg.py
@@ -24,8 +24,8 @@ class PerpNeg(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PerpNeg",
- display_name="Perp-Neg (DEPRECATED by PerpNegGuider)",
- category="_for_testing",
+ display_name="Perp-Neg (DEPRECATED by Perp-Neg Guider)",
+ category="experimental",
inputs=[
io.Model.Input("model"),
io.Conditioning.Input("empty_conditioning"),
@@ -127,7 +127,8 @@ class PerpNegGuider(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PerpNegGuider",
- category="_for_testing",
+ display_name="Perp-Neg Guider",
+ category="experimental",
inputs=[
io.Model.Input("model"),
io.Conditioning.Input("positive"),
diff --git a/comfy_extras/nodes_photomaker.py b/comfy_extras/nodes_photomaker.py
index 228183c07..8a2248572 100644
--- a/comfy_extras/nodes_photomaker.py
+++ b/comfy_extras/nodes_photomaker.py
@@ -123,7 +123,7 @@ class PhotoMakerLoader(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PhotoMakerLoader",
- category="_for_testing/photomaker",
+ category="experimental/photomaker",
inputs=[
io.Combo.Input("photomaker_model_name", options=folder_paths.get_filename_list("photomaker")),
],
@@ -149,7 +149,7 @@ class PhotoMakerEncode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PhotoMakerEncode",
- category="_for_testing/photomaker",
+ category="experimental/photomaker",
inputs=[
io.Photomaker.Input("photomaker"),
io.Image.Input("image"),
diff --git a/comfy_extras/nodes_pid.py b/comfy_extras/nodes_pid.py
new file mode 100644
index 000000000..811b9ae8e
--- /dev/null
+++ b/comfy_extras/nodes_pid.py
@@ -0,0 +1,55 @@
+"""PiD (Pixel Diffusion Decoder) node"""
+
+import torch
+from typing_extensions import override
+
+import node_helpers
+import comfy.latent_formats
+from comfy_api.latest import ComfyExtension, io
+
+
+class PiDConditioning(io.ComfyNode):
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ return io.Schema(
+ node_id="PiDConditioning",
+ display_name="PiD Conditioning",
+ category="advanced/conditioning",
+ description=(
+ "Attaches a latent and a degrade_sigma scalar to a CONDITIONING for PiD decoding/upscaling"
+ ),
+ inputs=[
+ io.Conditioning.Input("positive"),
+ io.Latent.Input("latent", tooltip="latent (from VAEEncode or a KSampler)."),
+ io.Combo.Input("latent_format", options=["flux", "sd3"], default="flux",
+ tooltip="Flux1 and Flux2 latents auto-detected from channel dim, sd3 has to be selected manually."),
+ io.Float.Input(
+ "degrade_sigma", default=0.0, min=0.0, max=1.0, step=0.01,
+ tooltip="0 = clean latent. Increase to denoise corrupted latent outputs.",
+ ),
+ ],
+ outputs=[io.Conditioning.Output()],
+ )
+
+ @classmethod
+ def execute(cls, positive, latent, latent_format: str, degrade_sigma: float) -> io.NodeOutput:
+ samples = latent["samples"]
+ if latent_format == "flux":
+ fmt_cls = comfy.latent_formats.Flux2 if samples.shape[1] == 128 else comfy.latent_formats.Flux
+ else:
+ fmt_cls = comfy.latent_formats.SD3
+ lq_latent = fmt_cls().process_in(samples)
+ sigma_t = torch.tensor([float(degrade_sigma)], dtype=torch.float32)
+ return io.NodeOutput(node_helpers.conditioning_set_values(
+ positive, {"lq_latent": lq_latent, "degrade_sigma": sigma_t},
+ ))
+
+
+class PiDExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [PiDConditioning]
+
+
+async def comfy_entrypoint() -> PiDExtension:
+ return PiDExtension()
diff --git a/comfy_extras/nodes_post_processing.py b/comfy_extras/nodes_post_processing.py
index 9037c3d20..3e440433e 100644
--- a/comfy_extras/nodes_post_processing.py
+++ b/comfy_extras/nodes_post_processing.py
@@ -6,6 +6,7 @@ from PIL import Image
import math
from enum import Enum
from typing import TypedDict, Literal
+import kornia
import comfy.utils
import comfy.model_management
@@ -19,8 +20,9 @@ class Blend(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ImageBlend",
- display_name="Image Blend",
- category="image/postprocessing",
+ search_aliases=["mix images"],
+ display_name="Blend Images",
+ category="image/filters",
essentials_category="Image Tools",
inputs=[
io.Image.Input("image1"),
@@ -78,8 +80,8 @@ class Blur(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ImageBlur",
- display_name="Image Blur",
- category="image/postprocessing",
+ display_name="Blur Image",
+ category="image/filters",
inputs=[
io.Image.Input("image"),
io.Int.Input("blur_radius", default=1, min=1, max=31, step=1),
@@ -114,7 +116,8 @@ class Quantize(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ImageQuantize",
- category="image/postprocessing",
+ display_name="Quantize Image",
+ category="image/filters",
inputs=[
io.Image.Input("image"),
io.Int.Input("colors", default=256, min=1, max=256, step=1),
@@ -179,7 +182,8 @@ class Sharpen(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ImageSharpen",
- category="image/postprocessing",
+ display_name="Sharpen Image",
+ category="image/filters",
inputs=[
io.Image.Input("image"),
io.Int.Input("sharpen_radius", default=1, min=1, max=31, step=1, advanced=True),
@@ -223,6 +227,7 @@ class ImageScaleToTotalPixels(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ImageScaleToTotalPixels",
+ display_name="Scale Image to Total Pixels",
category="image/upscaling",
inputs=[
io.Image.Input("image"),
@@ -433,7 +438,7 @@ class ResizeImageMaskNode(io.ComfyNode):
node_id="ResizeImageMaskNode",
display_name="Resize Image/Mask",
description="Resize an image or mask using various scaling methods.",
- category="transform",
+ category="image/transform",
search_aliases=["resize", "resize image", "resize mask", "scale", "scale image", "scale mask", "image resize", "change size", "dimensions", "shrink", "enlarge"],
inputs=[
io.MatchType.Input("input", template=template),
@@ -563,11 +568,11 @@ def batch_latents(latents: list[dict[str, torch.Tensor]]) -> dict[str, torch.Ten
class BatchImagesNode(io.ComfyNode):
@classmethod
def define_schema(cls):
- autogrow_template = io.Autogrow.TemplatePrefix(io.Image.Input("image"), prefix="image", min=2, max=50)
+ autogrow_template = io.Autogrow.TemplatePrefix(io.Image.Input("image"), prefix="image", min=1, max=50)
return io.Schema(
node_id="BatchImagesNode",
display_name="Batch Images",
- category="image",
+ category="image/batch",
essentials_category="Image Tools",
search_aliases=["batch", "image batch", "batch images", "combine images", "merge images", "stack images"],
inputs=[
@@ -585,12 +590,12 @@ class BatchImagesNode(io.ComfyNode):
class BatchMasksNode(io.ComfyNode):
@classmethod
def define_schema(cls):
- autogrow_template = io.Autogrow.TemplatePrefix(io.Mask.Input("mask"), prefix="mask", min=2, max=50)
+ autogrow_template = io.Autogrow.TemplatePrefix(io.Mask.Input("mask"), prefix="mask", min=1, max=50)
return io.Schema(
node_id="BatchMasksNode",
search_aliases=["combine masks", "stack masks", "merge masks"],
display_name="Batch Masks",
- category="mask",
+ category="image/mask",
inputs=[
io.Autogrow.Input("masks", template=autogrow_template)
],
@@ -606,12 +611,12 @@ class BatchMasksNode(io.ComfyNode):
class BatchLatentsNode(io.ComfyNode):
@classmethod
def define_schema(cls):
- autogrow_template = io.Autogrow.TemplatePrefix(io.Latent.Input("latent"), prefix="latent", min=2, max=50)
+ autogrow_template = io.Autogrow.TemplatePrefix(io.Latent.Input("latent"), prefix="latent", min=1, max=50)
return io.Schema(
node_id="BatchLatentsNode",
search_aliases=["combine latents", "stack latents", "merge latents"],
display_name="Batch Latents",
- category="latent",
+ category="model/latent",
inputs=[
io.Autogrow.Input("latents", template=autogrow_template)
],
@@ -660,6 +665,229 @@ class BatchImagesMasksLatentsNode(io.ComfyNode):
return io.NodeOutput(batched)
+class ColorTransfer(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="ColorTransfer",
+ display_name="Transfer Color",
+ category="image/filters",
+ description="Match the colors of one image to another using various algorithms.",
+ search_aliases=["color match", "color grading", "color correction", "match colors", "color transform", "mkl", "reinhard", "histogram"],
+ inputs=[
+ io.Image.Input("image_target", tooltip="Image(s) to apply the color transform to."),
+ io.Image.Input("image_ref", tooltip="Reference image(s) to match colors to."),
+ io.Combo.Input("method", options=['reinhard_lab', 'mkl_lab', 'histogram'],),
+ io.DynamicCombo.Input("source_stats",
+ tooltip="per_frame: each frame matched to image_ref individually. uniform: pool stats across all source frames as baseline, match to image_ref. target_frame: use one chosen frame as the baseline for the transform to image_ref, applied uniformly to all frames (preserves relative differences)",
+ options=[
+ io.DynamicCombo.Option("per_frame", []),
+ io.DynamicCombo.Option("uniform", []),
+ io.DynamicCombo.Option("target_frame", [
+ io.Int.Input("target_index", default=0, min=0, max=10000,
+ tooltip="Frame index used as the source baseline for computing the transform to image_ref"),
+ ]),
+ ]),
+ io.Float.Input("strength", default=1.0, min=0.0, max=10.0, step=0.01),
+ ],
+ outputs=[
+ io.Image.Output(display_name="image"),
+ ],
+ )
+
+ @staticmethod
+ def _to_lab(images, i, device):
+ return kornia.color.rgb_to_lab(
+ images[i:i+1].to(device, dtype=torch.float32).permute(0, 3, 1, 2))
+
+ @staticmethod
+ def _pool_stats(images, device, is_reinhard, eps):
+ """Two-pass pooled mean + std/cov across all frames."""
+ N, C = images.shape[0], images.shape[3]
+ HW = images.shape[1] * images.shape[2]
+ mean = torch.zeros(C, 1, device=device, dtype=torch.float32)
+ for i in range(N):
+ mean += ColorTransfer._to_lab(images, i, device).view(C, -1).mean(dim=-1, keepdim=True)
+ mean /= N
+ acc = torch.zeros(C, 1 if is_reinhard else C, device=device, dtype=torch.float32)
+ for i in range(N):
+ centered = ColorTransfer._to_lab(images, i, device).view(C, -1) - mean
+ if is_reinhard:
+ acc += (centered * centered).mean(dim=-1, keepdim=True)
+ else:
+ acc += centered @ centered.T / HW
+ if is_reinhard:
+ return mean, torch.sqrt(acc / N).clamp_min_(eps)
+ return mean, acc / N
+
+ @staticmethod
+ def _frame_stats(lab_flat, hw, is_reinhard, eps):
+ """Per-frame mean + std/cov."""
+ mean = lab_flat.mean(dim=-1, keepdim=True)
+ if is_reinhard:
+ return mean, lab_flat.std(dim=-1, keepdim=True, unbiased=False).clamp_min_(eps)
+ centered = lab_flat - mean
+ return mean, centered @ centered.T / hw
+
+ @staticmethod
+ def _mkl_matrix(cov_s, cov_r, eps):
+ """Compute MKL 3x3 transform matrix from source and ref covariances."""
+ eig_val_s, eig_vec_s = torch.linalg.eigh(cov_s)
+ sqrt_val_s = torch.sqrt(eig_val_s.clamp_min(0)).clamp_min_(eps)
+
+ scaled_V = eig_vec_s * sqrt_val_s.unsqueeze(0)
+ mid = scaled_V.T @ cov_r @ scaled_V
+ eig_val_m, eig_vec_m = torch.linalg.eigh(mid)
+ sqrt_m = torch.sqrt(eig_val_m.clamp_min(0))
+
+ inv_sqrt_s = 1.0 / sqrt_val_s
+ inv_scaled_V = eig_vec_s * inv_sqrt_s.unsqueeze(0)
+ M_half = (eig_vec_m * sqrt_m.unsqueeze(0)) @ eig_vec_m.T
+ return inv_scaled_V @ M_half @ inv_scaled_V.T
+
+ @staticmethod
+ def _histogram_lut(src, ref, bins=256):
+ """Build per-channel LUT from source and ref histograms. src/ref: (C, HW) in [0,1]."""
+ s_bins = (src * (bins - 1)).long().clamp(0, bins - 1)
+ r_bins = (ref * (bins - 1)).long().clamp(0, bins - 1)
+ s_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
+ r_hist = torch.zeros(src.shape[0], bins, device=src.device, dtype=src.dtype)
+ ones_s = torch.ones_like(src)
+ ones_r = torch.ones_like(ref)
+ s_hist.scatter_add_(1, s_bins, ones_s)
+ r_hist.scatter_add_(1, r_bins, ones_r)
+ s_cdf = s_hist.cumsum(1)
+ s_cdf = s_cdf / s_cdf[:, -1:]
+ r_cdf = r_hist.cumsum(1)
+ r_cdf = r_cdf / r_cdf[:, -1:]
+ return torch.searchsorted(r_cdf, s_cdf).clamp_max_(bins - 1).float() / (bins - 1)
+
+ @classmethod
+ def _pooled_cdf(cls, images, device, num_bins=256):
+ """Build pooled CDF across all frames, one frame at a time."""
+ C = images.shape[3]
+ hist = torch.zeros(C, num_bins, device=device, dtype=torch.float32)
+ for i in range(images.shape[0]):
+ frame = images[i].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
+ bins = (frame * (num_bins - 1)).long().clamp(0, num_bins - 1)
+ hist.scatter_add_(1, bins, torch.ones_like(frame))
+ cdf = hist.cumsum(1)
+ return cdf / cdf[:, -1:]
+
+ @classmethod
+ def _build_histogram_transform(cls, image_target, image_ref, device, stats_mode, target_index, B):
+ """Build per-frame or uniform LUT transform for histogram mode."""
+ if stats_mode == 'per_frame':
+ return None # LUT computed per-frame in the apply loop
+
+ r_cdf = cls._pooled_cdf(image_ref, device)
+ if stats_mode == 'target_frame':
+ ti = min(target_index, B - 1)
+ s_cdf = cls._pooled_cdf(image_target[ti:ti+1], device)
+ else:
+ s_cdf = cls._pooled_cdf(image_target, device)
+ return torch.searchsorted(r_cdf, s_cdf).clamp_max_(255).float() / 255.0
+
+ @classmethod
+ def _build_lab_transform(cls, image_target, image_ref, device, stats_mode, target_index, is_reinhard):
+ """Build transform parameters for Lab-based methods. Returns a transform function."""
+ eps = 1e-6
+ B, H, W, C = image_target.shape
+ B_ref = image_ref.shape[0]
+ single_ref = B_ref == 1
+ HW = H * W
+ HW_ref = image_ref.shape[1] * image_ref.shape[2]
+
+ # Precompute ref stats
+ if single_ref or stats_mode in ('uniform', 'target_frame'):
+ ref_mean, ref_sc = cls._pool_stats(image_ref, device, is_reinhard, eps)
+
+ # Uniform/target_frame: precompute single affine transform
+ if stats_mode in ('uniform', 'target_frame'):
+ if stats_mode == 'target_frame':
+ ti = min(target_index, B - 1)
+ s_lab = cls._to_lab(image_target, ti, device).view(C, -1)
+ s_mean, s_sc = cls._frame_stats(s_lab, HW, is_reinhard, eps)
+ else:
+ s_mean, s_sc = cls._pool_stats(image_target, device, is_reinhard, eps)
+
+ if is_reinhard:
+ scale = ref_sc / s_sc
+ offset = ref_mean - scale * s_mean
+ return lambda src_flat, **_: src_flat * scale + offset
+ T = cls._mkl_matrix(s_sc, ref_sc, eps)
+ offset = ref_mean - T @ s_mean
+ return lambda src_flat, **_: T @ src_flat + offset
+
+ # per_frame
+ def per_frame_transform(src_flat, frame_idx):
+ s_mean, s_sc = cls._frame_stats(src_flat, HW, is_reinhard, eps)
+
+ if single_ref:
+ r_mean, r_sc = ref_mean, ref_sc
+ else:
+ ri = min(frame_idx, B_ref - 1)
+ r_mean, r_sc = cls._frame_stats(cls._to_lab(image_ref, ri, device).view(C, -1), HW_ref, is_reinhard, eps)
+
+ centered = src_flat - s_mean
+ if is_reinhard:
+ return centered * (r_sc / s_sc) + r_mean
+ T = cls._mkl_matrix(centered @ centered.T / HW, r_sc, eps)
+ return T @ centered + r_mean
+
+ return per_frame_transform
+
+ @classmethod
+ def execute(cls, image_target, image_ref, method, source_stats, strength=1.0) -> io.NodeOutput:
+ stats_mode = source_stats["source_stats"]
+ target_index = source_stats.get("target_index", 0)
+
+ if strength == 0 or image_ref is None:
+ return io.NodeOutput(image_target)
+
+ device = comfy.model_management.get_torch_device()
+ intermediate_device = comfy.model_management.intermediate_device()
+ intermediate_dtype = comfy.model_management.intermediate_dtype()
+
+ B, H, W, C = image_target.shape
+ B_ref = image_ref.shape[0]
+ pbar = comfy.utils.ProgressBar(B)
+ out = torch.empty(B, H, W, C, device=intermediate_device, dtype=intermediate_dtype)
+
+ if method == 'histogram':
+ uniform_lut = cls._build_histogram_transform(
+ image_target, image_ref, device, stats_mode, target_index, B)
+
+ for i in range(B):
+ src = image_target[i].to(device, dtype=torch.float32).permute(2, 0, 1)
+ src_flat = src.reshape(C, -1)
+ if uniform_lut is not None:
+ lut = uniform_lut
+ else:
+ ri = min(i, B_ref - 1)
+ ref = image_ref[ri].to(device, dtype=torch.float32).permute(2, 0, 1).reshape(C, -1)
+ lut = cls._histogram_lut(src_flat, ref)
+ bin_idx = (src_flat * 255).long().clamp(0, 255)
+ matched = lut.gather(1, bin_idx).view(C, H, W)
+ result = matched if strength == 1.0 else torch.lerp(src, matched, strength)
+ out[i] = result.permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
+ pbar.update(1)
+ else:
+ transform = cls._build_lab_transform(image_target, image_ref, device, stats_mode, target_index, is_reinhard=method == "reinhard_lab")
+
+ for i in range(B):
+ src_frame = cls._to_lab(image_target, i, device)
+ corrected = transform(src_frame.view(C, -1), frame_idx=i)
+ if strength == 1.0:
+ result = kornia.color.lab_to_rgb(corrected.view(1, C, H, W))
+ else:
+ result = kornia.color.lab_to_rgb(torch.lerp(src_frame, corrected.view(1, C, H, W), strength))
+ out[i] = result.squeeze(0).permute(1, 2, 0).clamp_(0, 1).to(device=intermediate_device, dtype=intermediate_dtype)
+ pbar.update(1)
+
+ return io.NodeOutput(out)
+
+
class PostProcessingExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
@@ -673,6 +901,7 @@ class PostProcessingExtension(ComfyExtension):
BatchImagesNode,
BatchMasksNode,
BatchLatentsNode,
+ ColorTransfer,
# BatchImagesMasksLatentsNode,
]
diff --git a/comfy_extras/nodes_preview_any.py b/comfy_extras/nodes_preview_any.py
index b0a6f279d..1070a69d0 100644
--- a/comfy_extras/nodes_preview_any.py
+++ b/comfy_extras/nodes_preview_any.py
@@ -1,5 +1,6 @@
import json
from comfy.comfy_types.node_typing import IO
+import torch
# Preview Any - original implement from
# https://github.com/rgthree/rgthree-comfy/blob/main/py/display_any.py
@@ -11,14 +12,15 @@ class PreviewAny():
"required": {"source": (IO.ANY, {})},
}
- RETURN_TYPES = ()
+ RETURN_TYPES = (IO.STRING,)
FUNCTION = "main"
OUTPUT_NODE = True
- CATEGORY = "utils"
+ CATEGORY = "utilities"
SEARCH_ALIASES = ["show output", "inspect", "debug", "print value", "show text"]
def main(self, source=None):
+ torch.set_printoptions(edgeitems=6)
value = 'None'
if isinstance(source, str):
value = source
@@ -33,7 +35,8 @@ class PreviewAny():
except Exception:
value = 'source exists, but could not be serialized.'
- return {"ui": {"text": (value,)}}
+ torch.set_printoptions()
+ return {"ui": {"text": (value,)}, "result": (value,)}
NODE_CLASS_MAPPINGS = {
"PreviewAny": PreviewAny,
diff --git a/comfy_extras/nodes_primitive.py b/comfy_extras/nodes_primitive.py
index 9c2e98758..c44b09098 100644
--- a/comfy_extras/nodes_primitive.py
+++ b/comfy_extras/nodes_primitive.py
@@ -9,8 +9,9 @@ class String(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PrimitiveString",
- display_name="String",
- category="utils/primitive",
+ search_aliases=["text", "string", "text box", "prompt"],
+ display_name="Text String",
+ category="utilities/primitive",
inputs=[
io.String.Input("value"),
],
@@ -27,8 +28,9 @@ class StringMultiline(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="PrimitiveStringMultiline",
- display_name="String (Multiline)",
- category="utils/primitive",
+ search_aliases=["text", "string", "text multiline", "string multiline", "text box", "prompt"],
+ display_name="Text String (Multiline)",
+ category="utilities/primitive",
essentials_category="Basics",
inputs=[
io.String.Input("value", multiline=True),
@@ -47,9 +49,9 @@ class Int(io.ComfyNode):
return io.Schema(
node_id="PrimitiveInt",
display_name="Int",
- category="utils/primitive",
+ category="utilities/primitive",
inputs=[
- io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=True),
+ io.Int.Input("value", min=-sys.maxsize, max=sys.maxsize, control_after_generate=io.ControlAfterGenerate.fixed),
],
outputs=[io.Int.Output()],
)
@@ -65,7 +67,7 @@ class Float(io.ComfyNode):
return io.Schema(
node_id="PrimitiveFloat",
display_name="Float",
- category="utils/primitive",
+ category="utilities/primitive",
inputs=[
io.Float.Input("value", min=-sys.maxsize, max=sys.maxsize, step=0.1),
],
@@ -83,7 +85,7 @@ class Boolean(io.ComfyNode):
return io.Schema(
node_id="PrimitiveBoolean",
display_name="Boolean",
- category="utils/primitive",
+ category="utilities/primitive",
inputs=[
io.Boolean.Input("value"),
],
diff --git a/comfy_extras/nodes_qwen.py b/comfy_extras/nodes_qwen.py
index 6894367be..5b92814a4 100644
--- a/comfy_extras/nodes_qwen.py
+++ b/comfy_extras/nodes_qwen.py
@@ -112,11 +112,11 @@ class EmptyQwenImageLayeredLatentImage(io.ComfyNode):
return io.Schema(
node_id="EmptyQwenImageLayeredLatentImage",
display_name="Empty Qwen Image Layered Latent",
- category="latent/qwen",
+ category="model/latent/qwen",
inputs=[
io.Int.Input("width", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=640, min=16, max=nodes.MAX_RESOLUTION, step=16),
- io.Int.Input("layers", default=3, min=0, max=nodes.MAX_RESOLUTION, step=1, advanced=True),
+ io.Int.Input("layers", default=3, min=0, max=nodes.MAX_RESOLUTION, step=1),
io.Int.Input("batch_size", default=1, min=1, max=4096),
],
outputs=[
diff --git a/comfy_extras/nodes_rebatch.py b/comfy_extras/nodes_rebatch.py
index 5f4e82aef..2185385f0 100644
--- a/comfy_extras/nodes_rebatch.py
+++ b/comfy_extras/nodes_rebatch.py
@@ -10,7 +10,7 @@ class LatentRebatch(io.ComfyNode):
return io.Schema(
node_id="RebatchLatents",
display_name="Rebatch Latents",
- category="latent/batch",
+ category="model/latent/batch",
is_input_list=True,
inputs=[
io.Latent.Input("latents"),
diff --git a/comfy_extras/nodes_resolution.py b/comfy_extras/nodes_resolution.py
index 520b4067e..dc405291c 100644
--- a/comfy_extras/nodes_resolution.py
+++ b/comfy_extras/nodes_resolution.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import math
from enum import Enum
from typing_extensions import override
@@ -36,7 +35,7 @@ class ResolutionSelector(io.ComfyNode):
return io.Schema(
node_id="ResolutionSelector",
display_name="Resolution Selector",
- category="utils",
+ category="utilities",
description="Calculate width and height from aspect ratio and megapixel target. Useful for setting up Empty Latent Image dimensions.",
inputs=[
io.Combo.Input(
diff --git a/comfy_extras/nodes_rope.py b/comfy_extras/nodes_rope.py
index 918ddc02b..808eee29b 100644
--- a/comfy_extras/nodes_rope.py
+++ b/comfy_extras/nodes_rope.py
@@ -7,7 +7,7 @@ class ScaleROPE(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="ScaleROPE",
- category="advanced/model_patches",
+ category="model/patch",
description="Scale and shift the ROPE of the model.",
is_experimental=True,
inputs=[
diff --git a/comfy_extras/nodes_rtdetr.py b/comfy_extras/nodes_rtdetr.py
index 7feaf3ab3..e5a9b3902 100644
--- a/comfy_extras/nodes_rtdetr.py
+++ b/comfy_extras/nodes_rtdetr.py
@@ -15,7 +15,7 @@ class RTDETR_detect(io.ComfyNode):
return io.Schema(
node_id="RTDETR_detect",
display_name="RT-DETR Detect",
- category="detection/",
+ category="image/detection",
search_aliases=["bbox", "bounding box", "object detection", "coco"],
inputs=[
io.Model.Input("model", display_name="model"),
@@ -71,7 +71,7 @@ class DrawBBoxes(io.ComfyNode):
return io.Schema(
node_id="DrawBBoxes",
display_name="Draw BBoxes",
- category="detection/",
+ category="image/detection",
search_aliases=["bbox", "bounding box", "object detection", "rt_detr", "visualize detections", "coco"],
inputs=[
io.Image.Input("image", optional=True),
diff --git a/comfy_extras/nodes_sag.py b/comfy_extras/nodes_sag.py
index d9c47851c..9dbf1b6f9 100644
--- a/comfy_extras/nodes_sag.py
+++ b/comfy_extras/nodes_sag.py
@@ -113,7 +113,7 @@ class SelfAttentionGuidance(io.ComfyNode):
return io.Schema(
node_id="SelfAttentionGuidance",
display_name="Self-Attention Guidance",
- category="_for_testing",
+ category="experimental",
inputs=[
io.Model.Input("model"),
io.Float.Input("scale", default=0.5, min=-2.0, max=5.0, step=0.01),
diff --git a/comfy_extras/nodes_sam3.py b/comfy_extras/nodes_sam3.py
new file mode 100644
index 000000000..daac52f9b
--- /dev/null
+++ b/comfy_extras/nodes_sam3.py
@@ -0,0 +1,533 @@
+"""
+SAM3 (Segment Anything 3) nodes for detection, segmentation, and video tracking.
+"""
+
+from typing_extensions import override
+
+import json
+import os
+import torch
+import torch.nn.functional as F
+import comfy.model_management
+import comfy.utils
+import folder_paths
+from comfy_api.latest import ComfyExtension, io, ui
+import av
+from fractions import Fraction
+
+
+def _extract_text_prompts(conditioning, device, dtype):
+ """Extract list of (text_embeddings, text_mask) from conditioning."""
+ cond_meta = conditioning[0][1]
+ multi = cond_meta.get("sam3_multi_cond")
+ prompts = []
+ if multi is not None:
+ for entry in multi:
+ emb = entry["cond"].to(device=device, dtype=dtype)
+ mask = entry["attention_mask"].to(device) if entry["attention_mask"] is not None else None
+ if mask is None:
+ mask = torch.ones(emb.shape[0], emb.shape[1], dtype=torch.int64, device=device)
+ prompts.append((emb, mask, entry.get("max_detections", 1)))
+ else:
+ emb = conditioning[0][0].to(device=device, dtype=dtype)
+ mask = cond_meta.get("attention_mask")
+ if mask is not None:
+ mask = mask.to(device)
+ else:
+ mask = torch.ones(emb.shape[0], emb.shape[1], dtype=torch.int64, device=device)
+ prompts.append((emb, mask, 1))
+ return prompts
+
+
+def _refine_mask(sam3_model, orig_image_hwc, coarse_mask, box_xyxy, H, W, device, dtype, iterations):
+ """Refine a coarse detector mask via SAM decoder, cropping to the detection box.
+
+ Returns: [1, H, W] binary mask
+ """
+ def _coarse_fallback():
+ return (F.interpolate(coarse_mask.unsqueeze(0).unsqueeze(0), size=(H, W),
+ mode="bilinear", align_corners=False)[0] > 0).float()
+
+ if iterations <= 0:
+ return _coarse_fallback()
+
+ pad_frac = 0.1
+ x1, y1, x2, y2 = box_xyxy.tolist()
+ bw, bh = x2 - x1, y2 - y1
+ cx1 = max(0, int(x1 - bw * pad_frac))
+ cy1 = max(0, int(y1 - bh * pad_frac))
+ cx2 = min(W, int(x2 + bw * pad_frac))
+ cy2 = min(H, int(y2 + bh * pad_frac))
+ if cx2 <= cx1 or cy2 <= cy1:
+ return _coarse_fallback()
+
+ crop = orig_image_hwc[cy1:cy2, cx1:cx2, :3]
+ crop_1008 = comfy.utils.common_upscale(crop.unsqueeze(0).movedim(-1, 1), 1008, 1008, "bilinear", crop="disabled")
+ crop_frame = crop_1008.to(device=device, dtype=dtype)
+ crop_h, crop_w = cy2 - cy1, cx2 - cx1
+
+ # Crop coarse mask and refine via SAM on the cropped image
+ mask_h, mask_w = coarse_mask.shape[-2:]
+ mx1, my1 = int(cx1 / W * mask_w), int(cy1 / H * mask_h)
+ mx2, my2 = int(cx2 / W * mask_w), int(cy2 / H * mask_h)
+ if mx2 <= mx1 or my2 <= my1:
+ return _coarse_fallback()
+ mask_logit = coarse_mask[..., my1:my2, mx1:mx2].unsqueeze(0).unsqueeze(0)
+ for _ in range(iterations):
+ coarse_input = F.interpolate(mask_logit, size=(1008, 1008), mode="bilinear", align_corners=False)
+ mask_logit = sam3_model.forward_segment(crop_frame, mask_inputs=coarse_input)
+
+ refined_crop = F.interpolate(mask_logit, size=(crop_h, crop_w), mode="bilinear", align_corners=False)
+ full_mask = torch.zeros(1, 1, H, W, device=device, dtype=dtype)
+ full_mask[:, :, cy1:cy2, cx1:cx2] = refined_crop
+ coarse_full = F.interpolate(coarse_mask.unsqueeze(0).unsqueeze(0), size=(H, W), mode="bilinear", align_corners=False)
+ return ((full_mask[0] > 0) | (coarse_full[0] > 0)).float()
+
+
+
+class SAM3_Detect(io.ComfyNode):
+ """Open-vocabulary detection and segmentation using text, box, or point prompts."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SAM3_Detect",
+ display_name="SAM3 Detect",
+ category="image/detection",
+ search_aliases=["sam3", "segment anything", "open vocabulary", "text detection", "segment"],
+ inputs=[
+ io.Model.Input("model", display_name="model"),
+ io.Image.Input("image", display_name="image"),
+ io.Conditioning.Input("conditioning", display_name="conditioning", optional=True, tooltip="Text conditioning from CLIPTextEncode"),
+ io.BoundingBox.Input("bboxes", display_name="bboxes", force_input=True, optional=True, tooltip="Bounding boxes to segment within"),
+ io.String.Input("positive_coords", display_name="positive_coords", force_input=True, optional=True, tooltip="Positive point prompts as JSON [{\"x\": int, \"y\": int}, ...] (pixel coords)"),
+ io.String.Input("negative_coords", display_name="negative_coords", force_input=True, optional=True, tooltip="Negative point prompts as JSON [{\"x\": int, \"y\": int}, ...] (pixel coords)"),
+ io.Float.Input("threshold", display_name="threshold", default=0.5, min=0.0, max=1.0, step=0.01),
+ io.Int.Input("refine_iterations", display_name="refine_iterations", default=2, min=0, max=5, tooltip="SAM decoder refinement passes (0=use raw detector masks)"),
+ io.Boolean.Input("individual_masks", display_name="individual_masks", default=False, tooltip="Output per-object masks instead of union"),
+ ],
+ outputs=[
+ io.Mask.Output("masks"),
+ io.BoundingBox.Output("bboxes"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, model, image, conditioning=None, bboxes=None, positive_coords=None, negative_coords=None, threshold=0.5, refine_iterations=2, individual_masks=False) -> io.NodeOutput:
+ B, H, W, C = image.shape
+ image_in = comfy.utils.common_upscale(image[..., :3].movedim(-1, 1), 1008, 1008, "bilinear", crop="disabled")
+
+ # Convert bboxes to normalized cxcywh format, per-frame list of [1, N, 4] tensors.
+ # Supports: single dict (all frames), list[dict] (all frames), list[list[dict]] (per-frame).
+ def _boxes_to_tensor(box_list):
+ coords = []
+ for d in box_list:
+ cx = (d["x"] + d["width"] / 2) / W
+ cy = (d["y"] + d["height"] / 2) / H
+ coords.append([cx, cy, d["width"] / W, d["height"] / H])
+ return torch.tensor([coords], dtype=torch.float32) # [1, N, 4]
+
+ per_frame_boxes = None
+ if bboxes is not None:
+ if isinstance(bboxes, dict):
+ # Single box → same for all frames
+ shared = _boxes_to_tensor([bboxes])
+ per_frame_boxes = [shared] * B
+ elif isinstance(bboxes, list) and len(bboxes) > 0 and isinstance(bboxes[0], list):
+ # list[list[dict]] → per-frame boxes
+ per_frame_boxes = [_boxes_to_tensor(frame_boxes) if frame_boxes else None for frame_boxes in bboxes]
+ # Pad to B if fewer frames provided
+ while len(per_frame_boxes) < B:
+ per_frame_boxes.append(per_frame_boxes[-1] if per_frame_boxes else None)
+ elif isinstance(bboxes, list) and len(bboxes) > 0:
+ # list[dict] → same boxes for all frames
+ shared = _boxes_to_tensor(bboxes)
+ per_frame_boxes = [shared] * B
+
+ # Parse point prompts from JSON (KJNodes PointsEditor format: [{"x": int, "y": int}, ...])
+ pos_pts = json.loads(positive_coords) if positive_coords else []
+ neg_pts = json.loads(negative_coords) if negative_coords else []
+ has_points = len(pos_pts) > 0 or len(neg_pts) > 0
+
+ comfy.model_management.load_model_gpu(model)
+ device = comfy.model_management.get_torch_device()
+ dtype = model.model.get_dtype()
+ sam3_model = model.model.diffusion_model
+
+ # Build point inputs for tracker SAM decoder path
+ point_inputs = None
+ if has_points:
+ all_coords = [[p["x"] / W * 1008, p["y"] / H * 1008] for p in pos_pts] + \
+ [[p["x"] / W * 1008, p["y"] / H * 1008] for p in neg_pts]
+ all_labels = [1] * len(pos_pts) + [0] * len(neg_pts)
+ point_inputs = {
+ "point_coords": torch.tensor([all_coords], dtype=dtype, device=device),
+ "point_labels": torch.tensor([all_labels], dtype=torch.int32, device=device),
+ }
+
+ cond_list = _extract_text_prompts(conditioning, device, dtype) if conditioning is not None and len(conditioning) > 0 else []
+ has_text = len(cond_list) > 0
+
+ # Run per-image through detector (text/boxes) and/or tracker (points)
+ all_bbox_dicts = []
+ all_masks = []
+ pbar = comfy.utils.ProgressBar(B)
+
+ for b in range(B):
+ frame = image_in[b:b+1].to(device=device, dtype=dtype)
+ b_boxes = None
+ if per_frame_boxes is not None and per_frame_boxes[b] is not None:
+ b_boxes = per_frame_boxes[b].to(device=device, dtype=dtype)
+
+ frame_bbox_dicts = []
+ frame_masks = []
+
+ # Point prompts: tracker SAM decoder path with iterative refinement
+ if point_inputs is not None:
+ mask_logit = sam3_model.forward_segment(frame, point_inputs=point_inputs)
+ for _ in range(max(0, refine_iterations - 1)):
+ mask_logit = sam3_model.forward_segment(frame, mask_inputs=mask_logit)
+ mask = F.interpolate(mask_logit, size=(H, W), mode="bilinear", align_corners=False)
+ frame_masks.append((mask[0] > 0).float())
+
+ # Box prompts: SAM decoder path (segment inside each box)
+ if b_boxes is not None and not has_text:
+ for box_cxcywh in b_boxes[0]:
+ cx, cy, bw, bh = box_cxcywh.tolist()
+ # Convert cxcywh normalized → xyxy in 1008 space → [1, 2, 2] corners
+ sam_box = torch.tensor([[[(cx - bw/2) * 1008, (cy - bh/2) * 1008],
+ [(cx + bw/2) * 1008, (cy + bh/2) * 1008]]],
+ device=device, dtype=dtype)
+ mask_logit = sam3_model.forward_segment(frame, box_inputs=sam_box)
+ for _ in range(max(0, refine_iterations - 1)):
+ mask_logit = sam3_model.forward_segment(frame, mask_inputs=mask_logit)
+ mask = F.interpolate(mask_logit, size=(H, W), mode="bilinear", align_corners=False)
+ frame_masks.append((mask[0] > 0).float())
+
+ # Text prompts: run detector per text prompt (each detects one category)
+ for text_embeddings, text_mask, max_det in cond_list:
+ results = sam3_model(
+ frame, text_embeddings=text_embeddings, text_mask=text_mask,
+ boxes=b_boxes, threshold=threshold, orig_size=(H, W))
+
+ pred_boxes = results["boxes"][0]
+ scores = results["scores"][0]
+ masks = results["masks"][0]
+
+ probs = scores.sigmoid()
+ keep = probs > threshold
+ kept_boxes = pred_boxes[keep].cpu()
+ kept_scores = probs[keep].cpu()
+ kept_masks = masks[keep]
+
+ order = kept_scores.argsort(descending=True)[:max_det]
+ kept_boxes = kept_boxes[order]
+ kept_scores = kept_scores[order]
+ kept_masks = kept_masks[order]
+
+ for box, score in zip(kept_boxes, kept_scores):
+ frame_bbox_dicts.append({
+ "x": float(box[0]), "y": float(box[1]),
+ "width": float(box[2] - box[0]), "height": float(box[3] - box[1]),
+ "score": float(score),
+ })
+ for m, box in zip(kept_masks, kept_boxes):
+ frame_masks.append(_refine_mask(
+ sam3_model, image[b], m, box, H, W, device, dtype, refine_iterations))
+
+ all_bbox_dicts.append(frame_bbox_dicts)
+ if len(frame_masks) > 0:
+ combined = torch.cat(frame_masks, dim=0) # [N_obj, H, W]
+ if individual_masks:
+ all_masks.append(combined)
+ else:
+ all_masks.append((combined > 0).any(dim=0).float())
+ else:
+ if individual_masks:
+ all_masks.append(torch.zeros(0, H, W, device=comfy.model_management.intermediate_device()))
+ else:
+ all_masks.append(torch.zeros(H, W, device=comfy.model_management.intermediate_device()))
+ pbar.update(1)
+
+ idev = comfy.model_management.intermediate_device()
+ all_masks = [m.to(idev) for m in all_masks]
+ mask_out = torch.cat(all_masks, dim=0) if individual_masks else torch.stack(all_masks)
+ return io.NodeOutput(mask_out, all_bbox_dicts)
+
+
+SAM3TrackData = io.Custom("SAM3_TRACK_DATA")
+
+class SAM3_VideoTrack(io.ComfyNode):
+ """Track objects across video frames using SAM3's memory-based tracker."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SAM3_VideoTrack",
+ display_name="SAM3 Video Track",
+ category="image/detection",
+ search_aliases=["sam3", "video", "track", "propagate"],
+ inputs=[
+ io.Image.Input("images", display_name="images", tooltip="Video frames as batched images"),
+ io.Model.Input("model", display_name="model"),
+ io.Mask.Input("initial_mask", display_name="initial_mask", optional=True, tooltip="Mask(s) for the first frame to track (one per object)"),
+ io.Conditioning.Input("conditioning", display_name="conditioning", optional=True, tooltip="Text conditioning for detecting new objects during tracking"),
+ io.Float.Input("detection_threshold", display_name="detection_threshold", default=0.5, min=0.0, max=1.0, step=0.01, tooltip="Score threshold for text-prompted detection."),
+ io.Int.Input("max_objects", display_name="max_objects", default=4, min=0, max=64, tooltip="Max tracked objects. Initial masks count toward this limit. 0 uses the internal cap of 64."),
+ io.Int.Input("detect_interval", display_name="detect_interval", default=1, min=1, tooltip="Run detection every N frames (1=every frame). Higher values save compute."),
+ ],
+ outputs=[
+ SAM3TrackData.Output("track_data", display_name="track_data"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, images, model, initial_mask=None, conditioning=None, detection_threshold=0.5, max_objects=0, detect_interval=1) -> io.NodeOutput:
+ N, H, W, C = images.shape
+
+ comfy.model_management.load_model_gpu(model)
+ device = comfy.model_management.get_torch_device()
+ dtype = model.model.get_dtype()
+ sam3_model = model.model.diffusion_model
+
+ frames_in = images[..., :3].movedim(-1, 1)
+
+ init_masks = None
+ if initial_mask is not None:
+ init_masks = initial_mask.unsqueeze(1).to(device=device, dtype=dtype)
+
+ pbar = comfy.utils.ProgressBar(N)
+
+ text_prompts = None
+ if conditioning is not None and len(conditioning) > 0:
+ text_prompts = [(emb, mask) for emb, mask, _ in _extract_text_prompts(conditioning, device, dtype)]
+ elif initial_mask is None:
+ raise ValueError("Either initial_mask or conditioning must be provided")
+
+ result = sam3_model.forward_video(
+ images=frames_in, initial_masks=init_masks, pbar=pbar, text_prompts=text_prompts,
+ new_det_thresh=detection_threshold, max_objects=max_objects,
+ detect_interval=detect_interval, target_device=device, target_dtype=dtype)
+ result["orig_size"] = (H, W)
+ return io.NodeOutput(result)
+
+
+class SAM3_TrackPreview(io.ComfyNode):
+ """Visualize tracked objects with distinct colors as a video preview. No tensor output — saves to temp video."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SAM3_TrackPreview",
+ display_name="SAM3 Track Preview",
+ category="image/detection",
+ inputs=[
+ SAM3TrackData.Input("track_data", display_name="track_data"),
+ io.Image.Input("images", display_name="images", optional=True),
+ io.Float.Input("opacity", display_name="opacity", default=0.5, min=0.0, max=1.0, step=0.05),
+ io.Float.Input("fps", display_name="fps", default=24.0, min=1.0, max=120.0, step=1.0),
+ ],
+ is_output_node=True,
+ )
+
+ COLORS = [
+ (0.12, 0.47, 0.71), (1.0, 0.5, 0.05), (0.17, 0.63, 0.17), (0.84, 0.15, 0.16),
+ (0.58, 0.4, 0.74), (0.55, 0.34, 0.29), (0.89, 0.47, 0.76), (0.5, 0.5, 0.5),
+ (0.74, 0.74, 0.13), (0.09, 0.75, 0.81), (0.94, 0.76, 0.06), (0.42, 0.68, 0.84),
+ ]
+
+ # 5x3 bitmap font atlas for digits 0-9 [10, 5, 3]
+ _glyph_cache = {} # (device, scale) -> (glyphs, outlines, gh, gw, oh, ow)
+
+ @staticmethod
+ def _get_glyphs(device, scale=3):
+ key = (device, scale)
+ if key in SAM3_TrackPreview._glyph_cache:
+ return SAM3_TrackPreview._glyph_cache[key]
+ atlas = torch.tensor([
+ [[1,1,1],[1,0,1],[1,0,1],[1,0,1],[1,1,1]],
+ [[0,1,0],[1,1,0],[0,1,0],[0,1,0],[1,1,1]],
+ [[1,1,1],[0,0,1],[1,1,1],[1,0,0],[1,1,1]],
+ [[1,1,1],[0,0,1],[1,1,1],[0,0,1],[1,1,1]],
+ [[1,0,1],[1,0,1],[1,1,1],[0,0,1],[0,0,1]],
+ [[1,1,1],[1,0,0],[1,1,1],[0,0,1],[1,1,1]],
+ [[1,1,1],[1,0,0],[1,1,1],[1,0,1],[1,1,1]],
+ [[1,1,1],[0,0,1],[0,0,1],[0,0,1],[0,0,1]],
+ [[1,1,1],[1,0,1],[1,1,1],[1,0,1],[1,1,1]],
+ [[1,1,1],[1,0,1],[1,1,1],[0,0,1],[1,1,1]],
+ ], dtype=torch.bool)
+ glyphs, outlines = [], []
+ for d in range(10):
+ g = atlas[d].repeat_interleave(scale, 0).repeat_interleave(scale, 1)
+ padded = F.pad(g.float().unsqueeze(0).unsqueeze(0), (1,1,1,1))
+ o = (F.max_pool2d(padded, 3, stride=1, padding=1)[0, 0] > 0)
+ glyphs.append(g.to(device))
+ outlines.append(o.to(device))
+ gh, gw = glyphs[0].shape
+ oh, ow = outlines[0].shape
+ SAM3_TrackPreview._glyph_cache[key] = (glyphs, outlines, gh, gw, oh, ow)
+ return SAM3_TrackPreview._glyph_cache[key]
+
+ @staticmethod
+ def _draw_number_gpu(frame, number, cx, cy, color, scale=3):
+ """Draw a number on a GPU tensor [H, W, 3] float 0-1 at (cx, cy) with outline."""
+ H, W = frame.shape[:2]
+ device = frame.device
+ glyphs, outlines, gh, gw, oh, ow = SAM3_TrackPreview._get_glyphs(device, scale)
+ color_t = torch.tensor(color, device=device, dtype=frame.dtype)
+ digs = [int(d) for d in str(number)]
+ total_w = len(digs) * (gw + scale) - scale
+ x0 = cx - total_w // 2
+ y0 = cy - gh // 2
+ for i, d in enumerate(digs):
+ dx = x0 + i * (gw + scale)
+ # Black outline
+ oy0, ox0 = y0 - 1, dx - 1
+ osy1, osx1 = max(0, -oy0), max(0, -ox0)
+ osy2, osx2 = min(oh, H - oy0), min(ow, W - ox0)
+ if osy2 > osy1 and osx2 > osx1:
+ fy1, fx1 = oy0 + osy1, ox0 + osx1
+ frame[fy1:fy1+(osy2-osy1), fx1:fx1+(osx2-osx1)][outlines[d][osy1:osy2, osx1:osx2]] = 0
+ # Colored fill
+ sy1, sx1 = max(0, -y0), max(0, -dx)
+ sy2, sx2 = min(gh, H - y0), min(gw, W - dx)
+ if sy2 > sy1 and sx2 > sx1:
+ fy1, fx1 = y0 + sy1, dx + sx1
+ frame[fy1:fy1+(sy2-sy1), fx1:fx1+(sx2-sx1)][glyphs[d][sy1:sy2, sx1:sx2]] = color_t
+
+ @classmethod
+ def execute(cls, track_data, images=None, opacity=0.5, fps=24.0) -> io.NodeOutput:
+
+ from comfy.ldm.sam3.tracker import unpack_masks
+ packed = track_data["packed_masks"]
+ H, W = track_data["orig_size"]
+ if images is not None:
+ H, W = images.shape[1], images.shape[2]
+ if packed is None:
+ N, N_obj = track_data["n_frames"], 0
+ else:
+ N, N_obj = packed.shape[0], packed.shape[1]
+
+ import uuid
+ gpu = comfy.model_management.get_torch_device()
+ temp_dir = folder_paths.get_temp_directory()
+ filename = f"sam3_track_preview_{uuid.uuid4().hex[:8]}.mp4"
+ filepath = os.path.join(temp_dir, filename)
+ with av.open(filepath, mode='w') as output:
+ stream = output.add_stream('h264', rate=Fraction(round(fps * 1000), 1000))
+ stream.width = W
+ stream.height = H
+ stream.pix_fmt = 'yuv420p'
+
+ frame_cpu = torch.empty(H, W, 3, dtype=torch.uint8)
+ frame_np = frame_cpu.numpy()
+ if N_obj > 0:
+ colors_t = torch.tensor([cls.COLORS[i % len(cls.COLORS)] for i in range(N_obj)],
+ device=gpu, dtype=torch.float32)
+ grid_y = torch.arange(H, device=gpu).view(1, H, 1)
+ grid_x = torch.arange(W, device=gpu).view(1, 1, W)
+ for t in range(N):
+ if images is not None and t < images.shape[0]:
+ frame = images[t].clone()
+ else:
+ frame = torch.zeros(H, W, 3)
+
+ if N_obj > 0:
+ frame_binary = unpack_masks(packed[t:t+1].to(gpu)) # [1, N_obj, H, W] bool
+ frame_masks = F.interpolate(frame_binary.float(), size=(H, W), mode="nearest")[0]
+ frame_gpu = frame.to(gpu)
+ bool_masks = frame_masks > 0.5
+ any_mask = bool_masks.any(dim=0)
+ if any_mask.any():
+ obj_idx_map = bool_masks.to(torch.uint8).argmax(dim=0)
+ color_overlay = colors_t[obj_idx_map]
+ mask_3d = any_mask.unsqueeze(-1)
+ frame_gpu = torch.where(mask_3d, frame_gpu * (1 - opacity) + color_overlay * opacity, frame_gpu)
+ area = bool_masks.sum(dim=(-1, -2)).clamp_(min=1)
+ cy = (bool_masks * grid_y).sum(dim=(-1, -2)) // area
+ cx = (bool_masks * grid_x).sum(dim=(-1, -2)) // area
+ has = area > 1
+ scores = track_data.get("scores", [])
+ label_scale = max(3, H // 240) # Scale font with resolutio
+ size_caps = (area.float().sqrt() / 15).clamp_(min=1).long().tolist() #cap per-object so the number doesn't dwarf small masks
+ for obj_idx in range(N_obj):
+ if has[obj_idx]:
+ _cx, _cy = int(cx[obj_idx]), int(cy[obj_idx])
+ color = cls.COLORS[obj_idx % len(cls.COLORS)]
+ obj_scale = min(label_scale, size_caps[obj_idx])
+ score_scale = max(1, obj_scale * 2 // 3)
+ SAM3_TrackPreview._draw_number_gpu(frame_gpu, obj_idx, _cx, _cy, color, scale=obj_scale)
+ if obj_idx < len(scores) and scores[obj_idx] < 1.0:
+ SAM3_TrackPreview._draw_number_gpu(frame_gpu, int(scores[obj_idx] * 100),
+ _cx, _cy + 5 * obj_scale + 3, color, scale=score_scale)
+ frame_cpu.copy_(frame_gpu.clamp_(0, 1).mul_(255).byte())
+ else:
+ frame_cpu.copy_(frame.clamp_(0, 1).mul_(255).byte())
+
+ vframe = av.VideoFrame.from_ndarray(frame_np, format='rgb24')
+ output.mux(stream.encode(vframe.reformat(format='yuv420p')))
+ output.mux(stream.encode(None))
+ return io.NodeOutput(ui=ui.PreviewVideo([ui.SavedResult(filename, "", io.FolderType.temp)]))
+
+
+class SAM3_TrackToMask(io.ComfyNode):
+ """Select tracked objects by index and output as mask."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="SAM3_TrackToMask",
+ display_name="SAM3 Track to Mask",
+ category="image/detection",
+ inputs=[
+ SAM3TrackData.Input("track_data", display_name="track_data"),
+ io.String.Input("object_indices", display_name="object_indices", default="",
+ tooltip="Comma-separated object indices to include (e.g. '0,2,3'). Empty = all objects."),
+ ],
+ outputs=[
+ io.Mask.Output("masks", display_name="masks"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, track_data, object_indices="") -> io.NodeOutput:
+ from comfy.ldm.sam3.tracker import unpack_masks
+ packed = track_data["packed_masks"]
+ H, W = track_data["orig_size"]
+
+ if packed is None:
+ N = track_data["n_frames"]
+ return io.NodeOutput(torch.zeros(N, H, W, device=comfy.model_management.intermediate_device()))
+
+ N, N_obj = packed.shape[0], packed.shape[1]
+
+ if object_indices.strip():
+ indices = [int(i.strip()) for i in object_indices.split(",") if i.strip().isdigit()]
+ indices = [i for i in indices if 0 <= i < N_obj]
+ else:
+ indices = list(range(N_obj))
+
+ if not indices:
+ return io.NodeOutput(torch.zeros(N, H, W, device=comfy.model_management.intermediate_device()))
+
+ union_packed = packed[:, indices[0]].clone()
+ for i in indices[1:]:
+ union_packed |= packed[:, i]
+ union = unpack_masks(union_packed).unsqueeze(1).float() # [N, 1, Hm, Wm]
+ mask_out = F.interpolate(union, size=(H, W), mode="bilinear", align_corners=False)[:, 0]
+ return io.NodeOutput(mask_out)
+
+
+class SAM3Extension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ SAM3_Detect,
+ SAM3_VideoTrack,
+ SAM3_TrackPreview,
+ SAM3_TrackToMask,
+ ]
+
+
+async def comfy_entrypoint() -> SAM3Extension:
+ return SAM3Extension()
diff --git a/comfy_extras/nodes_save_3d.py b/comfy_extras/nodes_save_3d.py
new file mode 100644
index 000000000..c03524246
--- /dev/null
+++ b/comfy_extras/nodes_save_3d.py
@@ -0,0 +1,396 @@
+"""Save-side 3D nodes: mesh packing/slicing helpers + GLB writer + SaveGLB node."""
+
+import json
+import logging
+import os
+import struct
+from io import BytesIO
+
+import numpy as np
+from PIL import Image
+import torch
+from typing_extensions import override
+
+import folder_paths
+from comfy.cli_args import args
+from comfy_api.latest import ComfyExtension, IO, Types
+
+
+def pack_variable_mesh_batch(vertices, faces, colors=None, uvs=None, texture=None):
+ # Pack lists of (Nᵢ, *) vertex/face/color/uv tensors into padded batched tensors,
+ # stashing per-item lengths as runtime attrs so consumers can recover the real slice.
+ # colors and uvs are 1:1 with vertices, so they're padded to max_vertices and read with vertex_counts.
+ # texture is (B, H, W, 3) — passed through unchanged
+ batch_size = len(vertices)
+ max_vertices = max(v.shape[0] for v in vertices)
+ max_faces = max(f.shape[0] for f in faces)
+
+ packed_vertices = vertices[0].new_zeros((batch_size, max_vertices, vertices[0].shape[1]))
+ packed_faces = faces[0].new_zeros((batch_size, max_faces, faces[0].shape[1]))
+ vertex_counts = torch.tensor([v.shape[0] for v in vertices], device=vertices[0].device, dtype=torch.int64)
+ face_counts = torch.tensor([f.shape[0] for f in faces], device=faces[0].device, dtype=torch.int64)
+
+ for i, (v, f) in enumerate(zip(vertices, faces)):
+ packed_vertices[i, :v.shape[0]] = v
+ packed_faces[i, :f.shape[0]] = f
+
+ packed_colors = None
+ if colors is not None:
+ packed_colors = colors[0].new_zeros((batch_size, max_vertices, colors[0].shape[1]))
+ for i, c in enumerate(colors):
+ assert c.shape[0] == vertices[i].shape[0], (
+ f"vertex_colors[{i}] has {c.shape[0]} entries, expected {vertices[i].shape[0]} (1:1 with vertices)"
+ )
+ packed_colors[i, :c.shape[0]] = c
+
+ packed_uvs = None
+ if uvs is not None:
+ packed_uvs = uvs[0].new_zeros((batch_size, max_vertices, uvs[0].shape[1]))
+ for i, u in enumerate(uvs):
+ assert u.shape[0] == vertices[i].shape[0], (
+ f"uvs[{i}] has {u.shape[0]} entries, expected {vertices[i].shape[0]} (1:1 with vertices)"
+ )
+ packed_uvs[i, :u.shape[0]] = u
+
+ return Types.MESH(packed_vertices, packed_faces,
+ uvs=packed_uvs, vertex_colors=packed_colors, texture=texture,
+ vertex_counts=vertex_counts, face_counts=face_counts)
+
+
+def get_mesh_batch_item(mesh, index):
+ # Returns (vertices, faces, colors, uvs) for batch index, slicing to real lengths
+ # if the mesh carries per-item counts (variable-size batch).
+ v_colors = getattr(mesh, "vertex_colors", None)
+ v_uvs = getattr(mesh, "uvs", None)
+ if getattr(mesh, "vertex_counts", None) is not None:
+ vertex_count = int(mesh.vertex_counts[index].item())
+ face_count = int(mesh.face_counts[index].item())
+ vertices = mesh.vertices[index, :vertex_count]
+ faces = mesh.faces[index, :face_count]
+ colors = v_colors[index, :vertex_count] if v_colors is not None else None
+ uvs = v_uvs[index, :vertex_count] if v_uvs is not None else None
+ return vertices, faces, colors, uvs
+
+ colors = v_colors[index] if v_colors is not None else None
+ uvs = v_uvs[index] if v_uvs is not None else None
+ return mesh.vertices[index], mesh.faces[index], colors, uvs
+
+
+def save_glb(vertices, faces, filepath, metadata=None,
+ uvs=None, vertex_colors=None, texture_image=None):
+ """
+ Save PyTorch tensor vertices and faces as a GLB file without external dependencies.
+
+ Parameters:
+ vertices: torch.Tensor of shape (N, 3) - The vertex coordinates
+ faces: torch.Tensor of shape (M, 3) - The face indices (triangle faces)
+ filepath: str - Output filepath (should end with .glb)
+ metadata: dict - Optional asset.extras metadata
+ uvs: torch.Tensor of shape (N, 2) - Optional per-vertex texture coordinates
+ vertex_colors: torch.Tensor of shape (N, 3) or (N, 4) - Optional per-vertex colors in [0, 1]
+ texture_image: PIL.Image - Optional baseColor texture, embedded as PNG
+ """
+
+ # Convert tensors to numpy arrays
+ vertices_np = vertices.cpu().numpy().astype(np.float32)
+ faces_signed = faces.cpu().numpy().astype(np.int64)
+ uvs_np = uvs.cpu().numpy().astype(np.float32) if uvs is not None else None
+ colors_np = vertex_colors.cpu().numpy().astype(np.float32) if vertex_colors is not None else None
+ if colors_np is not None:
+ colors_np = np.clip(colors_np, 0.0, 1.0)
+
+ n_verts = vertices_np.shape[0]
+ if n_verts == 0:
+ raise ValueError("save_glb: vertices is empty")
+ if faces_signed.size > 0:
+ fmin = int(faces_signed.min())
+ fmax = int(faces_signed.max())
+ if fmin < 0 or fmax >= n_verts:
+ raise ValueError(
+ f"save_glb: face index out of range [0, {n_verts}): min={fmin}, max={fmax}"
+ )
+ if uvs_np is not None and uvs_np.shape[0] != n_verts:
+ raise ValueError(
+ f"save_glb: uvs has {uvs_np.shape[0]} entries but vertex count is {n_verts}"
+ )
+ if colors_np is not None and colors_np.shape[0] != n_verts:
+ raise ValueError(
+ f"save_glb: vertex_colors has {colors_np.shape[0]} entries but vertex count is {n_verts}"
+ )
+ faces_np = faces_signed.astype(np.uint32)
+ texture_png_bytes = None
+ if texture_image is not None:
+ buf = BytesIO()
+ texture_image.save(buf, format="PNG")
+ texture_png_bytes = buf.getvalue()
+
+ vertices_buffer = vertices_np.tobytes()
+ indices_buffer = faces_np.tobytes()
+ uvs_buffer = uvs_np.tobytes() if uvs_np is not None else b""
+ colors_buffer = colors_np.tobytes() if colors_np is not None else b""
+ texture_buffer = texture_png_bytes if texture_png_bytes is not None else b""
+
+ def pad_to_4_bytes(buffer):
+ padding_length = (4 - (len(buffer) % 4)) % 4
+ return buffer + b'\x00' * padding_length
+
+ vertices_buffer_padded = pad_to_4_bytes(vertices_buffer)
+ indices_buffer_padded = pad_to_4_bytes(indices_buffer)
+ uvs_buffer_padded = pad_to_4_bytes(uvs_buffer)
+ colors_buffer_padded = pad_to_4_bytes(colors_buffer)
+ texture_buffer_padded = pad_to_4_bytes(texture_buffer)
+
+ buffer_data = b"".join([
+ vertices_buffer_padded,
+ indices_buffer_padded,
+ uvs_buffer_padded,
+ colors_buffer_padded,
+ texture_buffer_padded,
+ ])
+
+ vertices_byte_length = len(vertices_buffer)
+ vertices_byte_offset = 0
+ indices_byte_length = len(indices_buffer)
+ indices_byte_offset = len(vertices_buffer_padded)
+ uvs_byte_offset = indices_byte_offset + len(indices_buffer_padded)
+ colors_byte_offset = uvs_byte_offset + len(uvs_buffer_padded)
+ texture_byte_offset = colors_byte_offset + len(colors_buffer_padded)
+
+ buffer_views = [
+ {
+ "buffer": 0,
+ "byteOffset": vertices_byte_offset,
+ "byteLength": vertices_byte_length,
+ "target": 34962 # ARRAY_BUFFER
+ },
+ {
+ "buffer": 0,
+ "byteOffset": indices_byte_offset,
+ "byteLength": indices_byte_length,
+ "target": 34963 # ELEMENT_ARRAY_BUFFER
+ }
+ ]
+ accessors = [
+ {
+ "bufferView": 0,
+ "byteOffset": 0,
+ "componentType": 5126, # FLOAT
+ "count": len(vertices_np),
+ "type": "VEC3",
+ "max": vertices_np.max(axis=0).tolist(),
+ "min": vertices_np.min(axis=0).tolist()
+ },
+ {
+ "bufferView": 1,
+ "byteOffset": 0,
+ "componentType": 5125, # UNSIGNED_INT
+ "count": faces_np.size,
+ "type": "SCALAR"
+ }
+ ]
+ primitive_attributes = {"POSITION": 0}
+
+ if uvs_np is not None and len(uvs_np) > 0:
+ buffer_views.append({
+ "buffer": 0,
+ "byteOffset": uvs_byte_offset,
+ "byteLength": len(uvs_buffer),
+ "target": 34962
+ })
+ accessor_idx = len(accessors)
+ accessors.append({
+ "bufferView": len(buffer_views) - 1,
+ "byteOffset": 0,
+ "componentType": 5126,
+ "count": len(uvs_np),
+ "type": "VEC2",
+ })
+ primitive_attributes["TEXCOORD_0"] = accessor_idx
+
+ if colors_np is not None and len(colors_np) > 0:
+ buffer_views.append({
+ "buffer": 0,
+ "byteOffset": colors_byte_offset,
+ "byteLength": len(colors_buffer),
+ "target": 34962
+ })
+ accessor_idx = len(accessors)
+ accessors.append({
+ "bufferView": len(buffer_views) - 1,
+ "byteOffset": 0,
+ "componentType": 5126,
+ "count": len(colors_np),
+ "type": "VEC3" if colors_np.shape[1] == 3 else "VEC4",
+ })
+ primitive_attributes["COLOR_0"] = accessor_idx
+
+ primitive = {
+ "attributes": primitive_attributes,
+ "indices": 1,
+ "mode": 4 # TRIANGLES
+ }
+
+ images = []
+ textures = []
+ samplers = []
+ materials = []
+ if texture_png_bytes is not None and "TEXCOORD_0" in primitive_attributes:
+ buffer_views.append({
+ "buffer": 0,
+ "byteOffset": texture_byte_offset,
+ "byteLength": len(texture_buffer),
+ })
+ images.append({"bufferView": len(buffer_views) - 1, "mimeType": "image/png"})
+ samplers.append({"magFilter": 9729, "minFilter": 9729, "wrapS": 33071, "wrapT": 33071})
+ textures.append({"source": 0, "sampler": 0})
+ materials.append({
+ "pbrMetallicRoughness": {
+ "baseColorTexture": {"index": 0, "texCoord": 0},
+ "metallicFactor": 0.0,
+ "roughnessFactor": 1.0,
+ },
+ "doubleSided": True,
+ })
+ primitive["material"] = 0
+
+ gltf = {
+ "asset": {"version": "2.0", "generator": "ComfyUI"},
+ "buffers": [{"byteLength": len(buffer_data)}],
+ "bufferViews": buffer_views,
+ "accessors": accessors,
+ "meshes": [{"primitives": [primitive]}],
+ "nodes": [{"mesh": 0}],
+ "scenes": [{"nodes": [0]}],
+ "scene": 0,
+ }
+ if images:
+ gltf["images"] = images
+ if samplers:
+ gltf["samplers"] = samplers
+ if textures:
+ gltf["textures"] = textures
+ if materials:
+ gltf["materials"] = materials
+
+ if metadata:
+ gltf["asset"]["extras"] = metadata
+
+ # Convert the JSON to bytes
+ gltf_json = json.dumps(gltf).encode('utf8')
+
+ def pad_json_to_4_bytes(buffer):
+ padding_length = (4 - (len(buffer) % 4)) % 4
+ return buffer + b' ' * padding_length
+
+ gltf_json_padded = pad_json_to_4_bytes(gltf_json)
+
+ # Create the GLB header (a 4-byte ASCII magic identifier glTF)
+ glb_header = struct.pack('<4sII', b'glTF', 2, 12 + 8 + len(gltf_json_padded) + 8 + len(buffer_data))
+
+ # Create JSON chunk header (chunk type 0)
+ json_chunk_header = struct.pack(' IO.NodeOutput:
+ full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, folder_paths.get_output_directory())
+ results = []
+
+ metadata = {}
+ if not args.disable_metadata:
+ if cls.hidden.prompt is not None:
+ metadata["prompt"] = json.dumps(cls.hidden.prompt)
+ if cls.hidden.extra_pnginfo is not None:
+ for x in cls.hidden.extra_pnginfo:
+ metadata[x] = json.dumps(cls.hidden.extra_pnginfo[x])
+
+ if isinstance(mesh, Types.File3D):
+ # Handle File3D input - save BytesIO data to output folder
+ ext = mesh.format or "glb"
+ f = f"{filename}_{counter:05}_.{ext}"
+ mesh.save_to(os.path.join(full_output_folder, f))
+ results.append({
+ "filename": f,
+ "subfolder": subfolder,
+ "type": "output"
+ })
+ counter += 1
+ else:
+ # Handle Mesh input - save vertices and faces as GLB; carry optional UVs / colors / texture.
+ texture_b = getattr(mesh, "texture", None)
+ texture_np = None
+ if texture_b is not None:
+ texture_np = (texture_b.clamp(0.0, 1.0).cpu().numpy() * 255).astype(np.uint8)
+ assert texture_np.ndim == 4 and texture_np.shape[-1] == 3, (
+ f"texture must be (B, H, W, 3) RGB, got shape {tuple(texture_np.shape)}"
+ )
+ for i in range(mesh.vertices.shape[0]):
+ vertices_i, faces_i, v_colors, uvs_i = get_mesh_batch_item(mesh, i)
+ if vertices_i.shape[0] == 0 or faces_i.shape[0] == 0:
+ logging.warning(f"SaveGLB: skipping empty mesh at batch index {i}")
+ continue
+ tex_img = Image.fromarray(texture_np[i], mode="RGB") if texture_np is not None else None
+ f = f"{filename}_{counter:05}_.glb"
+ save_glb(vertices_i, faces_i, os.path.join(full_output_folder, f), metadata,
+ uvs=uvs_i,
+ vertex_colors=v_colors,
+ texture_image=tex_img)
+ results.append({
+ "filename": f,
+ "subfolder": subfolder,
+ "type": "output"
+ })
+ counter += 1
+ return IO.NodeOutput(ui={"3d": results})
+
+
+class Save3DExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[IO.ComfyNode]]:
+ return [SaveGLB]
+
+
+async def comfy_entrypoint() -> Save3DExtension:
+ return Save3DExtension()
diff --git a/comfy_extras/nodes_sd3.py b/comfy_extras/nodes_sd3.py
index c43844a1a..38cbf117b 100644
--- a/comfy_extras/nodes_sd3.py
+++ b/comfy_extras/nodes_sd3.py
@@ -41,7 +41,7 @@ class EmptySD3LatentImage(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="EmptySD3LatentImage",
- category="latent/sd3",
+ category="model/latent/sd3",
inputs=[
io.Int.Input("width", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
io.Int.Input("height", default=1024, min=16, max=nodes.MAX_RESOLUTION, step=16),
@@ -54,7 +54,7 @@ class EmptySD3LatentImage(io.ComfyNode):
@classmethod
def execute(cls, width, height, batch_size=1) -> io.NodeOutput:
- latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+ latent = torch.zeros([batch_size, 16, height // 8, width // 8], device=comfy.model_management.intermediate_device(), dtype=comfy.model_management.intermediate_dtype())
return io.NodeOutput({"samples": latent, "downscale_ratio_spacial": 8})
generate = execute # TODO: remove
@@ -113,7 +113,7 @@ class ControlNetApplySD3(io.ComfyNode):
return io.Schema(
node_id="ControlNetApplySD3",
display_name="Apply Controlnet with VAE",
- category="conditioning/controlnet",
+ category="model/conditioning/controlnet",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/nodes_sdpose.py b/comfy_extras/nodes_sdpose.py
index 7d54967d5..20d459b00 100644
--- a/comfy_extras/nodes_sdpose.py
+++ b/comfy_extras/nodes_sdpose.py
@@ -353,7 +353,8 @@ class SDPoseDrawKeypoints(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SDPoseDrawKeypoints",
- category="image/preprocessors",
+ display_name="SDPose Draw Keypoints",
+ category="image/detection",
search_aliases=["openpose", "pose detection", "preprocessor", "keypoints", "pose"],
inputs=[
io.Custom("POSE_KEYPOINT").Input("keypoints"),
@@ -421,7 +422,8 @@ class SDPoseKeypointExtractor(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SDPoseKeypointExtractor",
- category="image/preprocessors",
+ display_name="SDPose Keypoint Extractor",
+ category="image/detection",
search_aliases=["openpose", "pose detection", "preprocessor", "keypoints", "sdpose"],
description="Extract pose keypoints from images using the SDPose model: https://huggingface.co/Comfy-Org/SDPose/tree/main/checkpoints",
inputs=[
@@ -459,27 +461,23 @@ class SDPoseKeypointExtractor(io.ComfyNode):
total_images = image.shape[0]
captured_feat = None
- model_h = int(head.heatmap_size[0]) * 4 # e.g. 192 * 4 = 768
- model_w = int(head.heatmap_size[1]) * 4 # e.g. 256 * 4 = 1024
+ model_w = int(head.heatmap_size[0]) * 4 # 192 * 4 = 768
+ model_h = int(head.heatmap_size[1]) * 4 # 256 * 4 = 1024
def _resize_to_model(imgs):
- """Aspect-preserving resize + zero-pad BHWC images to (model_h, model_w). Returns (resized_bhwc, scale, pad_top, pad_left)."""
+ """Stretch BHWC images to (model_h, model_w), model expects no aspect preservation."""
h, w = imgs.shape[-3], imgs.shape[-2]
- scale = min(model_h / h, model_w / w)
- sh, sw = int(round(h * scale)), int(round(w * scale))
- pt, pl = (model_h - sh) // 2, (model_w - sw) // 2
+ method = "area" if (model_h <= h and model_w <= w) else "bilinear"
chw = imgs.permute(0, 3, 1, 2).float()
- scaled = comfy.utils.common_upscale(chw, sw, sh, upscale_method="bilinear", crop="disabled")
- padded = torch.zeros(scaled.shape[0], scaled.shape[1], model_h, model_w, dtype=scaled.dtype, device=scaled.device)
- padded[:, :, pt:pt + sh, pl:pl + sw] = scaled
- return padded.permute(0, 2, 3, 1), scale, pt, pl
+ scaled = comfy.utils.common_upscale(chw, model_w, model_h, upscale_method=method, crop="disabled")
+ return scaled.permute(0, 2, 3, 1), model_w / w, model_h / h
- def _remap_keypoints(kp, scale, pad_top, pad_left, offset_x=0, offset_y=0):
+ def _remap_keypoints(kp, scale_x, scale_y, offset_x=0, offset_y=0):
"""Remap keypoints from model space back to original image space."""
kp = kp.copy() if isinstance(kp, np.ndarray) else np.array(kp, dtype=np.float32)
invalid = kp[..., 0] < 0
- kp[..., 0] = (kp[..., 0] - pad_left) / scale + offset_x
- kp[..., 1] = (kp[..., 1] - pad_top) / scale + offset_y
+ kp[..., 0] = kp[..., 0] / scale_x + offset_x
+ kp[..., 1] = kp[..., 1] / scale_y + offset_y
kp[invalid] = -1
return kp
@@ -529,18 +527,18 @@ class SDPoseKeypointExtractor(io.ComfyNode):
continue
crop = img[:, y1:y2, x1:x2, :] # (1, crop_h, crop_w, C)
- crop_resized, scale, pad_top, pad_left = _resize_to_model(crop)
+ crop_resized, sx, sy = _resize_to_model(crop)
latent_crop = vae.encode(crop_resized)
kp_batch, sc_batch = _run_on_latent(latent_crop)
- kp = _remap_keypoints(kp_batch[0], scale, pad_top, pad_left, x1, y1)
+ kp = _remap_keypoints(kp_batch[0], sx, sy, x1, y1)
img_keypoints.append(kp)
img_scores.append(sc_batch[0])
else:
- img_resized, scale, pad_top, pad_left = _resize_to_model(img)
+ img_resized, sx, sy = _resize_to_model(img)
latent_img = vae.encode(img_resized)
kp_batch, sc_batch = _run_on_latent(latent_img)
- img_keypoints.append(_remap_keypoints(kp_batch[0], scale, pad_top, pad_left))
+ img_keypoints.append(_remap_keypoints(kp_batch[0], sx, sy))
img_scores.append(sc_batch[0])
all_keypoints.append(img_keypoints)
@@ -549,12 +547,12 @@ class SDPoseKeypointExtractor(io.ComfyNode):
else: # full-image mode, batched
for batch_start in tqdm(range(0, total_images, batch_size), desc="Extracting keypoints"):
- batch_resized, scale, pad_top, pad_left = _resize_to_model(image[batch_start:batch_start + batch_size])
+ batch_resized, sx, sy = _resize_to_model(image[batch_start:batch_start + batch_size])
latent_batch = vae.encode(batch_resized)
kp_batch, sc_batch = _run_on_latent(latent_batch)
for kp, sc in zip(kp_batch, sc_batch):
- all_keypoints.append([_remap_keypoints(kp, scale, pad_top, pad_left)])
+ all_keypoints.append([_remap_keypoints(kp, sx, sy)])
all_scores.append([sc])
pbar.update(len(kp_batch))
@@ -599,7 +597,8 @@ class SDPoseFaceBBoxes(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SDPoseFaceBBoxes",
- category="image/preprocessors",
+ display_name="SDPose Face Bounding Boxes",
+ category="image/detection",
search_aliases=["face bbox", "face bounding box", "pose", "keypoints"],
inputs=[
io.Custom("POSE_KEYPOINT").Input("keypoints"),
@@ -656,7 +655,8 @@ class CropByBBoxes(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CropByBBoxes",
- category="image/preprocessors",
+ display_name="Crop By Bounding Boxes",
+ category="image/transform",
search_aliases=["crop", "face crop", "bbox crop", "pose", "bounding box"],
description="Crop and resize regions from the input image batch based on provided bounding boxes.",
inputs=[
@@ -727,13 +727,13 @@ class CropByBBoxes(io.ComfyNode):
scale = min(output_width / crop_w, output_height / crop_h)
scaled_w = int(round(crop_w * scale))
scaled_h = int(round(crop_h * scale))
- scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="bilinear", crop="disabled")
+ scaled = comfy.utils.common_upscale(crop_chw, scaled_w, scaled_h, upscale_method="area", crop="disabled")
pad_left = (output_width - scaled_w) // 2
pad_top = (output_height - scaled_h) // 2
resized = torch.zeros(1, num_ch, output_height, output_width, dtype=image.dtype, device=image.device)
resized[:, :, pad_top:pad_top + scaled_h, pad_left:pad_left + scaled_w] = scaled
else: # "stretch"
- resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="bilinear", crop="disabled")
+ resized = comfy.utils.common_upscale(crop_chw, output_width, output_height, upscale_method="area", crop="disabled")
crops.append(resized)
if not crops:
diff --git a/comfy_extras/nodes_sdupscale.py b/comfy_extras/nodes_sdupscale.py
index 5877719d3..ea283e971 100644
--- a/comfy_extras/nodes_sdupscale.py
+++ b/comfy_extras/nodes_sdupscale.py
@@ -9,7 +9,7 @@ class SD_4XUpscale_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SD_4XUpscale_Conditioning",
- category="conditioning/upscale_diffusion",
+ category="model/conditioning/upscale_diffusion",
inputs=[
io.Image.Input("images"),
io.Conditioning.Input("positive"),
diff --git a/comfy_extras/nodes_stable3d.py b/comfy_extras/nodes_stable3d.py
index 829c837a1..8a6e5b726 100644
--- a/comfy_extras/nodes_stable3d.py
+++ b/comfy_extras/nodes_stable3d.py
@@ -27,7 +27,7 @@ class StableZero123_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableZero123_Conditioning",
- category="conditioning/3d_models",
+ category="model/conditioning/3d_models",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),
@@ -65,7 +65,7 @@ class StableZero123_Conditioning_Batched(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableZero123_Conditioning_Batched",
- category="conditioning/3d_models",
+ category="model/conditioning/3d_models",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),
@@ -112,7 +112,7 @@ class SV3D_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="SV3D_Conditioning",
- category="conditioning/3d_models",
+ category="model/conditioning/3d_models",
inputs=[
io.ClipVision.Input("clip_vision"),
io.Image.Input("init_image"),
diff --git a/comfy_extras/nodes_stable_cascade.py b/comfy_extras/nodes_stable_cascade.py
index 8c1aebca9..e55f248ae 100644
--- a/comfy_extras/nodes_stable_cascade.py
+++ b/comfy_extras/nodes_stable_cascade.py
@@ -29,7 +29,7 @@ class StableCascade_EmptyLatentImage(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_EmptyLatentImage",
- category="latent/stable_cascade",
+ category="model/latent/stable_cascade",
inputs=[
io.Int.Input("width", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
io.Int.Input("height", default=1024, min=256, max=nodes.MAX_RESOLUTION, step=8),
@@ -58,7 +58,7 @@ class StableCascade_StageC_VAEEncode(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_StageC_VAEEncode",
- category="latent/stable_cascade",
+ category="model/latent/stable_cascade",
inputs=[
io.Image.Input("image"),
io.Vae.Input("vae"),
@@ -93,7 +93,7 @@ class StableCascade_StageB_Conditioning(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_StageB_Conditioning",
- category="conditioning/stable_cascade",
+ category="model/conditioning/stable_cascade",
inputs=[
io.Conditioning.Input("conditioning"),
io.Latent.Input("stage_c"),
@@ -119,7 +119,7 @@ class StableCascade_SuperResolutionControlnet(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StableCascade_SuperResolutionControlnet",
- category="_for_testing/stable_cascade",
+ category="experimental/stable_cascade",
is_experimental=True,
inputs=[
io.Image.Input("image"),
diff --git a/comfy_extras/nodes_string.py b/comfy_extras/nodes_string.py
index 75a8bb4ee..97485c8c5 100644
--- a/comfy_extras/nodes_string.py
+++ b/comfy_extras/nodes_string.py
@@ -1,17 +1,49 @@
import re
+import json
+import string
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
+class StringFormat(io.ComfyNode):
+ @classmethod
+ def define_schema(cls) -> io.Schema:
+ autogrow = io.Autogrow.TemplateNames(
+ input=io.AnyType.Input("value"),
+ names=list(string.ascii_lowercase),
+ min=0,
+ )
+ return io.Schema(
+ node_id="StringFormat",
+ display_name="Format Text",
+ category="text",
+ search_aliases=["string", "format"],
+ description="Same as Python's string format method. Supports all of Python's format options and features.",
+ inputs=[
+ io.Autogrow.Input("values", template=autogrow),
+ io.String.Input("f_string", default="{a}", multiline=True),
+ ],
+ outputs=[
+ io.String.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(
+ cls, values: io.Autogrow.Type, f_string: str
+ ) -> io.NodeOutput:
+ return io.NodeOutput(f_string.format(**values))
+
+
class StringConcatenate(io.ComfyNode):
@classmethod
def define_schema(cls):
return io.Schema(
node_id="StringConcatenate",
- display_name="Text Concatenate",
- category="utils/string",
- search_aliases=["Concatenate", "text concat", "join text", "merge text", "combine strings", "concat", "concatenate", "append text", "combine text", "string"],
+ search_aliases=["concatenate", "text concat", "join text", "merge text", "combine strings", "string concat", "append text", "combine text"],
+ display_name="Concatenate Text",
+ category="text",
inputs=[
io.String.Input("string_a", multiline=True),
io.String.Input("string_b", multiline=True),
@@ -32,9 +64,9 @@ class StringSubstring(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StringSubstring",
- search_aliases=["Substring", "extract text", "text portion"],
- display_name="Text Substring",
- category="utils/string",
+ search_aliases=["substring", "extract text", "text portion"],
+ display_name="Substring",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.Int.Input("start"),
@@ -57,7 +89,7 @@ class StringLength(io.ComfyNode):
node_id="StringLength",
search_aliases=["character count", "text size", "string length"],
display_name="Text Length",
- category="utils/string",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
],
@@ -76,9 +108,9 @@ class CaseConverter(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="CaseConverter",
- search_aliases=["Case Converter", "text case", "uppercase", "lowercase", "capitalize"],
- display_name="Text Case Converter",
- category="utils/string",
+ search_aliases=["case converter", "text case", "uppercase", "lowercase", "capitalize"],
+ display_name="Convert Text Case",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.Combo.Input("mode", options=["UPPERCASE", "lowercase", "Capitalize", "Title Case"]),
@@ -109,9 +141,9 @@ class StringTrim(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StringTrim",
- search_aliases=["Trim", "clean whitespace", "remove whitespace", "strip"],
- display_name="Text Trim",
- category="utils/string",
+ search_aliases=["trim", "clean whitespace", "remove whitespace", "remove spaces","strip"],
+ display_name="Trim Text",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.Combo.Input("mode", options=["Both", "Left", "Right"]),
@@ -140,9 +172,9 @@ class StringReplace(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StringReplace",
- search_aliases=["Replace", "find and replace", "substitute", "swap text"],
- display_name="Text Replace",
- category="utils/string",
+ search_aliases=["replace", "find and replace", "substitute", "swap text"],
+ display_name="Replace Text",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.String.Input("find", multiline=True),
@@ -163,9 +195,9 @@ class StringContains(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StringContains",
- search_aliases=["Contains", "text includes", "string includes"],
- display_name="Text Contains",
- category="utils/string",
+ search_aliases=["contains", "text includes", "string includes"],
+ display_name="Contains Text",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.String.Input("substring", multiline=True),
@@ -191,9 +223,9 @@ class StringCompare(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="StringCompare",
- search_aliases=["Compare", "text match", "string equals", "starts with", "ends with"],
- display_name="Text Compare",
- category="utils/string",
+ search_aliases=["compare", "text match", "string equals", "starts with", "ends with"],
+ display_name="Compare Text",
+ category="text",
inputs=[
io.String.Input("string_a", multiline=True),
io.String.Input("string_b", multiline=True),
@@ -227,9 +259,9 @@ class RegexMatch(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RegexMatch",
- search_aliases=["Regex Match", "regex", "pattern match", "text contains", "string match"],
- display_name="Text Match",
- category="utils/string",
+ search_aliases=["regex match", "regex", "pattern match", "text contains", "string match"],
+ display_name="Match Text",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.String.Input("regex_pattern", multiline=True),
@@ -268,9 +300,9 @@ class RegexExtract(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RegexExtract",
- search_aliases=["Regex Extract", "regex", "pattern extract", "text parser", "parse text"],
- display_name="Text Extract Substring",
- category="utils/string",
+ search_aliases=["regex extract", "regex", "pattern extract", "text parser", "parse text"],
+ display_name="Extract Text",
+ category="text",
inputs=[
io.String.Input("string", multiline=True),
io.String.Input("regex_pattern", multiline=True),
@@ -343,9 +375,9 @@ class RegexReplace(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="RegexReplace",
- search_aliases=["Regex Replace", "regex", "pattern replace", "regex replace", "substitution"],
- display_name="Text Replace (Regex)",
- category="utils/string",
+ search_aliases=["regex replace", "regex", "pattern replace", "substitution"],
+ display_name="Replace Text (Regex)",
+ category="text",
description="Find and replace text using regex patterns.",
inputs=[
io.String.Input("string", multiline=True),
@@ -375,10 +407,44 @@ class RegexReplace(io.ComfyNode):
return io.NodeOutput(result)
+class JsonExtractString(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="JsonExtractString",
+ display_name="Extract Text from JSON",
+ category="text",
+ search_aliases=["json", "extract json", "parse json", "json value", "read json"],
+ inputs=[
+ io.String.Input("json_string", multiline=True),
+ io.String.Input("key", multiline=False),
+ ],
+ outputs=[
+ io.String.Output(),
+ ]
+ )
+
+ @classmethod
+ def execute(cls, json_string, key):
+ try:
+ data = json.loads(json_string)
+ if isinstance(data, dict) and key in data:
+ value = data[key]
+ if value is None:
+ return io.NodeOutput("")
+
+ return io.NodeOutput(str(value))
+
+ return io.NodeOutput("")
+
+ except (json.JSONDecodeError, TypeError):
+ return io.NodeOutput("")
+
class StringExtension(ComfyExtension):
@override
async def get_node_list(self) -> list[type[io.ComfyNode]]:
return [
+ StringFormat,
StringConcatenate,
StringSubstring,
StringLength,
@@ -390,6 +456,7 @@ class StringExtension(ComfyExtension):
RegexMatch,
RegexExtract,
RegexReplace,
+ JsonExtractString,
]
async def comfy_entrypoint() -> StringExtension:
diff --git a/comfy_extras/nodes_textgen.py b/comfy_extras/nodes_textgen.py
index f1aeb63fa..d52faf815 100644
--- a/comfy_extras/nodes_textgen.py
+++ b/comfy_extras/nodes_textgen.py
@@ -26,15 +26,19 @@ class TextGenerate(io.ComfyNode):
return io.Schema(
node_id="TextGenerate",
- category="textgen",
+ display_name="Generate Text",
+ category="text",
search_aliases=["LLM", "gemma"],
inputs=[
io.Clip.Input("clip"),
io.String.Input("prompt", multiline=True, dynamic_prompts=True, default=""),
io.Image.Input("image", optional=True),
+ io.Image.Input("video", optional=True, tooltip="Video frames as image batch. Assumed to be 24 FPS; subsampled to 1 FPS internally."),
+ io.Audio.Input("audio", optional=True),
io.Int.Input("max_length", default=256, min=1, max=2048),
io.DynamicCombo.Input("sampling_mode", options=sampling_options, display_name="Sampling Mode"),
io.Boolean.Input("thinking", optional=True, default=False, tooltip="Operate in thinking mode if the model supports it."),
+ io.Boolean.Input("use_default_template", optional=True, default=True, tooltip="Use the built in system prompt/template if the model has one.", advanced=True),
],
outputs=[
io.String.Output(display_name="generated_text"),
@@ -42,9 +46,9 @@ class TextGenerate(io.ComfyNode):
)
@classmethod
- def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
+ def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True, video=None, audio=None) -> io.NodeOutput:
- tokens = clip.tokenize(prompt, image=image, skip_template=False, min_length=1, thinking=thinking)
+ tokens = clip.tokenize(prompt, image=image, skip_template=not use_default_template, min_length=1, thinking=thinking, video=video, audio=audio)
# Get sampling parameters from dynamic combo
do_sample = sampling_mode.get("sampling_mode") == "on"
@@ -69,7 +73,8 @@ class TextGenerate(io.ComfyNode):
seed=seed
)
- generated_text = clip.decode(generated_ids, skip_special_tokens=True)
+ generated_text = clip.decode(generated_ids)
+
return io.NodeOutput(generated_text)
@@ -153,6 +158,7 @@ class TextGenerateLTX2Prompt(TextGenerate):
parent_schema = super().define_schema()
return io.Schema(
node_id="TextGenerateLTX2Prompt",
+ display_name="Generate LTX2 Prompt",
category=parent_schema.category,
inputs=parent_schema.inputs,
outputs=parent_schema.outputs,
@@ -160,12 +166,12 @@ class TextGenerateLTX2Prompt(TextGenerate):
)
@classmethod
- def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False) -> io.NodeOutput:
+ def execute(cls, clip, prompt, max_length, sampling_mode, image=None, thinking=False, use_default_template=True, video=None, audio=None) -> io.NodeOutput:
if image is None:
formatted_prompt = f"system\n{LTX2_T2V_SYSTEM_PROMPT.strip()}\nuser\nUser Raw Input Prompt: {prompt}.\nmodel\n"
else:
formatted_prompt = f"system\n{LTX2_I2V_SYSTEM_PROMPT.strip()}\nuser\n\n\n\nUser Raw Input Prompt: {prompt}.\nmodel\n"
- return super().execute(clip, formatted_prompt, max_length, sampling_mode, image, thinking)
+ return super().execute(clip, formatted_prompt, max_length, sampling_mode, image=image, thinking=thinking, use_default_template=use_default_template, video=video, audio=audio)
class TextgenExtension(ComfyExtension):
diff --git a/comfy_extras/nodes_tomesd.py b/comfy_extras/nodes_tomesd.py
index 87bf29b8f..3667fac3a 100644
--- a/comfy_extras/nodes_tomesd.py
+++ b/comfy_extras/nodes_tomesd.py
@@ -151,7 +151,7 @@ class TomePatchModel(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TomePatchModel",
- category="model_patches/unet",
+ category="model/patch/unet",
inputs=[
io.Model.Input("model"),
io.Float.Input("ratio", default=0.3, min=0.0, max=1.0, step=0.01),
diff --git a/comfy_extras/nodes_toolkit.py b/comfy_extras/nodes_toolkit.py
index 71faf7226..9f709bbe3 100644
--- a/comfy_extras/nodes_toolkit.py
+++ b/comfy_extras/nodes_toolkit.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
from typing_extensions import override
from comfy_api.latest import ComfyExtension, io
@@ -14,7 +13,7 @@ class CreateList(io.ComfyNode):
return io.Schema(
node_id="CreateList",
display_name="Create List",
- category="logic",
+ category="utilities",
is_input_list=True,
search_aliases=["Image Iterator", "Text Iterator", "Iterator"],
inputs=[io.Autogrow.Input("inputs", template=template_autogrow)],
diff --git a/comfy_extras/nodes_torch_compile.py b/comfy_extras/nodes_torch_compile.py
index c9e2e0026..d4506b1a9 100644
--- a/comfy_extras/nodes_torch_compile.py
+++ b/comfy_extras/nodes_torch_compile.py
@@ -10,7 +10,7 @@ class TorchCompileModel(io.ComfyNode):
def define_schema(cls) -> io.Schema:
return io.Schema(
node_id="TorchCompileModel",
- category="_for_testing",
+ category="experimental",
inputs=[
io.Model.Input("model"),
io.Combo.Input(
diff --git a/comfy_extras/nodes_train.py b/comfy_extras/nodes_train.py
index 0616dfc2d..046eeaaf5 100644
--- a/comfy_extras/nodes_train.py
+++ b/comfy_extras/nodes_train.py
@@ -951,7 +951,7 @@ class TrainLoraNode(io.ComfyNode):
return io.Schema(
node_id="TrainLoraNode",
display_name="Train LoRA",
- category="training",
+ category="model/training",
is_experimental=True,
is_input_list=True, # All inputs become lists
inputs=[
@@ -1309,7 +1309,7 @@ class LoraModelLoader(io.ComfyNode):
return io.Schema(
node_id="LoraModelLoader",
display_name="Load LoRA Model",
- category="loaders",
+ category="model/loaders",
is_experimental=True,
inputs=[
io.Model.Input(
@@ -1361,7 +1361,7 @@ class SaveLoRA(io.ComfyNode):
node_id="SaveLoRA",
search_aliases=["export lora"],
display_name="Save LoRA Weights",
- category="loaders",
+ category="advanced/model_merging",
is_experimental=True,
is_output_node=True,
inputs=[
@@ -1405,7 +1405,7 @@ class LossGraphNode(io.ComfyNode):
node_id="LossGraphNode",
search_aliases=["training chart", "training visualization", "plot loss"],
display_name="Plot Loss Graph",
- category="training",
+ category="model/training",
is_experimental=True,
is_output_node=True,
inputs=[
diff --git a/comfy_extras/nodes_upscale_model.py b/comfy_extras/nodes_upscale_model.py
index d3ee3f1c1..1cf5a5d01 100644
--- a/comfy_extras/nodes_upscale_model.py
+++ b/comfy_extras/nodes_upscale_model.py
@@ -22,7 +22,7 @@ class UpscaleModelLoader(io.ComfyNode):
return io.Schema(
node_id="UpscaleModelLoader",
display_name="Load Upscale Model",
- category="loaders",
+ category="model/loaders",
inputs=[
io.Combo.Input("model_name", options=folder_paths.get_filename_list("upscale_models")),
],
diff --git a/comfy_extras/nodes_video.py b/comfy_extras/nodes_video.py
index 5c096c232..ae1d826d5 100644
--- a/comfy_extras/nodes_video.py
+++ b/comfy_extras/nodes_video.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import os
import av
import torch
@@ -17,7 +15,8 @@ class SaveWEBM(io.ComfyNode):
return io.Schema(
node_id="SaveWEBM",
search_aliases=["export webm"],
- category="image/video",
+ display_name="Save WEBM",
+ category="video",
is_experimental=True,
inputs=[
io.Image.Input("images"),
@@ -72,7 +71,7 @@ class SaveVideo(io.ComfyNode):
node_id="SaveVideo",
search_aliases=["export video"],
display_name="Save Video",
- category="image/video",
+ category="video",
essentials_category="Basics",
description="Saves the input images to your ComfyUI output directory.",
inputs=[
@@ -121,7 +120,8 @@ class CreateVideo(io.ComfyNode):
node_id="CreateVideo",
search_aliases=["images to video"],
display_name="Create Video",
- category="image/video",
+ category="video",
+ essentials_category="Video Tools",
description="Create a video from images.",
inputs=[
io.Image.Input("images", tooltip="The images to create a video from."),
@@ -146,7 +146,7 @@ class GetVideoComponents(io.ComfyNode):
node_id="GetVideoComponents",
search_aliases=["extract frames", "split video", "video to images", "demux"],
display_name="Get Video Components",
- category="image/video",
+ category="video",
description="Extracts all components from a video: frames, audio, and framerate.",
inputs=[
io.Video.Input("video", tooltip="The video to extract components from."),
@@ -174,7 +174,7 @@ class LoadVideo(io.ComfyNode):
node_id="LoadVideo",
search_aliases=["import video", "open video", "video file"],
display_name="Load Video",
- category="image/video",
+ category="video",
essentials_category="Basics",
inputs=[
io.Combo.Input("file", options=sorted(files), upload=io.UploadType.video),
@@ -216,7 +216,7 @@ class VideoSlice(io.ComfyNode):
"frame load cap",
"start time",
],
- category="image/video",
+ category="video",
essentials_category="Video Tools",
inputs=[
io.Video.Input("video"),
diff --git a/comfy_extras/nodes_video_model.py b/comfy_extras/nodes_video_model.py
index bf98e6b82..0d6cae6a8 100644
--- a/comfy_extras/nodes_video_model.py
+++ b/comfy_extras/nodes_video_model.py
@@ -15,7 +15,7 @@ class ImageOnlyCheckpointLoader:
RETURN_TYPES = ("MODEL", "CLIP_VISION", "VAE")
FUNCTION = "load_checkpoint"
- CATEGORY = "loaders/video_models"
+ CATEGORY = "model/loaders"
def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True):
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
@@ -41,7 +41,7 @@ class SVD_img2vid_Conditioning:
FUNCTION = "encode"
- CATEGORY = "conditioning/video_models"
+ CATEGORY = "model/conditioning/video_models"
def encode(self, clip_vision, init_image, vae, width, height, video_frames, motion_bucket_id, fps, augmentation_level):
output = clip_vision.encode_image(init_image)
@@ -65,7 +65,7 @@ class VideoLinearCFGGuidance:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
- CATEGORY = "sampling/video_models"
+ CATEGORY = "model/sampling/guiders"
def patch(self, model, min_cfg):
def linear_cfg(args):
@@ -89,7 +89,7 @@ class VideoTriangleCFGGuidance:
RETURN_TYPES = ("MODEL",)
FUNCTION = "patch"
- CATEGORY = "sampling/video_models"
+ CATEGORY = "model/sampling/guiders"
def patch(self, model, min_cfg):
def linear_cfg(args):
@@ -138,7 +138,7 @@ class ConditioningSetAreaPercentageVideo:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def append(self, conditioning, width, height, temporal, x, y, z, strength):
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", temporal, height, width, z, y, x),
@@ -157,5 +157,7 @@ NODE_CLASS_MAPPINGS = {
}
NODE_DISPLAY_NAME_MAPPINGS = {
- "ImageOnlyCheckpointLoader": "Image Only Checkpoint Loader (img2vid model)",
+ "ImageOnlyCheckpointLoader": "Load Checkpoint Image Only (img2vid model)",
+ "VideoLinearCFGGuidance": "Video Linear CFG Guidance",
+ "VideoTriangleCFGGuidance": "Video Triangle CFG Guidance",
}
diff --git a/comfy_extras/nodes_void.py b/comfy_extras/nodes_void.py
new file mode 100644
index 000000000..b43154b8d
--- /dev/null
+++ b/comfy_extras/nodes_void.py
@@ -0,0 +1,484 @@
+import logging
+
+import torch
+
+import comfy
+import comfy.model_management
+import comfy.model_patcher
+import comfy.samplers
+import comfy.utils
+import folder_paths
+import node_helpers
+import nodes
+from comfy.utils import model_trange as trange
+from comfy_api.latest import ComfyExtension, io
+from torchvision.models.optical_flow import raft_large
+from typing_extensions import override
+
+
+from comfy_extras.void_noise_warp import RaftOpticalFlow, get_noise_from_video
+
+OpticalFlow = io.Custom("OPTICAL_FLOW")
+
+TEMPORAL_COMPRESSION = 4
+PATCH_SIZE_T = 2
+
+
+def _valid_void_length(length: int) -> int:
+ """Round ``length`` down to a value that produces an even latent_t.
+
+ VOID / CogVideoX-Fun-V1.5 uses patch_size_t=2, so the VAE-encoded latent
+ must have an even temporal dimension. If latent_t is odd, the transformer
+ pad_to_patch_size circular-wraps an extra latent frame onto the end; after
+ the post-transformer crop the last real latent frame has been influenced
+ by the wrapped phantom frame, producing visible jitter and "disappearing"
+ subjects near the end of the decoded video. Rounding down fixes this.
+ """
+ latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
+ if latent_t % PATCH_SIZE_T == 0:
+ return length
+ # Round latent_t down to the nearest multiple of PATCH_SIZE_T, then invert
+ # the ((length - 1) // TEMPORAL_COMPRESSION) + 1 formula. Floor at 1 frame
+ # so we never return a non-positive length.
+ target_latent_t = max(PATCH_SIZE_T, (latent_t // PATCH_SIZE_T) * PATCH_SIZE_T)
+ return (target_latent_t - 1) * TEMPORAL_COMPRESSION + 1
+
+
+class OpticalFlowLoader(io.ComfyNode):
+ """Load an optical flow model from ``models/optical_flow/``.
+
+ Only torchvision's RAFT-large format is recognized today (the model used
+ by VOIDWarpedNoise). The checkpoint must be placed under
+ ``models/optical_flow/`` — ComfyUI never downloads optical-flow weights
+ at runtime.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="OpticalFlowLoader",
+ display_name="Load Optical Flow Model",
+ category="model/loaders",
+ inputs=[
+ io.Combo.Input(
+ "model_name",
+ options=folder_paths.get_filename_list("optical_flow"),
+ tooltip=(
+ "Optical flow model to load. Files must be placed in the "
+ "'optical_flow' folder. Today only torchvision's "
+ "raft_large.pth is supported."
+ ),
+ ),
+ ],
+ outputs=[
+ OpticalFlow.Output(),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, model_name) -> io.NodeOutput:
+
+ model_path = folder_paths.get_full_path_or_raise("optical_flow", model_name)
+ sd = comfy.utils.load_torch_file(model_path, safe_load=True)
+
+ has_raft_keys = (
+ any(k.startswith("feature_encoder.") for k in sd)
+ and any(k.startswith("context_encoder.") for k in sd)
+ and any(k.startswith("update_block.") for k in sd)
+ )
+ if not has_raft_keys:
+ raise ValueError(
+ "Unrecognized optical flow model format: expected a torchvision "
+ "RAFT-large state dict with 'feature_encoder.', 'context_encoder.' "
+ "and 'update_block.' prefixes."
+ )
+
+ model = raft_large(weights=None, progress=False)
+ model.load_state_dict(sd)
+ model.eval().to(torch.float32)
+
+ patcher = comfy.model_patcher.ModelPatcher(
+ model,
+ load_device=comfy.model_management.get_torch_device(),
+ offload_device=comfy.model_management.unet_offload_device(),
+ )
+ return io.NodeOutput(patcher)
+
+
+class VOIDQuadmaskPreprocess(io.ComfyNode):
+ """Preprocess a quadmask video for VOID inpainting.
+
+ Quantizes mask values to four semantic levels, inverts, and normalizes:
+ 0 -> primary object to remove
+ 63 -> overlap of primary + affected
+ 127 -> affected region (interactions)
+ 255 -> background (keep)
+
+ After inversion and normalization, the output mask has values in [0, 1]
+ with four discrete levels: 1.0 (remove), ~0.75, ~0.50, 0.0 (keep).
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="VOIDQuadmaskPreprocess",
+ display_name="VOID Quadmask Preprocessor",
+ category="image/mask",
+ inputs=[
+ io.Mask.Input("mask"),
+ io.Int.Input("dilate_width", default=0, min=0, max=50, step=1,
+ tooltip="Dilation radius for the primary mask region (0 = no dilation)"),
+ ],
+ outputs=[
+ io.Mask.Output(display_name="quadmask"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, mask, dilate_width=0) -> io.NodeOutput:
+ m = mask.clone()
+
+ if m.max() <= 1.0:
+ m = m * 255.0
+
+ if dilate_width > 0 and m.ndim >= 3:
+ binary = (m < 128).float()
+ kernel_size = dilate_width * 2 + 1
+ if binary.ndim == 3:
+ binary = binary.unsqueeze(1)
+ dilated = torch.nn.functional.max_pool2d(
+ binary, kernel_size=kernel_size, stride=1, padding=dilate_width
+ )
+ if dilated.ndim == 4:
+ dilated = dilated.squeeze(1)
+ m = torch.where(dilated > 0.5, torch.zeros_like(m), m)
+
+ m = torch.where(m <= 31, torch.zeros_like(m), m)
+ m = torch.where((m > 31) & (m <= 95), torch.full_like(m, 63), m)
+ m = torch.where((m > 95) & (m <= 191), torch.full_like(m, 127), m)
+ m = torch.where(m > 191, torch.full_like(m, 255), m)
+
+ m = (255.0 - m) / 255.0
+
+ return io.NodeOutput(m)
+
+
+class VOIDInpaintConditioning(io.ComfyNode):
+ """Build VOID inpainting conditioning for CogVideoX.
+
+ Encodes the processed quadmask and masked source video through the VAE,
+ producing a 32-channel concat conditioning (16ch mask + 16ch masked video)
+ that gets concatenated with the 16ch noise latent by the model.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="VOIDInpaintConditioning",
+ category="model/conditioning/video_models",
+ inputs=[
+ io.Conditioning.Input("positive"),
+ io.Conditioning.Input("negative"),
+ io.Vae.Input("vae"),
+ io.Image.Input("video", tooltip="Source video frames [T, H, W, 3]"),
+ io.Mask.Input("quadmask", tooltip="Preprocessed quadmask from VOIDQuadmaskPreprocess [T, H, W]"),
+ io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8),
+ io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8),
+ io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1,
+ tooltip="Number of pixel frames to process. For CogVideoX-Fun-V1.5 "
+ "(patch_size_t=2), latent_t must be even — lengths that "
+ "produce odd latent_t are rounded down (e.g. 49 → 45)."),
+ io.Int.Input("batch_size", default=1, min=1, max=64),
+ ],
+ outputs=[
+ io.Conditioning.Output(display_name="positive"),
+ io.Conditioning.Output(display_name="negative"),
+ io.Latent.Output(display_name="latent"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, positive, negative, vae, video, quadmask,
+ width, height, length, batch_size) -> io.NodeOutput:
+
+ adjusted_length = _valid_void_length(length)
+ if adjusted_length != length:
+ logging.warning(
+ "VOIDInpaintConditioning: rounding length %d down to %d so that "
+ "latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2). "
+ "Using odd latent_t causes the last frame to be corrupted by "
+ "circular padding.", length, adjusted_length,
+ )
+ length = adjusted_length
+
+ latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
+ latent_h = height // 8
+ latent_w = width // 8
+
+ vid = video[:length]
+ vid = comfy.utils.common_upscale(
+ vid.movedim(-1, 1), width, height, "bilinear", "center"
+ ).movedim(1, -1)
+
+ qm = quadmask[:length]
+ if qm.ndim == 3:
+ qm = qm.unsqueeze(-1)
+ qm = comfy.utils.common_upscale(
+ qm.movedim(-1, 1), width, height, "bilinear", "center"
+ ).movedim(1, -1)
+ if qm.ndim == 4 and qm.shape[-1] == 1:
+ qm = qm.squeeze(-1)
+
+ mask_condition = qm
+ if mask_condition.ndim == 3:
+ mask_condition_3ch = mask_condition.unsqueeze(-1).expand(-1, -1, -1, 3)
+ else:
+ mask_condition_3ch = mask_condition
+
+ inverted_mask_3ch = 1.0 - mask_condition_3ch
+ masked_video = vid[:, :, :, :3] * (1.0 - mask_condition_3ch)
+
+ mask_latents = vae.encode(inverted_mask_3ch)
+ masked_video_latents = vae.encode(masked_video)
+
+ def _match_temporal(lat, target_t):
+ if lat.shape[2] > target_t:
+ return lat[:, :, :target_t]
+ elif lat.shape[2] < target_t:
+ pad = target_t - lat.shape[2]
+ return torch.cat([lat, lat[:, :, -1:].repeat(1, 1, pad, 1, 1)], dim=2)
+ return lat
+
+ mask_latents = _match_temporal(mask_latents, latent_t)
+ masked_video_latents = _match_temporal(masked_video_latents, latent_t)
+
+ inpaint_latents = torch.cat([mask_latents, masked_video_latents], dim=1)
+
+ # No explicit scaling needed here: the model's CogVideoX.concat_cond()
+ # applies process_latent_in (×latent_format.scale_factor) to each 16-ch
+ # block of the stored conditioning. For 5b-class checkpoints (incl. the
+ # VOID/CogVideoX-Fun-V1.5 inpainting model) that scale_factor is auto-
+ # selected as 0.7 in supported_models.CogVideoX_T2V, which matches the
+ # diffusers vae/config.json scaling_factor VOID was trained with.
+
+ positive = node_helpers.conditioning_set_values(
+ positive, {"concat_latent_image": inpaint_latents}
+ )
+ negative = node_helpers.conditioning_set_values(
+ negative, {"concat_latent_image": inpaint_latents}
+ )
+
+ noise_latent = torch.zeros(
+ [batch_size, 16, latent_t, latent_h, latent_w],
+ device=comfy.model_management.intermediate_device()
+ )
+
+ return io.NodeOutput(positive, negative, {"samples": noise_latent})
+
+
+class VOIDWarpedNoise(io.ComfyNode):
+ """Generate optical-flow warped noise for VOID Pass 2 refinement.
+
+ Takes the Pass 1 output video and produces temporally-correlated noise
+ by warping Gaussian noise along optical flow vectors. This noise is used
+ as the initial latent for Pass 2, resulting in better temporal consistency.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="VOIDWarpedNoise",
+ category="model/latent/video",
+ inputs=[
+ OpticalFlow.Input(
+ "optical_flow",
+ tooltip="Optical flow model from OpticalFlowLoader (RAFT-large).",
+ ),
+ io.Image.Input("video", tooltip="Pass 1 output video frames [T, H, W, 3]"),
+ io.Int.Input("width", default=672, min=16, max=nodes.MAX_RESOLUTION, step=8),
+ io.Int.Input("height", default=384, min=16, max=nodes.MAX_RESOLUTION, step=8),
+ io.Int.Input("length", default=45, min=1, max=nodes.MAX_RESOLUTION, step=1,
+ tooltip="Number of pixel frames. Rounded down to make latent_t "
+ "even (patch_size_t=2 requirement), e.g. 49 → 45."),
+ io.Int.Input("batch_size", default=1, min=1, max=64),
+ ],
+ outputs=[
+ io.Latent.Output(display_name="warped_noise"),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, optical_flow, video, width, height, length, batch_size) -> io.NodeOutput:
+
+ adjusted_length = _valid_void_length(length)
+ if adjusted_length != length:
+ logging.warning(
+ "VOIDWarpedNoise: rounding length %d down to %d so that "
+ "latent_t is even (required by CogVideoX-Fun-V1.5 patch_size_t=2).",
+ length, adjusted_length,
+ )
+ length = adjusted_length
+
+ latent_t = ((length - 1) // TEMPORAL_COMPRESSION) + 1
+ latent_h = height // 8
+ latent_w = width // 8
+
+ # RAFT + noise warp is real compute, not an "intermediate" buffer, so
+ # we want the actual torch device (CUDA/MPS). The final latent is
+ # moved back to intermediate_device() before returning to match the
+ # rest of the ComfyUI pipeline.
+ device = comfy.model_management.get_torch_device()
+
+ comfy.model_management.load_model_gpu(optical_flow)
+ raft = RaftOpticalFlow(optical_flow.model, device=device)
+
+ vid = video[:length].to(device)
+ vid = comfy.utils.common_upscale(
+ vid.movedim(-1, 1), width, height, "bilinear", "center"
+ ).movedim(1, -1)
+ vid_uint8 = (vid.clamp(0, 1) * 255).to(torch.uint8)
+
+ FRAME = 2**-1
+ FLOW = 2**3
+ LATENT_SCALE = 8
+
+ warped = get_noise_from_video(
+ vid_uint8,
+ raft,
+ noise_channels=16,
+ resize_frames=FRAME,
+ resize_flow=FLOW,
+ downscale_factor=round(FRAME * FLOW) * LATENT_SCALE,
+ device=device,
+ )
+
+ if warped.shape[0] != latent_t:
+ indices = torch.linspace(0, warped.shape[0] - 1, latent_t,
+ device=device).long()
+ warped = warped[indices]
+
+ if warped.shape[1] != latent_h or warped.shape[2] != latent_w:
+ # (T, H, W, C) → (T, C, H, W) → bilinear resize → back
+ warped = warped.permute(0, 3, 1, 2)
+ warped = torch.nn.functional.interpolate(
+ warped, size=(latent_h, latent_w),
+ mode="bilinear", align_corners=False,
+ )
+ warped = warped.permute(0, 2, 3, 1)
+
+ # (T, H, W, C) → (B, C, T, H, W)
+ warped_tensor = warped.permute(3, 0, 1, 2).unsqueeze(0)
+ if batch_size > 1:
+ warped_tensor = warped_tensor.repeat(batch_size, 1, 1, 1, 1)
+
+ warped_tensor = warped_tensor.to(comfy.model_management.intermediate_device())
+ return io.NodeOutput({"samples": warped_tensor})
+
+
+class Noise_FromLatent:
+ """Wraps a pre-computed LATENT tensor as a NOISE source."""
+ def __init__(self, latent_dict):
+ self.seed = 0
+ self._samples = latent_dict["samples"]
+
+ def generate_noise(self, input_latent):
+ return self._samples.clone().cpu()
+
+
+class VOIDWarpedNoiseSource(io.ComfyNode):
+ """Convert a LATENT (e.g. from VOIDWarpedNoise) into a NOISE source
+ for use with SamplerCustomAdvanced."""
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="VOIDWarpedNoiseSource",
+ category="model/sampling/noise",
+ inputs=[
+ io.Latent.Input("warped_noise",
+ tooltip="Warped noise latent from VOIDWarpedNoise"),
+ ],
+ outputs=[io.Noise.Output()],
+ )
+
+ @classmethod
+ def execute(cls, warped_noise) -> io.NodeOutput:
+ return io.NodeOutput(Noise_FromLatent(warped_noise))
+
+
+class VOID_DDIM(comfy.samplers.Sampler):
+ """DDIM sampler for VOID inpainting models.
+
+ VOID was trained with the diffusers CogVideoXDDIMScheduler which operates in
+ alpha-space (input std ≈ 1). The standard KSampler applies noise_scaling that
+ multiplies by sqrt(1+sigma^2) ≈ 4500x, which is incompatible with VOID's
+ training. This sampler skips noise_scaling and implements the DDIM update rule
+ directly using sigma-to-alpha conversion.
+ """
+
+ def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+ x = noise.to(torch.float32)
+ model_options = extra_args.get("model_options", {})
+ seed = extra_args.get("seed", None)
+ s_in = x.new_ones([x.shape[0]])
+
+ for i in trange(len(sigmas) - 1, disable=disable_pbar):
+ sigma = sigmas[i]
+ sigma_next = sigmas[i + 1]
+
+ denoised = model_wrap(x, sigma * s_in, model_options=model_options, seed=seed)
+
+ if callback is not None:
+ callback(i, denoised, x, len(sigmas) - 1)
+
+ if sigma_next == 0:
+ x = denoised
+ else:
+ alpha_t = 1.0 / (1.0 + sigma ** 2)
+ alpha_prev = 1.0 / (1.0 + sigma_next ** 2)
+
+ pred_eps = (x - (alpha_t ** 0.5) * denoised) / (1.0 - alpha_t) ** 0.5
+ x = (alpha_prev ** 0.5) * denoised + (1.0 - alpha_prev) ** 0.5 * pred_eps
+
+ return x
+
+
+class VOIDSampler(io.ComfyNode):
+ """VOID DDIM sampler for use with SamplerCustom / SamplerCustomAdvanced.
+
+ Required for VOID inpainting models. Implements the same DDIM loop that VOID
+ was trained with (diffusers CogVideoXDDIMScheduler), without the noise_scaling
+ that the standard KSampler applies. Use with RandomNoise or VOIDWarpedNoiseSource.
+ """
+
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="VOIDSampler",
+ category="model/sampling/samplers",
+ inputs=[],
+ outputs=[io.Sampler.Output()],
+ )
+
+ @classmethod
+ def execute(cls) -> io.NodeOutput:
+ return io.NodeOutput(VOID_DDIM())
+
+ get_sampler = execute
+
+
+class VOIDExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ OpticalFlowLoader,
+ VOIDQuadmaskPreprocess,
+ VOIDInpaintConditioning,
+ VOIDWarpedNoise,
+ VOIDWarpedNoiseSource,
+ VOIDSampler,
+ ]
+
+
+async def comfy_entrypoint() -> VOIDExtension:
+ return VOIDExtension()
diff --git a/comfy_extras/nodes_wan.py b/comfy_extras/nodes_wan.py
index e50bfcd2c..67d3a8443 100644
--- a/comfy_extras/nodes_wan.py
+++ b/comfy_extras/nodes_wan.py
@@ -18,7 +18,7 @@ class WanImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -66,7 +66,7 @@ class WanFunControlToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFunControlToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -119,7 +119,7 @@ class Wan22FunControlToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Wan22FunControlToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -184,7 +184,7 @@ class WanFirstLastFrameToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFirstLastFrameToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -256,7 +256,7 @@ class WanFunInpaintToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanFunInpaintToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -288,7 +288,7 @@ class WanVaceToVideo(io.ComfyNode):
return io.Schema(
node_id="WanVaceToVideo",
search_aliases=["video conditioning", "video control"],
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -375,7 +375,7 @@ class TrimVideoLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="TrimVideoLatent",
- category="latent/video",
+ category="model/latent/video",
inputs=[
io.Latent.Input("samples"),
io.Int.Input("trim_amount", default=0, min=0, max=99999),
@@ -398,7 +398,7 @@ class WanCameraImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanCameraImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -452,7 +452,7 @@ class WanPhantomSubjectToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanPhantomSubjectToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -707,7 +707,7 @@ class WanTrackToVideo(io.ComfyNode):
return io.Schema(
node_id="WanTrackToVideo",
search_aliases=["motion tracking", "trajectory video", "point tracking", "keypoint animation"],
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -951,7 +951,7 @@ class WanSoundImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSoundImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -984,7 +984,7 @@ class WanSoundImageToVideoExtend(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSoundImageToVideoExtend",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -1046,7 +1046,7 @@ class WanHuMoImageToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanHuMoImageToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -1112,7 +1112,7 @@ class WanAnimateToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanAnimateToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
@@ -1252,7 +1252,7 @@ class Wan22ImageToVideoLatent(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="Wan22ImageToVideoLatent",
- category="conditioning/inpaint",
+ category="model/conditioning/inpaint",
inputs=[
io.Vae.Input("vae"),
io.Int.Input("width", default=1280, min=32, max=nodes.MAX_RESOLUTION, step=32),
@@ -1302,7 +1302,7 @@ class WanInfiniteTalkToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanInfiniteTalkToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.DynamicCombo.Input("mode", options=[
io.DynamicCombo.Option("single_speaker", []),
@@ -1461,7 +1461,7 @@ class WanSCAILToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanSCAILToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/nodes_wandancer.py b/comfy_extras/nodes_wandancer.py
new file mode 100644
index 000000000..a96885745
--- /dev/null
+++ b/comfy_extras/nodes_wandancer.py
@@ -0,0 +1,971 @@
+import math
+import nodes
+import node_helpers
+import torch
+import torchaudio
+import comfy.model_management
+import comfy.utils
+import numpy as np
+import logging
+from typing_extensions import override
+from comfy_api.latest import ComfyExtension, io
+
+import scipy.signal
+import scipy.ndimage
+import scipy.fft
+import scipy.sparse
+
+# Audio Processing Functions - Derived from librosa (https://github.com/librosa/librosa)
+# Copyright (c) 2013--2023, librosa development team.
+
+def mel_to_hz(mels, htk=False):
+ """Convert mel to Hz (slaney)"""
+ mels = np.asanyarray(mels)
+ if htk:
+ return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+ f_min = 0.0
+ f_sp = 200.0 / 3
+ freqs = f_min + f_sp * mels
+ min_log_hz = 1000.0
+ min_log_mel = (min_log_hz - f_min) / f_sp
+ logstep = np.log(6.4) / 27.0
+ if mels.ndim:
+ log_t = mels >= min_log_mel
+ freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel))
+ elif mels >= min_log_mel:
+ freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel))
+ return freqs
+
+def hz_to_mel(frequencies, htk=False):
+ """Convert Hz to mel (slaney)"""
+ frequencies = np.asanyarray(frequencies)
+ if htk:
+ return 2595.0 * np.log10(1.0 + frequencies / 700.0)
+ f_min = 0.0
+ f_sp = 200.0 / 3
+ mels = (frequencies - f_min) / f_sp
+ min_log_hz = 1000.0
+ min_log_mel = (min_log_hz - f_min) / f_sp
+ logstep = np.log(6.4) / 27.0
+ if frequencies.ndim:
+ log_t = frequencies >= min_log_hz
+ mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep
+ elif frequencies >= min_log_hz:
+ mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep
+ return mels
+
+def compute_cqt(y, sr=22050, hop_length=512, fmin=None, n_bins=84, bins_per_octave=12, tuning=0.0):
+ """Compute Constant-Q Transform (CQT) spectrogram."""
+
+ def _relative_bandwidth(freqs):
+ bpo = np.empty_like(freqs)
+ logf = np.log2(freqs)
+ bpo[0] = 1.0 / (logf[1] - logf[0])
+ bpo[-1] = 1.0 / (logf[-1] - logf[-2])
+ bpo[1:-1] = 2.0 / (logf[2:] - logf[:-2])
+ return (2.0 ** (2.0 / bpo) - 1.0) / (2.0 ** (2.0 / bpo) + 1.0)
+
+ def _wavelet_lengths(freqs, sr, filter_scale, alpha):
+ Q = float(filter_scale) / alpha
+ return Q * sr / freqs # shape (n_bins,) floats
+
+ def _build_wavelet(freqs_oct, sr, filter_scale, alpha_oct):
+ lengths = _wavelet_lengths(freqs_oct, sr, filter_scale, alpha_oct)
+ filters = []
+ for ilen, freq in zip(lengths, freqs_oct):
+ t = np.arange(int(-ilen // 2), int(ilen // 2), dtype=float)
+ sig = (np.cos(t * 2 * np.pi * freq / sr)
+ + 1j * np.sin(t * 2 * np.pi * freq / sr)).astype(np.complex64)
+ sig *= scipy.signal.get_window('hann', len(sig), fftbins=True)
+ l1 = np.sum(np.abs(sig))
+ tiny = np.finfo(np.float32).tiny
+ sig /= max(l1, tiny)
+ filters.append(sig)
+ max_len = max(lengths)
+ n_fft = int(2.0 ** np.ceil(np.log2(max_len)))
+ out = np.zeros((len(filters), n_fft), dtype=np.complex64)
+ for k, f in enumerate(filters):
+ lpad = int((n_fft - len(f)) // 2)
+ out[k, lpad: lpad + len(f)] = f
+ return out, lengths
+
+ def _resample_half(y):
+ ratio = 0.5
+ n_samples = int(np.ceil(len(y) * ratio))
+ # Kaiser-windowed FIR matches librosa/soxr more closely than scipy's default Hamming filter
+ L = 2
+ h = scipy.signal.firwin(160 * L + 1, 0.96 / L, window=('kaiser', 6.5))
+ y_hat = scipy.signal.resample_poly(y.astype(np.float32), 1, 2, window=h)
+ if len(y_hat) > n_samples:
+ y_hat = y_hat[:n_samples]
+ elif len(y_hat) < n_samples:
+ y_hat = np.pad(y_hat, (0, n_samples - len(y_hat)))
+ y_hat /= np.sqrt(ratio)
+ return y_hat.astype(np.float32)
+
+ def _sparsify_rows(x, quantile=0.01):
+ mags = np.abs(x)
+ norms = np.sum(mags, axis=1, keepdims=True)
+ norms = np.where(norms == 0, 1.0, norms)
+ mag_sort = np.sort(mags, axis=1)
+ cumulative_mag = np.cumsum(mag_sort / norms, axis=1)
+ threshold_idx = np.argmin(cumulative_mag < quantile, axis=1)
+ x_sparse = scipy.sparse.lil_matrix(x.shape, dtype=x.dtype)
+ for i, j in enumerate(threshold_idx):
+ idx = np.where(mags[i] >= mag_sort[i, j])
+ x_sparse[i, idx] = x[i, idx]
+ return x_sparse.tocsr()
+
+ if fmin is None:
+ fmin = 32.70319566257483 # C1 note frequency
+
+ fmin = fmin * (2.0 ** (tuning / bins_per_octave))
+ freqs = fmin * (2.0 ** (np.arange(n_bins) / bins_per_octave))
+
+ alpha = _relative_bandwidth(freqs)
+ lengths = _wavelet_lengths(freqs, float(sr), 1, alpha)
+
+ n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
+ n_filters = min(bins_per_octave, n_bins)
+
+ cqt_resp = []
+ my_y = y.astype(np.float32)
+ my_sr = float(sr)
+ my_hop = int(hop_length)
+
+ for i in range(n_octaves):
+ if i == 0:
+ sl = slice(-n_filters, None)
+ else:
+ sl = slice(-n_filters * (i + 1), -n_filters * i)
+
+ freqs_oct = freqs[sl]
+ alpha_oct = alpha[sl]
+
+ basis, basis_lengths = _build_wavelet(freqs_oct, my_sr, 1, alpha_oct)
+ n_fft_oct = basis.shape[1]
+
+ # Frequency-domain normalisation
+ basis = basis.astype(np.complex64)
+ basis *= basis_lengths[:, np.newaxis] / float(n_fft_oct)
+ fft_basis = scipy.fft.fft(basis, n=n_fft_oct, axis=1)[:, :(n_fft_oct // 2) + 1]
+ fft_basis = _sparsify_rows(fft_basis, quantile=0.01)
+ fft_basis = fft_basis * np.sqrt(sr / my_sr)
+
+ y_pad = np.pad(my_y, int(n_fft_oct // 2), mode='constant')
+ n_frames = 1 + (len(y_pad) - n_fft_oct) // my_hop
+ frames = np.lib.stride_tricks.as_strided(
+ y_pad,
+ shape=(n_fft_oct, n_frames),
+ strides=(y_pad.strides[0], y_pad.strides[0] * my_hop),
+ )
+ stft_result = scipy.fft.rfft(frames, axis=0)
+ cqt_resp.append(fft_basis.dot(stft_result))
+
+ if my_hop % 2 == 0:
+ my_hop //= 2
+ my_sr /= 2.0
+ my_y = _resample_half(my_y)
+
+ max_col = min(c.shape[-1] for c in cqt_resp)
+ cqt_out = np.empty((n_bins, max_col), dtype=np.complex64)
+ end = n_bins
+ for c_i in cqt_resp:
+ n_oct = c_i.shape[0]
+ if end < n_oct:
+ cqt_out[:end, :] = c_i[-end:, :max_col]
+ else:
+ cqt_out[end - n_oct:end, :] = c_i[:, :max_col]
+ end -= n_oct
+
+ cqt_out /= np.sqrt(lengths)[:, np.newaxis]
+ return np.abs(cqt_out).astype(np.float32)
+
+
+def cq_to_chroma_mapping(n_input, bins_per_octave=12, n_chroma=12, fmin=None):
+ """Map CQT bins to chroma bins."""
+
+ if fmin is None:
+ fmin = 32.70319566257483 # C1 note frequency
+
+ n_merge = bins_per_octave / n_chroma
+ cq_to_ch = np.repeat(np.eye(n_chroma), int(n_merge), axis=1)
+ cq_to_ch = np.roll(cq_to_ch, -int(n_merge // 2), axis=1)
+ n_octaves = int(np.ceil(n_input / bins_per_octave))
+ cq_to_ch = np.tile(cq_to_ch, n_octaves)[:, :n_input]
+
+ midi_0 = np.mod(12 * np.log2(fmin / 440.0) + 69, 12)
+ roll = int(np.round(midi_0 * (n_chroma / 12.0)))
+ cq_to_ch = np.roll(cq_to_ch, roll, axis=0)
+
+ return cq_to_ch.astype(np.float32)
+
+
+def _parabolic_interpolation(S, axis=-2):
+ """Compute parabolic interpolation shift for peak refinement."""
+ S_next = np.roll(S, -1, axis=axis)
+ S_prev = np.roll(S, 1, axis=axis)
+
+ a = S_next + S_prev - 2 * S
+ b = (S_next - S_prev) / 2.0
+
+ shifts = np.zeros_like(S)
+ valid = np.abs(b) < np.abs(a)
+ shifts[valid] = -b[valid] / a[valid]
+
+ if axis == -2 or axis == S.ndim - 2:
+ shifts[0, :] = 0
+ shifts[-1, :] = 0
+ elif axis == 0:
+ shifts[0, ...] = 0
+ shifts[-1, ...] = 0
+
+ return shifts
+
+
+def _localmax(S, axis=-2):
+ """Find local maxima along an axis."""
+
+ S_prev = np.roll(S, 1, axis=axis)
+ S_next = np.roll(S, -1, axis=axis)
+
+ local_max = (S > S_prev) & (S >= S_next)
+
+ if axis == -2 or axis == S.ndim - 2:
+ local_max[-1, :] = S[-1, :] > S[-2, :]
+ # First element is never a local max (strict inequality with previous)
+ local_max[0, :] = False
+ elif axis == 0:
+ local_max[-1, ...] = S[-1, ...] > S[-2, ...]
+ local_max[0, ...] = False
+
+ return local_max
+
+
+def piptrack(y=None, sr=22050, S=None, n_fft=2048, hop_length=512,
+ fmin=150.0, fmax=4000.0, threshold=0.1):
+ """Pitch tracking on thresholded parabolically-interpolated STFT."""
+
+ # Compute STFT if not provided
+ if S is None:
+ if y is None:
+ raise ValueError("Either y or S must be provided")
+
+ fft_window = scipy.signal.get_window('hann', n_fft, fftbins=True)
+ if len(fft_window) < n_fft:
+ lpad = int((n_fft - len(fft_window)) // 2)
+ fft_window = np.pad(fft_window, (lpad, int(n_fft - len(fft_window) - lpad)), mode='constant')
+ fft_window = fft_window.reshape((-1, 1))
+
+ y_pad = np.pad(y, int(n_fft // 2), mode='constant')
+ n_frames = 1 + (len(y_pad) - n_fft) // hop_length
+ frames = np.lib.stride_tricks.as_strided(
+ y_pad,
+ shape=(n_fft, n_frames),
+ strides=(y_pad.strides[0], y_pad.strides[0] * hop_length)
+ )
+
+ S = scipy.fft.rfft((fft_window * frames).astype(np.float32), axis=0)
+
+ S = np.abs(S)
+
+ fmin = max(fmin, 0)
+ fmax = min(fmax, float(sr) / 2)
+
+ fft_freqs = np.fft.rfftfreq(S.shape[0] * 2 - 2, 1.0 / sr)
+ if len(fft_freqs) > S.shape[0]:
+ fft_freqs = fft_freqs[:S.shape[0]]
+
+ shift = _parabolic_interpolation(S, axis=0)
+ avg = np.gradient(S, axis=0)
+ dskew = 0.5 * avg * shift
+
+ pitches = np.zeros_like(S)
+ mags = np.zeros_like(S)
+
+ freq_mask = (fmin <= fft_freqs) & (fft_freqs < fmax)
+ freq_mask = freq_mask.reshape(-1, 1)
+
+ ref_value = threshold * np.max(S, axis=0, keepdims=True)
+ local_max = _localmax(S * (S > ref_value), axis=0)
+ idx = np.nonzero(freq_mask & local_max)
+
+ pitches[idx] = (idx[0] + shift[idx]) * float(sr) / (S.shape[0] * 2 - 2)
+ mags[idx] = S[idx] + dskew[idx]
+
+ return pitches, mags
+
+
+def hz_to_octs(frequencies, tuning=0.0, bins_per_octave=12):
+ """Convert frequencies (Hz) to octave numbers."""
+
+ A440 = 440.0 * 2.0 ** (tuning / bins_per_octave)
+ octs = np.log2(np.asanyarray(frequencies) / (float(A440) / 16))
+ return octs
+
+
+def pitch_tuning(frequencies, resolution=0.01, bins_per_octave=12):
+ """Estimate tuning offset from a collection of pitches."""
+
+ frequencies = np.atleast_1d(frequencies)
+ frequencies = frequencies[frequencies > 0]
+
+ if not np.any(frequencies):
+ return 0.0
+
+ residual = np.mod(bins_per_octave * hz_to_octs(frequencies, tuning=0.0,
+ bins_per_octave=bins_per_octave), 1.0)
+ residual[residual >= 0.5] -= 1.0
+
+ bins = np.linspace(-0.5, 0.5, int(np.ceil(1.0 / resolution)) + 1)
+ counts, tuning = np.histogram(residual, bins)
+ tuning_est = tuning[np.argmax(counts)]
+ return tuning_est
+
+
+def estimate_tuning(y, sr=22050, bins_per_octave=12):
+ """Estimate global tuning deviation from 12-TET."""
+ n_fft = 2048
+ hop_length = 512
+
+ if len(y) < n_fft:
+ return 0.0
+
+ pitch, mag = piptrack(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length,
+ fmin=150.0, fmax=4000.0, threshold=0.1)
+
+ pitch_mask = pitch > 0
+
+ if not pitch_mask.any():
+ return 0.0
+
+ threshold = np.median(mag[pitch_mask])
+ valid_pitches = pitch[(mag >= threshold) & pitch_mask]
+
+ if len(valid_pitches) == 0:
+ return 0.0
+
+ tuning = pitch_tuning(valid_pitches, resolution=0.01, bins_per_octave=bins_per_octave)
+
+ return float(tuning)
+
+
+def compute_chroma_cens(y, sr=22050, hop_length=512, n_chroma=12,
+ n_octaves=7, bins_per_octave=36,
+ win_len_smooth=41, norm=2):
+ """Compute Chroma Energy Normalized Statistics (CENS) features."""
+
+ tuning = estimate_tuning(y, sr, bins_per_octave=bins_per_octave)
+
+ fmin = 32.70319566257483 # C1 note frequency
+ n_bins = n_octaves * bins_per_octave
+ cqt_mag = compute_cqt(y, sr=sr, hop_length=hop_length,
+ fmin=fmin, n_bins=n_bins,
+ bins_per_octave=bins_per_octave,
+ tuning=tuning)
+
+ chroma_map = cq_to_chroma_mapping(n_bins, bins_per_octave=bins_per_octave,
+ n_chroma=n_chroma, fmin=fmin)
+ chroma = np.dot(chroma_map, cqt_mag)
+
+ threshold = np.finfo(chroma.dtype).tiny
+ chroma_sum = np.sum(np.abs(chroma), axis=0, keepdims=True)
+ chroma_sum = np.maximum(chroma_sum, threshold)
+ chroma = chroma / chroma_sum
+
+ quant_steps = [0.4, 0.2, 0.1, 0.05]
+ quant_weights = [0.25, 0.25, 0.25, 0.25]
+ chroma_quant = np.zeros_like(chroma)
+ for step, weight in zip(quant_steps, quant_weights):
+ chroma_quant += (chroma > step) * weight
+
+ if win_len_smooth is not None and win_len_smooth > 0:
+ win = scipy.signal.get_window('hann', win_len_smooth + 2, fftbins=False)
+ win /= np.sum(win)
+ win = win.reshape(1, -1)
+ chroma_smooth = scipy.ndimage.convolve(chroma_quant, win, mode='constant')
+ else:
+ chroma_smooth = chroma_quant
+
+ if norm == 2:
+ threshold = np.finfo(chroma_smooth.dtype).tiny
+ chroma_norm = np.sqrt(np.sum(chroma_smooth ** 2, axis=0, keepdims=True))
+ chroma_norm = np.maximum(chroma_norm, threshold)
+ chroma_smooth = chroma_smooth / chroma_norm
+ elif norm == np.inf:
+ threshold = np.finfo(chroma_smooth.dtype).tiny
+ chroma_norm = np.max(np.abs(chroma_smooth), axis=0, keepdims=True)
+ chroma_norm = np.maximum(chroma_norm, threshold)
+ chroma_smooth = chroma_smooth / chroma_norm
+
+ return chroma_smooth
+
+
+def _create_mel_filterbank(sr, n_fft, n_mels=128, fmin=0.0, fmax=None):
+ """Create mel-scale filterbank matrix."""
+ if fmax is None:
+ fmax = sr / 2.0
+ mel_basis = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=np.float32)
+ fftfreqs = np.fft.rfftfreq(n=n_fft, d=1.0 / sr)
+ min_mel = hz_to_mel(fmin)
+ max_mel = hz_to_mel(fmax)
+ mels = np.linspace(min_mel, max_mel, n_mels + 2)
+ mel_f = mel_to_hz(mels)
+ fdiff = np.diff(mel_f)
+ ramps = np.subtract.outer(mel_f, fftfreqs)
+
+ for i in range(n_mels):
+ lower = -ramps[i] / fdiff[i]
+ upper = ramps[i + 2] / fdiff[i + 1]
+ mel_basis[i] = np.maximum(0, np.minimum(lower, upper))
+
+ enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+ mel_basis *= enorm[:, np.newaxis]
+ return mel_basis
+
+
+def _compute_mel_spectrogram(data, sr, n_fft=2048, hop_length=512, n_mels=128):
+ """Compute mel spectrogram from audio signal."""
+ fft_window = scipy.signal.get_window('hann', n_fft, fftbins=True)
+ if len(fft_window) < n_fft:
+ lpad = int((n_fft - len(fft_window)) // 2)
+ fft_window = np.pad(fft_window, (lpad, int(n_fft - len(fft_window) - lpad)), mode='constant')
+
+ fft_window = fft_window.reshape((-1, 1))
+ data_padded = np.pad(data, int(n_fft // 2), mode='constant')
+ n_frames = 1 + (len(data_padded) - n_fft) // hop_length
+ shape = (n_fft, n_frames)
+ strides = (data_padded.strides[0], data_padded.strides[0] * hop_length)
+ frames = np.lib.stride_tricks.as_strided(data_padded, shape=shape, strides=strides)
+
+ stft_result = scipy.fft.rfft(fft_window * frames, axis=0).astype(np.complex64)
+ power_spec = np.abs(stft_result) ** 2
+
+ mel_basis = _create_mel_filterbank(sr, n_fft, n_mels=n_mels, fmin=0.0, fmax=sr / 2.0)
+ mel_spec = np.dot(mel_basis, power_spec)
+ return mel_spec.astype(np.float32)
+
+
+def quick_tempo_estimate(audio_np, sr, start_bpm=120.0, std_bpm=1.0, hop_length=512):
+ """Estimate tempo using autocorrelation tempogram."""
+
+ if len(audio_np) < hop_length * 10:
+ logging.warning("Audio too short for tempo estimation, returning default BPM of 120.0")
+ return 120.0
+
+ n_fft = 2048
+ mel_S = _compute_mel_spectrogram(audio_np, sr, n_fft=n_fft, hop_length=hop_length, n_mels=128)
+ log_mel_S = 10.0 * np.log10(np.maximum(1e-10, mel_S))
+
+ lag = 1
+ S_diff = log_mel_S[:, lag:] - log_mel_S[:, :-lag]
+ S_onset = np.maximum(0.0, S_diff)
+ onset_env_pre = np.mean(S_onset, axis=0)
+ pad_width = lag + n_fft // (2 * hop_length)
+ onset_env = np.pad(onset_env_pre, (pad_width, 0), mode='constant')
+ onset_env = onset_env[:mel_S.shape[1]]
+
+ return estimate_tempo_from_onset(onset_env, sr, hop_length, start_bpm, std_bpm, max_tempo=320.0)
+
+
+def estimate_tempo_from_onset(onset_env, sr, hop_length, start_bpm=120.0, std_bpm=1.0, max_tempo=320.0):
+ """Estimate tempo from onset strength envelope using autocorrelation tempogram."""
+ if len(onset_env) < 20:
+ return 120.0
+
+ ac_size = 8.0
+ win_length = int(np.round(ac_size * sr / hop_length))
+ win_length = min(win_length, len(onset_env))
+
+ pad_width = win_length // 2
+ onset_padded = np.pad(onset_env, (pad_width, pad_width), mode='linear_ramp', end_values=(0, 0))
+
+ n_frames = len(onset_env)
+ shape = (win_length, n_frames)
+ strides = (onset_padded.strides[0], onset_padded.strides[0])
+ frames = np.lib.stride_tricks.as_strided(onset_padded, shape=shape, strides=strides)
+
+ hann_window = scipy.signal.get_window('hann', win_length, fftbins=True)
+ windowed_frames = frames * hann_window[:, np.newaxis]
+
+ tempogram = np.zeros((win_length, n_frames))
+ for i in range(n_frames):
+ frame = windowed_frames[:, i]
+ n_pad = scipy.fft.next_fast_len(2 * len(frame) - 1)
+ fft_result = scipy.fft.rfft(frame, n=n_pad)
+ powspec = np.abs(fft_result) ** 2
+ ac = scipy.fft.irfft(powspec, n=n_pad)
+ tempogram[:, i] = ac[:win_length]
+
+ ac_max = np.max(np.abs(tempogram), axis=0)
+ mask = ac_max > 0
+ tempogram[:, mask] /= ac_max[mask]
+
+ tempogram_mean = np.mean(tempogram, axis=1)
+ tempogram_mean = np.maximum(tempogram_mean, 0)
+
+ bpms = np.zeros(win_length, dtype=np.float64)
+ bpms[0] = np.inf
+ bpms[1:] = 60.0 * sr / (hop_length * np.arange(1.0, win_length))
+
+ logprior = -0.5 * ((np.log2(bpms) - np.log2(start_bpm)) / std_bpm) ** 2
+
+ if max_tempo is not None:
+ max_idx = int(np.argmax(bpms < max_tempo))
+ if max_idx > 0:
+ logprior[:max_idx] = -np.inf
+
+ weighted = np.log1p(1e6 * tempogram_mean) + logprior
+ best_idx = int(np.argmax(weighted[1:])) + 1
+ tempo = bpms[best_idx]
+
+ return tempo
+
+
+def detect_onset_peaks(onset_env, sr=22050, hop_length=512, pre_max=0.03, post_max=0.0,
+ pre_avg=0.10, post_avg=0.10, wait=0.03, delta=0.07):
+ """Detect onset peaks using peak picking algorithm."""
+
+ onset_normalized = onset_env - np.min(onset_env)
+ onset_max = np.max(onset_normalized)
+ if onset_max > 0:
+ onset_normalized = onset_normalized / onset_max
+
+ pre_max_frames = int(pre_max * sr / hop_length)
+ post_max_frames = int(post_max * sr / hop_length) + 1
+ pre_avg_frames = int(pre_avg * sr / hop_length)
+ post_avg_frames = int(post_avg * sr / hop_length) + 1
+ wait_frames = int(wait * sr / hop_length)
+
+ peaks = np.zeros(len(onset_normalized), dtype=bool)
+ peaks[0] = (onset_normalized[0] >= np.max(onset_normalized[:min(post_max_frames, len(onset_normalized))]))
+ peaks[0] &= (onset_normalized[0] >= np.mean(onset_normalized[:min(post_avg_frames, len(onset_normalized))]) + delta)
+
+ if peaks[0]:
+ n = wait_frames + 1
+ else:
+ n = 1
+
+ while n < len(onset_normalized):
+ maxn = np.max(onset_normalized[max(0, n - pre_max_frames):min(n + post_max_frames, len(onset_normalized))])
+ peaks[n] = (onset_normalized[n] == maxn)
+
+ if not peaks[n]:
+ n += 1
+ continue
+
+ avgn = np.mean(onset_normalized[max(0, n - pre_avg_frames):min(n + post_avg_frames, len(onset_normalized))])
+ peaks[n] &= (onset_normalized[n] >= avgn + delta)
+
+ if not peaks[n]:
+ n += 1
+ continue
+
+ n += wait_frames + 1
+
+ return np.flatnonzero(peaks).astype(np.int32)
+
+
+def track_beats(onset_env, tempo, sr, hop_length, tightness=100, trim=True):
+ """Track beats using dynamic programming."""
+
+ frame_rate = sr / hop_length
+ frames_per_beat = np.round(frame_rate * 60.0 / tempo)
+
+ if frames_per_beat <= 0 or len(onset_env) < 2:
+ return np.array([], dtype=np.int32)
+
+ onset_std = np.std(onset_env, ddof=1)
+ if onset_std > 0:
+ onset_normalized = onset_env / onset_std
+ else:
+ onset_normalized = onset_env
+
+ window_range = np.arange(-frames_per_beat, frames_per_beat + 1)
+ window = np.exp(-0.5 * (window_range * 32.0 / frames_per_beat) ** 2)
+
+ localscore = scipy.signal.convolve(onset_normalized, window, mode='same')
+
+ backlink = np.full(len(localscore), -1, dtype=np.int32)
+ cumscore = np.zeros(len(localscore), dtype=np.float64)
+
+ score_thresh = 0.01 * localscore.max()
+ first_beat = True
+
+ backlink[0] = -1
+ cumscore[0] = localscore[0]
+
+ fpb = int(frames_per_beat)
+
+ for i in range(1, len(localscore)):
+ score_i = localscore[i]
+ best_score = -np.inf
+ beat_location = -1
+
+ search_start = int(i - np.round(fpb / 2.0))
+ search_end = int(i - 2 * fpb - 1)
+
+ for loc in range(search_start, search_end, -1):
+ if loc < 0:
+ break
+
+ score = cumscore[loc] - tightness * (np.log(i - loc) - np.log(fpb)) ** 2
+
+ if score > best_score:
+ best_score = score
+ beat_location = loc
+
+ if beat_location >= 0:
+ cumscore[i] = score_i + best_score
+ else:
+ cumscore[i] = score_i
+
+ if first_beat and score_i < score_thresh:
+ backlink[i] = -1
+ else:
+ backlink[i] = beat_location
+ first_beat = False
+
+ local_max_mask = np.zeros(len(cumscore), dtype=bool)
+
+ local_max_mask[0] = False
+
+ for i in range(1, len(cumscore) - 1):
+ local_max_mask[i] = (cumscore[i] > cumscore[i-1]) and (cumscore[i] >= cumscore[i+1])
+
+ if len(cumscore) > 1:
+ local_max_mask[-1] = cumscore[-1] > cumscore[-2]
+
+ if np.any(local_max_mask):
+ median_max = np.median(cumscore[local_max_mask])
+ threshold = 0.5 * median_max
+
+ tail = -1
+ for i in range(len(cumscore) - 1, -1, -1):
+ if local_max_mask[i] and cumscore[i] >= threshold:
+ tail = i
+ break
+ else:
+ tail = len(cumscore) - 1
+
+ beats = np.zeros(len(localscore), dtype=bool)
+ n = tail
+ visited = set()
+ while n >= 0 and n not in visited:
+ beats[n] = True
+ visited.add(n)
+ n = backlink[n]
+
+ if trim and np.any(beats):
+ beat_positions = np.flatnonzero(beats)
+
+ beat_localscores = localscore[beat_positions]
+
+ w = np.hanning(5)
+ smooth_boe_full = np.convolve(beat_localscores, w)
+ smooth_boe = smooth_boe_full[len(w)//2 : len(localscore) + len(w)//2]
+
+ threshold = 0.5 * np.sqrt(np.mean(smooth_boe ** 2))
+
+ start_frame = 0
+ while start_frame < len(localscore) and localscore[start_frame] <= threshold:
+ beats[start_frame] = False
+ start_frame += 1
+
+ end_frame = len(localscore) - 1
+ while end_frame >= 0 and localscore[end_frame] <= threshold:
+ beats[end_frame] = False
+ end_frame -= 1
+
+ return np.flatnonzero(beats).astype(np.int32)
+
+def compute_onset_envelope(mel_spec_db, n_fft=2048, hop_length=512):
+ """Compute onset strength envelope from a log-mel spectrogram (dB)."""
+ lag = 1
+ onset_diff = mel_spec_db[:, lag:] - mel_spec_db[:, :-lag]
+ onset_diff = np.maximum(0.0, onset_diff)
+ envelope_pre_pad = np.mean(onset_diff, axis=0)
+
+ pad_width = lag + n_fft // (2 * hop_length)
+ envelope = np.pad(envelope_pre_pad, (pad_width, 0), mode='constant')
+ envelope = envelope[:mel_spec_db.shape[1]]
+
+ return envelope
+
+def compute_mfcc(mel_spec_db, n_mfcc=20):
+ """Compute MFCC features from a log-mel spectrogram (dB)."""
+ mfcc = scipy.fft.dct(mel_spec_db, axis=0, type=2, norm='ortho')[:n_mfcc].T
+ return mfcc.astype(np.float32)
+
+
+def power_to_db(S, amin=1e-10, top_db=80.0, ref=1.0):
+ """Convert a power spectrogram (amplitude squared) to decibel (dB) units"""
+ S = np.asarray(S)
+ log_spec = 10.0 * np.log10(np.maximum(amin, S))
+ log_spec -= 10.0 * np.log10(np.maximum(amin, ref))
+ if top_db is not None:
+ log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+ return log_spec
+
+
+class WanDancerEncodeAudio(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="WanDancerEncodeAudio",
+ category="model/conditioning/video_models",
+ inputs=[
+ io.Audio.Input("audio"),
+ io.Int.Input("video_frames", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4),
+ io.Float.Input("audio_inject_scale", default=1.0, min=0.0, max=10.0, step=0.01, tooltip="The scale for the audio features when injected into the video model."),
+ ],
+ outputs=[
+ io.AudioEncoderOutput.Output(display_name="audio_encoder_output"),
+ io.String.Output(display_name="fps_string", tooltip="The calculated fps based on the audio length and the number of video frames. Used in the prompt."),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, video_frames, audio_inject_scale, audio) -> io.NodeOutput:
+ waveform = audio["waveform"][0]
+ sample_rate = audio["sample_rate"]
+ base_fps = 30
+ hop_length = 512
+ model_sr = 22050
+ n_fft = 2048
+
+ # start tempo from original audio (not the resampled one) to match the reference pipeline
+ if waveform.shape[0] > 1:
+ waveform = waveform.mean(dim=0, keepdim=False)
+
+ start_bpm = quick_tempo_estimate(waveform.squeeze().cpu().numpy(), sample_rate, hop_length=hop_length)
+
+ # resample to the sample rate used for feature extraction
+ resample_sr = base_fps * hop_length
+ waveform = torchaudio.functional.resample(waveform, sample_rate, resample_sr)
+
+ waveform_np = waveform.cpu().numpy().squeeze()
+ mel_spec = _compute_mel_spectrogram(waveform_np, model_sr, n_fft, hop_length, n_mels=128)
+ mel_spec_db = power_to_db(mel_spec, amin=1e-10, top_db=80.0, ref=1.0)
+ envelope = compute_onset_envelope(mel_spec_db, n_fft, hop_length)
+ mfcc = compute_mfcc(mel_spec_db, n_mfcc=20)
+ chroma = compute_chroma_cens(y=waveform_np, sr=model_sr, hop_length=hop_length).T
+ # detect peaks
+ peak_idxs = detect_onset_peaks(envelope, sr=model_sr, hop_length=hop_length)
+ peak_onehot = np.zeros_like(envelope, dtype=np.float32)
+ peak_onehot[peak_idxs] = 1.0
+ # detect beats
+ beat_tracking_tempo = estimate_tempo_from_onset(envelope, sr=model_sr, hop_length=hop_length, start_bpm=start_bpm)
+ beat_idxs = track_beats(envelope, beat_tracking_tempo, model_sr, hop_length, tightness=100, trim=True)
+ beat_onehot = np.zeros_like(envelope, dtype=np.float32)
+ beat_onehot[beat_idxs] = 1.0
+
+ audio_feature = np.concatenate(
+ [envelope[:, None], mfcc, chroma, peak_onehot[:, None], beat_onehot[:, None]],
+ axis=-1,
+ )
+ audio_feature = torch.from_numpy(audio_feature).unsqueeze(0).to(comfy.model_management.intermediate_device())
+
+ fps = float(base_fps / int(audio_feature.shape[1] / video_frames + 0.5))
+
+ audio_encoder_output = {
+ "audio_feature": audio_feature,
+ "fps": fps,
+ "audio_inject_scale": audio_inject_scale,
+ }
+
+ if int(fps + 0.5) != 30:
+ fps_string = " 帧率是{:.4f}".format(fps) # "frame rate is" in Chinese, as it was in the original pipeline
+ else:
+ fps_string = ", 帧率是30fps。" # to match the reference pipeline when the fps is 30
+
+ return io.NodeOutput(audio_encoder_output, fps_string)
+
+
+class WanDancerVideo(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="WanDancerVideo",
+ category="model/conditioning/video_models",
+ inputs=[
+ io.Conditioning.Input("positive"),
+ io.Conditioning.Input("negative"),
+ io.Vae.Input("vae"),
+ io.Int.Input("width", default=480, min=16, max=nodes.MAX_RESOLUTION, step=16),
+ io.Int.Input("height", default=832, min=16, max=nodes.MAX_RESOLUTION, step=16),
+ io.Int.Input("length", default=149, min=1, max=nodes.MAX_RESOLUTION, step=4, tooltip="The number of frames in the generated video. Should stay 149 for WanDancer."),
+ io.ClipVisionOutput.Input("clip_vision_output", optional=True, tooltip="The CLIP vision embeds for the first frame."),
+ io.ClipVisionOutput.Input("clip_vision_output_ref", optional=True, tooltip="The CLIP vision embeds for the reference image."),
+ io.Image.Input("start_image", optional=True, tooltip="The initial image(s) to be encoded, can be any number of frames."),
+ io.Mask.Input("mask", optional=True, tooltip="Image conditioning mask for the start image(s). White is kept, black is generated. Used for the local generations."),
+ io.AudioEncoderOutput.Input("audio_encoder_output", optional=True),
+ ],
+ outputs=[
+ io.Conditioning.Output(display_name="positive"),
+ io.Conditioning.Output(display_name="negative"),
+ io.Latent.Output(display_name="latent", tooltip="Empty latent."),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, positive, negative, vae, width, height, length, start_image=None, mask=None, clip_vision_output=None, clip_vision_output_ref=None, audio_encoder_output=None) -> io.NodeOutput:
+ latent = torch.zeros([1, 16, ((length - 1) // 4) + 1, height // 8, width // 8], device=comfy.model_management.intermediate_device())
+ if start_image is not None:
+ start_image = comfy.utils.common_upscale(start_image[:length].movedim(-1, 1), width, height, "bilinear", "center").movedim(1, -1)
+ image = torch.zeros((length, height, width, start_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
+ image[:start_image.shape[0]] = start_image
+
+ concat_latent_image = vae.encode(image[:, :, :, :3])
+ if mask is None:
+ concat_mask = torch.ones((1, 1, latent.shape[2], concat_latent_image.shape[-2], concat_latent_image.shape[-1]), device=start_image.device, dtype=start_image.dtype)
+ concat_mask[:, :, :((start_image.shape[0] - 1) // 4) + 1] = 0.0
+ else:
+ concat_mask = 1 - mask[:length].unsqueeze(0)
+ concat_mask = comfy.utils.common_upscale(concat_mask, concat_latent_image.shape[-2], concat_latent_image.shape[-1], "nearest-exact", "disabled")
+ concat_mask = torch.cat([torch.repeat_interleave(concat_mask[:, 0:1], repeats=4, dim=1), concat_mask[:, 1:]], dim=1)
+ concat_mask = concat_mask.view(1, concat_mask.shape[1] // 4, 4, concat_latent_image.shape[-2], concat_latent_image.shape[-1]).transpose(1, 2)
+
+ positive = node_helpers.conditioning_set_values(positive, {"concat_latent_image": concat_latent_image, "concat_mask": concat_mask})
+ negative = node_helpers.conditioning_set_values(negative, {"concat_latent_image": concat_latent_image, "concat_mask": concat_mask})
+
+ if clip_vision_output is not None:
+ positive = node_helpers.conditioning_set_values(positive, {"clip_vision_output": clip_vision_output, "clip_vision_output_ref": clip_vision_output_ref})
+ negative = node_helpers.conditioning_set_values(negative, {"clip_vision_output": clip_vision_output, "clip_vision_output_ref": clip_vision_output_ref})
+
+ if audio_encoder_output is not None:
+ positive = node_helpers.conditioning_set_values(positive, {"audio_embed": audio_encoder_output["audio_feature"], "fps": audio_encoder_output["fps"], "audio_inject_scale": audio_encoder_output.get("audio_inject_scale", 1.0)})
+ negative = node_helpers.conditioning_set_values(negative, {"audio_embed": audio_encoder_output["audio_feature"], "fps": audio_encoder_output["fps"], "audio_inject_scale": audio_encoder_output.get("audio_inject_scale", 1.0)})
+
+ out_latent = {}
+ out_latent["samples"] = latent
+ return io.NodeOutput(positive, negative, out_latent)
+
+
+class WanDancerPadKeyframes(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="WanDancerPadKeyframes",
+ category="image/video",
+ inputs=[
+ io.Image.Input("images",),
+ io.Int.Input("segment_length", default=149, min=1, max=10000, tooltip="Length of this segment (usually 149 frames)"),
+ io.Int.Input("segment_index", default=0, min=0, max=100, tooltip="Which segment this is (0 for first, 1 for second, etc.)"),
+ io.Audio.Input("audio", tooltip="Audio to calculate total output frames from and extract segment audio."),
+ ],
+ outputs=[
+ io.Image.Output(display_name="keyframes_sequence", tooltip="Padded keyframe sequence"),
+ io.Mask.Output(display_name="keyframes_mask", tooltip="Mask indicating valid frames"),
+ io.Audio.Output(display_name="audio_segment", tooltip="Audio segment for this video segment"),
+ ],
+ )
+
+ @classmethod
+ def do_execute(cls, images, segment_length, segment_index, audio):
+ B, H, W, C = images.shape
+ fps = 30
+
+ # calculate total frames
+ audio_duration = audio["waveform"].shape[-1] / audio["sample_rate"]
+ segment_duration = segment_length / fps
+ buffer = 0.2
+ num_segments = int((audio_duration - buffer) / segment_duration) + 1 if audio_duration > buffer else 0
+ total_frames = num_segments * segment_length
+
+ mask = torch.zeros((segment_length, H, W), device=images.device, dtype=images.dtype)
+ keyframes = torch.zeros((segment_length, H, W, C), dtype=images.dtype, device=images.device)
+
+ # guard: with no audio or no images, nothing to place — leave keyframes/mask zeroed
+ if total_frames > 0 and B > 0:
+ frame_interval = float(total_frames) / B
+ seg_num = int(math.ceil(total_frames / segment_length))
+ is_last_segment = (segment_index == seg_num - 1)
+
+ positions = []
+ images_before_this_segment = 0
+
+ # count images consumed by previous segments
+ for seg_idx in range(segment_index):
+ end_idx = (total_frames - segment_length * seg_idx - 1) if seg_idx == seg_num - 1 else (segment_length - 1)
+ cnt = 0
+ while cnt * frame_interval < end_idx - frame_interval:
+ cnt += 1
+ images_before_this_segment += cnt
+
+ # positions for current segment
+ end_index = (total_frames - segment_length * segment_index - 1) if is_last_segment else (segment_length - 1)
+ cnt = 0
+ while cnt * frame_interval < end_index - frame_interval:
+ pos = int(math.ceil(frame_interval * cnt))
+ positions.append((pos, images_before_this_segment + cnt))
+ cnt += 1
+ positions.append((end_index, images_before_this_segment + cnt))
+
+ valid_positions = [(pos, idx) for pos, idx in positions if idx < B and pos < segment_length]
+
+ if valid_positions:
+ seg_positions, img_indices = zip(*valid_positions)
+ seg_positions = torch.tensor(seg_positions, dtype=torch.long, device=images.device)
+ img_indices = torch.tensor(img_indices, dtype=torch.long, device=images.device)
+ mask[seg_positions] = 1
+ keyframes[seg_positions] = images[img_indices]
+
+ # extract audio segment
+ segment_duration = segment_length / fps
+ start_time = segment_index * segment_duration
+ end_time = min(start_time + segment_duration, audio_duration)
+
+ sample_rate = audio["sample_rate"]
+ start_sample = int(start_time * sample_rate)
+ end_sample = int(end_time * sample_rate)
+
+ audio_segment_waveform = audio["waveform"][:, :, start_sample:end_sample]
+ audio_segment = {
+ "waveform": audio_segment_waveform,
+ "sample_rate": sample_rate
+ }
+
+ return keyframes, mask, audio_segment
+
+ @classmethod
+ def execute(cls, images, segment_length, segment_index, audio=None) -> io.NodeOutput:
+ return io.NodeOutput(*cls.do_execute(images, segment_length, segment_index, audio))
+
+class WanDancerPadKeyframesList(io.ComfyNode):
+ @classmethod
+ def define_schema(cls):
+ return io.Schema(
+ node_id="WanDancerPadKeyframesList",
+ category="image/video",
+ inputs=[
+ io.Image.Input("images"),
+ io.Int.Input("segment_length", default=149, min=1, max=10000, tooltip="Length of each segment (usually 149 frames)"),
+ io.Int.Input("num_segments", default=1, min=1, max=100, tooltip="How many padded segments to emit as lists."),
+ io.Audio.Input("audio", tooltip="Audio to slice for each emitted segment."),
+ ],
+ outputs=[
+ io.Image.Output(display_name="keyframes_sequence", tooltip="Padded keyframe sequences", is_output_list=True),
+ io.Mask.Output(display_name="keyframes_mask", tooltip="Masks indicating valid frames", is_output_list=True),
+ io.Audio.Output(display_name="audio_segment", tooltip="Audio segment for each video segment", is_output_list=True),
+ ],
+ )
+
+ @classmethod
+ def execute(cls, images, segment_length, num_segments, audio=None) -> io.NodeOutput:
+ outputs = [WanDancerPadKeyframes.do_execute(images, segment_length, i, audio) for i in range(num_segments)]
+ keyframes, masks, audio_segments = zip(*outputs)
+ return io.NodeOutput(list(keyframes), list(masks), list(audio_segments))
+
+class WanDancerExtension(ComfyExtension):
+ @override
+ async def get_node_list(self) -> list[type[io.ComfyNode]]:
+ return [
+ WanDancerVideo,
+ WanDancerEncodeAudio,
+ WanDancerPadKeyframes,
+ WanDancerPadKeyframesList,
+ ]
+
+async def comfy_entrypoint() -> WanDancerExtension:
+ return WanDancerExtension()
diff --git a/comfy_extras/nodes_wanmove.py b/comfy_extras/nodes_wanmove.py
index 5acae03eb..2db064922 100644
--- a/comfy_extras/nodes_wanmove.py
+++ b/comfy_extras/nodes_wanmove.py
@@ -247,7 +247,7 @@ class WanMoveVisualizeTracks(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanMoveVisualizeTracks",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Image.Input("images"),
io.Tracks.Input("tracks", optional=True),
@@ -283,7 +283,7 @@ class WanMoveTracksFromCoords(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanMoveTracksFromCoords",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.String.Input("track_coords", force_input=True, default="[]", optional=True),
io.Mask.Input("track_mask", optional=True),
@@ -325,7 +325,7 @@ class GenerateTracks(io.ComfyNode):
return io.Schema(
node_id="GenerateTracks",
search_aliases=["motion paths", "camera movement", "trajectory"],
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Int.Input("width", default=832, min=16, max=4096, step=16),
io.Int.Input("height", default=480, min=16, max=4096, step=16),
@@ -434,7 +434,7 @@ class WanMoveConcatTrack(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanMoveConcatTrack",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Tracks.Input("tracks_1"),
io.Tracks.Input("tracks_2", optional=True),
@@ -463,7 +463,7 @@ class WanMoveTrackToVideo(io.ComfyNode):
def define_schema(cls):
return io.Schema(
node_id="WanMoveTrackToVideo",
- category="conditioning/video_models",
+ category="model/conditioning/video_models",
inputs=[
io.Conditioning.Input("positive"),
io.Conditioning.Input("negative"),
diff --git a/comfy_extras/void_noise_warp.py b/comfy_extras/void_noise_warp.py
new file mode 100644
index 000000000..fcc9a5f8b
--- /dev/null
+++ b/comfy_extras/void_noise_warp.py
@@ -0,0 +1,494 @@
+"""
+Optical-flow-warped noise for VOID Pass 2 refinement.
+
+Adapted from RyannDaGreat/CommonSource (MIT License, Ryan Burgert):
+ https://github.com/RyannDaGreat/CommonSource
+ - noise_warp.py (NoiseWarper / warp_xyωc / regaussianize / get_noise_from_video)
+ - raft.py (RaftOpticalFlow)
+
+Only the code paths that ``comfy_extras/nodes_void.py::VOIDWarpedNoise`` actually
+uses (torch THWC uint8 input, no background removal, no visualization, no disk
+I/O, default warp/noise params) have been inlined. External ``rp`` utilities
+have been replaced with equivalents from torch.nn.functional / einops. The
+RAFT optical-flow model itself is loaded offline via ``OpticalFlowLoader`` in
+``nodes_void.py`` and passed into ``get_noise_from_video`` by the caller; this
+module never downloads weights at runtime.
+"""
+
+import logging
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+import comfy.model_management
+
+
+# ---------------------------------------------------------------------------
+# Low-level torch image helpers (drop-in replacements for rp.torch_* primitives)
+# ---------------------------------------------------------------------------
+
+def _torch_resize_chw(image, size, interp, copy=True):
+ """Resize a CHW tensor.
+
+ ``size`` is either a scalar factor or a (h, w) tuple. ``interp`` is one
+ of ``"bilinear"``, ``"nearest"``, ``"area"``. When ``copy`` is False and
+ the requested size matches the input, returns the input tensor as is
+ (faster but callers must not mutate the result).
+ """
+ if image.ndim != 3:
+ raise ValueError(
+ f"_torch_resize_chw expects a 3D CHW tensor, got shape {tuple(image.shape)}"
+ )
+ _, in_h, in_w = image.shape
+ if isinstance(size, (int, float)) and not isinstance(size, bool):
+ new_h = max(1, int(in_h * size))
+ new_w = max(1, int(in_w * size))
+ else:
+ new_h, new_w = size
+
+ if (new_h, new_w) == (in_h, in_w):
+ return image.clone() if copy else image
+
+ kwargs = {}
+ if interp in ("bilinear", "bicubic"):
+ kwargs["align_corners"] = False
+ out = F.interpolate(image[None], size=(new_h, new_w), mode=interp, **kwargs)[0]
+ return out
+
+
+def _torch_remap_relative(image, dx, dy, interp="bilinear"):
+ """Relative remap of a CHW image via ``F.grid_sample``.
+
+ Equivalent to ``rp.torch_remap_image(image, dx, dy, relative=True, interp=interp)``
+ for ``interp`` in {"bilinear", "nearest"}. Out-of-bounds samples are 0.
+ """
+ if image.ndim != 3:
+ raise ValueError(
+ f"_torch_remap_relative expects a 3D CHW tensor, got shape {tuple(image.shape)}"
+ )
+ if dx.shape != dy.shape:
+ raise ValueError(
+ f"_torch_remap_relative: dx and dy must match, got {tuple(dx.shape)} vs {tuple(dy.shape)}"
+ )
+ _, h, w = image.shape
+
+ x_abs = dx + torch.arange(w, device=dx.device, dtype=dx.dtype)
+ y_abs = dy + torch.arange(h, device=dy.device, dtype=dy.dtype)[:, None]
+
+ x_norm = (x_abs / (w - 1)) * 2 - 1
+ y_norm = (y_abs / (h - 1)) * 2 - 1
+
+ grid = torch.stack([x_norm, y_norm], dim=-1)[None].to(image.dtype)
+ out = F.grid_sample(
+ image[None], grid, mode=interp, align_corners=True, padding_mode="zeros"
+ )[0]
+ return out
+
+
+def _torch_scatter_add_relative(image, dx, dy):
+ """Scatter-add a CHW image using relative floor-rounded (dx, dy) offsets.
+
+ Equivalent to ``rp.torch_scatter_add_image(image, dx, dy, relative=True,
+ interp='floor')``. Out-of-bounds targets are dropped.
+ """
+ if image.ndim != 3:
+ raise ValueError(
+ f"_torch_scatter_add_relative expects a 3D CHW tensor, got shape {tuple(image.shape)}"
+ )
+ in_c, in_h, in_w = image.shape
+ if dx.shape != (in_h, in_w) or dy.shape != (in_h, in_w):
+ raise ValueError(
+ f"_torch_scatter_add_relative: dx/dy must be ({in_h}, {in_w}), "
+ f"got dx={tuple(dx.shape)} dy={tuple(dy.shape)}"
+ )
+
+ x = dx.long() + torch.arange(in_w, device=dx.device, dtype=torch.long)
+ y = dy.long() + torch.arange(in_h, device=dy.device, dtype=torch.long)[:, None]
+
+ valid = ((y >= 0) & (y < in_h) & (x >= 0) & (x < in_w)).reshape(-1)
+ indices = (y * in_w + x).reshape(-1)[valid]
+
+ flat_image = rearrange(image, "c h w -> (h w) c")[valid]
+ out = torch.zeros((in_h * in_w, in_c), dtype=image.dtype, device=image.device)
+ out.index_add_(0, indices, flat_image)
+ return rearrange(out, "(h w) c -> c h w", h=in_h, w=in_w)
+
+
+# ---------------------------------------------------------------------------
+# Noise warping primitives (ported from noise_warp.py)
+# ---------------------------------------------------------------------------
+
+def unique_pixels(image):
+ """Find unique pixel values in a CHW tensor.
+
+ Returns ``(unique_colors [U, C], counts [U], index_matrix [H, W])`` where
+ ``index_matrix[i, j]`` is the index of the unique color at that pixel.
+ """
+ _, h, w = image.shape
+ flat = rearrange(image, "c h w -> (h w) c")
+ unique_colors, inverse_indices, counts = torch.unique(
+ flat, dim=0, return_inverse=True, return_counts=True, sorted=False,
+ )
+ index_matrix = rearrange(inverse_indices, "(h w) -> h w", h=h, w=w)
+ return unique_colors, counts, index_matrix
+
+
+def sum_indexed_values(image, index_matrix):
+ """For each unique index, sum the CHW image values at its pixels."""
+ _, h, w = image.shape
+ u = int(index_matrix.max().item()) + 1
+ flat = rearrange(image, "c h w -> (h w) c")
+ out = torch.zeros((u, flat.shape[1]), dtype=flat.dtype, device=flat.device)
+ out.index_add_(0, index_matrix.view(-1), flat)
+ return out
+
+
+def indexed_to_image(index_matrix, unique_colors):
+ """Build a CHW image from an index matrix and a (U, C) color table."""
+ h, w = index_matrix.shape
+ flat = unique_colors[index_matrix.view(-1)]
+ return rearrange(flat, "(h w) c -> c h w", h=h, w=w)
+
+
+def regaussianize(noise):
+ """Variance-preserving re-sampling of a CHW noise tensor.
+
+ Wherever the noise contains groups of identical pixel values (e.g. after
+ a nearest-neighbor warp that duplicated source pixels), adds zero-mean
+ foreign noise within each group and scales by ``1/sqrt(count)`` so the
+ output is unit-variance gaussian again.
+ """
+ _, hs, ws = noise.shape
+ _, counts, index_matrix = unique_pixels(noise[:1])
+
+ foreign_noise = torch.randn_like(noise)
+ summed = sum_indexed_values(foreign_noise, index_matrix)
+ meaned = indexed_to_image(index_matrix, summed / rearrange(counts, "u -> u 1"))
+ zeroed_foreign = foreign_noise - meaned
+
+ counts_image = indexed_to_image(index_matrix, rearrange(counts, "u -> u 1"))
+
+ output = noise / counts_image ** 0.5 + zeroed_foreign
+ return output, counts_image
+
+
+def xy_meshgrid_like_image(image):
+ """Return a (2, H, W) tensor of (x, y) pixel coordinates matching ``image``."""
+ _, h, w = image.shape
+ y, x = torch.meshgrid(
+ torch.arange(h, device=image.device, dtype=image.dtype),
+ torch.arange(w, device=image.device, dtype=image.dtype),
+ indexing="ij",
+ )
+ return torch.stack([x, y])
+
+
+def noise_to_state(noise):
+ """Pack a (C, H, W) noise tensor into a state tensor (3+C, H, W) = [dx, dy, ω, noise]."""
+ zeros = torch.zeros_like(noise[:1])
+ ones = torch.ones_like(noise[:1])
+ return torch.cat([zeros, zeros, ones, noise])
+
+
+def state_to_noise(state):
+ """Unpack the noise channels from a state tensor."""
+ return state[3:]
+
+
+def warp_state(state, flow):
+ """Warp a noise-warper state tensor along the given optical flow.
+
+ ``state`` has shape ``(3+c, h, w)`` (= dx, dy, ω, c noise channels).
+ ``flow`` has shape ``(2, h, w)`` (= dx, dy).
+ """
+ if flow.device != state.device:
+ raise ValueError(
+ f"warp_state: flow and state must be on the same device, "
+ f"got flow={flow.device} state={state.device}"
+ )
+ if state.ndim != 3:
+ raise ValueError(
+ f"warp_state: state must be 3D (3+C, H, W), got shape {tuple(state.shape)}"
+ )
+ xyoc, h, w = state.shape
+ if flow.shape != (2, h, w):
+ raise ValueError(
+ f"warp_state: flow must have shape (2, {h}, {w}), got {tuple(flow.shape)}"
+ )
+ device = state.device
+
+ x_ch, y_ch = 0, 1
+ xy = 2 # state[:xy] = [dx, dy]
+ xyw = 3 # state[:xyw] = [dx, dy, ω]
+ w_ch = 2 # state[w_ch] = ω
+ c = xyoc - xyw
+ oc = xyoc - xy
+ if c <= 0:
+ raise ValueError(
+ f"warp_state: state has no noise channels (expected 3+C with C>0, got {xyoc} channels)"
+ )
+ if not (state[w_ch] > 0).all():
+ raise ValueError("warp_state: all weights in state[2] must be > 0")
+
+ grid = xy_meshgrid_like_image(state)
+
+ init = torch.empty_like(state)
+ init[:xy] = 0
+ init[w_ch] = 1
+ init[-c:] = 0
+
+ # --- Expansion branch: nearest-neighbor remap with negated flow ---
+ pre_expand = torch.empty_like(state)
+ pre_expand[:xy] = _torch_remap_relative(state[:xy], -flow[0], -flow[1], "nearest")
+ pre_expand[-oc:] = _torch_remap_relative(state[-oc:], -flow[0], -flow[1], "nearest")
+ pre_expand[w_ch][pre_expand[w_ch] == 0] = 1
+
+ # --- Shrink branch: scatter-add state into new positions ---
+ pre_shrink = state.clone()
+ pre_shrink[:xy] += flow
+
+ pos = (grid + pre_shrink[:xy]).round()
+ in_bounds = (pos[x_ch] >= 0) & (pos[x_ch] < w) & (pos[y_ch] >= 0) & (pos[y_ch] < h)
+ pre_shrink = torch.where(~in_bounds[None], init, pre_shrink)
+
+ scat_xy = pre_shrink[:xy].round()
+ pre_shrink[:xy] -= scat_xy
+ pre_shrink[:xy] = 0 # xy_mode='none' in upstream
+
+ def scat(tensor):
+ return _torch_scatter_add_relative(tensor, scat_xy[0], scat_xy[1])
+
+ # rp.torch_scatter_add_image on a bool tensor errors on modern torch;
+ # scatter-sum a float ones tensor and threshold to get the mask instead.
+ shrink_mask = scat(torch.ones(1, h, w, dtype=state.dtype, device=device)) > 0
+
+ # Drop expansion samples at positions that will be filled by shrink.
+ pre_expand = torch.where(shrink_mask, init, pre_expand)
+
+ # Regaussianize both branches together so duplicated-source groups are
+ # counted globally, then split back apart.
+ concat = torch.cat([pre_shrink, pre_expand], dim=2) # along width
+ concat[-c:], counts_image = regaussianize(concat[-c:])
+ concat[w_ch] = concat[w_ch] / counts_image[0]
+ concat[w_ch] = concat[w_ch].nan_to_num()
+ pre_shrink, expand = torch.chunk(concat, chunks=2, dim=2)
+
+ shrink = torch.empty_like(pre_shrink)
+ shrink[w_ch] = scat(pre_shrink[w_ch][None])[0]
+ shrink[:xy] = scat(pre_shrink[:xy] * pre_shrink[w_ch][None]) / shrink[w_ch][None]
+ shrink[-c:] = scat(pre_shrink[-c:] * pre_shrink[w_ch][None]) / scat(
+ pre_shrink[w_ch][None] ** 2
+ ).sqrt()
+
+ output = torch.where(shrink_mask, shrink, expand)
+ output[w_ch] = output[w_ch] / output[w_ch].mean()
+ output[w_ch] += 1e-5
+ output[w_ch] **= 0.9999
+ return output
+
+
+class NoiseWarper:
+ """Maintain a warpable noise state and emit gaussian noise per frame.
+
+ Simplified from RyannDaGreat/CommonSource/noise_warp.py::NoiseWarper:
+ ``scale_factor``, ``post_noise_alpha``, ``progressive_noise_alpha``, and
+ ``warp_kwargs`` are all dropped since VOIDWarpedNoise always uses defaults.
+ """
+
+ def __init__(self, c, h, w, device, dtype=torch.float32):
+ if c <= 0 or h <= 0 or w <= 0:
+ raise ValueError(
+ f"NoiseWarper: c/h/w must all be positive, got c={c} h={h} w={w}"
+ )
+ self.c = c
+ self.h = h
+ self.w = w
+ self.device = device
+ self.dtype = dtype
+
+ noise = torch.randn(c, h, w, dtype=dtype, device=device)
+ self._state = noise_to_state(noise)
+
+ @property
+ def noise(self):
+ # With scale_factor=1 the "downsample to respect weights" step is a
+ # size-preserving no-op; the weight-variance correction math still
+ # runs to stay faithful to upstream.
+ n = state_to_noise(self._state)
+ weights = self._state[2:3]
+ return n * weights / (weights ** 2).sqrt()
+
+ def __call__(self, dx, dy):
+ if dx.shape != dy.shape:
+ raise ValueError(
+ f"NoiseWarper: dx and dy must match, got {tuple(dx.shape)} vs {tuple(dy.shape)}"
+ )
+ flow = torch.stack([dx, dy]).to(self.device, self.dtype)
+ _, oflowh, ofloww = flow.shape
+
+ flow = _torch_resize_chw(flow, (self.h, self.w), "bilinear", copy=True)
+ flowh, floww = flow.shape[-2:]
+
+ # Upstream scales flow[0] by flowh/oflowh and flow[1] by floww/ofloww
+ # (channel-order appears swapped but harmless when H and W are scaled
+ # by the same factor, which is always the case for our callers).
+ flow[0] *= flowh / oflowh
+ flow[1] *= floww / ofloww
+
+ self._state = warp_state(self._state, flow)
+ return self
+
+
+# ---------------------------------------------------------------------------
+# RAFT optical flow wrapper (ported from raft.py)
+# ---------------------------------------------------------------------------
+
+class RaftOpticalFlow:
+ """RAFT-large wrapper around a pre-loaded torchvision model.
+
+ ``model`` must be the ``torchvision.models.optical_flow.raft_large`` module
+ with its weights already populated; this class is load-agnostic so the
+ caller owns downloading/offload concerns (see ``OpticalFlowLoader`` in
+ ``nodes_void.py``). ``__call__`` returns a ``(2, H, W)`` flow.
+ """
+
+ def __init__(self, model, device=None):
+ if device is None:
+ device = comfy.model_management.get_torch_device()
+ device = torch.device(device) if not isinstance(device, torch.device) else device
+
+ model = model.to(device)
+ model.eval()
+ self.device = device
+ self.model = model
+
+ def _preprocess(self, image_chw):
+ image = image_chw.to(self.device, torch.float32)
+ _, h, w = image.shape
+ new_h = (h // 8) * 8
+ new_w = (w // 8) * 8
+ image = _torch_resize_chw(image, (new_h, new_w), "bilinear", copy=False)
+ image = image * 2 - 1
+ return image[None]
+
+ def __call__(self, from_image, to_image):
+ """``from_image``, ``to_image``: CHW float tensors in [0, 1]."""
+ if from_image.shape != to_image.shape:
+ raise ValueError(
+ f"RaftOpticalFlow: from_image and to_image must match, "
+ f"got {tuple(from_image.shape)} vs {tuple(to_image.shape)}"
+ )
+ _, h, w = from_image.shape
+ with torch.no_grad():
+ img1 = self._preprocess(from_image)
+ img2 = self._preprocess(to_image)
+ list_of_flows = self.model(img1, img2)
+ flow = list_of_flows[-1][0] # (2, new_h, new_w)
+ if flow.shape[-2:] != (h, w):
+ flow = _torch_resize_chw(flow, (h, w), "bilinear", copy=False)
+ return flow
+
+
+# ---------------------------------------------------------------------------
+# Narrow entry point used by VOIDWarpedNoise
+# ---------------------------------------------------------------------------
+
+def get_noise_from_video(
+ video_frames: torch.Tensor,
+ raft: RaftOpticalFlow,
+ *,
+ noise_channels: int = 16,
+ resize_frames: float = 0.5,
+ resize_flow: int = 8,
+ downscale_factor: int = 32,
+ device: Optional[torch.device] = None,
+) -> torch.Tensor:
+ """Produce optical-flow-warped gaussian noise from a video.
+
+ Args:
+ video_frames: ``(T, H, W, 3)`` uint8 torch tensor.
+ raft: Pre-loaded RAFT optical-flow wrapper (see ``RaftOpticalFlow``).
+ noise_channels: Channels in the output noise.
+ resize_frames: Pre-RAFT frame scale factor.
+ resize_flow: Post-flow up-scale factor applied to the optical flow;
+ the internal noise state is allocated at
+ ``(resize_flow * resize_frames * H, resize_flow * resize_frames * W)``.
+ downscale_factor: Area-pool factor applied to the noise before return;
+ should evenly divide the internal noise resolution.
+ device: Target device. Defaults to ``comfy.model_management.get_torch_device()``.
+
+ Returns:
+ ``(T, H', W', noise_channels)`` float32 noise tensor on ``device``.
+ """
+ if not isinstance(resize_flow, int) or resize_flow < 1:
+ raise ValueError(
+ f"get_noise_from_video: resize_flow must be a positive int, got {resize_flow!r}"
+ )
+ if video_frames.ndim != 4 or video_frames.shape[-1] != 3:
+ raise ValueError(
+ "get_noise_from_video: video_frames must have shape (T, H, W, 3), "
+ f"got {tuple(video_frames.shape)}"
+ )
+ if video_frames.dtype != torch.uint8:
+ raise TypeError(
+ "get_noise_from_video: video_frames must be uint8 in [0, 255], "
+ f"got dtype {video_frames.dtype}"
+ )
+
+ if device is None:
+ device = comfy.model_management.get_torch_device()
+ device = torch.device(device) if not isinstance(device, torch.device) else device
+
+ if device.type == "cpu":
+ logging.warning(
+ "VOIDWarpedNoise: running get_noise_from_video on CPU; this will be "
+ "slow (minutes for ~45 frames). Use CUDA for interactive use."
+ )
+
+ T = video_frames.shape[0]
+ frames = video_frames.to(device).permute(0, 3, 1, 2).to(torch.float32) / 255.0
+ if resize_frames != 1.0:
+ new_h = max(1, int(frames.shape[2] * resize_frames))
+ new_w = max(1, int(frames.shape[3] * resize_frames))
+ frames = F.interpolate(frames, size=(new_h, new_w), mode="area")
+
+ _, _, H, W = frames.shape
+ internal_h = resize_flow * H
+ internal_w = resize_flow * W
+ if internal_h % downscale_factor or internal_w % downscale_factor:
+ logging.warning(
+ "VOIDWarpedNoise: internal noise size %dx%d is not divisible by "
+ "downscale_factor %d; output noise may have artifacts.",
+ internal_h, internal_w, downscale_factor,
+ )
+
+ with torch.no_grad():
+ warper = NoiseWarper(
+ c=noise_channels, h=internal_h, w=internal_w, device=device,
+ )
+ down_h = warper.h // downscale_factor
+ down_w = warper.w // downscale_factor
+ output = torch.empty(
+ (T, down_h, down_w, noise_channels), dtype=torch.float32, device=device,
+ )
+
+ def downscale(noise_chw):
+ # Area-pool to 1/downscale_factor then multiply by downscale_factor
+ # to adjust std (sqrt of pool area == downscale_factor for a
+ # square pool).
+ down = _torch_resize_chw(noise_chw, 1.0 / downscale_factor, "area", copy=False)
+ return down * downscale_factor
+
+ output[0] = downscale(warper.noise).permute(1, 2, 0)
+
+ prev = frames[0]
+ for i in range(1, T):
+ curr = frames[i]
+ flow = raft(prev, curr).to(device)
+ warper(flow[0], flow[1])
+ output[i] = downscale(warper.noise).permute(1, 2, 0)
+ prev = curr
+
+ return output
diff --git a/comfyui_version.py b/comfyui_version.py
index 61d7672ca..0bb0f780c 100644
--- a/comfyui_version.py
+++ b/comfyui_version.py
@@ -1,3 +1,3 @@
# This file is automatically generated by the build process when version is
# updated in pyproject.toml.
-__version__ = "0.18.1"
+__version__ = "0.22.0"
diff --git a/custom_nodes/websocket_image_save.py b/custom_nodes/websocket_image_save.py
index 15f87f9f5..6a8646d0e 100644
--- a/custom_nodes/websocket_image_save.py
+++ b/custom_nodes/websocket_image_save.py
@@ -22,7 +22,7 @@ class SaveImageWebsocket:
OUTPUT_NODE = True
- CATEGORY = "api/image"
+ CATEGORY = "image"
def save_images(self, images):
pbar = comfy.utils.ProgressBar(images.shape[0])
@@ -42,3 +42,7 @@ class SaveImageWebsocket:
NODE_CLASS_MAPPINGS = {
"SaveImageWebsocket": SaveImageWebsocket,
}
+
+NODE_DISPLAY_NAME_MAPPINGS = {
+ "SaveImageWebsocket": "Save Image (Websocket)",
+}
\ No newline at end of file
diff --git a/execution.py b/execution.py
index 5e02dffb2..5246d651c 100644
--- a/execution.py
+++ b/execution.py
@@ -2,6 +2,7 @@ import copy
import heapq
import inspect
import logging
+import psutil
import sys
import threading
import time
@@ -15,6 +16,7 @@ import torch
from comfy.cli_args import args
import comfy.memory_management
import comfy.model_management
+import comfy.model_prefetch
import comfy_aimdo.model_vbar
from latent_preview import set_preview_method
@@ -537,6 +539,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
if args.verbose == "DEBUG":
comfy_aimdo.control.analyze()
comfy.model_management.reset_cast_buffers()
+ comfy.model_prefetch.cleanup_prefetch_queues()
comfy_aimdo.model_vbar.vbars_reset_watermark_limits()
if has_pending_tasks:
@@ -624,7 +627,7 @@ async def execute(server, dynprompt, caches, current_item, extra_data, executed,
if comfy.model_management.is_oom(ex):
tips = "This error means you ran out of memory on your GPU.\n\nTIPS: If the workflow worked before you might have accidentally set the batch_size to a large number."
- logging.info("Memory summary: {}".format(comfy.model_management.debug_memory_summary()))
+ logging.info("Memory summary:\n{}".format(comfy.model_management.debug_memory_summary()))
logging.error("Got an OOM, unloading all loaded models.")
comfy.model_management.unload_all_models()
elif isinstance(ex, RuntimeError) and ("mat1 and mat2 shapes" in str(ex)) and "Sampler" in class_type:
@@ -725,6 +728,7 @@ class PromptExecutor:
self._notify_prompt_lifecycle("start", prompt_id)
ram_headroom = int(self.cache_args["ram"] * (1024 ** 3))
+ ram_inactive_headroom = int(self.cache_args["ram_inactive"] * (1024 ** 3))
ram_release_callback = self.caches.outputs.ram_release if self.cache_type == CacheType.RAM_PRESSURE else None
comfy.memory_management.set_ram_cache_release_state(ram_release_callback, ram_headroom)
@@ -778,8 +782,14 @@ class PromptExecutor:
execution_list.complete_node_execution()
if self.cache_type == CacheType.RAM_PRESSURE:
- comfy.model_management.free_memory(0, None, pins_required=ram_headroom, ram_required=ram_headroom)
- comfy.memory_management.extra_ram_release(ram_headroom)
+ ram_release_callback(ram_inactive_headroom)
+ ram_shortfall = ram_headroom - psutil.virtual_memory().available
+ freed = comfy.model_management.free_pins(ram_shortfall + 512 * (1024 ** 2))
+ if freed < ram_shortfall:
+ if freed > 64 * (1024 ** 2):
+ # AIMDO MEM_DECOMMIT can outrun psutil.available catching up.
+ time.sleep(0.05)
+ ram_release_callback(ram_headroom, free_active=True)
else:
# Only execute when the while-loop ends without break
# Send cached UI for intermediate output nodes that weren't executed
@@ -811,11 +821,30 @@ class PromptExecutor:
self._notify_prompt_lifecycle("end", prompt_id)
-async def validate_inputs(prompt_id, prompt, item, validated):
+async def validate_inputs(prompt_id, prompt, item, validated, visiting=None):
+ if visiting is None:
+ visiting = []
+
unique_id = item
if unique_id in validated:
return validated[unique_id]
+ if unique_id in visiting:
+ cycle_path_nodes = visiting[visiting.index(unique_id):] + [unique_id]
+ cycle_nodes = list(dict.fromkeys(cycle_path_nodes))
+ cycle_path = " -> ".join(f"{node_id} ({prompt[node_id]['class_type']})" for node_id in cycle_path_nodes)
+ for node_id in cycle_nodes:
+ validated[node_id] = (False, [{
+ "type": "dependency_cycle",
+ "message": "Dependency cycle detected",
+ "details": cycle_path,
+ "extra_info": {
+ "node_id": node_id,
+ "cycle_nodes": cycle_nodes,
+ }
+ }], node_id)
+ return validated[unique_id]
+
inputs = prompt[unique_id]['inputs']
class_type = prompt[unique_id]['class_type']
obj_class = nodes.NODE_CLASS_MAPPINGS[class_type]
@@ -899,7 +928,11 @@ async def validate_inputs(prompt_id, prompt, item, validated):
errors.append(error)
continue
try:
- r = await validate_inputs(prompt_id, prompt, o_id, validated)
+ visiting.append(unique_id)
+ try:
+ r = await validate_inputs(prompt_id, prompt, o_id, validated, visiting)
+ finally:
+ visiting.pop()
if r[0] is False:
# `r` will be set in `validated[o_id]` already
valid = False
@@ -994,7 +1027,12 @@ async def validate_inputs(prompt_id, prompt, item, validated):
combo_options = extra_info.get("options", [])
else:
combo_options = input_type
- if val not in combo_options:
+ is_multiselect = extra_info.get("multiselect", False)
+ if is_multiselect and isinstance(val, list):
+ invalid_vals = [v for v in val if v not in combo_options]
+ else:
+ invalid_vals = [val] if val not in combo_options else []
+ if invalid_vals:
input_config = info
list_info = ""
@@ -1009,7 +1047,7 @@ async def validate_inputs(prompt_id, prompt, item, validated):
error = {
"type": "value_not_in_list",
"message": "Value not in list",
- "details": f"{x}: '{val}' not in {list_info}",
+ "details": f"{x}: {', '.join(repr(v) for v in invalid_vals)} not in {list_info}",
"extra_info": {
"input_name": x,
"input_config": input_config,
@@ -1048,10 +1086,13 @@ async def validate_inputs(prompt_id, prompt, item, validated):
errors.append(error)
continue
- if len(errors) > 0 or valid is not True:
- ret = (False, errors, unique_id)
- else:
- ret = (True, [], unique_id)
+ ret = validated.get(unique_id, (True, [], unique_id))
+ # Recursive cycle detection may have already populated an error on us. Join it.
+ ret = (
+ ret[0] and valid is True and not errors,
+ ret[1] + [error for error in errors if error not in ret[1]],
+ unique_id,
+ )
validated[unique_id] = ret
return ret
diff --git a/extra_model_paths.yaml.example b/extra_model_paths.yaml.example
index 34df01681..9c395c0b2 100644
--- a/extra_model_paths.yaml.example
+++ b/extra_model_paths.yaml.example
@@ -28,7 +28,7 @@
#config for a1111 ui
#all you have to do is uncomment this (remove the #) and change the base_path to where yours is installed
-#a111:
+#a1111:
# base_path: path/to/stable-diffusion-webui/
# checkpoints: models/Stable-diffusion
# configs: models/Stable-diffusion
diff --git a/folder_paths.py b/folder_paths.py
index 9c96540e3..7304e1b73 100644
--- a/folder_paths.py
+++ b/folder_paths.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
import os
import time
import mimetypes
@@ -52,6 +50,16 @@ folder_names_and_paths["model_patches"] = ([os.path.join(models_dir, "model_patc
folder_names_and_paths["audio_encoders"] = ([os.path.join(models_dir, "audio_encoders")], supported_pt_extensions)
+folder_names_and_paths["background_removal"] = ([os.path.join(models_dir, "background_removal")], supported_pt_extensions)
+
+folder_names_and_paths["frame_interpolation"] = ([os.path.join(models_dir, "frame_interpolation")], supported_pt_extensions)
+
+folder_names_and_paths["geometry_estimation"] = ([os.path.join(models_dir, "geometry_estimation")], supported_pt_extensions)
+
+folder_names_and_paths["optical_flow"] = ([os.path.join(models_dir, "optical_flow")], supported_pt_extensions)
+
+folder_names_and_paths["detection"] = ([os.path.join(models_dir, "detection")], supported_pt_extensions)
+
output_directory = os.path.join(base_path, "output")
temp_directory = os.path.join(base_path, "temp")
input_directory = os.path.join(base_path, "input")
@@ -430,7 +438,9 @@ def get_save_image_path(filename_prefix: str, output_dir: str, image_width=0, im
prefix_len = len(os.path.basename(filename_prefix))
prefix = filename[:prefix_len + 1]
try:
- digits = int(filename[prefix_len + 1:].split('_')[0])
+ remainder = filename[prefix_len + 1:]
+ base_remainder = remainder.split('.')[0]
+ digits = int(base_remainder.split('_')[0])
except:
digits = 0
return digits, prefix
diff --git a/main.py b/main.py
index 12b04719d..bce451a83 100644
--- a/main.py
+++ b/main.py
@@ -1,14 +1,24 @@
import comfy.options
comfy.options.enable_args_parsing()
+from comfy.cli_args import args
+
+if args.list_feature_flags:
+ import json
+ from comfy_api.feature_flags import CLI_FEATURE_FLAG_REGISTRY
+ print(json.dumps(CLI_FEATURE_FLAG_REGISTRY, indent=2)) # noqa: T201
+ raise SystemExit(0)
+
import os
import importlib.util
import shutil
import importlib.metadata
import folder_paths
import time
-from comfy.cli_args import args, enables_dynamic_vram
+from comfy.cli_args import enables_dynamic_vram
from app.logger import setup_logger
+setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
+
from app.assets.seeder import asset_seeder
from app.assets.services import register_output_files
import itertools
@@ -27,8 +37,6 @@ if __name__ == "__main__":
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'
os.environ['DO_NOT_TRACK'] = '1'
-setup_logger(log_level=args.verbose, use_stdout=args.log_stdout)
-
faulthandler.enable(file=sys.stderr, all_threads=False)
import comfy_aimdo.control
@@ -210,7 +218,7 @@ import comfy.model_patcher
if args.enable_dynamic_vram or (enables_dynamic_vram() and comfy.model_management.is_nvidia() and not comfy.model_management.is_wsl()):
if (not args.enable_dynamic_vram) and (comfy.model_management.torch_version_numeric < (2, 8)):
logging.warning("Unsupported Pytorch detected. DynamicVRAM support requires Pytorch version 2.8 or later. Falling back to legacy ModelPatcher. VRAM estimates may be unreliable especially on Windows")
- elif comfy_aimdo.control.init_device(comfy.model_management.get_torch_device().index):
+ elif comfy_aimdo.control.init_devices(d.index for d in comfy.model_management.get_all_torch_devices()):
if args.verbose == 'DEBUG':
comfy_aimdo.control.set_log_debug()
elif args.verbose == 'CRITICAL':
@@ -275,19 +283,25 @@ def _collect_output_absolute_paths(history_result: dict) -> list[str]:
def prompt_worker(q, server_instance):
current_time: float = 0.0
- cache_ram = args.cache_ram
- if cache_ram < 0:
- cache_ram = min(32.0, max(4.0, comfy.model_management.total_ram * 0.25 / 1024.0))
+ cache_ram = 0
+ cache_ram_inactive = 0
+ if not args.cache_classic and not args.cache_none and args.cache_lru <= 0:
+ cache_ram = min(10.0, max(2.0, comfy.model_management.total_ram * 0.10 / 1024.0))
+ cache_ram_inactive = min(96.0, comfy.model_management.total_ram / 1024.0)
+ if len(args.cache_ram) > 0:
+ cache_ram = args.cache_ram[0]
+ if len(args.cache_ram) > 1:
+ cache_ram_inactive = args.cache_ram[1]
- cache_type = execution.CacheType.CLASSIC
- if args.cache_lru > 0:
+ cache_type = execution.CacheType.RAM_PRESSURE
+ if args.cache_classic:
+ cache_type = execution.CacheType.CLASSIC
+ elif args.cache_lru > 0:
cache_type = execution.CacheType.LRU
- elif cache_ram > 0:
- cache_type = execution.CacheType.RAM_PRESSURE
elif args.cache_none:
cache_type = execution.CacheType.NONE
- e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram } )
+ e = execution.PromptExecutor(server_instance, cache_type=cache_type, cache_args={ "lru" : args.cache_lru, "ram" : cache_ram, "ram_inactive" : cache_ram_inactive } )
last_gc_collect = 0
need_gc = False
gc_collect_interval = 10.0
@@ -330,9 +344,9 @@ def prompt_worker(q, server_instance):
# Log Time in a more readable way after 10 minutes
if execution_time > 600:
execution_time = time.strftime("%H:%M:%S", time.gmtime(execution_time))
- logging.info(f"Prompt executed in {execution_time}")
+ logging.info(f"Prompt executed in {execution_time}", extra={'color': 'green'})
else:
- logging.info("Prompt executed in {:.2f} seconds".format(execution_time))
+ logging.info("Prompt executed in {:.2f} seconds".format(execution_time), extra={'color': 'green'})
if not asset_seeder.is_disabled():
paths = _collect_output_absolute_paths(e.history_result)
diff --git a/manager_requirements.txt b/manager_requirements.txt
index f770ec933..a079d3492 100644
--- a/manager_requirements.txt
+++ b/manager_requirements.txt
@@ -1 +1 @@
-comfyui_manager==4.1
+comfyui_manager==4.2.1
diff --git a/models/background_removal/put_background_removal_models_here b/models/background_removal/put_background_removal_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/detection/put_detection_models_here b/models/detection/put_detection_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/frame_interpolation/put_frame_interpolation_models_here b/models/frame_interpolation/put_frame_interpolation_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/geometry_estimation/put_geometry_estimation_models_here b/models/geometry_estimation/put_geometry_estimation_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/optical_flow/put_optical_flow_models_here b/models/optical_flow/put_optical_flow_models_here
new file mode 100644
index 000000000..e69de29bb
diff --git a/node_helpers.py b/node_helpers.py
index d3d834516..cac4e88dd 100644
--- a/node_helpers.py
+++ b/node_helpers.py
@@ -86,6 +86,6 @@ def image_alpha_fix(destination, source):
if destination.shape[-1] < source.shape[-1]:
source = source[...,:destination.shape[-1]]
elif destination.shape[-1] > source.shape[-1]:
- destination = torch.nn.functional.pad(destination, (0, 1))
- destination[..., -1] = 1.0
+ source = torch.nn.functional.pad(source, (0, 1))
+ source[..., -1] = 1.0
return destination, source
diff --git a/nodes.py b/nodes.py
index 299b3d758..528bf316f 100644
--- a/nodes.py
+++ b/nodes.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
import torch
@@ -32,7 +31,7 @@ import comfy.controlnet
from comfy.comfy_types import IO, ComfyNodeABC, InputTypeDict, FileLocator
from comfy_api.internal import register_versions, ComfyAPIWithVersion
from comfy_api.version_list import supported_versions
-from comfy_api.latest import io, ComfyExtension
+from comfy_api.latest import io, ComfyExtension, InputImpl
import comfy.clip_vision
@@ -69,7 +68,7 @@ class CLIPTextEncode(ComfyNodeABC):
OUTPUT_TOOLTIPS = ("A conditioning containing the embedded text used to guide the diffusion model.",)
FUNCTION = "encode"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
DESCRIPTION = "Encodes a text prompt using a CLIP model into an embedding that can be used to guide the diffusion model towards generating specific images."
SEARCH_ALIASES = ["text", "prompt", "text prompt", "positive prompt", "negative prompt", "encode text", "text encoder", "encode prompt"]
@@ -88,7 +87,7 @@ class ConditioningCombine:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "combine"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
SEARCH_ALIASES = ["combine", "merge conditioning", "combine prompts", "merge prompts", "mix prompts", "add prompt"]
def combine(self, conditioning_1, conditioning_2):
@@ -105,7 +104,7 @@ class ConditioningAverage :
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "addWeighted"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def addWeighted(self, conditioning_to, conditioning_from, conditioning_to_strength):
out = []
@@ -144,7 +143,7 @@ class ConditioningConcat:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "concat"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def concat(self, conditioning_to, conditioning_from):
out = []
@@ -177,7 +176,7 @@ class ConditioningSetArea:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def append(self, conditioning, width, height, x, y, strength):
c = node_helpers.conditioning_set_values(conditioning, {"area": (height // 8, width // 8, y // 8, x // 8),
@@ -198,7 +197,7 @@ class ConditioningSetAreaPercentage:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def append(self, conditioning, width, height, x, y, strength):
c = node_helpers.conditioning_set_values(conditioning, {"area": ("percentage", height, width, y, x),
@@ -215,7 +214,7 @@ class ConditioningSetAreaStrength:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def append(self, conditioning, strength):
c = node_helpers.conditioning_set_values(conditioning, {"strength": strength})
@@ -235,7 +234,7 @@ class ConditioningSetMask:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def append(self, conditioning, mask, set_cond_area, strength):
set_area_to_bounds = False
@@ -304,7 +303,7 @@ class VAEDecode:
OUTPUT_TOOLTIPS = ("The decoded image.",)
FUNCTION = "decode"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
DESCRIPTION = "Decodes latent images back into pixel space images."
SEARCH_ALIASES = ["decode", "decode latent", "latent to image", "render latent"]
@@ -330,7 +329,7 @@ class VAEDecodeTiled:
RETURN_TYPES = ("IMAGE",)
FUNCTION = "decode"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
def decode(self, vae, samples, tile_size, overlap=64, temporal_size=64, temporal_overlap=8):
if tile_size < overlap * 4:
@@ -358,7 +357,7 @@ class VAEEncode:
RETURN_TYPES = ("LATENT",)
FUNCTION = "encode"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
SEARCH_ALIASES = ["encode", "encode image", "image to latent"]
def encode(self, vae, pixels):
@@ -377,7 +376,7 @@ class VAEEncodeTiled:
RETURN_TYPES = ("LATENT",)
FUNCTION = "encode"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
def encode(self, vae, pixels, tile_size, overlap, temporal_size=64, temporal_overlap=8):
t = vae.encode_tiled(pixels, tile_x=tile_size, tile_y=tile_size, overlap=overlap, tile_t=temporal_size, overlap_t=temporal_overlap)
@@ -390,7 +389,7 @@ class VAEEncodeForInpaint:
RETURN_TYPES = ("LATENT",)
FUNCTION = "encode"
- CATEGORY = "latent/inpaint"
+ CATEGORY = "model/latent/inpaint"
def encode(self, vae, pixels, mask, grow_mask_by=6):
downscale_ratio = vae.spacial_compression_encode()
@@ -439,7 +438,7 @@ class InpaintModelConditioning:
RETURN_NAMES = ("positive", "negative", "latent")
FUNCTION = "encode"
- CATEGORY = "conditioning/inpaint"
+ CATEGORY = "model/conditioning/inpaint"
def encode(self, positive, negative, pixels, vae, mask, noise_mask=True):
x = (pixels.shape[1] // 8) * 8
@@ -493,7 +492,7 @@ class SaveLatent:
OUTPUT_NODE = True
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
def save(self, samples, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(filename_prefix, self.output_dir)
@@ -538,7 +537,7 @@ class LoadLatent:
files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f)) and f.endswith(".latent")]
return {"required": {"latent": [sorted(files), ]}, }
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
RETURN_TYPES = ("LATENT", )
FUNCTION = "load"
@@ -599,7 +598,7 @@ class CheckpointLoaderSimple:
"The VAE model used for encoding and decoding images to and from latent space.")
FUNCTION = "load_checkpoint"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
DESCRIPTION = "Loads a diffusion model checkpoint, diffusion models are used to denoise latents."
SEARCH_ALIASES = ["load model", "checkpoint", "model loader", "load checkpoint", "ckpt", "model"]
@@ -645,7 +644,7 @@ class unCLIPCheckpointLoader:
RETURN_TYPES = ("MODEL", "CLIP", "VAE", "CLIP_VISION")
FUNCTION = "load_checkpoint"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
def load_checkpoint(self, ckpt_name, output_vae=True, output_clip=True):
ckpt_path = folder_paths.get_full_path_or_raise("checkpoints", ckpt_name)
@@ -661,7 +660,7 @@ class CLIPSetLastLayer:
RETURN_TYPES = ("CLIP",)
FUNCTION = "set_last_layer"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def set_last_layer(self, clip, stop_at_clip_layer):
clip = clip.clone()
@@ -690,8 +689,8 @@ class LoraLoader:
OUTPUT_TOOLTIPS = ("The modified diffusion model.", "The modified CLIP model.")
FUNCTION = "load_lora"
- CATEGORY = "loaders"
- DESCRIPTION = "LoRAs are used to modify diffusion and CLIP models, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together."
+ CATEGORY = "model/loaders"
+ DESCRIPTION = "This LoRA loader is used to modify both diffusion and CLIP models, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together."
SEARCH_ALIASES = ["lora", "load lora", "apply lora", "lora loader", "lora model"]
def load_lora(self, model, clip, lora_name, strength_model, strength_clip):
@@ -700,17 +699,19 @@ class LoraLoader:
lora_path = folder_paths.get_full_path_or_raise("loras", lora_name)
lora = None
+ lora_metadata = None
if self.loaded_lora is not None:
if self.loaded_lora[0] == lora_path:
lora = self.loaded_lora[1]
+ lora_metadata = self.loaded_lora[2] if len(self.loaded_lora) > 2 else None
else:
self.loaded_lora = None
if lora is None:
- lora = comfy.utils.load_torch_file(lora_path, safe_load=True)
- self.loaded_lora = (lora_path, lora)
+ lora, lora_metadata = comfy.utils.load_torch_file(lora_path, safe_load=True, return_metadata=True)
+ self.loaded_lora = (lora_path, lora, lora_metadata)
- model_lora, clip_lora = comfy.sd.load_lora_for_models(model, clip, lora, strength_model, strength_clip)
+ model_lora, clip_lora = comfy.sd.load_lora_for_models(model, clip, lora, strength_model, strength_clip, lora_metadata=lora_metadata)
return (model_lora, clip_lora)
class LoraLoaderModelOnly(LoraLoader):
@@ -721,6 +722,7 @@ class LoraLoaderModelOnly(LoraLoader):
"strength_model": ("FLOAT", {"default": 1.0, "min": -100.0, "max": 100.0, "step": 0.01}),
}}
RETURN_TYPES = ("MODEL",)
+ DESCRIPTION = "This LoRAs loader is used to modify the diffusion model, altering the way in which latents are denoised such as applying styles. Multiple LoRA nodes can be linked together."
FUNCTION = "load_lora_model_only"
def load_lora_model_only(self, model, lora_name, strength_model):
@@ -728,50 +730,26 @@ class LoraLoaderModelOnly(LoraLoader):
class VAELoader:
video_taes = ["taehv", "lighttaew2_2", "lighttaew2_1", "lighttaehy1_5", "taeltx_2"]
- image_taes = ["taesd", "taesdxl", "taesd3", "taef1"]
+ image_taes = ["taesd", "taesdxl", "taesd3", "taef1", "taef2"]
+
@staticmethod
def vae_list(s):
vaes = folder_paths.get_filename_list("vae")
approx_vaes = folder_paths.get_filename_list("vae_approx")
- sdxl_taesd_enc = False
- sdxl_taesd_dec = False
- sd1_taesd_enc = False
- sd1_taesd_dec = False
- sd3_taesd_enc = False
- sd3_taesd_dec = False
- f1_taesd_enc = False
- f1_taesd_dec = False
-
+ have_img_encoder, have_img_decoder = set(), set()
for v in approx_vaes:
- if v.startswith("taesd_decoder."):
- sd1_taesd_dec = True
- elif v.startswith("taesd_encoder."):
- sd1_taesd_enc = True
- elif v.startswith("taesdxl_decoder."):
- sdxl_taesd_dec = True
- elif v.startswith("taesdxl_encoder."):
- sdxl_taesd_enc = True
- elif v.startswith("taesd3_decoder."):
- sd3_taesd_dec = True
- elif v.startswith("taesd3_encoder."):
- sd3_taesd_enc = True
- elif v.startswith("taef1_encoder."):
- f1_taesd_dec = True
- elif v.startswith("taef1_decoder."):
- f1_taesd_enc = True
- else:
+ parts = v.split("_", 1)
+ if len(parts) != 2 or parts[0] not in s.image_taes:
for tae in s.video_taes:
if v.startswith(tae):
vaes.append(v)
-
- if sd1_taesd_dec and sd1_taesd_enc:
- vaes.append("taesd")
- if sdxl_taesd_dec and sdxl_taesd_enc:
- vaes.append("taesdxl")
- if sd3_taesd_dec and sd3_taesd_enc:
- vaes.append("taesd3")
- if f1_taesd_dec and f1_taesd_enc:
- vaes.append("taef1")
+ break
+ continue
+ if parts[1].startswith("encoder."):
+ have_img_encoder.add(parts[0])
+ elif parts[1].startswith("decoder."):
+ have_img_decoder.add(parts[0])
+ vaes += [k for k in have_img_decoder if k in have_img_encoder]
vaes.append("pixel_space")
return vaes
@@ -811,11 +789,12 @@ class VAELoader:
RETURN_TYPES = ("VAE",)
FUNCTION = "load_vae"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
#TODO: scale factor?
def load_vae(self, vae_name):
metadata = None
+ vae_path = None
if vae_name == "pixel_space":
sd = {}
sd["pixel_space_vae"] = torch.tensor(1.0)
@@ -827,8 +806,21 @@ class VAELoader:
else:
vae_path = folder_paths.get_full_path_or_raise("vae", vae_name)
sd, metadata = comfy.utils.load_torch_file(vae_path, return_metadata=True)
+ if vae_name == "taef2":
+ if metadata is None:
+ metadata = {"tae_latent_channels": 128}
+ else:
+ metadata["tae_latent_channels"] = 128
vae = comfy.sd.VAE(sd=sd, metadata=metadata)
vae.throw_exception_if_invalid()
+ # Register a reload factory on the patcher so multigpu deepclones
+ # (Select VAE Device, future MultiGPU VAE work-units) can produce
+ # per-device clones from the same loader context. Only set when we
+ # actually have a single backing file -- pixel_space and the
+ # image TAESDs (composed from separate encoder/decoder files via
+ # load_taesd) are not addressable by a single vae_path.
+ if vae_path is not None:
+ vae.patcher.cached_patcher_init = (comfy.sd.load_vae_patcher, (vae_path, metadata, None))
return (vae,)
class ControlNetLoader:
@@ -839,7 +831,7 @@ class ControlNetLoader:
RETURN_TYPES = ("CONTROL_NET",)
FUNCTION = "load_controlnet"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
SEARCH_ALIASES = ["controlnet", "control net", "cn", "load controlnet", "controlnet loader"]
def load_controlnet(self, control_net_name):
@@ -858,7 +850,7 @@ class DiffControlNetLoader:
RETURN_TYPES = ("CONTROL_NET",)
FUNCTION = "load_controlnet"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
def load_controlnet(self, model, control_net_name):
controlnet_path = folder_paths.get_full_path_or_raise("controlnet", control_net_name)
@@ -878,7 +870,7 @@ class ControlNetApply:
FUNCTION = "apply_controlnet"
DEPRECATED = True
- CATEGORY = "conditioning/controlnet"
+ CATEGORY = "model/conditioning/controlnet"
def apply_controlnet(self, conditioning, control_net, image, strength):
if strength == 0:
@@ -916,7 +908,7 @@ class ControlNetApplyAdvanced:
RETURN_NAMES = ("positive", "negative")
FUNCTION = "apply_controlnet"
- CATEGORY = "conditioning/controlnet"
+ CATEGORY = "model/conditioning/controlnet"
SEARCH_ALIASES = ["controlnet", "apply controlnet", "use controlnet", "control net"]
def apply_controlnet(self, positive, negative, control_net, image, strength, start_percent, end_percent, vae=None, extra_concat=[]):
@@ -977,7 +969,7 @@ class CLIPLoader:
@classmethod
def INPUT_TYPES(s):
return {"required": { "clip_name": (folder_paths.get_filename_list("text_encoders"), ),
- "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image"], ),
+ "type": (["stable_diffusion", "stable_cascade", "sd3", "stable_audio", "mochi", "ltxv", "pixart", "cosmos", "lumina2", "wan", "hidream", "chroma", "ace", "omnigen2", "qwen_image", "hunyuan_image", "flux2", "ovis", "longcat_image", "cogvideox", "lens", "pixeldit"], ),
},
"optional": {
"device": (["default", "cpu"], {"advanced": True}),
@@ -987,7 +979,7 @@ class CLIPLoader:
CATEGORY = "advanced/loaders"
- DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B"
+ DESCRIPTION = "[Recipes]\n\nstable_diffusion: clip-l\nstable_cascade: clip-g\nsd3: t5 xxl/ clip-g / clip-l\nstable_audio: t5 base\nmochi: t5 xxl\ncogvideox: t5 xxl (226-token padding)\ncosmos: old t5 xxl\nlumina2: gemma 2 2B\nwan: umt5 xxl\n hidream: llama-3.1 (Recommend) or t5\nomnigen2: qwen vl 2.5 3B\nlens: gpt-oss-20b\n pixeldit: gemma 2 2B elm"
def load_clip(self, clip_name, type="stable_diffusion", device="default"):
clip_type = getattr(comfy.sd.CLIPType, type.upper(), comfy.sd.CLIPType.STABLE_DIFFUSION)
@@ -1038,7 +1030,7 @@ class CLIPVisionLoader:
RETURN_TYPES = ("CLIP_VISION",)
FUNCTION = "load_clip"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
def load_clip(self, clip_name):
clip_path = folder_paths.get_full_path_or_raise("clip_vision", clip_name)
@@ -1057,7 +1049,7 @@ class CLIPVisionEncode:
RETURN_TYPES = ("CLIP_VISION_OUTPUT",)
FUNCTION = "encode"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def encode(self, clip_vision, image, crop):
crop_image = True
@@ -1074,7 +1066,7 @@ class StyleModelLoader:
RETURN_TYPES = ("STYLE_MODEL",)
FUNCTION = "load_style_model"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
def load_style_model(self, style_model_name):
style_model_path = folder_paths.get_full_path_or_raise("style_models", style_model_name)
@@ -1096,7 +1088,7 @@ class StyleModelApply:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "apply_stylemodel"
- CATEGORY = "conditioning/style_model"
+ CATEGORY = "model/conditioning/style_model"
def apply_stylemodel(self, conditioning, style_model, clip_vision_output, strength, strength_type):
cond = style_model.get_cond(clip_vision_output).flatten(start_dim=0, end_dim=1).unsqueeze(dim=0)
@@ -1156,7 +1148,7 @@ class unCLIPConditioning:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "apply_adm"
- CATEGORY = "conditioning"
+ CATEGORY = "model/conditioning"
def apply_adm(self, conditioning, clip_vision_output, strength, noise_augmentation):
if strength == 0:
@@ -1173,7 +1165,7 @@ class GLIGENLoader:
RETURN_TYPES = ("GLIGEN",)
FUNCTION = "load_gligen"
- CATEGORY = "loaders"
+ CATEGORY = "model/loaders"
def load_gligen(self, gligen_name):
gligen_path = folder_paths.get_full_path_or_raise("gligen", gligen_name)
@@ -1195,7 +1187,7 @@ class GLIGENTextBoxApply:
RETURN_TYPES = ("CONDITIONING",)
FUNCTION = "append"
- CATEGORY = "conditioning/gligen"
+ CATEGORY = "model/conditioning/gligen"
def append(self, conditioning_to, clip, gligen_textbox_model, text, width, height, x, y):
c = []
@@ -1225,7 +1217,7 @@ class EmptyLatentImage:
OUTPUT_TOOLTIPS = ("The empty latent image batch.",)
FUNCTION = "generate"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
DESCRIPTION = "Create a new batch of empty latent images to be denoised via sampling."
SEARCH_ALIASES = ["empty", "empty latent", "new latent", "create latent", "blank latent", "blank"]
@@ -1240,18 +1232,20 @@ class LatentFromBatch:
@classmethod
def INPUT_TYPES(s):
return {"required": { "samples": ("LATENT",),
- "batch_index": ("INT", {"default": 0, "min": 0, "max": 63}),
+ "batch_index": ("INT", {"default": 0, "min": -MAX_RESOLUTION, "max": MAX_RESOLUTION}),
"length": ("INT", {"default": 1, "min": 1, "max": 64}),
}}
RETURN_TYPES = ("LATENT",)
FUNCTION = "frombatch"
- CATEGORY = "latent/batch"
+ CATEGORY = "model/latent/batch"
def frombatch(self, samples, batch_index, length):
s = samples.copy()
s_in = samples["samples"]
- batch_index = min(s_in.shape[0] - 1, batch_index)
+ if batch_index < 0:
+ batch_index += s_in.shape[0]
+ batch_index = max(0, min(s_in.shape[0] - 1, batch_index))
length = min(s_in.shape[0] - batch_index, length)
s["samples"] = s_in[batch_index:batch_index + length].clone()
if "noise_mask" in samples:
@@ -1279,7 +1273,7 @@ class RepeatLatentBatch:
RETURN_TYPES = ("LATENT",)
FUNCTION = "repeat"
- CATEGORY = "latent/batch"
+ CATEGORY = "model/latent/batch"
def repeat(self, samples, amount):
s = samples.copy()
@@ -1311,7 +1305,7 @@ class LatentUpscale:
RETURN_TYPES = ("LATENT",)
FUNCTION = "upscale"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
def upscale(self, samples, upscale_method, width, height, crop):
if width == 0 and height == 0:
@@ -1344,7 +1338,7 @@ class LatentUpscaleBy:
RETURN_TYPES = ("LATENT",)
FUNCTION = "upscale"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
def upscale(self, samples, upscale_method, scale_by):
s = samples.copy()
@@ -1362,7 +1356,7 @@ class LatentRotate:
RETURN_TYPES = ("LATENT",)
FUNCTION = "rotate"
- CATEGORY = "latent/transform"
+ CATEGORY = "model/latent/transform"
def rotate(self, samples, rotation):
s = samples.copy()
@@ -1388,7 +1382,7 @@ class LatentFlip:
RETURN_TYPES = ("LATENT",)
FUNCTION = "flip"
- CATEGORY = "latent/transform"
+ CATEGORY = "model/latent/transform"
def flip(self, samples, flip_method):
s = samples.copy()
@@ -1413,7 +1407,7 @@ class LatentComposite:
RETURN_TYPES = ("LATENT",)
FUNCTION = "composite"
- CATEGORY = "latent"
+ CATEGORY = "model/latent"
def composite(self, samples_to, samples_from, x, y, composite_method="normal", feather=0):
x = x // 8
@@ -1462,7 +1456,7 @@ class LatentBlend:
RETURN_TYPES = ("LATENT",)
FUNCTION = "blend"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
def blend(self, samples1, samples2, blend_factor:float, blend_mode: str="normal"):
@@ -1500,7 +1494,7 @@ class LatentCrop:
RETURN_TYPES = ("LATENT",)
FUNCTION = "crop"
- CATEGORY = "latent/transform"
+ CATEGORY = "model/latent/transform"
def crop(self, samples, width, height, x, y):
s = samples.copy()
@@ -1530,7 +1524,7 @@ class SetLatentNoiseMask:
RETURN_TYPES = ("LATENT",)
FUNCTION = "set_mask"
- CATEGORY = "latent/inpaint"
+ CATEGORY = "model/latent/inpaint"
def set_mask(self, samples, mask):
s = samples.copy()
@@ -1539,7 +1533,7 @@ class SetLatentNoiseMask:
def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive, negative, latent, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False):
latent_image = latent["samples"]
- latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None))
+ latent_image = comfy.sample.fix_empty_latent_channels(model, latent_image, latent.get("downscale_ratio_spacial", None), latent.get("downscale_ratio_temporal", None))
if disable_noise:
noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu")
@@ -1558,6 +1552,7 @@ def common_ksampler(model, seed, steps, cfg, sampler_name, scheduler, positive,
force_full_denoise=force_full_denoise, noise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
out = latent.copy()
out.pop("downscale_ratio_spacial", None)
+ out.pop("downscale_ratio_temporal", None)
out["samples"] = samples
return (out, )
@@ -1583,7 +1578,7 @@ class KSampler:
OUTPUT_TOOLTIPS = ("The denoised latent.",)
FUNCTION = "sample"
- CATEGORY = "sampling"
+ CATEGORY = "model/sampling"
DESCRIPTION = "Uses the provided model, positive and negative conditioning to denoise the latent image."
SEARCH_ALIASES = ["sampler", "sample", "generate", "denoise", "diffuse", "txt2img", "img2img"]
@@ -1613,7 +1608,7 @@ class KSamplerAdvanced:
RETURN_TYPES = ("LATENT",)
FUNCTION = "sample"
- CATEGORY = "sampling"
+ CATEGORY = "model/sampling"
def sample(self, model, add_noise, noise_seed, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, start_at_step, end_at_step, return_with_leftover_noise, denoise=1.0):
force_full_denoise = True
@@ -1713,22 +1708,27 @@ class LoadImage:
RETURN_TYPES = ("IMAGE", "MASK")
FUNCTION = "load_image"
+
def load_image(self, image):
image_path = folder_paths.get_annotated_filepath(image)
+ dtype = comfy.model_management.intermediate_dtype()
+ device = comfy.model_management.intermediate_device()
+
+ components = InputImpl.VideoFromFile(image_path).get_components()
+ if components.images.shape[0] > 0:
+ return (components.images.to(device=device, dtype=dtype), (1.0 - components.alpha[..., -1]).to(device=device, dtype=dtype) if components.alpha is not None else torch.zeros((components.images.shape[0], 64, 64), dtype=dtype, device=device))
+
+ # This code is left here to handle animated webp which pyav does not support loading
img = node_helpers.pillow(Image.open, image_path)
output_images = []
output_masks = []
w, h = None, None
- dtype = comfy.model_management.intermediate_dtype()
-
for i in ImageSequence.Iterator(img):
i = node_helpers.pillow(ImageOps.exif_transpose, i)
- if i.mode == 'I':
- i = i.point(lambda i: i * (1 / 255))
image = i.convert("RGB")
if len(output_images) == 0:
@@ -1743,25 +1743,15 @@ class LoadImage:
if 'A' in i.getbands():
mask = np.array(i.getchannel('A')).astype(np.float32) / 255.0
mask = 1. - torch.from_numpy(mask)
- elif i.mode == 'P' and 'transparency' in i.info:
- mask = np.array(i.convert('RGBA').getchannel('A')).astype(np.float32) / 255.0
- mask = 1. - torch.from_numpy(mask)
else:
- mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
+ mask = torch.zeros((64, 64), dtype=torch.float32, device="cpu")
output_images.append(image.to(dtype=dtype))
output_masks.append(mask.unsqueeze(0).to(dtype=dtype))
- if img.format == "MPO":
- break # ignore all frames except the first one for MPO format
+ output_image = torch.cat(output_images, dim=0)
+ output_mask = torch.cat(output_masks, dim=0)
- if len(output_images) > 1:
- output_image = torch.cat(output_images, dim=0)
- output_mask = torch.cat(output_masks, dim=0)
- else:
- output_image = output_images[0]
- output_mask = output_masks[0]
-
- return (output_image, output_mask)
+ return (output_image.to(device=device, dtype=dtype), output_mask.to(device=device, dtype=dtype))
@classmethod
def IS_CHANGED(s, image):
@@ -1778,57 +1768,49 @@ class LoadImage:
return True
-class LoadImageMask:
+
+class LoadImageMask(LoadImage):
ESSENTIALS_CATEGORY = "Image Tools"
SEARCH_ALIASES = ["import mask", "alpha mask", "channel mask"]
_color_channels = ["alpha", "red", "green", "blue"]
+
@classmethod
def INPUT_TYPES(s):
- input_dir = folder_paths.get_input_directory()
- files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
- return {"required":
- {"image": (sorted(files), {"image_upload": True}),
- "channel": (s._color_channels, ), }
- }
-
- CATEGORY = "mask"
+ types = super().INPUT_TYPES()
+ return {
+ "required": {
+ **types["required"],
+ "channel": (s._color_channels, )
+ }
+ }
+ CATEGORY = "image"
RETURN_TYPES = ("MASK",)
- FUNCTION = "load_image"
- def load_image(self, image, channel):
- image_path = folder_paths.get_annotated_filepath(image)
- i = node_helpers.pillow(Image.open, image_path)
- i = node_helpers.pillow(ImageOps.exif_transpose, i)
- if i.getbands() != ("R", "G", "B", "A"):
- if i.mode == 'I':
- i = i.point(lambda i: i * (1 / 255))
- i = i.convert("RGBA")
- mask = None
+ FUNCTION = "load_image_mask"
+
+ def load_image_mask(self, image, channel):
+ image_tensor, mask_tensor = super().load_image(image)
c = channel[0].upper()
- if c in i.getbands():
- mask = np.array(i.getchannel(c)).astype(np.float32) / 255.0
- mask = torch.from_numpy(mask)
- if c == 'A':
- mask = 1. - mask
+
+ if c == 'A':
+ return (mask_tensor,)
+
+ channel_idx = {'R': 0, 'G': 1, 'B': 2}.get(c, 0)
+
+ if channel_idx < image_tensor.shape[-1]:
+ return (image_tensor[..., channel_idx].clone(),)
else:
- mask = torch.zeros((64,64), dtype=torch.float32, device="cpu")
- return (mask.unsqueeze(0),)
+ empty_mask = torch.zeros(
+ image_tensor.shape[:-1],
+ dtype=image_tensor.dtype,
+ device=image_tensor.device
+ )
+ return (empty_mask,)
@classmethod
def IS_CHANGED(s, image, channel):
- image_path = folder_paths.get_annotated_filepath(image)
- m = hashlib.sha256()
- with open(image_path, 'rb') as f:
- m.update(f.read())
- return m.digest().hex()
-
- @classmethod
- def VALIDATE_INPUTS(s, image):
- if not folder_paths.exists_annotated_filepath(image):
- return "Invalid image file: {}".format(image)
-
- return True
+ return super().IS_CHANGED(image)
class LoadImageOutput(LoadImage):
@@ -1919,7 +1901,7 @@ class ImageInvert:
RETURN_TYPES = ("IMAGE",)
FUNCTION = "invert"
- CATEGORY = "image"
+ CATEGORY = "image/color"
def invert(self, image):
s = 1.0 - image
@@ -1935,7 +1917,7 @@ class ImageBatch:
RETURN_TYPES = ("IMAGE",)
FUNCTION = "batch"
- CATEGORY = "image"
+ CATEGORY = "image/batch"
DEPRECATED = True
def batch(self, image1, image2):
@@ -1992,7 +1974,7 @@ class ImagePadForOutpaint:
RETURN_TYPES = ("IMAGE", "MASK")
FUNCTION = "expand_image"
- CATEGORY = "image"
+ CATEGORY = "image/transform"
def expand_image(self, image, left, top, right, bottom, feathering):
d1, d2, d3, d4 = image.size()
@@ -2124,6 +2106,8 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"StyleModelLoader": "Load Style Model",
"CLIPVisionLoader": "Load CLIP Vision",
"UNETLoader": "Load Diffusion Model",
+ "unCLIPCheckpointLoader": "Load unCLIP Checkpoint",
+ "GLIGENLoader": "Load GLIGEN Model",
# Conditioning
"CLIPVisionEncode": "CLIP Vision Encode",
"StyleModelApply": "Apply Style Model",
@@ -2135,7 +2119,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"ConditioningSetArea": "Conditioning (Set Area)",
"ConditioningSetAreaPercentage": "Conditioning (Set Area with Percentage)",
"ConditioningSetMask": "Conditioning (Set Mask)",
- "ControlNetApply": "Apply ControlNet (OLD)",
+ "ControlNetApply": "Apply ControlNet (DEPRECATED)",
"ControlNetApplyAdvanced": "Apply ControlNet",
# Latent
"VAEEncodeForInpaint": "VAE Encode (for Inpainting)",
@@ -2153,6 +2137,7 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"LatentFromBatch" : "Latent From Batch",
"RepeatLatentBatch": "Repeat Latent Batch",
# Image
+ "EmptyImage": "Empty Image",
"SaveImage": "Save Image",
"PreviewImage": "Preview Image",
"LoadImage": "Load Image",
@@ -2160,18 +2145,18 @@ NODE_DISPLAY_NAME_MAPPINGS = {
"LoadImageOutput": "Load Image (from Outputs)",
"ImageScale": "Upscale Image",
"ImageScaleBy": "Upscale Image By",
- "ImageInvert": "Invert Image",
+ "ImageInvert": "Invert Image Colors",
"ImagePadForOutpaint": "Pad Image for Outpainting",
- "ImageBatch": "Batch Images",
- "ImageCrop": "Image Crop",
- "ImageStitch": "Image Stitch",
- "ImageBlend": "Image Blend",
- "ImageBlur": "Image Blur",
- "ImageQuantize": "Image Quantize",
- "ImageSharpen": "Image Sharpen",
+ "ImageBatch": "Batch Images (DEPRECATED)",
+ "ImageCrop": "Crop Image",
+ "ImageStitch": "Stitch Images",
+ "ImageBlend": "Blend Images",
+ "ImageBlur": "Blur Image",
+ "ImageQuantize": "Quantize Image",
+ "ImageSharpen": "Sharpen Image",
"ImageScaleToTotalPixels": "Scale Image to Total Pixels",
"GetImageSize": "Get Image Size",
- # _for_testing
+ # experimental
"VAEDecodeTiled": "VAE Decode (Tiled)",
"VAEEncodeTiled": "VAE Encode (Tiled)",
}
@@ -2293,7 +2278,7 @@ async def load_custom_node(module_path: str, ignore=set(), module_parent="custom
logging.warning(f"Error while calling comfy_entrypoint in {module_path}: {e}")
return False
else:
- logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS or NODES_LIST (need one).")
+ logging.warning(f"Skip {module_path} module for custom nodes due to the lack of NODE_CLASS_MAPPINGS or comfy_entrypoint (need one).")
return False
except Exception as e:
logging.warning(traceback.format_exc())
@@ -2412,6 +2397,7 @@ async def init_builtin_extra_nodes():
"nodes_lt_audio.py",
"nodes_lt.py",
"nodes_hooks.py",
+ "nodes_multigpu.py",
"nodes_load_3d.py",
"nodes_cosmos.py",
"nodes_video.py",
@@ -2434,6 +2420,7 @@ async def init_builtin_extra_nodes():
"nodes_context_windows.py",
"nodes_qwen.py",
"nodes_chroma_radiance.py",
+ "nodes_pid.py",
"nodes_model_patch.py",
"nodes_easycache.py",
"nodes_audio_encoder.py",
@@ -2443,6 +2430,7 @@ async def init_builtin_extra_nodes():
"nodes_nop.py",
"nodes_kandinsky5.py",
"nodes_wanmove.py",
+ "nodes_ar_video.py",
"nodes_image_compare.py",
"nodes_zimage.py",
"nodes_glsl.py",
@@ -2457,7 +2445,16 @@ async def init_builtin_extra_nodes():
"nodes_number_convert.py",
"nodes_painter.py",
"nodes_curve.py",
- "nodes_rtdetr.py"
+ "nodes_bg_removal.py",
+ "nodes_rtdetr.py",
+ "nodes_frame_interpolation.py",
+ "nodes_sam3.py",
+ "nodes_void.py",
+ "nodes_wandancer.py",
+ "nodes_hidream_o1.py",
+ "nodes_save_3d.py",
+ "nodes_moge.py",
+ "nodes_mediapipe.py",
]
import_failed = []
diff --git a/openapi.yaml b/openapi.yaml
new file mode 100644
index 000000000..f801a39d9
--- /dev/null
+++ b/openapi.yaml
@@ -0,0 +1,11749 @@
+openapi: 3.1.0
+info:
+ title: ComfyUI API
+ description: |
+ API for ComfyUI - A powerful and modular stable diffusion GUI and backend.
+
+ This API allows you to interact with ComfyUI programmatically, including:
+ - Submitting and managing workflow executions
+ - Querying node/object information
+ - Uploading and viewing files
+ - Managing user settings and data
+ - Asset management (feature-gated)
+
+ ## Dual-path routing
+ Every route registered via `self.routes` in the ComfyUI server is available at
+ both its bare path (e.g. `/prompt`) and an `/api`-prefixed path (e.g. `/api/prompt`).
+ This spec uses the `/api`-prefixed versions as canonical.
+
+ ## Multi-user mode
+ When ComfyUI is started with `--multi-user`, the `Comfy-User` header identifies
+ the active user for settings, userdata, and history isolation. This is **not** a
+ security mechanism — it is an organisational convenience with no authentication
+ or authorisation behind it.
+ version: 1.0.0
+ license:
+ name: GNU General Public License v3.0
+ url: https://github.com/comfyanonymous/ComfyUI/blob/master/LICENSE
+
+servers:
+ - url: /
+ description: Default ComfyUI server (typically http://127.0.0.1:8188)
+
+tags:
+ - name: prompt
+ description: Workflow submission and prompt info
+ - name: queue
+ description: Queue inspection and management
+ - name: history
+ description: Execution history
+ - name: upload
+ description: File upload endpoints
+ - name: view
+ description: File viewing / download
+ - name: system
+ description: System stats and feature flags
+ - name: node
+ description: Node / object_info definitions
+ - name: model
+ description: Model folder and file listing
+ - name: user
+ description: User management (multi-user mode)
+ - name: userdata
+ description: Per-user file storage
+ - name: settings
+ description: Per-user settings
+ - name: extensions
+ description: Frontend extension JS files
+ - name: subgraph
+ description: Global subgraph blueprints
+ - name: internal
+ description: Internal / debug endpoints
+ - name: assets
+ description: Asset management (feature-gated behind enable-assets)
+
+ - name: auth
+ description: Authentication and session management (cloud-only)
+ - name: billing
+ description: Billing, subscriptions, and payment management (cloud-only)
+ - name: workspace
+ description: Workspace and team management (cloud-only)
+ - name: hub
+ description: "ComfyUI Hub: profiles, shared workflows, and labels (cloud-only)"
+ - name: workflows
+ description: Cloud workflow management and versioning (cloud-only)
+ - name: task
+ description: Background task management (cloud-only)
+ - name: runtime-only
+ description: Operations served exclusively by the cloud runtime with no local equivalent
+
+paths:
+ # ---------------------------------------------------------------------------
+ # WebSocket
+ # ---------------------------------------------------------------------------
+ /ws:
+ get:
+ operationId: connectWebSocket
+ tags: [system]
+ summary: WebSocket connection for real-time updates
+ description: |
+ Upgrades to a WebSocket connection that streams execution progress,
+ node status, and output messages. The server sends an initial `status`
+ message with the session ID (SID) on connect.
+
+ ## Message types (server → client)
+ The server sends JSON messages with a `type` field. See the
+ `x-websocket-messages` list below for the schema of each message type.
+ parameters:
+ - name: clientId
+ in: query
+ required: false
+ schema:
+ type: string
+ description: Client identifier. If omitted the server assigns one.
+ responses:
+ "101":
+ description: WebSocket upgrade successful
+ '401':
+ description: Unauthorized
+ x-websocket-messages:
+ - type: status
+ schema:
+ $ref: "#/components/schemas/StatusWsMessage"
+ - type: progress
+ schema:
+ $ref: "#/components/schemas/ProgressWsMessage"
+ - type: progress_text
+ schema:
+ $ref: "#/components/schemas/ProgressTextWsMessage"
+ - type: progress_state
+ schema:
+ $ref: "#/components/schemas/ProgressStateWsMessage"
+ - type: executing
+ schema:
+ $ref: "#/components/schemas/ExecutingWsMessage"
+ - type: executed
+ schema:
+ $ref: "#/components/schemas/ExecutedWsMessage"
+ - type: execution_start
+ schema:
+ $ref: "#/components/schemas/ExecutionStartWsMessage"
+ - type: execution_success
+ schema:
+ $ref: "#/components/schemas/ExecutionSuccessWsMessage"
+ - type: execution_cached
+ schema:
+ $ref: "#/components/schemas/ExecutionCachedWsMessage"
+ - type: execution_interrupted
+ schema:
+ $ref: "#/components/schemas/ExecutionInterruptedWsMessage"
+ - type: execution_error
+ schema:
+ $ref: "#/components/schemas/ExecutionErrorWsMessage"
+ - type: logs
+ schema:
+ $ref: "#/components/schemas/LogsWsMessage"
+ - type: notification
+ schema:
+ $ref: "#/components/schemas/NotificationWsMessage"
+ - type: feature_flags
+ schema:
+ $ref: "#/components/schemas/FeatureFlagsWsMessage"
+ - type: asset_download
+ schema:
+ $ref: "#/components/schemas/AssetDownloadWsMessage"
+ - type: asset_export
+ schema:
+ $ref: "#/components/schemas/AssetExportWsMessage"
+
+ # ---------------------------------------------------------------------------
+ # Prompt
+ # ---------------------------------------------------------------------------
+ /api/prompt:
+ get:
+ operationId: getPromptInfo
+ tags: [prompt]
+ summary: Get queue status
+ description: Returns how many items remain in the execution queue.
+ responses:
+ "200":
+ description: Queue info
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PromptInfo"
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: executePrompt
+ tags: [prompt]
+ summary: Submit a workflow for execution
+ description: Submits a workflow for execution. The server validates the graph, assigns a `prompt_id`, and enqueues it. Clients listen on `/ws` for execution progress and output messages.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PromptRequest"
+ responses:
+ "200":
+ description: Prompt accepted
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PromptResponse"
+ "400":
+ description: Validation or node errors
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PromptErrorResponse"
+
+ '402':
+ description: Payment required - Insufficient credits
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PromptErrorResponse'
+ '429':
+ description: Payment required - User has not paid
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PromptErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PromptErrorResponse'
+ '503':
+ description: Service unavailable
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/PromptErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Queue
+ # ---------------------------------------------------------------------------
+ /api/queue:
+ get:
+ operationId: getQueueInfo
+ tags: [queue]
+ summary: Get running and pending queue items
+ description: Returns the server's current execution queue, split into the currently-running prompt and the list of pending prompts.
+ responses:
+ "200":
+ description: Queue contents
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/QueueInfo"
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: manageQueue
+ tags: [queue]
+ summary: Clear or delete items from the queue
+ description: Mutates the execution queue. Supports clearing all queued prompts or deleting individual prompts by ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/QueueManageRequest"
+ responses:
+ "200":
+ description: Queue updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/QueueManageResponse"
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/interrupt:
+ post:
+ operationId: interruptJob
+ tags: [queue]
+ summary: Interrupt current execution
+ description: Interrupts the prompt that is currently executing. The next queued prompt (if any) will start immediately after.
+ requestBody:
+ required: false
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ prompt_id:
+ type: string
+ format: uuid
+ description: "If provided, only interrupts this specific running prompt. Otherwise interrupts all."
+ responses:
+ "200":
+ description: Interrupt signal sent
+
+ '401':
+ description: Unauthorized - Authentication required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/free:
+ post:
+ operationId: freeMemory
+ tags: [queue]
+ summary: Free GPU memory and/or unload models
+ description: Frees GPU memory by unloading models and/or freeing the resident model cache, controlled by the request flags.
+ requestBody:
+ required: false
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ unload_models:
+ type: boolean
+ description: Unload all models from VRAM/RAM
+ free_memory:
+ type: boolean
+ description: Run garbage collection and free cached memory
+ responses:
+ "200":
+ description: Memory freed
+
+ # ---------------------------------------------------------------------------
+ # Jobs
+ # ---------------------------------------------------------------------------
+ /api/jobs:
+ get:
+ operationId: listJobs
+ tags: [queue]
+ summary: List jobs with filtering and pagination
+ description: Returns a paginated list of completed prompt executions, newest first.
+ parameters:
+ - name: status
+ in: query
+ schema:
+ type: string
+ description: Filter by job status
+ - name: workflow_id
+ in: query
+ schema:
+ type: string
+ description: Filter by workflow ID
+ - name: sort_by
+ in: query
+ schema:
+ type: string
+ description: Field to sort by
+ - name: sort_order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ description: Sort direction
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results (default is unlimited/None)
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ default: 0
+ description: Pagination offset
+ responses:
+ "200":
+ description: Jobs list
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ jobs:
+ type: array
+ items:
+ $ref: "#/components/schemas/JobEntry"
+ pagination:
+ $ref: "#/components/schemas/PaginationInfo"
+
+ '401':
+ description: Unauthorized - Authentication required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/jobs/{job_id}:
+ get:
+ operationId: getJobDetail
+ tags: [queue]
+ summary: Get a single job by ID
+ description: Returns the full record for a single completed prompt execution, including its outputs, status, and metadata.
+ parameters:
+ - name: job_id
+ in: path
+ description: The job (prompt) ID to fetch.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ responses:
+ "200":
+ description: Job detail
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobDetailResponse"
+ "404":
+ description: Job not found
+
+ '401':
+ description: Unauthorized - Authentication required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '403':
+ description: Forbidden - Job does not belong to user
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # History
+ # ---------------------------------------------------------------------------
+ /api/history:
+ get:
+ operationId: getPromptHistory
+ tags: [history]
+ summary: Get execution history
+ deprecated: true
+ description: |
+ **Deprecated.** Superseded by `GET /api/jobs`, which returns the same
+ execution records in a paginated, filterable format. Planned for removal
+ no earlier than a future major release; sunset timeline TBD.
+
+ Returns a dictionary keyed by prompt_id. Each value is a HistoryEntry
+ containing prompt metadata, outputs, status, and node meta.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: max_items
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of history entries to return
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset (number of entries to skip)
+ responses:
+ "200":
+ description: History dictionary keyed by prompt_id
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/HistoryEntry"
+ '404':
+ description: "Not Found \u2014 use /api/history_v2 instead"
+ post:
+ operationId: manageHistory
+ tags: [history]
+ summary: Clear or delete history entries
+ deprecated: true
+ description: |
+ **Deprecated.** Superseded by the forthcoming job-management endpoints
+ under `/api/jobs`. Planned for removal no earlier than a future major
+ release; sunset timeline TBD.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HistoryManageRequest"
+ responses:
+ "200":
+ description: History updated
+
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized - Authentication required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/history/{prompt_id}:
+ get:
+ operationId: getHistoryByPromptId
+ tags: [history]
+ summary: Get history for a specific prompt
+ deprecated: true
+ description: |
+ **Deprecated.** Superseded by `GET /api/jobs/{job_id}`, which returns
+ the same execution record. Planned for removal no earlier than a future
+ major release; sunset timeline TBD.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: prompt_id
+ in: path
+ description: The prompt ID to fetch history for.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ responses:
+ "200":
+ description: Single-entry history dictionary. Returns an empty object `{}` if the prompt_id is not found.
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/HistoryEntry"
+
+ '404':
+ description: "Not Found \u2014 use /api/jobs/{prompt_id} instead"
+ # ---------------------------------------------------------------------------
+ # Upload
+ # ---------------------------------------------------------------------------
+ /api/upload/image:
+ post:
+ operationId: uploadImage
+ tags: [upload]
+ summary: Upload an image file
+ description: Uploads an image file into one of the input/output/temp directories so it can be referenced by workflow nodes.
+ requestBody:
+ required: true
+ content:
+ multipart/form-data:
+ schema:
+ type: object
+ required:
+ - image
+ properties:
+ image:
+ type: string
+ format: binary
+ description: Image file to upload
+ type:
+ type: string
+ enum: [input, temp, output]
+ default: input
+ description: Target directory type
+ overwrite:
+ type: string
+ description: 'Set to "true" to overwrite existing files'
+ subfolder:
+ type: string
+ description: Subfolder within the target directory
+ responses:
+ "200":
+ description: Upload result
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UploadResult"
+ "400":
+ description: No file provided or invalid request
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/upload/mask:
+ post:
+ operationId: uploadMask
+ tags: [upload]
+ deprecated: true
+ summary: Upload a mask image (deprecated)
+ description: |
+ Deprecated. Clients should composite the mask onto the source image
+ client-side and upload the resulting image via POST /api/upload/image
+ instead. This endpoint will continue to function for older clients,
+ but will not receive new features.
+
+ Uploads a mask image associated with a previously-uploaded reference image.
+ requestBody:
+ required: true
+ content:
+ multipart/form-data:
+ schema:
+ type: object
+ required:
+ - image
+ - original_ref
+ properties:
+ image:
+ type: string
+ format: binary
+ description: Mask image (alpha channel is used)
+ original_ref:
+ type: object
+ description: Reference to the original image file
+ required:
+ - filename
+ properties:
+ filename:
+ type: string
+ description: Filename of the original image
+ additionalProperties: true
+ type:
+ type: string
+ enum: [input, temp, output]
+ default: input
+ description: Target directory type
+ overwrite:
+ type: string
+ description: 'Set to "true" to overwrite existing files'
+ subfolder:
+ type: string
+ description: Subfolder within the target directory
+ responses:
+ "200":
+ description: Upload result
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UploadResult"
+ "400":
+ description: No file provided or invalid request
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # View
+ # ---------------------------------------------------------------------------
+ /api/view:
+ get:
+ operationId: viewFile
+ tags: [view]
+ summary: View or download a file
+ description: Serves a file (image, audio, or video) from the input/output/temp directory identified by the query parameters.
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Name of the file to view
+ - name: type
+ in: query
+ schema:
+ type: string
+ enum: [input, output, temp]
+ default: output
+ description: Directory type
+ - name: subfolder
+ in: query
+ schema:
+ type: string
+ description: Subfolder within the directory
+ - name: preview
+ in: query
+ schema:
+ type: string
+ description: Preview format hint (e.g. "webp;90")
+ - name: channel
+ in: query
+ schema:
+ type: string
+ enum: [rgba, rgb, a]
+ description: Channel extraction mode
+ responses:
+ "200":
+ description: File content
+ content:
+ image/*:
+ schema:
+ type: string
+ format: binary
+ video/*:
+ schema:
+ type: string
+ format: binary
+ audio/*:
+ schema:
+ type: string
+ format: binary
+ application/octet-stream:
+ schema:
+ type: string
+ format: binary
+ "404":
+ description: File not found
+
+ '302':
+ description: Redirect to GCS signed URL
+ headers:
+ Location:
+ description: Signed URL to access the file in GCS
+ schema:
+ type: string
+ Cache-Control:
+ description: Cache directive for the redirect response
+ schema:
+ type: string
+ Vary:
+ description: Headers that affect response caching
+ schema:
+ type: string
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/view_metadata/{folder_name}:
+ get:
+ operationId: viewMetadata
+ tags: [view]
+ summary: Get metadata for a file (e.g. safetensors header)
+ description: Returns embedded metadata parsed from a file in the given folder — for example, the header of a safetensors model.
+ parameters:
+ - name: folder_name
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Folder type (output, input, temp, etc.)
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Filename to read metadata from
+ responses:
+ "200":
+ description: File metadata
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ "404":
+ description: File or metadata not found
+
+ # ---------------------------------------------------------------------------
+ # System
+ # ---------------------------------------------------------------------------
+ /api/system_stats:
+ get:
+ operationId: getSystemStats
+ tags: [system]
+ summary: Get system statistics
+ description: Returns hardware, Python, VRAM, and runtime statistics for the running ComfyUI process.
+ responses:
+ "200":
+ description: System stats
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SystemStatsResponse"
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/features:
+ get:
+ operationId: getFeatures
+ tags: [system]
+ summary: Get enabled feature flags
+ description: Returns a dictionary of feature flag names to their enabled state. Cloud deployments may include additional typed fields alongside the boolean flags.
+ responses:
+ "200":
+ description: Feature flags
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ type: boolean
+ properties:
+ max_upload_size:
+ type: integer
+ format: int64
+ minimum: 0
+ description: "Maximum file upload size in bytes."
+ free_tier_credits:
+ type: integer
+ format: int32
+ minimum: 0
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Credits available to free-tier users. Local ComfyUI returns null."
+ posthog_api_host:
+ type: string
+ format: uri
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] PostHog analytics proxy URL for frontend telemetry. Local ComfyUI returns null."
+ max_concurrent_jobs:
+ type: integer
+ format: int32
+ minimum: 0
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Maximum concurrent jobs the authenticated user can run. Local ComfyUI returns null."
+ workflow_templates_version:
+ type: string
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Version identifier for the workflow templates bundle. Local ComfyUI returns null."
+ workflow_templates_source:
+ type: string
+ nullable: true
+ enum: [dynamic_config_override, workflow_templates_version_json]
+ x-runtime: [cloud]
+ description: "[cloud-only] How the templates version was resolved. Local ComfyUI returns null."
+
+ # ---------------------------------------------------------------------------
+ # Node / Object Info
+ # ---------------------------------------------------------------------------
+ /api/object_info:
+ get:
+ operationId: getNodeInfo
+ tags: [node]
+ summary: Get all node definitions
+ description: |
+ Returns a dictionary of every registered node class, keyed by class name.
+ Each value is a NodeInfo object describing inputs, outputs, category, etc.
+ responses:
+ "200":
+ description: All node definitions
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/NodeInfo"
+
+ /api/object_info/{node_class}:
+ get:
+ operationId: getObjectInfoByClass
+ tags: [node]
+ summary: Get a single node definition
+ description: Returns the `NodeInfo` definition for a single registered node class.
+ parameters:
+ - name: node_class
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Node class name (e.g. "KSampler")
+ responses:
+ "200":
+ description: Single node definition
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/NodeInfo"
+ "404":
+ description: Node class not found
+
+ /api/embeddings:
+ get:
+ operationId: getEmbeddings
+ tags: [node]
+ summary: List available embedding names
+ description: Returns the list of text-encoder embeddings available on disk.
+ responses:
+ "200":
+ description: Embedding names
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+
+ # ---------------------------------------------------------------------------
+ # Models
+ # ---------------------------------------------------------------------------
+ /api/models:
+ get:
+ operationId: getModelTypes
+ tags: [model]
+ summary: List model folder type names
+ description: Returns an array of model type names (e.g. checkpoints, loras, vae).
+ responses:
+ "200":
+ description: Model type names
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+
+ '404':
+ description: "Not Found \u2014 use /api/experiment/models instead"
+ /api/models/{folder}:
+ get:
+ operationId: getModelsByFolder
+ tags: [model]
+ summary: List model filenames in a folder
+ description: Returns the names of model files in the given folder. This endpoint predates `/api/experiment/models/{folder}` and returns names only — prefer the experiment endpoint for new integrations.
+ parameters:
+ - name: folder
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Model folder type name
+ responses:
+ "200":
+ description: Model filenames
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+ "404":
+ description: Unknown folder type
+
+ /api/experiment/models:
+ get:
+ operationId: getModelFolders
+ tags: [model]
+ summary: List model folders with paths
+ description: Returns an array of model folder objects with name and folder paths.
+ responses:
+ "200":
+ description: Model folders
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/ModelFolder"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/experiment/models/{folder}:
+ get:
+ operationId: getModelsInFolder
+ tags: [model]
+ summary: List model files with metadata
+ description: Returns the model files in the given folder with richer metadata (path index, mtime, size) than the legacy `/api/models/{folder}` endpoint.
+ parameters:
+ - name: folder
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Model folder type name
+ responses:
+ "200":
+ description: Model files with metadata
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/ModelFile"
+ "404":
+ description: Unknown folder type
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/experiment/models/preview/{folder}/{path_index}/{filename}:
+ get:
+ operationId: getModelPreview
+ tags: [model]
+ summary: Get model preview image
+ description: Returns the preview image associated with a model file, if one exists alongside the model on disk.
+ parameters:
+ - name: folder
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Model folder type name
+ - name: path_index
+ in: path
+ required: true
+ schema:
+ type: integer
+ description: Path index within the folder
+ - name: filename
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Model filename
+ responses:
+ "200":
+ description: Preview image (WebP)
+ content:
+ image/webp:
+ schema:
+ type: string
+ format: binary
+ "404":
+ description: Preview not found
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Users
+ # ---------------------------------------------------------------------------
+ /api/users:
+ get:
+ operationId: getUsersInfo
+ tags: [user]
+ summary: Get user storage info
+ description: |
+ Returns user storage configuration. In single-user mode returns
+ `{"storage": "server", "migrated": true/false}`. In multi-user mode
+ returns `{"storage": "server", "users": {"user_id": "user_dir", ...}}`.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ responses:
+ "200":
+ description: User info
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ storage:
+ type: string
+ description: Storage backend type (always "server")
+ migrated:
+ type: boolean
+ description: Whether migration from browser storage is complete (single-user)
+ users:
+ type: object
+ additionalProperties:
+ type: string
+ description: Map of user_id to directory name (multi-user)
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createUser
+ tags: [user]
+ summary: Create a new user (multi-user mode)
+ description: Creates a new user entry. Only meaningful when ComfyUI is running in multi-user mode.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - username
+ properties:
+ username:
+ type: string
+ description: Username for the new user
+ responses:
+ "200":
+ description: Created user ID
+ content:
+ application/json:
+ schema:
+ type: string
+ description: The generated user_id
+ "400":
+ description: Username already exists or invalid
+
+ # ---------------------------------------------------------------------------
+ # Userdata
+ # ---------------------------------------------------------------------------
+ /api/userdata:
+ get:
+ operationId: getUserdata
+ tags: [userdata]
+ summary: List files in a userdata directory
+ description: Lists files in the authenticated user's data directory. Returns either filename strings or full objects depending on the `full_info` query parameter.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: dir
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Directory path relative to the user's data folder
+ - name: recurse
+ in: query
+ schema:
+ type: boolean
+ description: Recurse into subdirectories
+ - name: full_info
+ in: query
+ schema:
+ type: boolean
+ description: Return full file info objects instead of just names
+ - name: split
+ in: query
+ schema:
+ type: boolean
+ description: Split paths into directory components
+ responses:
+ "200":
+ description: File listing
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/GetUserDataResponseFull"
+ "404":
+ description: Directory not found
+
+ '400':
+ description: Bad request (e.g., invalid filename).
+ content:
+ text/plain:
+ schema:
+ type: string
+ '401':
+ description: Unauthorized.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '500':
+ description: General error
+ content:
+ text/plain:
+ schema:
+ type: string
+ /api/v2/userdata:
+ get:
+ operationId: listUserdataV2
+ tags: [userdata]
+ summary: List files in userdata (v2 format)
+ description: Lists files in the authenticated user's data directory using the v2 response shape, which always returns full objects.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: path
+ in: query
+ schema:
+ type: string
+ description: Directory path relative to user data root
+ responses:
+ "200":
+ description: File listing with metadata
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: object
+ properties:
+ name:
+ type: string
+ path:
+ type: string
+ type:
+ type: string
+ enum: [file, directory]
+ size:
+ type: integer
+ modified:
+ type: number
+ description: Unix timestamp
+
+ '404':
+ description: "Not Found \u2014 use /api/userdata instead"
+ /api/userdata/{file}:
+ get:
+ operationId: getUserdataFile
+ tags: [userdata]
+ summary: Read a userdata file
+ description: Reads the contents of a file from the authenticated user's data directory.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: File path relative to user data directory
+ responses:
+ "200":
+ description: File content
+ content:
+ application/octet-stream:
+ schema:
+ type: string
+ format: binary
+ "404":
+ description: File not found
+ '400':
+ description: Bad request (e.g., invalid filename).
+ content:
+ text/plain:
+ schema:
+ type: string
+ '401':
+ description: Unauthorized.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '500':
+ description: General error
+ content:
+ text/plain:
+ schema:
+ type: string
+ post:
+ operationId: postUserdataFile
+ tags: [userdata]
+ summary: Write or create a userdata file
+ description: Writes (creates or replaces) a file in the authenticated user's data directory.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: File path relative to user data directory
+ - name: overwrite
+ in: query
+ schema:
+ type: boolean
+ description: Allow overwriting existing files
+ - name: full_info
+ in: query
+ schema:
+ type: boolean
+ description: Return full file info in response
+ requestBody:
+ required: true
+ content:
+ application/octet-stream:
+ schema:
+ type: string
+ format: binary
+ application/json:
+ schema: {}
+ responses:
+ "200":
+ description: File written
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UserDataResponseFull"
+ "409":
+ description: File exists and overwrite not set
+ '400':
+ description: Missing or invalid 'file' parameter.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '401':
+ description: Unauthorized.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '403':
+ description: The requested path is not allowed.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '500':
+ description: General error
+ content:
+ text/plain:
+ schema:
+ type: string
+ delete:
+ operationId: deleteUserdataFile
+ tags: [userdata]
+ summary: Delete a userdata file
+ description: Deletes a file from the authenticated user's data directory.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: File path relative to user data directory
+ responses:
+ "204":
+ description: File deleted
+ "404":
+ description: File not found
+
+ '401':
+ description: Unauthorized.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '500':
+ description: Internal server error.
+ content:
+ text/plain:
+ schema:
+ type: string
+ /api/userdata/{file}/move/{dest}:
+ post:
+ operationId: moveUserdataFile
+ tags: [userdata]
+ summary: Move or rename a userdata file
+ description: Renames or moves a file within the authenticated user's data directory.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Source file path
+ - name: dest
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Destination file path
+ - name: overwrite
+ in: query
+ schema:
+ type: boolean
+ description: Allow overwriting at destination
+ - name: full_info
+ in: query
+ schema:
+ type: boolean
+ description: Return full file info in response
+ responses:
+ "200":
+ description: File moved
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UserDataResponseFull"
+ "404":
+ description: Source file not found
+ "409":
+ description: Destination exists and overwrite not set
+
+ '400':
+ description: Missing or invalid parameters.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '401':
+ description: Unauthorized.
+ content:
+ text/plain:
+ schema:
+ type: string
+ '500':
+ description: General error
+ content:
+ text/plain:
+ schema:
+ type: string
+ # ---------------------------------------------------------------------------
+ # Settings
+ # ---------------------------------------------------------------------------
+ /api/settings:
+ get:
+ operationId: getAllSettings
+ tags: [settings]
+ summary: Get all user settings
+ description: Returns all settings for the authenticated user.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ responses:
+ "200":
+ description: Settings object
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: updateMultipleSettings
+ tags: [settings]
+ summary: Update user settings (partial merge)
+ description: Replaces the authenticated user's settings with the provided object.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ description: Partial settings to merge
+ responses:
+ "200":
+ description: Settings updated
+
+ '400':
+ description: Invalid request
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/settings/{id}:
+ get:
+ operationId: getSettingById
+ tags: [settings]
+ summary: Get a single setting by key
+ description: Returns the value of a single setting, identified by key.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Setting key
+ responses:
+ "200":
+ description: Setting value (null if the setting does not exist)
+ content:
+ application/json:
+ schema:
+ nullable: true
+ description: The setting value (any JSON type), or null if not set
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Setting not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: updateSettingById
+ tags: [settings]
+ summary: Set a single setting value
+ description: Sets the value of a single setting, identified by key.
+ parameters:
+ - $ref: "#/components/parameters/ComfyUserHeader"
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Setting key
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ description: The setting value (any JSON type)
+ responses:
+ "200":
+ description: Setting updated
+
+ '400':
+ description: Invalid request
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Extensions / Templates / i18n
+ # ---------------------------------------------------------------------------
+ /api/extensions:
+ get:
+ operationId: getExtensions
+ tags: [extensions]
+ summary: List frontend extension JS file paths
+ description: Returns the list of frontend extension JS URLs registered by custom nodes, to be loaded by the frontend on startup.
+ responses:
+ "200":
+ description: Array of JS file paths
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+ description: Relative path to extension JS file
+
+ /api/workflow_templates:
+ get:
+ operationId: getWorkflowTemplates
+ tags: [extensions]
+ summary: Get workflow template mappings
+ description: Returns a map of custom node names to their provided workflow template names.
+ responses:
+ "200":
+ description: Template mappings
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ type: array
+ items:
+ type: string
+ description: Map of node pack name to array of template names
+
+ /api/i18n:
+ get:
+ operationId: getI18n
+ tags: [extensions]
+ summary: Get internationalisation translation strings
+ description: Returns the URLs of translation files contributed by custom nodes, keyed by locale.
+ responses:
+ "200":
+ description: Translation map
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ description: Nested map of locale to translation key-value pairs
+
+ # ---------------------------------------------------------------------------
+ # Subgraphs
+ # ---------------------------------------------------------------------------
+ /api/global_subgraphs:
+ get:
+ operationId: getGlobalSubgraphs
+ tags: [subgraph]
+ summary: List global subgraph blueprints
+ description: Returns a dictionary of subgraph IDs to their metadata.
+ responses:
+ "200":
+ description: Subgraph metadata dictionary
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/GlobalSubgraphInfo"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/global_subgraphs/{id}:
+ get:
+ operationId: getGlobalSubgraph
+ tags: [subgraph]
+ summary: Get a global subgraph with full data
+ description: Returns the blueprint for a globally-registered subgraph, used by the frontend to materialize the subgraph node.
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Subgraph identifier
+ responses:
+ "200":
+ description: Full subgraph data
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/GlobalSubgraphData"
+ "404":
+ description: Subgraph not found
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Node Replacements
+ # ---------------------------------------------------------------------------
+ /api/node_replacements:
+ get:
+ operationId: getNodeReplacements
+ tags: [node]
+ summary: Get node replacement mappings
+ description: |
+ Returns a dictionary mapping deprecated or replaced node class names
+ to their replacement node information.
+ responses:
+ "200":
+ description: Replacement mappings
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Internal (x-internal: true)
+ # ---------------------------------------------------------------------------
+ /internal/logs:
+ get:
+ operationId: getInternalLogs
+ tags: [internal]
+ summary: Get server logs as text
+ description: Returns structured ComfyUI log entries from the in-memory log buffer.
+ x-internal: true
+ responses:
+ "200":
+ description: Log text
+ content:
+ text/plain:
+ schema:
+ type: string
+
+ /internal/logs/raw:
+ get:
+ operationId: getInternalLogsRaw
+ tags: [internal]
+ summary: Get raw structured log entries
+ description: Returns the raw ComfyUI log buffer as text, together with metadata about the current size limit.
+ x-internal: true
+ responses:
+ "200":
+ description: Structured log data
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ entries:
+ type: array
+ items:
+ type: object
+ properties:
+ t:
+ type: number
+ description: Timestamp
+ m:
+ type: string
+ description: Message
+ size:
+ type: object
+ properties:
+ cols:
+ type: integer
+ rows:
+ type: integer
+
+ /internal/logs/subscribe:
+ patch:
+ operationId: subscribeToLogs
+ tags: [internal]
+ summary: Subscribe or unsubscribe a WebSocket client to log streaming
+ description: Subscribes or unsubscribes the current client from live log streaming over the WebSocket.
+ x-internal: true
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - clientId
+ - enabled
+ properties:
+ clientId:
+ type: string
+ description: WebSocket client ID
+ enabled:
+ type: boolean
+ description: Enable or disable log streaming for this client
+ responses:
+ "200":
+ description: Subscription updated
+
+ /internal/folder_paths:
+ get:
+ operationId: getInternalFolderPaths
+ tags: [internal]
+ summary: Get configured folder paths
+ description: Returns the filesystem paths ComfyUI is configured to load models and other assets from, keyed by folder type.
+ x-internal: true
+ responses:
+ "200":
+ description: Dictionary of folder type to paths
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ type: array
+ items:
+ type: array
+ items:
+ type: string
+ description: Map of folder type name to list of [path, ...] entries
+
+ /internal/files/{directory_type}:
+ get:
+ operationId: getFiles
+ tags: [internal]
+ summary: List files in a directory type
+ description: Lists the files present in one of ComfyUI's known directories (input, output, or temp).
+ x-internal: true
+ parameters:
+ - name: directory_type
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Directory type (e.g. output, input, temp)
+ responses:
+ "200":
+ description: Array of filenames
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ type: string
+
+ '400':
+ description: Invalid directory type
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Assets (x-feature-gate: enable-assets)
+ # ---------------------------------------------------------------------------
+ /api/assets/hash/{hash}:
+ head:
+ operationId: checkAssetByHash
+ tags: [assets]
+ summary: Check if an asset with the given hash exists
+ description: Returns 204 if an asset with the given content hash already exists, 404 otherwise. Used by clients to deduplicate uploads before transferring bytes.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: hash
+ in: path
+ required: true
+ schema:
+ type: string
+ description: "Blake3 hash of the asset (e.g. blake3:abc123...)"
+ responses:
+ "200":
+ description: Asset exists
+ "404":
+ description: No asset with this hash
+
+ '400':
+ description: Invalid hash format
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets:
+ get:
+ operationId: listAssets
+ tags: [assets]
+ summary: List assets with filtering and pagination
+ description: Returns a paginated list of assets, optionally filtered by tags, name, or other query parameters.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ default: 50
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ default: 0
+ - name: include_tags
+ in: query
+ schema:
+ type: array
+ items:
+ type: string
+ style: form
+ explode: true
+ description: Tags that assets must have (AND logic)
+ - name: exclude_tags
+ in: query
+ schema:
+ type: array
+ items:
+ type: string
+ style: form
+ explode: true
+ description: Tags that assets must not have
+ - name: name_contains
+ in: query
+ schema:
+ type: string
+ description: Filter assets whose name contains this substring
+ - name: metadata_filter
+ in: query
+ schema:
+ type: string
+ description: JSON-encoded metadata key/value filter
+ - name: sort
+ in: query
+ schema:
+ type: string
+ description: Field to sort by
+ - name: order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ description: Sort direction
+ - name: include_public
+ in: query
+ schema:
+ type: boolean
+ x-runtime: [cloud]
+ description: "[cloud-only] Include workspace-public assets in addition to the caller's own."
+ - name: asset_hash
+ in: query
+ schema:
+ type: string
+ x-runtime: [cloud]
+ description: "[cloud-only] Filter by exact content hash."
+ responses:
+ "200":
+ description: Asset list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ListAssetsResponse"
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: uploadAsset
+ tags: [assets]
+ summary: Upload a new asset
+ description: Uploads a new asset (binary content plus metadata) and registers it in the asset database.
+ x-feature-gate: enable-assets
+ requestBody:
+ required: true
+ content:
+ multipart/form-data:
+ schema:
+ type: object
+ required:
+ - file
+ properties:
+ file:
+ type: string
+ format: binary
+ description: Asset file to upload
+ name:
+ type: string
+ description: Display name for the asset
+ tags:
+ type: string
+ description: Comma-separated tags
+ user_metadata:
+ type: string
+ description: JSON-encoded user metadata
+ hash:
+ type: string
+ description: "Blake3 hash of the file content (e.g. blake3:abc123...)"
+ mime_type:
+ type: string
+ description: MIME type of the file (overrides auto-detected type)
+ preview_id:
+ type: string
+ format: uuid
+ description: ID of an existing asset to use as the preview image
+ id:
+ type: string
+ format: uuid
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Client-supplied asset ID for idempotent creation. If an asset with this ID already exists, the existing asset is returned."
+ application/json:
+ schema:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] URL-based asset upload. Caller supplies a URL instead of a file body; the server fetches the content."
+ required:
+ - url
+ properties:
+ url:
+ type: string
+ format: uri
+ description: "[cloud-only] URL of the file to import as an asset"
+ name:
+ type: string
+ description: Display name for the asset
+ tags:
+ type: string
+ description: Comma-separated tags
+ user_metadata:
+ type: string
+ description: JSON-encoded user metadata
+ hash:
+ type: string
+ description: "Blake3 hash of the file content (e.g. blake3:abc123...)"
+ mime_type:
+ type: string
+ description: MIME type of the file (overrides auto-detected type)
+ preview_id:
+ type: string
+ format: uuid
+ description: ID of an existing asset to use as the preview image
+ id:
+ type: string
+ format: uuid
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Client-supplied asset ID for idempotent creation. If an asset with this ID already exists, the existing asset is returned."
+ responses:
+ "201":
+ description: Asset created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AssetCreated"
+
+ '200':
+ description: Asset already exists (returned existing asset)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/AssetCreated'
+ '400':
+ description: Invalid request (bad file, invalid URL, invalid content type, etc.)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '403':
+ description: Source URL requires authentication or access denied
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Source URL not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '413':
+ description: File too large
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '415':
+ description: Unsupported media type
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Download failed due to network error or timeout
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/from-hash:
+ post:
+ operationId: createAssetFromHash
+ tags: [assets]
+ summary: Create an asset reference from an existing hash
+ description: Registers a new asset that references existing content by hash, without re-uploading the bytes.
+ x-feature-gate: enable-assets
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - hash
+ - name
+ properties:
+ hash:
+ type: string
+ description: Blake3 hash of existing content
+ name:
+ type: string
+ description: Display name
+ tags:
+ type: array
+ items:
+ type: string
+ user_metadata:
+ type: object
+ additionalProperties: true
+ mime_type:
+ type: string
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] MIME type of the content, so the type is preserved without re-inspecting content. Ignored by local ComfyUI."
+ responses:
+ "201":
+ description: Asset created from hash
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AssetCreated"
+
+ '200':
+ description: Asset reference already exists (returned existing)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/AssetCreated'
+ '400':
+ description: Invalid request (bad hash format, invalid tags, etc.)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Source asset with given hash not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/{id}:
+ get:
+ operationId: getAssetById
+ tags: [assets]
+ summary: Get asset metadata
+ description: Returns the metadata for a single asset.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ responses:
+ "200":
+ description: Asset metadata
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/Asset"
+ "404":
+ description: Asset not found
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ put:
+ operationId: updateAsset
+ tags: [assets]
+ summary: Update asset metadata
+ description: Updates the mutable metadata of an asset (name, tags, etc.). Binary content is immutable.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ name:
+ type: string
+ description: New display name for the asset
+ user_metadata:
+ type: object
+ additionalProperties: true
+ description: Custom user metadata to set
+ preview_id:
+ type: string
+ format: uuid
+ description: ID of the asset to use as the preview
+ mime_type:
+ type: string
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] MIME type override when auto-detection was wrong. Ignored by local ComfyUI."
+ responses:
+ "200":
+ description: Asset updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AssetUpdated"
+ '400':
+ description: Invalid request (no fields provided)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Asset not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteAsset
+ tags: [assets]
+ summary: Delete an asset
+ description: Removes an asset entry. Depending on the server configuration, the underlying content may also be deleted.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ - name: delete_content
+ in: query
+ schema:
+ type: boolean
+ description: Also delete the underlying content file
+ responses:
+ "204":
+ description: Asset deleted
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Asset not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Asset cannot be deleted because it is referenced by another resource (e.g., workflow version)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/{id}/content:
+ get:
+ operationId: getAssetContent
+ tags: [assets]
+ summary: Download asset file content
+ description: Returns the binary content of an asset. Supports range requests.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ responses:
+ "200":
+ description: Asset file content
+ content:
+ application/octet-stream:
+ schema:
+ type: string
+ format: binary
+ "404":
+ description: Asset not found
+
+ /api/assets/{id}/tags:
+ post:
+ operationId: addAssetTags
+ tags: [assets]
+ summary: Add tags to an asset
+ description: Adds one or more tags to an asset.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - tags
+ properties:
+ tags:
+ type: array
+ items:
+ type: string
+ responses:
+ "200":
+ description: Tags added
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/TagsModificationResponse"
+ '400':
+ description: Invalid request
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Asset not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error (e.g., reserved tag)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: removeAssetTags
+ tags: [assets]
+ summary: Remove tags from an asset
+ description: Removes one or more tags from an asset.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: id
+ in: path
+ description: The asset ID.
+ required: true
+ schema:
+ type: string
+ format: uuid
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - tags
+ properties:
+ tags:
+ type: array
+ items:
+ type: string
+ responses:
+ "200":
+ description: Tags removed
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/TagsModificationResponse"
+
+ '400':
+ description: Invalid request
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Asset not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error (e.g., reserved tag)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/tags:
+ get:
+ operationId: listTags
+ tags: [assets]
+ summary: List all known tags with counts
+ description: Returns the list of all tags known to the asset database, with counts.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ - name: search
+ in: query
+ schema:
+ type: string
+ description: Search term for tag name
+ responses:
+ "200":
+ description: Tag list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ListTagsResponse"
+
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/tags/refine:
+ get:
+ operationId: getAssetTagHistogram
+ tags: [assets]
+ summary: Get tag counts for assets matching current filters
+ description: Returns suggested additional tags that would refine a filtered asset query, together with the count of assets each tag would select.
+ x-feature-gate: enable-assets
+ parameters:
+ - name: include_tags
+ in: query
+ schema:
+ type: array
+ items:
+ type: string
+ style: form
+ explode: true
+ description: Tags that assets must have (AND logic)
+ - name: exclude_tags
+ in: query
+ schema:
+ type: array
+ items:
+ type: string
+ style: form
+ explode: true
+ description: Tags that assets must not have
+ - name: name_contains
+ in: query
+ schema:
+ type: string
+ description: Filter assets whose name contains this substring
+ - name: metadata_filter
+ in: query
+ schema:
+ type: string
+ description: JSON-encoded metadata key/value filter
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ - name: sort
+ in: query
+ schema:
+ type: string
+ description: Field to sort by
+ - name: order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ description: Sort direction
+ responses:
+ "200":
+ description: Tag histogram
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AssetTagHistogramResponse"
+
+ '400':
+ description: Invalid request parameters
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/seed:
+ post:
+ operationId: seedAssets
+ tags: [assets]
+ summary: Trigger asset scan/seed from filesystem
+ description: Starts a background job that scans the configured directories and registers any assets not yet present in the asset database.
+ x-feature-gate: enable-assets
+ requestBody:
+ required: false
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ roots:
+ type: array
+ items:
+ type: string
+ description: Root folder paths to scan (if omitted, scans all)
+ responses:
+ "200":
+ description: Seed started
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ status:
+ type: string
+
+ /api/assets/seed/status:
+ get:
+ operationId: getAssetSeedStatus
+ tags: [assets]
+ summary: Get asset scan progress
+ description: Returns the progress and status of the most recently-started asset seed job.
+ x-feature-gate: enable-assets
+ responses:
+ "200":
+ description: Scan progress
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ description: Scan progress details (files scanned, total, status, etc.)
+
+ /api/assets/seed/cancel:
+ post:
+ operationId: cancelAssetSeed
+ tags: [assets]
+ summary: Cancel an in-progress asset scan
+ description: Requests cancellation of the currently-running asset seed job.
+ x-feature-gate: enable-assets
+ responses:
+ "200":
+ description: Scan cancelled
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ status:
+ type: string
+
+ /api/assets/prune:
+ post:
+ operationId: pruneAssets
+ tags: [assets]
+ summary: Mark assets whose backing files no longer exist on disk
+ description: Starts a background job that removes asset entries whose underlying content no longer exists on disk.
+ x-feature-gate: enable-assets
+ responses:
+ "200":
+ description: Prune result
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ status:
+ type: string
+ marked:
+ type: integer
+ description: Number of assets marked as missing
+
+ # ===========================================================================
+ # Cloud-runtime FE-facing operations
+ #
+ # These operations are served by the cloud runtime. The local runtime returns
+ # 404 for all of these paths. Each operation is tagged x-runtime: [cloud].
+ # ===========================================================================
+
+ # ---------------------------------------------------------------------------
+ # Jobs / prompts (cloud)
+ # ---------------------------------------------------------------------------
+ /api/jobs/{job_id}/cancel:
+ post:
+ operationId: cancelJob
+ tags: [queue]
+ summary: Cancel a running or pending job
+ description: "[cloud-only] Requests cancellation of a job. If the job is currently executing, execution is interrupted. If it is pending in the queue, it is removed."
+ x-runtime: [cloud]
+ parameters:
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The job ID to cancel.
+ responses:
+ "200":
+ description: Cancellation accepted
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobCancelResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Bad Request - job_id is not a valid UUID (emitted by request validation before the handler runs)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/BindingErrorResponse'
+ '500':
+ description: Internal server error - cancellation failed
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/job/{job_id}/status:
+ get:
+ operationId: getJobStatus
+ tags: [queue]
+ summary: Get status of a cloud job
+ deprecated: true
+ description: |
+ **Deprecated.** This endpoint is superseded by `GET /api/jobs/{job_id}`.
+ Clients should migrate; the endpoint is retained for backward
+ compatibility but will be removed in a future release.
+ x-runtime: [cloud]
+ parameters:
+ - name: job_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The job ID to check status for.
+ responses:
+ "200":
+ description: Job status
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JobStatusResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '403':
+ description: Forbidden - job belongs to another user
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/prompt/{prompt_id}:
+ get:
+ operationId: getCloudPrompt
+ tags: [prompt]
+ summary: Get a cloud prompt by ID
+ description: "[cloud-only] Returns the full prompt record for a cloud-executed prompt, including the submitted workflow graph and execution metadata."
+ x-runtime: [cloud]
+ parameters:
+ - name: prompt_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The prompt ID to fetch.
+ responses:
+ "200":
+ description: Cloud prompt detail
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudPrompt"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/history_v2:
+ get:
+ operationId: getHistory
+ tags: [history]
+ summary: Get paginated execution history (v2)
+ deprecated: true
+ description: |
+ **Deprecated.** This endpoint is superseded by `GET /api/jobs`.
+ Clients should migrate; the endpoint is retained for backward
+ compatibility but will be removed in a future release.
+ x-runtime: [cloud]
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ default: 20
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ default: 0
+ description: Pagination offset
+ - name: status
+ in: query
+ schema:
+ type: string
+ description: Filter by execution status
+ responses:
+ "200":
+ description: History list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HistoryResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/history_v2/{prompt_id}:
+ get:
+ operationId: getHistoryForPrompt
+ tags: [history]
+ summary: Get v2 history for a specific prompt
+ deprecated: true
+ description: |
+ **Deprecated.** This endpoint is superseded by `GET /api/jobs/{prompt_id}`.
+ Clients should migrate; the endpoint is retained for backward
+ compatibility but will be removed in a future release.
+ x-runtime: [cloud]
+ parameters:
+ - name: prompt_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The prompt ID to fetch history for.
+ responses:
+ "200":
+ description: History entry
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HistoryDetailResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/logs:
+ get:
+ operationId: getLogs
+ tags: [system]
+ summary: Get cloud execution logs
+ deprecated: true
+ description: |
+ **Deprecated.** This endpoint returns a static placeholder response and
+ provides no real log data. It is retained only to avoid breaking clients
+ that still call it. Clients should remove their dependency; the endpoint
+ will be removed in a future release.
+ x-runtime: [cloud]
+ parameters:
+ - name: job_id
+ in: query
+ schema:
+ type: string
+ description: Filter logs by job ID
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ default: 100
+ description: Maximum number of log entries
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ default: 0
+ description: Pagination offset
+ responses:
+ "200":
+ description: Log entries
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/LogsResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ # ---------------------------------------------------------------------------
+ # Assets extensions (cloud)
+ # ---------------------------------------------------------------------------
+ /api/assets/download:
+ post:
+ operationId: createAssetDownload
+ tags: [assets]
+ summary: Download assets to cloud runtime
+ description: "[cloud-only] Initiates a download of one or more assets to the cloud runtime environment. Returns a task ID for tracking download progress via WebSocket."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - assets
+ properties:
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/AssetDownloadRequest"
+ description: Assets to download
+ responses:
+ "202":
+ description: Download task accepted
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - task_id
+ - status
+ properties:
+ task_id:
+ type: string
+ format: uuid
+ description: ID of the download task; use to poll status.
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ description: Current task status (typically `created` on initial creation).
+ message:
+ type: string
+ description: Human-readable task message.
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '200':
+ description: File already exists in storage - asset created/returned immediately
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/AssetCreated'
+ '422':
+ description: Validation errors
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/export:
+ post:
+ operationId: createAssetExport
+ tags: [assets]
+ summary: Export assets as a downloadable archive
+ description: "[cloud-only] Initiates a bulk export of assets. Returns a task ID for tracking progress via WebSocket. When complete, the export can be downloaded via the exports endpoint."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ job_ids:
+ type: array
+ items:
+ type: string
+ description: Job IDs whose associated assets should all be included in the ZIP bundle.
+ asset_ids:
+ type: array
+ items:
+ type: string
+ format: uuid
+ description: Asset IDs to include in the ZIP bundle. Additive to assets associated with provided job IDs.
+ export_name:
+ type: string
+ description: Name for the export archive
+ naming_strategy:
+ type: string
+ enum: [group_by_job_id, preserve, asset_id, group_by_job_time]
+ default: group_by_job_time
+ description: "Strategy for naming files in the ZIP: group by job ID, preserve original names, use the asset ID, or group by job creation time."
+ job_asset_name_filters:
+ type: object
+ additionalProperties:
+ type: array
+ minItems: 1
+ items:
+ type: string
+ description: Optional per-job asset name filters. When provided for a job ID, only assets whose name matches one of the listed names are included.
+ responses:
+ "202":
+ description: Export task accepted
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - task_id
+ - status
+ properties:
+ task_id:
+ type: string
+ format: uuid
+ description: ID of the export task; use to poll status.
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ description: Current task status (typically `created` on initial creation).
+ message:
+ type: string
+ description: Human-readable task message.
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/exports/{exportName}:
+ get:
+ operationId: getAssetExport
+ tags: [assets]
+ summary: Download a completed asset export
+ description: "[cloud-only] Returns the archive file for a completed asset export."
+ x-runtime: [cloud]
+ parameters:
+ - name: exportName
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Name of the export to download
+ responses:
+ "200":
+ description: Export archive file
+ content:
+ application/zip:
+ schema:
+ type: string
+ format: binary
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Invalid export name
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/from-workflow:
+ post:
+ operationId: postAssetsFromWorkflow
+ tags: [assets]
+ summary: Create asset records from a workflow execution
+ description: "[cloud-only] Registers output files from a workflow execution as assets in the asset database."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - prompt_id
+ properties:
+ prompt_id:
+ type: string
+ format: uuid
+ description: Prompt ID whose outputs should be registered as assets
+ tags:
+ type: array
+ items:
+ type: string
+ description: Tags to apply to the created assets
+ responses:
+ "200":
+ description: Assets created or referenced
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/Asset"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/import:
+ post:
+ operationId: importPublishedAssets
+ tags: [assets]
+ summary: "[cloud-only] Import published assets into the caller's library"
+ description: |
+ [cloud-only] Imports the specified published assets into the caller's asset library. New DB records reference the same storage objects; no file copying occurs. Assets the caller already owns (by hash) are deduplicated. The `id` field on each returned `AssetInfo` is the caller's newly-created private asset ID, not the published asset ID supplied in the request.
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ImportPublishedAssetsRequest"
+ responses:
+ "200":
+ description: Successfully imported assets
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ImportPublishedAssetsResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/assets/remote-metadata:
+ get:
+ operationId: getRemoteAssetMetadata
+ tags: [assets]
+ summary: Fetch metadata for a remote asset URL
+ description: "[cloud-only] Fetches and returns metadata (content type, size, filename) for a remote URL without downloading the full content."
+ x-runtime: [cloud]
+ parameters:
+ - name: url
+ in: query
+ required: true
+ schema:
+ type: string
+ format: uri
+ description: URL to inspect
+ responses:
+ "200":
+ description: Remote metadata
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AssetMetadataResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '422':
+ description: Failed to retrieve metadata from source
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Custom nodes / hub (cloud)
+ # ---------------------------------------------------------------------------
+ /api/experiment/nodes:
+ get:
+ operationId: getNodeInfoSchema
+ tags: [runtime-only]
+ summary: Get pre-rendered node info schema
+ description: "[cloud-only] Returns the static ComfyUI object_info schema, identical for every caller, rendered once at startup with empty model/user-file context. Served by a raw HTTP handler that writes pre-rendered bytes with ETag + Cache-Control validators for RFC 7232 conditional GETs."
+ x-runtime: [cloud]
+ parameters:
+ - name: If-None-Match
+ in: header
+ required: false
+ schema:
+ type: string
+ description: Entity tag previously returned by this endpoint. When present and matching, the server returns 304 Not Modified.
+ responses:
+ "200":
+ description: Node info schema
+ headers:
+ ETag:
+ schema:
+ type: string
+ description: Entity tag for conditional request validation
+ Cache-Control:
+ schema:
+ type: string
+ description: Cache directives for the response
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties:
+ $ref: "#/components/schemas/NodeInfo"
+ "304":
+ description: Not Modified — returned when the client sends a matching If-None-Match header
+ post:
+ operationId: installCloudNode
+ tags: [node]
+ summary: Install a custom node package
+ description: "[cloud-only] Installs a custom node package in the cloud runtime by ID or repository URL."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - id
+ properties:
+ id:
+ type: string
+ description: Node package ID or repository URL
+ version:
+ type: string
+ description: Specific version to install
+ responses:
+ "200":
+ description: Node installed
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudNode"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/experiment/nodes/{id}:
+ get:
+ operationId: getNodeByID
+ tags: [runtime-only]
+ summary: Get a single node definition by ID
+ description: "[cloud-only] Returns one node's definition from the pre-indexed object_info schema. Served by a raw HTTP handler that writes pre-rendered bytes with ETag + Cache-Control validators for RFC 7232 conditional GETs."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Node class identifier
+ - name: If-None-Match
+ in: header
+ required: false
+ schema:
+ type: string
+ description: Entity tag previously returned by this endpoint. When present and matching, the server returns 304 Not Modified.
+ responses:
+ "200":
+ description: Single node definition
+ headers:
+ ETag:
+ schema:
+ type: string
+ description: Entity tag for conditional request validation
+ Cache-Control:
+ schema:
+ type: string
+ description: Cache directives for the response
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/NodeInfo"
+ "304":
+ description: Not Modified — returned when the client sends a matching If-None-Match header
+ "404":
+ description: Node not found
+ delete:
+ operationId: uninstallCloudNode
+ tags: [node]
+ summary: Uninstall a custom node package
+ description: "[cloud-only] Removes a custom node package from the cloud runtime."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Custom node package ID
+ responses:
+ "204":
+ description: Node uninstalled
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/hub/assets/upload-url:
+ post:
+ operationId: createHubAssetUploadUrl
+ tags: [hub]
+ summary: Get a pre-signed upload URL for a hub asset
+ description: "[cloud-only] Returns a pre-signed URL that can be used to upload an asset file directly to storage."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - filename
+ - content_type
+ properties:
+ filename:
+ type: string
+ description: Name of the file to upload
+ content_type:
+ type: string
+ description: MIME type of the file
+ size:
+ type: integer
+ format: int64
+ description: File size in bytes
+ responses:
+ "200":
+ description: Upload URL
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ upload_url:
+ type: string
+ format: uri
+ description: Pre-signed upload URL
+ asset_url:
+ type: string
+ format: uri
+ description: Public URL after upload completes
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/labels:
+ get:
+ operationId: listHubLabels
+ tags: [hub]
+ summary: List available hub labels
+ description: "[cloud-only] Returns the list of labels/categories available for tagging hub content."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Label list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubLabelListResponse"
+ '400':
+ description: Bad request (e.g. invalid type parameter)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/profiles:
+ get:
+ operationId: listHubProfiles
+ tags: [hub]
+ summary: List hub user profiles
+ description: "[cloud-only] Returns a paginated list of public hub user profiles."
+ x-runtime: [cloud]
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset
+ - name: search
+ in: query
+ schema:
+ type: string
+ description: Search by username or display name
+ responses:
+ "200":
+ description: Profile list
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ profiles:
+ type: array
+ items:
+ $ref: "#/components/schemas/HubProfile"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+ post:
+ operationId: createHubProfile
+ tags: [hub]
+ summary: Create a Hub profile
+ description: "[cloud-only] Creates a hub profile for the specified workspace. Username is immutable after creation."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CreateHubProfileRequest"
+ responses:
+ "201":
+ description: Hub profile created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubProfile"
+ "400":
+ description: Bad request (e.g. invalid username)
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "409":
+ description: Username already taken or profile already exists
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/profiles/{username}:
+ get:
+ operationId: getHubProfile
+ tags: [hub]
+ summary: Get a hub profile by username
+ description: "[cloud-only] Returns the public hub profile for the given username."
+ x-runtime: [cloud]
+ parameters:
+ - name: username
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Hub username
+ responses:
+ "200":
+ description: Profile
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubProfile"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/profiles/check:
+ get:
+ operationId: checkHubUsername
+ tags: [hub]
+ summary: Check if a hub username is available
+ description: "[cloud-only] Returns whether the given username is available for registration."
+ x-runtime: [cloud]
+ parameters:
+ - name: username
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Username to check
+ responses:
+ "200":
+ description: Availability result
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ available:
+ type: boolean
+ username:
+ type: string
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/profiles/me:
+ get:
+ operationId: getMyHubProfile
+ tags: [hub]
+ summary: Get the authenticated user's hub profile
+ description: "[cloud-only] Returns the hub profile of the currently authenticated user."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Profile
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubProfile"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '404':
+ description: No hub profile exists
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ put:
+ operationId: updateMyHubProfile
+ tags: [hub]
+ summary: Update the authenticated user's hub profile
+ description: "[cloud-only] Updates the hub profile of the currently authenticated user."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ username:
+ type: string
+ display_name:
+ type: string
+ bio:
+ type: string
+ avatar_url:
+ type: string
+ format: uri
+ links:
+ type: array
+ items:
+ type: string
+ format: uri
+ responses:
+ "200":
+ description: Updated profile
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubProfile"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "409":
+ description: Conflict
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/hub/workflows:
+ get:
+ operationId: listHubWorkflows
+ tags: [hub]
+ summary: List published hub workflows
+ description: "[cloud-only] Returns a paginated list of publicly shared workflows on the hub."
+ x-runtime: [cloud]
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset
+ - name: sort
+ in: query
+ schema:
+ type: string
+ description: Sort field (e.g. created_at, likes)
+ - name: order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ description: Sort direction
+ - name: search
+ in: query
+ schema:
+ type: string
+ description: Search by title or description
+ - name: labels
+ in: query
+ schema:
+ type: string
+ description: Filter by label IDs (comma-separated)
+ responses:
+ "200":
+ description: Hub workflow list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubWorkflowListResponse"
+ '400':
+ description: Bad request (e.g. malformed pagination cursor)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Profile not found (when filtering by username)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: publishHubWorkflow
+ tags: [hub]
+ summary: Publish a workflow to the hub
+ description: "[cloud-only] Publishes a workflow to the hub with metadata, thumbnail, and sample images."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PublishHubWorkflowRequest"
+ responses:
+ "200":
+ description: Workflow published to hub
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubWorkflowDetail"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Workflow or profile not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/workflows/{share_id}:
+ get:
+ operationId: getHubWorkflow
+ tags: [hub]
+ summary: Get a published hub workflow by share ID
+ description: "[cloud-only] Returns the full details of a published workflow on the hub."
+ x-runtime: [cloud]
+ parameters:
+ - name: share_id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Workflow share ID
+ responses:
+ "200":
+ description: Hub workflow
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/HubWorkflowDetail"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '413':
+ description: Workflow JSON too large
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteHubWorkflow
+ tags: [hub]
+ summary: Unpublish a workflow from the hub
+ description: "[cloud-only] Removes a workflow from the hub listing."
+ x-runtime: [cloud]
+ parameters:
+ - name: share_id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: Workflow share ID
+ responses:
+ "204":
+ description: Successfully unpublished
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Workflow not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/hub/workflows/index:
+ get:
+ operationId: listHubWorkflowIndex
+ tags: [hub]
+ summary: Get the hub workflow index
+ description: "[cloud-only] Returns the lightweight index of all hub workflows for client-side search and navigation."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Workflow index
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/HubWorkflowIndexEntry"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Workflows (cloud)
+ # ---------------------------------------------------------------------------
+ /api/workflows:
+ get:
+ operationId: listWorkflows
+ tags: [workflows]
+ summary: List cloud workflows
+ description: "[cloud-only] Returns a paginated list of the authenticated user's cloud workflows."
+ x-runtime: [cloud]
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset
+ - name: sort
+ in: query
+ schema:
+ type: string
+ description: Sort field
+ - name: order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ description: Sort direction
+ - name: search
+ in: query
+ schema:
+ type: string
+ description: Search by workflow name
+ responses:
+ "200":
+ description: Workflow list
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowListResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createWorkflow
+ tags: [workflows]
+ summary: Create a new cloud workflow
+ description: "[cloud-only] Creates a new cloud workflow with the provided name and optional initial content."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - name
+ properties:
+ name:
+ type: string
+ description: Workflow name
+ description:
+ type: string
+ description: Workflow description
+ content:
+ type: object
+ additionalProperties: true
+ description: Initial workflow graph JSON
+ responses:
+ "201":
+ description: Workflow created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workflows/{workflow_id}:
+ get:
+ operationId: getWorkflow
+ tags: [workflows]
+ summary: Get a cloud workflow by ID
+ description: "[cloud-only] Returns the metadata for a cloud workflow."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ responses:
+ "200":
+ description: Workflow detail
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '403':
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ patch:
+ operationId: updateWorkflow
+ tags: [workflows]
+ summary: Update a cloud workflow
+ description: "[cloud-only] Updates the metadata (name, description) of an existing cloud workflow."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ name:
+ type: string
+ description:
+ type: string
+ responses:
+ "200":
+ description: Workflow updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteWorkflow
+ tags: [workflows]
+ summary: Delete a cloud workflow
+ description: "[cloud-only] Deletes a cloud workflow and all its versions."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ responses:
+ "204":
+ description: Workflow deleted
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workflows/{workflow_id}/content:
+ get:
+ operationId: getWorkflowContent
+ tags: [workflows]
+ summary: Get the content of a cloud workflow
+ description: "[cloud-only] Returns the full workflow graph JSON for the latest version of a cloud workflow."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ - name: version_id
+ in: query
+ schema:
+ type: string
+ description: Specific version ID to fetch
+ responses:
+ "200":
+ description: Workflow content
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ description: The full workflow graph JSON
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '403':
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ put:
+ operationId: updateCloudWorkflowContent
+ tags: [workflows]
+ summary: Update the content of a cloud workflow
+ description: "[cloud-only] Saves new workflow graph JSON as a new version of the cloud workflow."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ description: The workflow graph JSON to save
+ responses:
+ "200":
+ description: Content updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudWorkflowVersion"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/workflows/{workflow_id}/fork:
+ post:
+ operationId: forkWorkflow
+ tags: [workflows]
+ summary: Fork a cloud workflow
+ description: "[cloud-only] Creates a copy of a cloud workflow under the authenticated user's account."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID to fork.
+ requestBody:
+ required: false
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ name:
+ type: string
+ description: Name for the forked workflow (defaults to original name)
+ responses:
+ "201":
+ description: Forked workflow
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '403':
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workflows/{workflow_id}/versions:
+ get:
+ operationId: listCloudWorkflowVersions
+ tags: [workflows]
+ summary: List versions of a cloud workflow
+ description: "[cloud-only] Returns the version history of a cloud workflow."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset
+ responses:
+ "200":
+ description: Version list
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ versions:
+ type: array
+ items:
+ $ref: "#/components/schemas/CloudWorkflowVersion"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ post:
+ operationId: createWorkflowVersion
+ tags: [workflows]
+ summary: Create a new cloud workflow version
+ description: "[cloud-only] Creates a new workflow version with updated workflow JSON. Uses optimistic concurrency via base_version."
+ x-runtime: [cloud]
+ parameters:
+ - name: workflow_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The workflow ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CreateWorkflowVersionRequest"
+ responses:
+ "201":
+ description: Version created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowVersionResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden — not the workflow owner
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "409":
+ description: Version conflict — base_version does not match latest
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workflows/published/{share_id}:
+ get:
+ operationId: getPublishedWorkflow
+ tags: [workflows]
+ summary: Get a published workflow by share ID
+ description: "[cloud-only] Returns a publicly published cloud workflow by its share identifier."
+ x-runtime: [cloud]
+ parameters:
+ - name: share_id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The workflow share ID.
+ responses:
+ "200":
+ description: Published workflow
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PublishedWorkflowDetail"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '413':
+ description: Workflow JSON too large
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Auth / session (cloud)
+ # ---------------------------------------------------------------------------
+ /api/auth/session:
+ get:
+ operationId: getAuthSession
+ tags: [auth]
+ summary: Get the current authentication session
+ description: "[cloud-only] Returns the current session state for the authenticated user, including user identity and active workspace."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Session info
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AuthSession"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ post:
+ operationId: createSession
+ tags: [auth]
+ summary: Create a session cookie
+ description: "[cloud-only] Creates a session cookie from the bearer token in the Authorization header. Returns a Set-Cookie header with a secure HttpOnly session cookie. Cookie authentication is not allowed for this endpoint."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Session created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CreateSessionResponse"
+ "400":
+ description: Bad request — invalid or expired ID token
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteSession
+ tags: [auth]
+ summary: Delete session cookie (logout)
+ description: "[cloud-only] Clears the session cookie and optionally revokes the session on the server."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Session deleted
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/DeleteSessionResponse"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/auth/token:
+ post:
+ operationId: exchangeToken
+ tags: [auth]
+ summary: Exchange credentials for an access token
+ description: "[cloud-only] Exchanges authentication credentials (e.g. an authorization code) for an access token."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - grant_type
+ properties:
+ grant_type:
+ type: string
+ enum: [authorization_code, refresh_token]
+ description: OAuth2 grant type
+ code:
+ type: string
+ description: Authorization code (for authorization_code grant)
+ refresh_token:
+ type: string
+ description: Refresh token (for refresh_token grant)
+ redirect_uri:
+ type: string
+ format: uri
+ description: Redirect URI used in the authorization request
+ responses:
+ "200":
+ description: Token response
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ExchangeTokenResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found or user not a member
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /.well-known/jwks.json:
+ get:
+ operationId: getJwks
+ tags: [auth]
+ summary: Get JSON Web Key Set
+ description: "[cloud-only] Returns the JSON Web Key Set (JWKS) used to verify JWTs issued by the cloud authentication service."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: JWKS
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/JwksResponse"
+
+ # ---------------------------------------------------------------------------
+ # OAuth 2.1 / RFC 7591 Dynamic Client Registration (cloud)
+ # ---------------------------------------------------------------------------
+ /.well-known/oauth-authorization-server:
+ get:
+ operationId: getOAuthAuthorizationServer
+ tags: [auth]
+ summary: "[cloud-only] OAuth 2.1 authorization-server metadata (RFC 8414)"
+ description: "[cloud-only] Public metadata document for OAuth 2.1 clients. Cached 5 minutes."
+ x-runtime: [cloud]
+ security: []
+ responses:
+ "200":
+ description: Authorization-server metadata
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthAuthorizationServerMetadata"
+ "404":
+ description: OAuth disabled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /.well-known/oauth-protected-resource:
+ get:
+ operationId: getOAuthProtectedResource
+ tags: [auth]
+ summary: "[cloud-only] OAuth 2.1 protected-resource metadata (RFC 9728)"
+ description: "[cloud-only] Public metadata describing the currently advertised protected resource. Cached 5 minutes."
+ x-runtime: [cloud]
+ security: []
+ responses:
+ "200":
+ description: Protected-resource metadata
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthProtectedResourceMetadata"
+ "404":
+ description: OAuth disabled or no active resource configured
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /oauth/authorize:
+ get:
+ operationId: getOAuthAuthorize
+ tags: [auth]
+ summary: "[cloud-only] Begin or resume an OAuth 2.1 authorization request"
+ description: |
+ [cloud-only] Two modes:
+ - **Initial entry** (OAuth params present): validates client/redirect/resource/scopes, persists a server-side authorization-request row, and either redirects (no session / unverified email) to the configured frontend login URL carrying only the opaque `oauth_request_id`, or returns the JSON consent challenge for the frontend to render.
+ - **Resume** (`oauth_request_id` present): loads the server-side row, fails closed if expired/consumed/unknown, returns the JSON consent challenge. Browser-replayed OAuth params are intentionally ignored.
+
+ The frontend renders the consent UI from the JSON payload and POSTs the user's decision back to this endpoint.
+ x-runtime: [cloud]
+ security: []
+ parameters:
+ - { name: response_type, in: query, required: false, schema: { type: string } }
+ - { name: client_id, in: query, required: false, schema: { type: string } }
+ - { name: redirect_uri, in: query, required: false, schema: { type: string } }
+ - { name: scope, in: query, required: false, schema: { type: string } }
+ - name: state
+ in: query
+ required: false
+ schema: { type: string }
+ description: |
+ RFC 6749 §10.12 marks `state` as RECOMMENDED. Cloud hardening makes it REQUIRED on the initial-entry path (omitted only on the resume path where `oauth_request_id` is supplied instead). This parameter is `required: false` at the spec level only because the operation is dual-mode (initial entry vs. resume); the runtime rejects empty `state` on the initial-entry path with a stable `invalid_request` 400.
+ - { name: code_challenge, in: query, required: false, schema: { type: string } }
+ - { name: code_challenge_method, in: query, required: false, schema: { type: string } }
+ - { name: resource, in: query, required: false, schema: { type: string } }
+ - { name: oauth_request_id, in: query, required: false, schema: { type: string } }
+ responses:
+ "200":
+ description: Consent challenge payload (session present, email verified). Frontend renders the consent UI from this payload and POSTs back to /oauth/authorize.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthConsentChallenge"
+ "302":
+ description: Redirect to login (no session / unverified email) or to registered redirect_uri (pre-validated client error)
+ headers:
+ Location:
+ schema:
+ type: string
+ "400":
+ description: Invalid authorize request (pre-redirect failure — unknown client, redirect mismatch, malformed params)
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: OAuth disabled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ post:
+ operationId: postOAuthAuthorize
+ tags: [auth]
+ summary: "[cloud-only] Submit OAuth consent decision"
+ description: |
+ [cloud-only] JSON-only consent submission. The handler verifies the per-row CSRF token, atomically marks the authorization request consumed (single-use covers both allow and deny paths), then returns the redirect URL the browser must navigate to. The URL contains either `code` + original `state` for allow, or the RFC 6749 §5.2 error and `state` for deny.
+
+ Workspace membership is re-checked at submission time. Consent is persisted keyed by `(user_id, client_id, resource_id, workspace_id)`; broadening the previously approved scope set requires a fresh consent flow.
+ x-runtime: [cloud]
+ security: []
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required: [oauth_request_id, csrf_token, decision, workspace_id]
+ properties:
+ oauth_request_id: { type: string, format: uuid }
+ csrf_token: { type: string }
+ decision: { type: string, enum: [allow, deny] }
+ workspace_id: { type: string }
+ responses:
+ "200":
+ description: Redirect URL for the frontend to navigate to (allow → with code+state; deny → with error+state)
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthAuthorizeRedirectResponse"
+ "400":
+ description: Bad request (CSRF mismatch, expired/consumed request, inaccessible workspace)
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Scope broadening on consent re-grant — fresh consent flow required
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: OAuth disabled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /oauth/token:
+ post:
+ operationId: postOAuthToken
+ tags: [auth]
+ summary: "[cloud-only] Exchange authorization code or refresh token for a resource-bound access token"
+ description: |
+ [cloud-only] OAuth 2.1 token endpoint (RFC 6749 §3.2). Public clients only — `client_secret` is rejected.
+
+ Two grant types are supported:
+ - `authorization_code` — exchanges the code minted by `/oauth/authorize` (with PKCE verifier) for an access token + first refresh token. Single-use; reuse fails closed.
+ - `refresh_token` — rotates the refresh token. Old token immediately invalid; presenting an already-rotated token revokes the entire token family and emits a security metric.
+
+ Both grant types re-validate canonical user state, current workspace membership, and the resource's active flag at every mint. A code or refresh token bound to a deactivated resource fails closed.
+
+ Errors follow RFC 6749 §5.2. Logs never contain raw codes, refresh tokens, or minted tokens.
+
+ Per RFC 6749 §5.1, every 200 and 400 response carries `Cache-Control: no-store` and `Pragma: no-cache` so intermediaries cannot cache token-bearing or state-change-reason responses.
+ x-runtime: [cloud]
+ security: []
+ requestBody:
+ required: true
+ content:
+ application/x-www-form-urlencoded:
+ schema:
+ type: object
+ required: [grant_type, client_id]
+ properties:
+ grant_type: { type: string, enum: [authorization_code, refresh_token] }
+ client_id: { type: string }
+ code: { type: string }
+ redirect_uri: { type: string }
+ code_verifier: { type: string }
+ refresh_token: { type: string }
+ scope: { type: string }
+ client_secret: { type: string }
+ responses:
+ "200":
+ description: New token pair
+ headers:
+ Cache-Control:
+ schema:
+ type: string
+ description: 'Always "no-store" per RFC 6749 §5.1'
+ Pragma:
+ schema:
+ type: string
+ description: 'Always "no-cache" per RFC 6749 §5.1'
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthTokenResponse"
+ "400":
+ description: RFC 6749 §5.2 error
+ headers:
+ Cache-Control:
+ schema:
+ type: string
+ description: 'Always "no-store" per RFC 6749 §5.1'
+ Pragma:
+ schema:
+ type: string
+ description: 'Always "no-cache" per RFC 6749 §5.1'
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthTokenError"
+ "404":
+ description: OAuth disabled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /oauth/register:
+ post:
+ operationId: postOAuthRegister
+ tags: [auth]
+ summary: "[cloud-only] Dynamic Client Registration (RFC 7591)"
+ description: |
+ [cloud-only] Public, unauthenticated, insert-only RFC 7591 §3.1 client registration. Used by MCP-spec-compliant clients to self-register a public OAuth client without operator involvement.
+
+ Policy:
+
+ - Public clients only — `token_endpoint_auth_method` is forced to `none`. Confidential-client registration is out of scope this phase.
+ - Server-owned `resource_grants`. Caller-supplied `scope` or `resource_grants` is rejected as `invalid_client_metadata` (would be a privilege-escalation surface). Dynamic clients receive the same scopes the active resource publishes.
+ - Application-type-aware redirect URI policy. `application_type=native` accepts loopback (`127.0.0.1`, `::1`, `localhost`) and reverse-DNS-shaped custom schemes; `application_type=web` accepts HTTPS to hosts in an operator-controlled allowlist only. `application_type` is REQUIRED on the request — missing or empty rejects with `invalid_client_metadata`.
+ - Anti-impersonation: reserved client names are rejected from third parties via NFKC-folded compare.
+ - Generated `client_id` carries a stable prefix to distinguish dynamic from seeded clients in audit logs.
+ - Cache-Control: `no-store` on every 201 and 400 response (the response carries fresh credentials and rejection reasons).
+ x-runtime: [cloud]
+ security: []
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthRegisterRequest"
+ responses:
+ "201":
+ description: Registered. Body echoes the metadata RFC 7591 §3.2.1 requires.
+ headers:
+ Cache-Control:
+ schema:
+ type: string
+ description: 'Always "no-store"'
+ Pragma:
+ schema:
+ type: string
+ description: 'Always "no-cache"'
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthRegisterResponse"
+ "400":
+ description: RFC 7591 §3.2.2 invalid client metadata
+ headers:
+ Cache-Control:
+ schema:
+ type: string
+ description: 'Always "no-store"'
+ Pragma:
+ schema:
+ type: string
+ description: 'Always "no-cache"'
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/OAuthRegisterError"
+ "404":
+ description: OAuth disabled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "503":
+ description: No active resource is configured — DCR cannot mint a usable client until an active resource row is seeded.
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ # ---------------------------------------------------------------------------
+ # Billing (cloud)
+ # ---------------------------------------------------------------------------
+ /api/billing/balance:
+ get:
+ operationId: getBillingBalance
+ tags: [billing]
+ summary: Get current credit balance
+ description: "[cloud-only] Returns the authenticated user's current credit balance and usage summary."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Balance info
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/BillingBalanceResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/events:
+ get:
+ operationId: getBillingEvents
+ tags: [billing]
+ summary: List billing events
+ description: "[cloud-only] Returns a paginated list of billing events (charges, credits, refunds) for the authenticated user."
+ x-runtime: [cloud]
+ parameters:
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ description: Maximum number of results
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ description: Pagination offset
+ - name: type
+ in: query
+ schema:
+ type: string
+ description: Filter by event type
+ responses:
+ "200":
+ description: Billing events
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/BillingEventsResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/ops/{id}:
+ get:
+ operationId: getBillingOpStatus
+ tags: [billing]
+ summary: Get a billing operation by ID
+ description: "[cloud-only] Returns details of a specific billing operation."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The billing operation ID.
+ responses:
+ "200":
+ description: Billing operation
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/BillingOpStatusResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/payment-portal:
+ post:
+ operationId: getPaymentPortal
+ tags: [billing]
+ summary: Create a payment portal session
+ description: "[cloud-only] Creates a Stripe customer portal session for managing payment methods and invoices. Returns a URL to redirect the user to."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Portal session
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ url:
+ type: string
+ format: uri
+ description: Stripe portal URL
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Bad request (e.g., missing return_url)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/plans:
+ get:
+ operationId: getBillingPlans
+ tags: [billing]
+ summary: List available billing plans
+ description: "[cloud-only] Returns the list of available subscription plans and their pricing."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Plan list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/BillingPlan"
+
+ '401':
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/preview-subscribe:
+ post:
+ operationId: previewSubscribe
+ tags: [billing]
+ summary: Preview a subscription change
+ description: "[cloud-only] Returns a preview of what a subscription change would cost, including prorations."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - plan_id
+ properties:
+ plan_id:
+ type: string
+ description: ID of the plan to preview
+ responses:
+ "200":
+ description: Subscription preview
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PreviewSubscribeResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/status:
+ get:
+ operationId: getBillingStatus
+ tags: [billing]
+ summary: Get billing status
+ description: "[cloud-only] Returns the authenticated user's current billing and subscription status."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Billing status
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/BillingStatusResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/subscribe:
+ post:
+ operationId: subscribe
+ tags: [billing]
+ summary: Subscribe to a billing plan
+ description: "[cloud-only] Creates a new subscription to the specified billing plan."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - plan_id
+ properties:
+ plan_id:
+ type: string
+ description: ID of the plan to subscribe to
+ payment_method_id:
+ type: string
+ description: Stripe payment method ID
+ responses:
+ "200":
+ description: Subscription created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SubscribeResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/subscription/cancel:
+ post:
+ operationId: cancelSubscription
+ tags: [billing]
+ summary: Cancel the active subscription
+ description: "[cloud-only] Cancels the authenticated user's active subscription. The subscription remains active until the end of the current billing period."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Subscription cancelled
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CancelSubscriptionResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Invalid request (e.g., no active subscription)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/subscription/resubscribe:
+ post:
+ operationId: resubscribe
+ tags: [billing]
+ summary: Resubscribe after cancellation
+ description: "[cloud-only] Reactivates a subscription that was previously cancelled but has not yet expired."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Subscription reactivated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/ResubscribeResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Invalid request (e.g., no active subscription, not in cancellation grace period)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/billing/topup:
+ post:
+ operationId: createTopup
+ tags: [billing]
+ summary: Purchase additional credits
+ description: "[cloud-only] Purchases a one-time credit top-up using the user's payment method on file."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - amount
+ properties:
+ amount:
+ type: integer
+ description: Number of credits to purchase
+ responses:
+ "200":
+ description: Top-up successful
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CreateTopupResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # Workspace (cloud)
+ # ---------------------------------------------------------------------------
+ /api/workspace/api-keys:
+ get:
+ operationId: listWorkspaceAPIKeys
+ tags: [workspace]
+ summary: List workspace API keys
+ description: "[cloud-only] Returns the list of API keys for the current workspace."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: API key list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/WorkspaceApiKey"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createWorkspaceAPIKey
+ tags: [workspace]
+ summary: Create a workspace API key
+ description: "[cloud-only] Creates a new API key for the current workspace."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - name
+ properties:
+ name:
+ type: string
+ description: Display name for the API key
+ description:
+ type: string
+ description: User-provided description of the key's purpose
+ maxLength: 5000
+ responses:
+ "201":
+ description: API key created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CreateWorkspaceAPIKeyResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '429':
+ description: Key limit reached
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/api-keys/{id}:
+ delete:
+ operationId: revokeWorkspaceAPIKey
+ tags: [workspace]
+ summary: Delete a workspace API key
+ description: "[cloud-only] Revokes and deletes a workspace API key."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The API key ID.
+ responses:
+ "204":
+ description: API key deleted
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/invites:
+ get:
+ operationId: listWorkspaceInvites
+ tags: [workspace]
+ summary: List pending workspace invites
+ description: "[cloud-only] Returns the list of pending invitations for the current workspace."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Invite list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/WorkspaceInvite"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createWorkspaceInvite
+ tags: [workspace]
+ summary: Invite a user to the workspace
+ description: "[cloud-only] Creates an invitation for a user to join the current workspace."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - email
+ properties:
+ email:
+ type: string
+ format: email
+ description: Email address to invite
+ role:
+ type: string
+ enum: [admin, member]
+ description: Role to assign
+ responses:
+ "201":
+ description: Invite created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/PendingInvite"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "409":
+ description: Conflict
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/invites/{inviteId}:
+ delete:
+ operationId: revokeWorkspaceInvite
+ tags: [workspace]
+ summary: Cancel a workspace invite
+ description: "[cloud-only] Cancels a pending workspace invitation."
+ x-runtime: [cloud]
+ parameters:
+ - name: inviteId
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The invite ID.
+ responses:
+ "204":
+ description: Invite cancelled
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/leave:
+ post:
+ operationId: leaveWorkspace
+ tags: [workspace]
+ summary: Leave the current workspace
+ description: "[cloud-only] Removes the authenticated user from the current workspace."
+ x-runtime: [cloud]
+ responses:
+ "204":
+ description: Left workspace
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found or not a member
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/members:
+ get:
+ operationId: listWorkspaceMembers
+ tags: [workspace]
+ summary: List workspace members
+ description: "[cloud-only] Returns the list of members in the current workspace."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Member list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/WorkspaceMember"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Workspace not found
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/members/{user_id}/api-keys:
+ get:
+ operationId: listMemberApiKeys
+ tags: [workspace]
+ summary: List API keys for a workspace member
+ description: "[cloud-only] Returns the API keys belonging to a specific workspace member. Requires admin role."
+ x-runtime: [cloud]
+ parameters:
+ - name: user_id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The member's user ID.
+ responses:
+ "200":
+ description: API key list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/WorkspaceApiKey"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ delete:
+ operationId: bulkRevokeWorkspaceMemberAPIKeys
+ tags: [workspace]
+ summary: Bulk revoke a member's API keys
+ description: "[cloud-only] Revokes all active API keys for a specific workspace member. Only workspace owners can perform this action."
+ x-runtime: [cloud]
+ parameters:
+ - name: user_id
+ in: path
+ required: true
+ schema:
+ type: string
+ minLength: 1
+ description: The member's user ID.
+ responses:
+ "200":
+ description: Keys revoked
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/BulkRevokeAPIKeysResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden — must be workspace owner
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '422':
+ description: Validation error (e.g. empty user_id)
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspace/members/{userId}:
+ patch:
+ operationId: updateWorkspaceMember
+ tags: [workspace]
+ summary: Update a workspace member's role
+ description: "[cloud-only] Updates the role of a workspace member. Requires admin role."
+ x-runtime: [cloud]
+ parameters:
+ - name: userId
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The member's user ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - role
+ properties:
+ role:
+ type: string
+ enum: [admin, member]
+ description: New role to assign
+ responses:
+ "200":
+ description: Member updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkspaceMember"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ delete:
+ operationId: removeWorkspaceMember
+ tags: [workspace]
+ summary: Remove a member from the workspace
+ description: "[cloud-only] Removes a member from the current workspace. Requires admin role."
+ x-runtime: [cloud]
+ parameters:
+ - name: userId
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The member's user ID.
+ responses:
+ "204":
+ description: Member removed
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspaces:
+ get:
+ operationId: listWorkspaces
+ tags: [workspace]
+ summary: List workspaces the user belongs to
+ description: "[cloud-only] Returns the list of workspaces the authenticated user is a member of."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Workspace list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/Workspace"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '404':
+ description: Feature not enabled for user
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createWorkspace
+ tags: [workspace]
+ summary: Create a new workspace
+ description: "[cloud-only] Creates a new workspace. The authenticated user becomes the owner."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - name
+ properties:
+ name:
+ type: string
+ description: Workspace name
+ responses:
+ "201":
+ description: Workspace created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/Workspace"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '404':
+ description: Feature not enabled for user
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/workspaces/{id}:
+ get:
+ operationId: getWorkspace
+ tags: [workspace]
+ summary: Get a workspace by ID
+ description: "[cloud-only] Returns details of a workspace the user is a member of."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The workspace ID.
+ responses:
+ "200":
+ description: Workspace detail
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/Workspace"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ patch:
+ operationId: updateWorkspace
+ tags: [workspace]
+ summary: Update workspace settings
+ description: "[cloud-only] Updates the name or settings of a workspace. Requires admin role."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The workspace ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ name:
+ type: string
+ description: New workspace name
+ responses:
+ "200":
+ description: Workspace updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/Workspace"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteWorkspace
+ tags: [workspace]
+ summary: Delete a workspace
+ description: "[cloud-only] Soft-deletes a workspace. Requires owner role. Personal workspaces cannot be deleted."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The workspace ID.
+ responses:
+ "204":
+ description: Workspace deleted
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "403":
+ description: Forbidden — must be workspace owner
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ # ---------------------------------------------------------------------------
+ # User / settings / misc (cloud)
+ # ---------------------------------------------------------------------------
+ /api/feedback:
+ post:
+ operationId: submitFeedback
+ tags: [user]
+ summary: Submit user feedback
+ description: "[cloud-only] Submits feedback from the user about their experience with the cloud runtime."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/FeedbackRequest"
+ responses:
+ "201":
+ description: Feedback submitted
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ id:
+ type: string
+ status:
+ type: string
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/files/mask-layers:
+ get:
+ operationId: getMaskLayers
+ tags: [assets]
+ summary: Get related mask layer filenames
+ description: "[cloud-only] Given a mask file (any of the 4 layers), returns all related mask layer filenames. Used by the mask editor to load the paint, mask, and painted layers when reopening a previously edited mask."
+ x-runtime: [cloud]
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Hash filename of any mask layer file
+ responses:
+ "200":
+ description: Related mask layers
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ mask:
+ type: string
+ description: Filename of the mask layer
+ nullable: true
+ paint:
+ type: string
+ description: Filename of the paint strokes layer
+ nullable: true
+ painted:
+ type: string
+ description: Filename of the painted image layer
+ nullable: true
+ painted_masked:
+ type: string
+ description: Filename of the final composite layer
+ nullable: true
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: File not found or not a mask file
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/internal/cloud_analytics:
+ post:
+ operationId: postCloudAnalytics
+ tags: [internal]
+ summary: Post client analytics events
+ description: "[cloud-only] Receives analytics events from the frontend for processing by the cloud analytics pipeline."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - events
+ properties:
+ events:
+ type: array
+ items:
+ type: object
+ required:
+ - event_name
+ properties:
+ event_name:
+ type: string
+ timestamp:
+ type: string
+ format: date-time
+ properties:
+ type: object
+ additionalProperties: true
+ responses:
+ "200":
+ description: Events accepted
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/invites/{token}/accept:
+ post:
+ operationId: acceptWorkspaceInvite
+ tags: [workspace]
+ summary: Accept a workspace invitation
+ description: "[cloud-only] Accepts a workspace invitation using the invite token. The authenticated user is added to the workspace."
+ x-runtime: [cloud]
+ parameters:
+ - name: token
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The invitation token.
+ responses:
+ "200":
+ description: Invite accepted
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/AcceptInviteResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '403':
+ description: Email does not match invite
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Already a member of this workspace
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/secrets:
+ get:
+ operationId: listSecrets
+ tags: [settings]
+ summary: List user secrets
+ description: "[cloud-only] Returns the list of secrets (API keys for third-party services) stored for the authenticated user. Secret values are redacted."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: Secret list
+ content:
+ application/json:
+ schema:
+ type: array
+ items:
+ $ref: "#/components/schemas/SecretMeta"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '503':
+ description: Service unavailable - feature is disabled
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: createSecret
+ tags: [settings]
+ summary: Create or update a secret
+ description: "[cloud-only] Stores a new secret or updates an existing one. Secrets are encrypted at rest."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ required:
+ - name
+ - value
+ properties:
+ name:
+ type: string
+ description: Secret name (unique per user)
+ value:
+ type: string
+ description: Secret value
+ responses:
+ "201":
+ description: Secret created
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SecretResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '409':
+ description: Conflict - secret with this name or provider already exists
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '422':
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '503':
+ description: Service unavailable - secrets feature disabled
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/secrets/{id}:
+ get:
+ operationId: getSecret
+ tags: [settings]
+ summary: Get secret metadata
+ description: "[cloud-only] Returns metadata for a specific secret. Does not return the plaintext secret value."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The secret ID.
+ responses:
+ "200":
+ description: Secret metadata
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SecretResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '403':
+ description: Forbidden - user does not own this secret
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '503':
+ description: Service unavailable - secrets feature disabled
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ patch:
+ operationId: updateSecret
+ tags: [settings]
+ summary: Update a secret
+ description: "[cloud-only] Updates an existing secret's name and/or value. Both fields are optional; only provided fields are updated."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: The secret ID.
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UpdateSecretRequest"
+ responses:
+ "200":
+ description: Secret updated
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/SecretResponse"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "409":
+ description: Conflict — a secret with this name already exists
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '403':
+ description: Forbidden - user does not own this secret
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '503':
+ description: Service unavailable - secrets feature disabled
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ delete:
+ operationId: deleteSecret
+ tags: [settings]
+ summary: Delete a secret
+ description: "[cloud-only] Permanently deletes a stored secret."
+ x-runtime: [cloud]
+ parameters:
+ - name: id
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The secret ID.
+ responses:
+ "204":
+ description: Secret deleted
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '403':
+ description: Forbidden - user does not own this secret
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '503':
+ description: Service unavailable - secrets feature disabled
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/user:
+ get:
+ operationId: getUser
+ tags: [user]
+ summary: Get the authenticated cloud user
+ description: "[cloud-only] Returns the profile and account information for the currently authenticated user."
+ x-runtime: [cloud]
+ responses:
+ "200":
+ description: User profile
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/UserResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ put:
+ operationId: updateCloudUser
+ tags: [user]
+ summary: Update the authenticated cloud user profile
+ description: "[cloud-only] Updates the profile information for the currently authenticated user."
+ x-runtime: [cloud]
+ requestBody:
+ required: true
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ display_name:
+ type: string
+ avatar_url:
+ type: string
+ format: uri
+ responses:
+ "200":
+ description: Updated profile
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudUser"
+ "400":
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/userdata/{file}/publish:
+ get:
+ operationId: getUserdataFilePublish
+ tags: [userdata]
+ summary: Get publish info for a userdata file
+ description: "[cloud-only] Returns the publish status and share info for a userdata workflow file."
+ x-runtime: [cloud]
+ parameters:
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: File path relative to user data directory
+ responses:
+ "200":
+ description: Publish info (publish_time is null if never published)
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/WorkflowPublishInfo"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Workflow not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ post:
+ operationId: postUserdataFilePublish
+ tags: [userdata]
+ summary: Publish a userdata file to the cloud
+ description: "[cloud-only] Makes a userdata file available via a public URL for sharing or embedding."
+ x-runtime: [cloud]
+ parameters:
+ - name: file
+ in: path
+ required: true
+ schema:
+ type: string
+ description: File path relative to user data directory
+ responses:
+ "200":
+ description: Published file URL
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ url:
+ type: string
+ format: uri
+ description: Public URL of the published file
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: Bad request
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/vhs/queryvideo:
+ get:
+ operationId: getVhsQueryVideo
+ tags: [view]
+ summary: Query VHS video metadata
+ description: "[cloud-only] Returns metadata about a video file processed by the VHS (Video Helper Suite) integration."
+ x-runtime: [cloud]
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Video filename
+ - name: type
+ in: query
+ schema:
+ type: string
+ enum: [input, output, temp]
+ description: Directory type
+ - name: subfolder
+ in: query
+ schema:
+ type: string
+ description: Subfolder within the directory
+ responses:
+ "200":
+ description: Video metadata
+ content:
+ application/json:
+ schema:
+ type: object
+ additionalProperties: true
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '400':
+ description: 'Missing required query parameter. Produced by the oapi-codegen
+ wrapper via echo.NewHTTPError, so the body shape matches Echo''s
+ default HTTPError serialization rather than ErrorResponse.
+ '
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/BindingErrorResponse'
+ /api/vhs/viewaudio:
+ get:
+ operationId: viewVhsAudio
+ tags: [view]
+ summary: View or download VHS audio
+ description: "[cloud-only] Returns audio content from a VHS-processed file."
+ x-runtime: [cloud]
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Audio filename
+ - name: type
+ in: query
+ schema:
+ type: string
+ enum: [input, output, temp]
+ description: Directory type
+ - name: subfolder
+ in: query
+ schema:
+ type: string
+ description: Subfolder within the directory
+ responses:
+ "200":
+ description: Audio content
+ content:
+ audio/*:
+ schema:
+ type: string
+ format: binary
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/vhs/viewvideo:
+ get:
+ operationId: viewVhsVideo
+ tags: [view]
+ summary: View or download VHS video
+ description: "[cloud-only] Returns video content from a VHS-processed file."
+ x-runtime: [cloud]
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Video filename
+ - name: type
+ in: query
+ schema:
+ type: string
+ enum: [input, output, temp]
+ description: Directory type
+ - name: subfolder
+ in: query
+ schema:
+ type: string
+ description: Subfolder within the directory
+ responses:
+ "200":
+ description: Video content
+ content:
+ video/*:
+ schema:
+ type: string
+ format: binary
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/viewvideo:
+ get:
+ operationId: viewVideo
+ tags: [view]
+ summary: View or download a video file
+ deprecated: true
+ description: |
+ **Deprecated.** This endpoint is an alias of `GET /api/view` added for
+ legacy history-queue video playback. Callers should use `/api/view`
+ directly; the endpoint is retained for backward compatibility but will
+ be removed in a future release.
+ x-runtime: [cloud]
+ parameters:
+ - name: filename
+ in: query
+ required: true
+ schema:
+ type: string
+ description: Video filename
+ - name: type
+ in: query
+ schema:
+ type: string
+ enum: [input, output, temp]
+ description: Directory type
+ - name: subfolder
+ in: query
+ schema:
+ type: string
+ description: Subfolder within the directory
+ responses:
+ "200":
+ description: Video content
+ content:
+ video/*:
+ schema:
+ type: string
+ format: binary
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ /api/tasks:
+ get:
+ operationId: listTasks
+ tags: [task]
+ summary: List background tasks
+ description: "[cloud-only] Retrieve a paginated list of background tasks for the authenticated user. Supports filtering by task type, status, and creation time."
+ x-runtime: [cloud]
+ parameters:
+ - name: task_name
+ in: query
+ schema:
+ type: string
+ description: Filter by task type name (exact match).
+ - name: idempotency_key
+ in: query
+ schema:
+ type: string
+ description: Filter by idempotency key (exact match).
+ - name: status
+ in: query
+ schema:
+ type: string
+ description: Filter by one or more statuses (comma-separated).
+ - name: created_after
+ in: query
+ schema:
+ type: string
+ format: date-time
+ description: Filter tasks created after this timestamp.
+ - name: created_before
+ in: query
+ schema:
+ type: string
+ format: date-time
+ description: Filter tasks created before this timestamp.
+ - name: sort_order
+ in: query
+ schema:
+ type: string
+ enum: [asc, desc]
+ default: desc
+ description: Sort direction by create_time.
+ - name: offset
+ in: query
+ schema:
+ type: integer
+ minimum: 0
+ default: 0
+ description: Pagination offset (0-based).
+ - name: limit
+ in: query
+ schema:
+ type: integer
+ minimum: 1
+ maximum: 100
+ default: 20
+ description: Maximum items per page (1-100).
+ responses:
+ "200":
+ description: Tasks retrieved
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/TasksListResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "422":
+ description: Validation error
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /api/tasks/{task_id}:
+ get:
+ operationId: getTask
+ tags: [task]
+ summary: Get task details
+ description: "[cloud-only] Retrieve full details for a specific background task."
+ x-runtime: [cloud]
+ parameters:
+ - name: task_id
+ in: path
+ required: true
+ schema:
+ type: string
+ format: uuid
+ description: Task identifier (UUID).
+ responses:
+ "200":
+ description: Task details
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/TaskResponse"
+ "401":
+ description: Unauthorized
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+ "404":
+ description: Task not found
+ content:
+ application/json:
+ schema:
+ $ref: "#/components/schemas/CloudError"
+
+ '500':
+ description: Internal server error
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+components:
+ parameters:
+ ComfyUserHeader:
+ name: Comfy-User
+ in: header
+ required: false
+ schema:
+ type: string
+ description: |
+ Identifies the active user in multi-user mode. Used for settings,
+ userdata, and history isolation. This is not a security mechanism —
+ it is an organisational convenience with no authentication behind it.
+
+ schemas:
+ # -------------------------------------------------------------------
+ # Prompt
+ # -------------------------------------------------------------------
+ PromptRequest:
+ type: object
+ description: A workflow submission. Wraps the prompt graph plus optional client identifier and extra per-request data.
+ required:
+ - prompt
+ properties:
+ prompt:
+ type: object
+ description: |
+ The workflow graph to execute. Keys are node IDs (strings);
+ values are objects with class_type and inputs.
+ additionalProperties: true
+ number:
+ type: number
+ description: Priority number for the queue (lower numbers have higher priority)
+ front:
+ type: boolean
+ description: If true, adds the prompt to the front of the queue
+ extra_data:
+ type: object
+ description: Extra data associated with the prompt (e.g. extra_pnginfo)
+ additionalProperties: true
+ client_id:
+ type: string
+ description: WebSocket client ID to receive progress updates
+ prompt_id:
+ type: string
+ format: uuid
+ description: "Client-supplied prompt ID. Server generates a UUID if omitted."
+ partial_execution_targets:
+ type: array
+ items:
+ type: string
+ description: List of node IDs to execute (partial graph execution)
+ workflow_id:
+ type: string
+ format: uuid
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Cloud workflow entity ID for tracking and gallery association. Ignored by local ComfyUI."
+ workflow_version_id:
+ type: string
+ format: uuid
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] Cloud workflow version ID for pinning execution to a specific version. Ignored by local ComfyUI."
+
+ PromptResponse:
+ type: object
+ description: Server acknowledgement of a workflow submission. Includes the assigned `prompt_id` and current queue position.
+ properties:
+ prompt_id:
+ type: string
+ format: uuid
+ description: Unique identifier for the prompt execution
+ number:
+ type: number
+ description: Priority number in the queue
+ node_errors:
+ type: object
+ description: Validation errors keyed by node ID
+ additionalProperties:
+ $ref: "#/components/schemas/NodeError"
+ error:
+ description: Top-level prompt error (string message or structured error)
+ oneOf:
+ - type: string
+ - $ref: "#/components/schemas/PromptError"
+
+ PromptErrorResponse:
+ type: object
+ description: Error response when prompt validation fails
+ additionalProperties: true
+
+ PromptError:
+ type: object
+ description: Structured prompt validation error
+ properties:
+ type:
+ type: string
+ message:
+ type: string
+ details:
+ type: string
+
+ Error:
+ type: object
+ description: Detailed node-level error
+ properties:
+ type:
+ type: string
+ message:
+ type: string
+ details:
+ type: string
+ extra_info:
+ type: object
+ properties:
+ input_name:
+ type: string
+ additionalProperties: true
+
+ NodeError:
+ type: object
+ description: Error details for a single node
+ properties:
+ errors:
+ type: array
+ items:
+ $ref: "#/components/schemas/Error"
+ class_type:
+ type: string
+ description: The node's class type
+ dependent_outputs:
+ type: array
+ items: {}
+
+ PromptInfo:
+ type: object
+ description: Summary of a queued or recently-executed prompt, as returned by the queue and history endpoints.
+ properties:
+ exec_info:
+ type: object
+ properties:
+ queue_remaining:
+ type: integer
+ description: Number of items remaining in the queue
+
+ # -------------------------------------------------------------------
+ # Queue
+ # -------------------------------------------------------------------
+ QueueInfo:
+ type: object
+ description: Queue information with pending and running items
+ properties:
+ queue_running:
+ type: array
+ description: Currently running queue items
+ items:
+ type: array
+ description: |
+ Queue item tuple: [number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive]
+ items: {}
+ prefixItems:
+ - type: number
+ description: Priority number
+ - type: string
+ format: uuid
+ description: prompt_id
+ - type: object
+ description: prompt graph
+ additionalProperties: true
+ - type: object
+ description: extra_data
+ additionalProperties: true
+ - type: array
+ description: outputs_to_execute (list of output node IDs)
+ items:
+ type: string
+ - type: object
+ description: sensitive data (may be omitted)
+ additionalProperties: true
+ queue_pending:
+ type: array
+ description: Pending queue items (oldest first)
+ items:
+ type: array
+ description: |
+ Queue item tuple: [number, prompt_id, prompt, extra_data, outputs_to_execute, sensitive]
+ items: {}
+ prefixItems:
+ - type: number
+ description: Priority number
+ - type: string
+ format: uuid
+ description: prompt_id
+ - type: object
+ description: prompt graph
+ additionalProperties: true
+ - type: object
+ description: extra_data
+ additionalProperties: true
+ - type: array
+ description: outputs_to_execute (list of output node IDs)
+ items:
+ type: string
+ - type: object
+ description: sensitive data (may be omitted)
+ additionalProperties: true
+
+ QueueManageRequest:
+ type: object
+ description: Request to clear or delete from queue
+ properties:
+ clear:
+ type: boolean
+ description: If true, clear all pending items
+ delete:
+ type: array
+ items:
+ type: string
+ description: Array of prompt IDs to delete from queue
+
+ QueueManageResponse:
+ type: object
+ x-runtime: [cloud]
+ description: >-
+ [cloud-only] Result of a queue mutation. The Cloud runtime returns which
+ items were deleted and whether the queue was cleared; local ComfyUI
+ returns an empty 200 body.
+ properties:
+ deleted:
+ type: array
+ nullable: true
+ items:
+ type: string
+ description: Prompt IDs that were deleted from the queue.
+ cleared:
+ type: boolean
+ nullable: true
+ description: Whether the queue was cleared.
+
+ # -------------------------------------------------------------------
+ # History
+ # -------------------------------------------------------------------
+ HistoryEntry:
+ type: object
+ description: A single execution history entry
+ properties:
+ prompt:
+ type: array
+ description: |
+ Prompt tuple: [number, prompt_id, prompt_graph, extra_data, output_node_ids]
+ items: {}
+ outputs:
+ type: object
+ description: Output data from execution keyed by node ID
+ additionalProperties: true
+ status:
+ type: object
+ description: Execution status (status_str, completed, messages, etc.)
+ additionalProperties: true
+ meta:
+ type: object
+ description: Metadata about the execution and nodes
+ additionalProperties: true
+
+ HistoryManageRequest:
+ type: object
+ description: Request to clear or delete history entries
+ properties:
+ clear:
+ type: boolean
+ description: If true, clear all history
+ delete:
+ type: array
+ items:
+ type: string
+ description: Array of prompt IDs to delete from history
+
+ # -------------------------------------------------------------------
+ # Jobs
+ # -------------------------------------------------------------------
+ JobEntry:
+ type: object
+ description: Lightweight job data for list views
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: Unique job identifier (same as prompt_id)
+ status:
+ type: string
+ enum:
+ - pending
+ - in_progress
+ - completed
+ - failed
+ - cancelled
+ description: Current job status
+ create_time:
+ type: integer
+ format: int64
+ description: Job creation timestamp (Unix milliseconds).
+ execution_start_time:
+ type: integer
+ format: int64
+ description: Workflow execution start timestamp (Unix milliseconds, terminal states only).
+ execution_end_time:
+ type: integer
+ format: int64
+ description: Workflow execution end timestamp (Unix milliseconds, terminal states only).
+ preview_output:
+ type: object
+ additionalProperties: true
+ description: Primary preview output
+ outputs_count:
+ type: integer
+ description: Total number of output files
+ workflow_id:
+ type: string
+ nullable: true
+ x-runtime: [cloud]
+ description: "[cloud-only] UUID of the Cloud workflow entity this job is associated with. Local ComfyUI returns null."
+ execution_error:
+ x-runtime: [cloud]
+ description: "[cloud-only] Detailed execution error from ComfyUI for failed jobs. Absent on local ComfyUI."
+ allOf:
+ - $ref: "#/components/schemas/ExecutionError"
+
+ JobDetailResponse:
+ type: object
+ description: Full job details including workflow and outputs
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ format: uuid
+ status:
+ type: string
+ enum:
+ - pending
+ - in_progress
+ - completed
+ - failed
+ - cancelled
+ workflow:
+ type: object
+ additionalProperties: true
+ description: Full ComfyUI workflow
+ outputs:
+ type: object
+ additionalProperties: true
+ description: Full outputs object from execution
+ execution_error:
+ $ref: "#/components/schemas/ExecutionError"
+ create_time:
+ type: integer
+ format: int64
+ description: Job creation timestamp (Unix milliseconds).
+ update_time:
+ type: integer
+ format: int64
+ description: Last state-change timestamp (Unix milliseconds).
+ execution_start_time:
+ type: integer
+ format: int64
+ description: Workflow execution start timestamp (Unix milliseconds, terminal states only).
+ execution_end_time:
+ type: integer
+ format: int64
+ description: Workflow execution end timestamp (Unix milliseconds, terminal states only).
+ preview_output:
+ type: object
+ additionalProperties: true
+ outputs_count:
+ type: integer
+ execution_status:
+ type: object
+ additionalProperties: true
+ execution_meta:
+ type: object
+ additionalProperties: true
+
+ ExecutionError:
+ type: object
+ description: Detailed execution error from ComfyUI
+ properties:
+ node_id:
+ type: string
+ description: ID of the node that failed
+ node_type:
+ type: string
+ description: Type name of the node
+ exception_message:
+ type: string
+ description: Human-readable error message
+ exception_type:
+ type: string
+ description: Python exception type
+ traceback:
+ type: array
+ items:
+ type: string
+ description: Traceback lines
+ current_inputs:
+ type: object
+ additionalProperties: true
+ current_outputs:
+ type: object
+ additionalProperties: true
+
+ PaginationInfo:
+ type: object
+ description: Pagination metadata returned alongside list responses.
+ properties:
+ offset:
+ type: integer
+ limit:
+ type: integer
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ # -------------------------------------------------------------------
+ # Upload / View
+ # -------------------------------------------------------------------
+ UploadResult:
+ type: object
+ description: Response body returned by the image/mask upload endpoints, describing where the uploaded file now lives.
+ properties:
+ name:
+ type: string
+ description: Saved filename (may be renamed to avoid collisions)
+ subfolder:
+ type: string
+ description: Subfolder the file was saved to
+ type:
+ type: string
+ description: Directory type (input, temp)
+
+ # -------------------------------------------------------------------
+ # System
+ # -------------------------------------------------------------------
+ DeviceStats:
+ type: object
+ description: GPU/compute device statistics
+ required:
+ - name
+ - type
+ - index
+ properties:
+ name:
+ type: string
+ description: Device name
+ type:
+ type: string
+ description: Device type (cuda, mps, cpu, etc.)
+ index:
+ type: number
+ nullable: true
+ description: |
+ Device index within its type (e.g. CUDA ordinal for `cuda:0`,
+ `cuda:1`). `null` for devices with no index, including the CPU
+ device returned in `--cpu` mode (PyTorch's `torch.device('cpu').index`
+ is `None`).
+ vram_total:
+ type: number
+ description: Total VRAM in bytes
+ vram_free:
+ type: number
+ description: Free VRAM in bytes
+ torch_vram_total:
+ type: number
+ description: Total PyTorch-managed VRAM in bytes
+ torch_vram_free:
+ type: number
+ description: Free PyTorch-managed VRAM in bytes
+
+ SystemStatsResponse:
+ type: object
+ description: Hardware, VRAM, Python, and ComfyUI version information for the running process.
+ required:
+ - system
+ - devices
+ properties:
+ system:
+ type: object
+ required:
+ - os
+ - python_version
+ - embedded_python
+ - comfyui_version
+ - pytorch_version
+ - argv
+ - ram_total
+ - ram_free
+ properties:
+ os:
+ type: string
+ description: Operating system
+ python_version:
+ type: string
+ description: Python version
+ embedded_python:
+ type: boolean
+ description: Whether using embedded Python
+ comfyui_version:
+ type: string
+ description: ComfyUI version string
+ pytorch_version:
+ type: string
+ description: PyTorch version
+ required_frontend_version:
+ type: string
+ description: Required frontend version
+ argv:
+ type: array
+ items:
+ type: string
+ description: Command line arguments
+ ram_total:
+ type: number
+ description: Total RAM in bytes
+ ram_free:
+ type: number
+ description: Free RAM in bytes
+ installed_templates_version:
+ type: string
+ nullable: true
+ description: Version of the currently installed workflow templates
+ required_templates_version:
+ type: string
+ nullable: true
+ description: Minimum required workflow templates version for this ComfyUI build
+ comfy_package_versions:
+ type: array
+ description: Installed and required versions for every comfy* package pinned in requirements.txt
+ items:
+ type: object
+ required:
+ - name
+ - installed
+ - required
+ properties:
+ name:
+ type: string
+ installed:
+ type: string
+ nullable: true
+ required:
+ type: string
+ nullable: true
+ devices:
+ type: array
+ items:
+ $ref: "#/components/schemas/DeviceStats"
+
+ # -------------------------------------------------------------------
+ # Node / Object Info
+ # -------------------------------------------------------------------
+ NodeInfo:
+ type: object
+ description: 'Definition of a registered node class: its inputs, outputs, category, and display metadata.'
+ properties:
+ input:
+ type: object
+ description: Input specifications (required and optional groups)
+ additionalProperties: true
+ input_order:
+ type: object
+ description: Ordered input names per group
+ additionalProperties:
+ type: array
+ items:
+ type: string
+ output:
+ type: array
+ items:
+ type: string
+ description: Output type names
+ output_is_list:
+ type: array
+ items:
+ type: boolean
+ description: Whether each output is a list
+ output_name:
+ type: array
+ items:
+ type: string
+ description: Display names of outputs
+ name:
+ type: string
+ description: Internal class name
+ display_name:
+ type: string
+ description: Human-readable display name
+ description:
+ type: string
+ description: Node description
+ python_module:
+ type: string
+ description: Python module implementing the node
+ category:
+ type: string
+ description: Node category path
+ output_node:
+ type: boolean
+ description: Whether this is an output node
+ output_tooltips:
+ type: array
+ items:
+ type: string
+ description: Tooltips for each output
+ deprecated:
+ type: boolean
+ description: Whether the node is deprecated
+ experimental:
+ type: boolean
+ description: Whether the node is experimental
+ api_node:
+ type: boolean
+ description: Whether this is an API node
+ is_input_list:
+ type: boolean
+ description: Whether the node accepts list inputs
+ dev_only:
+ type: boolean
+ description: Whether the node is developer-only (hidden in production UI)
+ has_intermediate_output:
+ type: boolean
+ description: Whether the node emits intermediate output during execution
+ search_aliases:
+ type: array
+ items:
+ type: string
+ description: Alternative search terms for finding this node
+ essentials_category:
+ type: string
+ nullable: true
+ description: |
+ Category override used by the essentials pack. The
+ `essentials_category` key may be present with a string value,
+ present and `null`, or absent entirely:
+
+ - V1 nodes: `essentials_category` is **omitted** when the node
+ class doesn't define an `ESSENTIALS_CATEGORY` attribute, and
+ **`null`** if the attribute is explicitly set to `None`.
+ - V3 nodes (`comfy_api.latest.io`): `essentials_category` is
+ **always present**, and **`null`** for nodes whose `Schema`
+ doesn't populate it.
+
+ # -------------------------------------------------------------------
+ # Models
+ # -------------------------------------------------------------------
+ ModelFolder:
+ type: object
+ description: A configured model folder and the list of disk paths it resolves to.
+ required:
+ - name
+ - folders
+ properties:
+ name:
+ type: string
+ description: Model folder type name (e.g. "checkpoints")
+ folders:
+ type: array
+ items:
+ type: string
+ description: Filesystem paths for this model type
+
+ ModelFile:
+ type: object
+ description: A single model file in a folder, with filesystem metadata.
+ required:
+ - name
+ - pathIndex
+ properties:
+ name:
+ type: string
+ description: Model filename
+ pathIndex:
+ type: integer
+ description: Index into the folder's paths array
+ modified:
+ type: number
+ description: File modification timestamp
+ created:
+ type: number
+ description: File creation timestamp
+ size:
+ type: integer
+ format: int64
+ description: File size in bytes
+
+ # -------------------------------------------------------------------
+ # Subgraphs
+ # -------------------------------------------------------------------
+ GlobalSubgraphInfo:
+ type: object
+ description: Metadata for a global subgraph blueprint (without full data)
+ required:
+ - source
+ - name
+ - info
+ properties:
+ source:
+ type: string
+ description: Source type ("templates" or "custom_node")
+ name:
+ type: string
+ description: Display name of the subgraph blueprint
+ info:
+ type: object
+ description: Additional information about the subgraph
+ required:
+ - node_pack
+ properties:
+ node_pack:
+ type: string
+ description: The node pack/module providing this subgraph
+ data:
+ type: string
+ description: The full subgraph JSON data (may be empty in list view)
+
+ GlobalSubgraphData:
+ type: object
+ description: Full data for a global subgraph blueprint
+ required:
+ - source
+ - name
+ - info
+ - data
+ properties:
+ source:
+ type: string
+ description: Source type ("templates" or "custom_node")
+ name:
+ type: string
+ description: Display name of the subgraph blueprint
+ info:
+ type: object
+ description: Additional information about the subgraph
+ required:
+ - node_pack
+ properties:
+ node_pack:
+ type: string
+ description: The node pack/module providing this subgraph
+ data:
+ type: string
+ description: The full subgraph JSON data as a string
+
+ # -------------------------------------------------------------------
+ # Userdata
+ # -------------------------------------------------------------------
+ UserDataResponse:
+ description: |
+ Response body for the POST endpoints `/api/userdata/{file}` and
+ `/api/userdata/{file}/move/{dest}`. Returns a single item whose
+ shape depends on the `full_info` query parameter.
+ x-variant-selector:
+ full_info=true: file-info object (`GetUserDataResponseFullFile`)
+ default: relative path string
+ oneOf:
+ - $ref: "#/components/schemas/GetUserDataResponseFullFile"
+ - type: string
+ description: Relative path of the written or moved file. Returned when `full_info` is absent or false.
+
+ ListUserdataResponse:
+ description: |
+ Response body for `GET /api/userdata`. The array item shape is
+ determined by the `full_info` and `split` query parameters.
+ x-variant-selector:
+ full_info=true: array of file-info objects (`GetUserDataResponseFullFile`)
+ split=true: array of `[relative_path, ...path_components]` arrays
+ default: array of relative path strings
+ oneOf:
+ - type: array
+ items:
+ $ref: "#/components/schemas/GetUserDataResponseFullFile"
+ description: Returned when `full_info=true`.
+ - type: array
+ items:
+ type: array
+ items:
+ type: string
+ minItems: 2
+ description: |
+ Returned when `split=true` and `full_info=false`. Each inner
+ array is `[relative_path, ...path_components]`.
+ - type: array
+ items:
+ type: string
+ description: Default shape — array of file paths relative to the user data root.
+
+ GetUserDataResponseFullFile:
+ type: object
+ description: A single entry in a full-info user data listing.
+ properties:
+ path:
+ type: string
+ description: File name or path relative to the user directory
+ created:
+ type: number
+ description: Unix timestamp of file creation
+ size:
+ type: integer
+ description: File size in bytes
+ modified:
+ type: integer
+ format: int64
+ description: Unix timestamp of last modification in milliseconds
+
+ # -------------------------------------------------------------------
+ # Assets
+ # -------------------------------------------------------------------
+ Asset:
+ type: object
+ description: A registered asset — an input/output file tracked in the asset database with content hash and metadata.
+ required:
+ - id
+ - name
+ - size
+ - created_at
+ - updated_at
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: Unique identifier for the asset
+ name:
+ type: string
+ description: Name of the asset file
+ hash:
+ type: string
+ nullable: true
+ description: Blake3 content hash of the asset (preferred over asset_hash)
+ pattern: "^blake3:[a-f0-9]{64}$"
+ asset_hash:
+ type: string
+ nullable: true
+ deprecated: true
+ description: "Deprecated: use `hash` instead. Blake3 hash of the asset content."
+ pattern: "^blake3:[a-f0-9]{64}$"
+ size:
+ type: integer
+ format: int64
+ description: Size of the asset in bytes
+ mime_type:
+ type: string
+ description: MIME type of the asset
+ tags:
+ type: array
+ items:
+ type: string
+ description: Tags associated with the asset
+ user_metadata:
+ type: object
+ description: Custom user metadata
+ additionalProperties: true
+ metadata:
+ type: object
+ description: System-managed metadata (read-only)
+ additionalProperties: true
+ readOnly: true
+ preview_url:
+ type: string
+ format: uri
+ description: URL for asset preview/thumbnail
+ preview_id:
+ type: string
+ format: uuid
+ description: ID of the preview asset if available
+ prompt_id:
+ type: string
+ format: uuid
+ nullable: true
+ deprecated: true
+ description: "Deprecated: use job_id instead. ID of the prompt that created this asset."
+ job_id:
+ type: string
+ format: uuid
+ nullable: true
+ description: ID of the job that created this asset
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+ last_access_time:
+ type: string
+ format: date-time
+ is_immutable:
+ type: boolean
+ description: Whether this asset is immutable
+
+ AssetCreated:
+ description: Response body returned after successfully registering a new asset.
+ allOf:
+ - $ref: "#/components/schemas/Asset"
+ - type: object
+ required:
+ - created_new
+ properties:
+ created_new:
+ type: boolean
+ description: Whether this was a new creation (true) or returned existing (false)
+
+ AssetUpdated:
+ type: object
+ description: Response body returned after updating an asset's metadata.
+ required:
+ - id
+ - updated_at
+ properties:
+ id:
+ type: string
+ format: uuid
+ name:
+ type: string
+ hash:
+ type: string
+ nullable: true
+ description: Blake3 content hash of the asset (preferred over asset_hash)
+ pattern: "^blake3:[a-f0-9]{64}$"
+ asset_hash:
+ type: string
+ nullable: true
+ deprecated: true
+ description: "Deprecated: use `hash` instead. Blake3 hash of the asset content."
+ pattern: "^blake3:[a-f0-9]{64}$"
+ tags:
+ type: array
+ items:
+ type: string
+ mime_type:
+ type: string
+ user_metadata:
+ type: object
+ additionalProperties: true
+ prompt_id:
+ type: string
+ format: uuid
+ nullable: true
+ deprecated: true
+ description: "Deprecated: use job_id instead. ID of the prompt that created this asset."
+ job_id:
+ type: string
+ format: uuid
+ nullable: true
+ description: ID of the job that created this asset
+ updated_at:
+ type: string
+ format: date-time
+
+ ListAssetsResponse:
+ type: object
+ description: Paginated list of assets.
+ required:
+ - assets
+ - total
+ - has_more
+ properties:
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/Asset"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ TagInfo:
+ type: object
+ description: A tag known to the asset database, with the number of assets bearing it.
+ required:
+ - name
+ - count
+ properties:
+ name:
+ type: string
+ count:
+ type: integer
+
+ ListTagsResponse:
+ type: object
+ description: Flat list of all tags, with counts.
+ required:
+ - tags
+ - total
+ - has_more
+ properties:
+ tags:
+ type: array
+ items:
+ $ref: "#/components/schemas/TagInfo"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ AssetTagHistogramResponse:
+ type: object
+ description: Tags that would refine a filtered asset query, with the count of assets each tag would additionally select.
+ required:
+ - tag_counts
+ properties:
+ tag_counts:
+ type: object
+ additionalProperties:
+ type: integer
+ description: Map of tag names to occurrence counts
+
+ TagsModificationResponse:
+ type: object
+ description: Response body returned after adding or removing tags on an asset.
+ required:
+ - total_tags
+ properties:
+ added:
+ type: array
+ items:
+ type: string
+ description: Tags successfully added
+ removed:
+ type: array
+ items:
+ type: string
+ description: Tags successfully removed
+ already_present:
+ type: array
+ items:
+ type: string
+ description: Tags already present (for add)
+ not_present:
+ type: array
+ items:
+ type: string
+ description: Tags not present (for remove)
+ total_tags:
+ type: array
+ items:
+ type: string
+ description: All tags on the asset after the operation
+
+ # -------------------------------------------------------------------
+ # Result / Output types
+ # -------------------------------------------------------------------
+ ResultItem:
+ type: object
+ description: A single output file reference
+ properties:
+ filename:
+ type: string
+ subfolder:
+ type: string
+ type:
+ type: string
+ enum: [input, output, temp]
+ display_name:
+ type: string
+
+ NodeOutputs:
+ type: object
+ description: |
+ Outputs from a single node execution. Known keys are listed below,
+ but custom nodes may add arbitrary keys (additionalProperties).
+ properties:
+ images:
+ type: array
+ items:
+ $ref: "#/components/schemas/ResultItem"
+ audio:
+ type: array
+ items:
+ $ref: "#/components/schemas/ResultItem"
+ video:
+ type: array
+ items:
+ $ref: "#/components/schemas/ResultItem"
+ animated:
+ type: array
+ items:
+ type: boolean
+ text:
+ oneOf:
+ - type: string
+ - type: array
+ items:
+ type: string
+ additionalProperties: true
+
+ TerminalSize:
+ type: object
+ description: Terminal dimensions
+ properties:
+ cols:
+ type: number
+ row:
+ type: number
+
+ LogEntry:
+ type: object
+ description: A single log entry
+ properties:
+ t:
+ type: string
+ description: Timestamp
+ m:
+ type: string
+ description: Log message
+
+ StatusWsMessageStatus:
+ type: object
+ description: Inner payload of a `status` WebSocket message, describing the execution queue state.
+ properties:
+ exec_info:
+ type: object
+ required:
+ - queue_remaining
+ properties:
+ queue_remaining:
+ type: integer
+
+ StatusWsMessage:
+ type: object
+ description: Initial status message sent on connect + queue status updates
+ properties:
+ status:
+ $ref: "#/components/schemas/StatusWsMessageStatus"
+ sid:
+ type: string
+ description: Session ID assigned by the server
+
+ ProgressWsMessage:
+ type: object
+ description: Node execution progress (step N of M)
+ required:
+ - value
+ - max
+ - prompt_id
+ - node
+ properties:
+ value:
+ type: integer
+ description: Current step
+ max:
+ type: integer
+ description: Total steps
+ prompt_id:
+ type: string
+ node:
+ type: string
+ description: Node ID currently executing
+
+ ProgressTextWsMessage:
+ type: object
+ description: Text-based progress update from a node
+ properties:
+ nodeId:
+ type: string
+ text:
+ type: string
+ prompt_id:
+ type: string
+
+ NodeProgressState:
+ type: object
+ description: Progress state for a single node
+ properties:
+ value:
+ type: number
+ max:
+ type: number
+ state:
+ type: string
+ enum: [pending, running, finished, error]
+ node_id:
+ type: string
+ prompt_id:
+ type: string
+ display_node_id:
+ type: string
+ parent_node_id:
+ type: string
+ real_node_id:
+ type: string
+
+ ProgressStateWsMessage:
+ type: object
+ description: Bulk progress state for all nodes in a prompt
+ required:
+ - prompt_id
+ - nodes
+ properties:
+ prompt_id:
+ type: string
+ nodes:
+ type: object
+ description: Map of node ID to progress state
+ additionalProperties:
+ $ref: "#/components/schemas/NodeProgressState"
+
+ ExecutingWsMessage:
+ type: object
+ description: Fired when a node begins execution
+ required:
+ - node
+ - display_node
+ - prompt_id
+ properties:
+ node:
+ type: string
+ description: Node ID
+ display_node:
+ type: string
+ description: Display node ID (may differ for subgraphs)
+ prompt_id:
+ type: string
+
+ ExecutedWsMessage:
+ type: object
+ description: Fired when a node completes execution with output
+ required:
+ - node
+ - display_node
+ - prompt_id
+ - output
+ properties:
+ node:
+ type: string
+ display_node:
+ type: string
+ prompt_id:
+ type: string
+ output:
+ $ref: "#/components/schemas/NodeOutputs"
+ merge:
+ type: boolean
+ description: Whether to merge with existing output
+
+ ExecutionWsMessageBase:
+ type: object
+ description: Base fields for execution lifecycle messages
+ required:
+ - prompt_id
+ - timestamp
+ properties:
+ prompt_id:
+ type: string
+ timestamp:
+ type: integer
+ description: Unix timestamp in milliseconds
+
+ ExecutionStartWsMessage:
+ allOf:
+ - $ref: "#/components/schemas/ExecutionWsMessageBase"
+ description: Fired when prompt execution begins
+
+ ExecutionSuccessWsMessage:
+ allOf:
+ - $ref: "#/components/schemas/ExecutionWsMessageBase"
+ description: Fired when prompt execution completes successfully
+
+ ExecutionCachedWsMessage:
+ allOf:
+ - $ref: "#/components/schemas/ExecutionWsMessageBase"
+ - type: object
+ properties:
+ nodes:
+ type: array
+ items:
+ type: string
+ description: List of node IDs that were cached
+ description: Fired when nodes are served from cache
+
+ ExecutionInterruptedWsMessage:
+ allOf:
+ - $ref: "#/components/schemas/ExecutionWsMessageBase"
+ - type: object
+ properties:
+ node_id:
+ type: string
+ node_type:
+ type: string
+ executed:
+ type: array
+ items:
+ type: string
+ description: Node IDs that completed before interruption
+ description: Fired when execution is interrupted by user
+
+ ExecutionErrorWsMessage:
+ allOf:
+ - $ref: "#/components/schemas/ExecutionWsMessageBase"
+ - type: object
+ properties:
+ node_id:
+ type: string
+ node_type:
+ type: string
+ executed:
+ type: array
+ items:
+ type: string
+ exception_message:
+ type: string
+ exception_type:
+ type: string
+ traceback:
+ type: array
+ items:
+ type: string
+ current_inputs: {}
+ current_outputs: {}
+ description: Fired when a node throws an exception during execution
+
+ LogsWsMessage:
+ type: object
+ description: Streaming log entries from the server
+ properties:
+ size:
+ $ref: "#/components/schemas/TerminalSize"
+ entries:
+ type: array
+ items:
+ $ref: "#/components/schemas/LogEntry"
+
+ NotificationWsMessage:
+ type: object
+ description: Server notification (e.g. model download complete)
+ properties:
+ value:
+ type: string
+ id:
+ type: string
+
+ FeatureFlagsWsMessage:
+ type: object
+ description: Feature flags sent on connect
+ additionalProperties: true
+
+ AssetDownloadWsMessage:
+ type: object
+ description: Asset download progress
+ required:
+ - task_id
+ - asset_name
+ - bytes_total
+ - bytes_downloaded
+ - progress
+ - status
+ properties:
+ task_id:
+ type: string
+ asset_name:
+ type: string
+ bytes_total:
+ type: number
+ bytes_downloaded:
+ type: number
+ progress:
+ type: number
+ description: 0.0 to 1.0
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ asset_id:
+ type: string
+ error:
+ type: string
+
+ AssetExportWsMessage:
+ type: object
+ description: Bulk asset export progress
+ required:
+ - task_id
+ - assets_total
+ - assets_attempted
+ - assets_failed
+ - bytes_total
+ - bytes_processed
+ - progress
+ - status
+ properties:
+ task_id:
+ type: string
+ export_name:
+ type: string
+ assets_total:
+ type: number
+ assets_attempted:
+ type: number
+ assets_failed:
+ type: number
+ bytes_total:
+ type: number
+ bytes_processed:
+ type: number
+ progress:
+ type: number
+ description: 0.0 to 1.0
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ error:
+ type: string
+
+ # -------------------------------------------------------------------
+ # Cloud-runtime schemas
+ #
+ # These schemas are exclusively referenced by cloud-runtime operations.
+ # Tagged x-runtime: [cloud].
+ # -------------------------------------------------------------------
+ CloudError:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Standard error response from cloud endpoints."
+ required:
+ - error
+ properties:
+ error:
+ type: string
+ description: Error message
+ code:
+ type: string
+ description: Machine-readable error code
+ details:
+ type: object
+ additionalProperties: true
+ description: Additional error context
+
+ CloudJobStatus:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Status of a cloud job."
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ format: uuid
+ status:
+ type: string
+ enum: [pending, running, completed, failed, cancelled]
+ progress:
+ type: number
+ minimum: 0
+ maximum: 1
+ description: "Execution progress (0.0 to 1.0)"
+ started_at:
+ type: string
+ format: date-time
+ nullable: true
+ completed_at:
+ type: string
+ format: date-time
+ nullable: true
+
+ CloudPrompt:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A cloud-executed prompt record."
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ format: uuid
+ status:
+ type: string
+ workflow:
+ type: object
+ additionalProperties: true
+ outputs:
+ type: object
+ additionalProperties: true
+ created_at:
+ type: string
+ format: date-time
+ completed_at:
+ type: string
+ format: date-time
+ nullable: true
+
+ HistoryV2Response:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated execution history in v2 format."
+ required:
+ - items
+ - total
+ - has_more
+ properties:
+ items:
+ type: array
+ items:
+ $ref: "#/components/schemas/HistoryV2Entry"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ HistoryV2Entry:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A single execution history entry in v2 format."
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ format: uuid
+ status:
+ type: string
+ workflow:
+ type: object
+ additionalProperties: true
+ outputs:
+ type: object
+ additionalProperties: true
+ created_at:
+ type: string
+ format: date-time
+ started_at:
+ type: string
+ format: date-time
+ nullable: true
+ completed_at:
+ type: string
+ format: date-time
+ nullable: true
+ preview_output:
+ type: object
+ additionalProperties: true
+
+ CloudLogsResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated cloud execution logs."
+ required:
+ - entries
+ properties:
+ entries:
+ type: array
+ items:
+ type: object
+ properties:
+ timestamp:
+ type: string
+ format: date-time
+ level:
+ type: string
+ enum: [debug, info, warn, error]
+ message:
+ type: string
+ job_id:
+ type: string
+ format: uuid
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ AssetDownloadRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A single asset to download to the cloud runtime."
+ required:
+ - asset_id
+ properties:
+ asset_id:
+ type: string
+ format: uuid
+ description: ID of the asset to download
+ target_path:
+ type: string
+ description: Target path on the runtime filesystem
+
+ ImportPublishedAssetsRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Request body for importing published assets into the caller's library."
+ required:
+ - published_asset_ids
+ properties:
+ published_asset_ids:
+ type: array
+ description: IDs of published assets (inputs and models) to import.
+ items:
+ type: string
+ share_id:
+ type: string
+ nullable: true
+ description: |
+ Optional. Share ID of the published workflow these assets belong to. When provided (non-null, non-empty): all `published_asset_ids` must belong to this share's workflow version; returns 400 if the share is not found or any asset does not belong to it. When omitted, null, or empty string: no share-scoped validation is performed and the assets are validated only against global rules (preserved for clients that have not yet adopted `share_id`).
+
+ ImportPublishedAssetsResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Response after importing published assets. Each returned `AssetInfo.id` is the caller's newly-created private asset ID, not the published asset ID supplied in the request."
+ required:
+ - assets
+ properties:
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/AssetInfo"
+
+ RemoteAssetMetadata:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Metadata fetched from a remote asset URL."
+ properties:
+ content_type:
+ type: string
+ description: MIME type of the remote file
+ content_length:
+ type: integer
+ format: int64
+ description: Size in bytes
+ filename:
+ type: string
+ description: Suggested filename from Content-Disposition or URL
+
+ CloudNode:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] An installed custom node package in the cloud runtime."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ version:
+ type: string
+ description:
+ type: string
+ author:
+ type: string
+ repository:
+ type: string
+ format: uri
+ installed_at:
+ type: string
+ format: date-time
+ enabled:
+ type: boolean
+
+ HubLabel:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A label/category used for tagging hub content."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ color:
+ type: string
+ description: Hex color code for the label
+
+ HubProfile:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A public user profile on the ComfyUI Hub."
+ required:
+ - username
+ properties:
+ username:
+ type: string
+ display_name:
+ type: string
+ bio:
+ type: string
+ avatar_url:
+ type: string
+ format: uri
+ links:
+ type: array
+ items:
+ type: string
+ format: uri
+ workflow_count:
+ type: integer
+ created_at:
+ type: string
+ format: date-time
+
+ HubWorkflow:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A published workflow on the ComfyUI Hub."
+ required:
+ - share_id
+ - name
+ properties:
+ share_id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ author:
+ $ref: "#/components/schemas/HubProfile"
+ labels:
+ type: array
+ items:
+ $ref: "#/components/schemas/HubLabel"
+ thumbnail_url:
+ type: string
+ format: uri
+ content:
+ type: object
+ additionalProperties: true
+ description: Workflow graph JSON
+ likes:
+ type: integer
+ views:
+ type: integer
+ forks:
+ type: integer
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+
+ HubWorkflowList:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated list of hub workflows."
+ required:
+ - workflows
+ - total
+ - has_more
+ properties:
+ workflows:
+ type: array
+ items:
+ $ref: "#/components/schemas/HubWorkflow"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ HubWorkflowIndexEntry:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Lightweight entry in the hub workflow index for client-side search."
+ required:
+ - share_id
+ - name
+ properties:
+ share_id:
+ type: string
+ name:
+ type: string
+ author_username:
+ type: string
+ labels:
+ type: array
+ items:
+ type: string
+ likes:
+ type: integer
+ updated_at:
+ type: string
+ format: date-time
+
+ CloudWorkflow:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A cloud-managed workflow with version history."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ format: uuid
+ name:
+ type: string
+ description:
+ type: string
+ share_id:
+ type: string
+ nullable: true
+ description: Public share identifier if published
+ latest_version_id:
+ type: string
+ format: uuid
+ nullable: true
+ thumbnail_url:
+ type: string
+ format: uri
+ nullable: true
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+
+ CloudWorkflowList:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated list of cloud workflows."
+ required:
+ - workflows
+ - total
+ - has_more
+ properties:
+ workflows:
+ type: array
+ items:
+ $ref: "#/components/schemas/CloudWorkflow"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ CloudWorkflowVersion:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A version of a cloud workflow."
+ required:
+ - id
+ - workflow_id
+ properties:
+ id:
+ type: string
+ format: uuid
+ workflow_id:
+ type: string
+ format: uuid
+ version_number:
+ type: integer
+ created_at:
+ type: string
+ format: date-time
+
+ AuthSession:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Current authentication session state."
+ required:
+ - user
+ properties:
+ user:
+ $ref: "#/components/schemas/CloudUser"
+ workspace:
+ $ref: "#/components/schemas/Workspace"
+ expires_at:
+ type: string
+ format: date-time
+
+ AuthTokenResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] OAuth2 token response."
+ required:
+ - access_token
+ - token_type
+ properties:
+ access_token:
+ type: string
+ token_type:
+ type: string
+ description: Always "Bearer"
+ expires_in:
+ type: integer
+ description: Token lifetime in seconds
+ refresh_token:
+ type: string
+ nullable: true
+ scope:
+ type: string
+
+ JwksResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] JSON Web Key Set for JWT verification."
+ required:
+ - keys
+ properties:
+ keys:
+ type: array
+ items:
+ type: object
+ required:
+ - kty
+ - kid
+ - use
+ properties:
+ kty:
+ type: string
+ description: Key type (e.g. RSA)
+ kid:
+ type: string
+ description: Key ID
+ use:
+ type: string
+ description: Key use (e.g. sig)
+ alg:
+ type: string
+ description: Algorithm (e.g. RS256)
+ n:
+ type: string
+ description: RSA modulus (base64url)
+ e:
+ type: string
+ description: RSA exponent (base64url)
+ additionalProperties: true
+
+ OAuthAuthorizationServerMetadata:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] OAuth 2.1 authorization-server metadata (RFC 8414)."
+ required:
+ - issuer
+ - authorization_endpoint
+ - token_endpoint
+ - jwks_uri
+ - response_types_supported
+ - grant_types_supported
+ - code_challenge_methods_supported
+ - token_endpoint_auth_methods_supported
+ properties:
+ issuer:
+ type: string
+ format: uri
+ authorization_endpoint:
+ type: string
+ format: uri
+ token_endpoint:
+ type: string
+ format: uri
+ jwks_uri:
+ type: string
+ format: uri
+ registration_endpoint:
+ type: string
+ format: uri
+ description: "[cloud-only] RFC 7591 §3.1 Dynamic Client Registration endpoint. Advertised so MCP-spec-compliant clients can auto-discover and self-register without operator involvement. Present only when DCR is enabled."
+ response_types_supported:
+ type: array
+ items:
+ type: string
+ grant_types_supported:
+ type: array
+ items:
+ type: string
+ code_challenge_methods_supported:
+ type: array
+ items:
+ type: string
+ token_endpoint_auth_methods_supported:
+ type: array
+ items:
+ type: string
+ scopes_supported:
+ type: array
+ items:
+ type: string
+
+ OAuthProtectedResourceMetadata:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] OAuth 2.1 protected-resource metadata (RFC 9728)."
+ required:
+ - resource
+ - authorization_servers
+ - scopes_supported
+ properties:
+ resource:
+ type: string
+ format: uri
+ authorization_servers:
+ type: array
+ items:
+ type: string
+ format: uri
+ scopes_supported:
+ type: array
+ items:
+ type: string
+ bearer_methods_supported:
+ type: array
+ items:
+ type: string
+
+ OAuthConsentChallenge:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Server-side state describing the OAuth consent decision the user is being asked to make. Returned by GET /oauth/authorize when a valid session exists; the frontend renders the consent UI from this payload and POSTs the decision back. Browser never sees the original OAuth params on resume."
+ required:
+ - oauth_request_id
+ - csrf_token
+ - client_display_name
+ - resource_display_name
+ - scopes
+ - workspaces
+ properties:
+ oauth_request_id:
+ type: string
+ format: uuid
+ description: Opaque server-side identifier for the authorization-request row. Carried back unchanged in the consent submission.
+ csrf_token:
+ type: string
+ description: Per-row CSRF token bound to this authorization request (not to the session). Must be echoed back on POST.
+ client_display_name:
+ type: string
+ description: Human-readable name of the OAuth client requesting authorization.
+ resource_display_name:
+ type: string
+ description: Human-readable name of the protected resource.
+ scopes:
+ type: array
+ description: Scopes the client is requesting for this resource. The frontend should present these for the user to approve.
+ items:
+ type: string
+ workspaces:
+ type: array
+ description: Workspaces the user can select from. Membership is re-checked on POST.
+ items:
+ $ref: "#/components/schemas/OAuthConsentChallengeWorkspace"
+
+ OAuthConsentChallengeWorkspace:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] One workspace option presented in the OAuth consent challenge."
+ required: [id, name, type, role]
+ properties:
+ id: { type: string }
+ name: { type: string }
+ type: { type: string, enum: [personal, team] }
+ role: { type: string, enum: [owner, member] }
+
+ OAuthAuthorizeRedirectResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Redirect target produced after a JSON consent submission. The frontend must navigate the browser to this URL so custom-scheme client callbacks work without relying on fetch-visible 302 headers."
+ required:
+ - redirect_url
+ properties:
+ redirect_url:
+ type: string
+ format: uri
+ description: OAuth client redirect URI with either code+state for allow, or error+state for deny.
+
+ OAuthTokenResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] RFC 6749 §5.1 successful token response."
+ required: [access_token, token_type, expires_in, refresh_token, scope]
+ properties:
+ access_token:
+ type: string
+ description: Resource-bound access token (audience matches the protected resource).
+ token_type:
+ type: string
+ enum: [Bearer]
+ expires_in:
+ type: integer
+ description: Access token lifetime in seconds.
+ refresh_token:
+ type: string
+ description: Opaque refresh token. Rotates on every successful refresh; presenting an already-rotated token revokes the entire family.
+ scope:
+ type: string
+ description: Space-delimited scopes granted with this token.
+
+ OAuthTokenError:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] RFC 6749 §5.2 error response."
+ required: [error]
+ properties:
+ error:
+ type: string
+ description: 'RFC 6749 §5.2 error code: invalid_request, invalid_client, invalid_grant, unauthorized_client, unsupported_grant_type, invalid_scope.'
+ error_description:
+ type: string
+ description: Human-readable, no leak of internal storage state.
+
+ OAuthRegisterRequest:
+ type: object
+ x-runtime: [cloud]
+ additionalProperties: false
+ description: "[cloud-only] RFC 7591 §2 client metadata document. Only the fields the server honors are listed; presence of `scope` or `resource_grants` in the request is rejected (`invalid_client_metadata`) because those are server-owned for dynamic clients."
+ required:
+ - redirect_uris
+ - application_type
+ properties:
+ redirect_uris:
+ type: array
+ items:
+ type: string
+ minItems: 1
+ maxItems: 5
+ description: 1–5 redirect URIs. Validated against `application_type` policy.
+ client_name:
+ type: string
+ maxLength: 100
+ description: Human-readable name shown in the consent UI. Reserved-name list rejects impersonation of major clients.
+ application_type:
+ type: string
+ enum: [native, web]
+ description: |
+ RFC 7591 §2 application_type. **REQUIRED** — clients MUST declare intent; the server does not default this field. `native` for desktop / CLI / MCP-spec-strict clients (loopback redirects); `web` for hosted clients (HTTPS only, host must be allowlisted). A missing or explicitly empty `application_type` rejects with `invalid_client_metadata`.
+ token_endpoint_auth_method:
+ type: string
+ enum: [none]
+ description: 'Public clients only this phase — must be `none` if present. The server forces `none` regardless.'
+ grant_types:
+ type: array
+ items:
+ type: string
+ enum: [authorization_code, refresh_token]
+ description: Optional. Defaults to `["authorization_code","refresh_token"]`.
+ response_types:
+ type: array
+ items:
+ type: string
+ enum: [code]
+ description: Optional. Defaults to `["code"]`.
+ scope:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Dynamic clients do not pick scopes — the server assigns scopes from the active resource's published list. Sending `scope` in the registration body is treated as a privilege-escalation attempt and returns `invalid_client_metadata`."
+ resource_grants:
+ type: object
+ nullable: true
+ additionalProperties:
+ type: array
+ items:
+ type: string
+ description: "**REJECTED IF PRESENT.** Same reason as `scope`. The set of resources and scopes a dynamic client may request is server-policy, not request-driven."
+ client_uri:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ logo_uri:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ tos_uri:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ policy_uri:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ software_id:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ software_version:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ contacts:
+ type: array
+ nullable: true
+ items:
+ type: string
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ jwks:
+ type: object
+ nullable: true
+ additionalProperties: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+ jwks_uri:
+ type: string
+ nullable: true
+ description: "**REJECTED IF PRESENT.** Unsupported RFC 7591 metadata for this public-client phase."
+
+ OAuthRegisterResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] RFC 7591 §3.2.1 successful registration response."
+ required:
+ - client_id
+ - client_id_issued_at
+ - redirect_uris
+ - grant_types
+ - response_types
+ - token_endpoint_auth_method
+ - application_type
+ properties:
+ client_id:
+ type: string
+ description: Server-generated client_id.
+ client_id_issued_at:
+ type: integer
+ format: int64
+ description: Unix timestamp (seconds) when the client was registered.
+ client_name:
+ type: string
+ redirect_uris:
+ type: array
+ items:
+ type: string
+ grant_types:
+ type: array
+ items:
+ type: string
+ response_types:
+ type: array
+ items:
+ type: string
+ token_endpoint_auth_method:
+ type: string
+ enum: [none]
+ application_type:
+ type: string
+ enum: [native, web]
+
+ OAuthRegisterError:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] RFC 7591 §3.2.2 error response."
+ required:
+ - error
+ properties:
+ error:
+ type: string
+ enum: [invalid_redirect_uri, invalid_client_metadata]
+ error_description:
+ type: string
+ nullable: true
+
+ BillingBalance:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Current credit balance and usage summary."
+ required:
+ - credits_remaining
+ properties:
+ credits_remaining:
+ type: integer
+ description: Available credits
+ credits_used:
+ type: integer
+ description: Credits used in current billing period
+ credits_total:
+ type: integer
+ description: Total credits allocated in current period
+
+ BillingEvent:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A billing event (charge, credit, refund)."
+ required:
+ - id
+ - type
+ - amount
+ - created_at
+ properties:
+ id:
+ type: string
+ type:
+ type: string
+ enum: [charge, credit, refund, topup, subscription]
+ amount:
+ type: integer
+ description: Amount in credits
+ description:
+ type: string
+ job_id:
+ type: string
+ format: uuid
+ nullable: true
+ created_at:
+ type: string
+ format: date-time
+
+ BillingEventList:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated list of billing events."
+ required:
+ - events
+ - total
+ - has_more
+ properties:
+ events:
+ type: array
+ items:
+ $ref: "#/components/schemas/BillingEvent"
+ total:
+ type: integer
+ has_more:
+ type: boolean
+
+ BillingOp:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A billing operation record."
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ status:
+ type: string
+ enum: [pending, completed, failed]
+ type:
+ type: string
+ amount:
+ type: integer
+ created_at:
+ type: string
+ format: date-time
+ completed_at:
+ type: string
+ format: date-time
+ nullable: true
+
+ BillingPlan:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A subscription plan with pricing details."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ credits_per_month:
+ type: integer
+ price_cents:
+ type: integer
+ description: Monthly price in cents (USD)
+ currency:
+ type: string
+ default: usd
+ features:
+ type: array
+ items:
+ type: string
+ description: List of plan features
+
+ BillingStatus:
+ type: string
+ x-runtime: [cloud]
+ description: "[cloud-only] Overall billing/payment lifecycle status."
+ enum:
+ - awaiting_payment_method
+ - pending_payment
+ - paid
+ - payment_failed
+ - inactive
+
+ BillingSubscription:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Active subscription details."
+ required:
+ - id
+ - status
+ - plan_id
+ properties:
+ id:
+ type: string
+ status:
+ type: string
+ enum: [active, cancelled, past_due, trialing]
+ plan_id:
+ type: string
+ plan_name:
+ type: string
+ current_period_start:
+ type: string
+ format: date-time
+ current_period_end:
+ type: string
+ format: date-time
+ cancel_at_period_end:
+ type: boolean
+
+ SubscriptionPreview:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Preview of a subscription change including prorations."
+ properties:
+ plan_id:
+ type: string
+ plan_name:
+ type: string
+ amount_due:
+ type: integer
+ description: Amount due in cents
+ proration_amount:
+ type: integer
+ description: Proration adjustment in cents
+ currency:
+ type: string
+ next_billing_date:
+ type: string
+ format: date-time
+
+ Workspace:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A cloud workspace for team collaboration."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ type:
+ type: string
+ enum:
+ - personal
+ - team
+ description: Workspace type (personal vs. team).
+ owner_id:
+ type: string
+ member_count:
+ type: integer
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+
+ WorkspaceMember:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A member of a cloud workspace."
+ required:
+ - user_id
+ - role
+ properties:
+ user_id:
+ type: string
+ email:
+ type: string
+ format: email
+ display_name:
+ type: string
+ avatar_url:
+ type: string
+ format: uri
+ role:
+ type: string
+ enum: [owner, admin, member]
+ joined_at:
+ type: string
+ format: date-time
+
+ WorkspaceInvite:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A pending workspace invitation."
+ required:
+ - id
+ - email
+ - role
+ properties:
+ id:
+ type: string
+ email:
+ type: string
+ format: email
+ role:
+ type: string
+ enum: [admin, member]
+ invited_by:
+ type: string
+ created_at:
+ type: string
+ format: date-time
+ expires_at:
+ type: string
+ format: date-time
+
+ WorkspaceApiKey:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A workspace API key (secret value redacted)."
+ required:
+ - id
+ - name
+ - description
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ maxLength: 5000
+ description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
+ prefix:
+ type: string
+ description: First few characters of the key for identification
+ created_at:
+ type: string
+ format: date-time
+ last_used_at:
+ type: string
+ format: date-time
+ nullable: true
+ created_by:
+ type: string
+
+ WorkspaceApiKeyCreated:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A newly created workspace API key, including the full secret value (shown only once)."
+ required:
+ - id
+ - name
+ - description
+ - key
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ maxLength: 5000
+ description: User-provided description of the key's purpose. Always present in responses; empty string when no description was supplied on create.
+ key:
+ type: string
+ description: Full API key value (only returned on creation)
+ prefix:
+ type: string
+ created_at:
+ type: string
+ format: date-time
+
+ CloudUser:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] A cloud-authenticated user profile."
+ required:
+ - id
+ - email
+ properties:
+ id:
+ type: string
+ email:
+ type: string
+ format: email
+ display_name:
+ type: string
+ avatar_url:
+ type: string
+ format: uri
+ created_at:
+ type: string
+ format: date-time
+
+ SecretMeta:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Metadata for a stored secret (value is never returned)."
+ required:
+ - id
+ - name
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ provider:
+ type: string
+ description: "[cloud-only] Provider identifier (e.g., huggingface, civitai)."
+ x-runtime: [cloud]
+ last_used_at:
+ type: string
+ format: date-time
+ description: "[cloud-only] When the secret was last used for decryption."
+ x-runtime: [cloud]
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+
+ UpdateSecretRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Request body for updating an existing user secret."
+ properties:
+ name:
+ type: string
+ description: New name for the secret
+ secret_value:
+ type: string
+ description: New secret value (API key, token, etc.)
+
+ CreateSessionResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Response after creating a session cookie."
+ required:
+ - success
+ properties:
+ success:
+ type: boolean
+ expiresIn:
+ type: integer
+ description: Session expiration time in seconds.
+
+ DeleteSessionResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Response after deleting a session cookie."
+ required:
+ - success
+ properties:
+ success:
+ type: boolean
+
+ CreateHubProfileRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Request body for creating a new Hub profile."
+ required:
+ - workspace_id
+ - username
+ properties:
+ workspace_id:
+ type: string
+ username:
+ type: string
+ description: Unique URL-safe slug. Immutable after creation.
+ display_name:
+ type: string
+ description:
+ type: string
+ avatar_token:
+ type: string
+ website_urls:
+ type: array
+ items:
+ type: string
+
+ PublishHubWorkflowRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Request body for publishing or updating a workflow on the Hub."
+ required:
+ - username
+ - name
+ - workflow_filename
+ - asset_ids
+ properties:
+ username:
+ type: string
+ name:
+ type: string
+ workflow_filename:
+ type: string
+ asset_ids:
+ type: array
+ items:
+ type: string
+ description:
+ type: string
+ tags:
+ type: array
+ items:
+ type: string
+ models:
+ type: array
+ items:
+ type: string
+ custom_nodes:
+ type: array
+ items:
+ type: string
+ tutorial_url:
+ type: string
+ metadata:
+ type: object
+ additionalProperties: true
+ thumbnail_type:
+ type: string
+ enum: [image, video, image_comparison]
+ thumbnail_token_or_url:
+ type: string
+ thumbnail_comparison_token_or_url:
+ type: string
+ sample_image_tokens_or_urls:
+ type: array
+ items:
+ type: string
+
+ HubWorkflowDetail:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Full Hub workflow detail including versions, assets, and statistics."
+ required:
+ - share_id
+ - workflow_id
+ - name
+ - workflow_json
+ - assets
+ - profile
+ - status
+ properties:
+ share_id:
+ type: string
+ workflow_id:
+ type: string
+ name:
+ type: string
+ status:
+ type: string
+ enum: [pending, approved, rejected, deprecated]
+ description:
+ type: string
+ thumbnail_type:
+ type: string
+ enum: [image, video, image_comparison]
+ thumbnail_url:
+ type: string
+ thumbnail_comparison_url:
+ type: string
+ tutorial_url:
+ type: string
+ metadata:
+ type: object
+ additionalProperties: true
+ sample_image_urls:
+ type: array
+ items:
+ type: string
+ publish_time:
+ type: string
+ format: date-time
+ nullable: true
+ workflow_json:
+ type: object
+ additionalProperties: true
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/AssetInfo"
+ profile:
+ $ref: "#/components/schemas/HubProfile"
+
+ AssetInfo:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Lightweight asset reference used in workflow publishing payloads."
+ required:
+ - id
+ - filename
+ properties:
+ id:
+ type: string
+ filename:
+ type: string
+ mime_type:
+ type: string
+ size_bytes:
+ type: integer
+ format: int64
+
+ BulkRevokeAPIKeysResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Response after bulk-revoking API keys for a workspace member."
+ required:
+ - revoked_count
+ properties:
+ revoked_count:
+ type: integer
+ minimum: 0
+
+ CreateWorkflowVersionRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Request body for creating a new version of a saved workflow."
+ required:
+ - base_version
+ - workflow_json
+ properties:
+ base_version:
+ type: integer
+ description: Version number this change is based on (for optimistic concurrency).
+ workflow_json:
+ type: object
+ additionalProperties: true
+
+ WorkflowVersionResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Metadata for a single workflow version."
+ required:
+ - id
+ - version
+ - latest_version
+ - created_by
+ - created_at
+ properties:
+ id:
+ type: string
+ version:
+ type: integer
+ latest_version:
+ type: integer
+ created_by:
+ type: string
+ created_at:
+ type: string
+ format: date-time
+
+ WorkflowPublishInfo:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Publishing metadata for a workflow shared to the Hub."
+ required:
+ - workflow_id
+ - share_id
+ - listed
+ - assets
+ properties:
+ workflow_id:
+ type: string
+ share_id:
+ type: string
+ publish_time:
+ type: string
+ format: date-time
+ nullable: true
+ listed:
+ type: boolean
+ assets:
+ type: array
+ items:
+ $ref: "#/components/schemas/AssetInfo"
+
+ TaskEntry:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Task data for list views."
+ required:
+ - id
+ - task_name
+ - status
+ - create_time
+ properties:
+ id:
+ type: string
+ format: uuid
+ task_name:
+ type: string
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ create_time:
+ type: string
+ format: date-time
+ started_at:
+ type: string
+ format: date-time
+ completed_at:
+ type: string
+ format: date-time
+
+ TaskResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Full task details including payload and result."
+ required:
+ - id
+ - idempotency_key
+ - task_name
+ - payload
+ - status
+ - create_time
+ - update_time
+ properties:
+ id:
+ type: string
+ format: uuid
+ idempotency_key:
+ type: string
+ task_name:
+ type: string
+ payload:
+ type: object
+ additionalProperties: true
+ status:
+ type: string
+ enum: [created, running, completed, failed]
+ result:
+ type: object
+ additionalProperties: true
+ create_time:
+ type: string
+ format: date-time
+ update_time:
+ type: string
+ format: date-time
+ started_at:
+ type: string
+ format: date-time
+ completed_at:
+ type: string
+ format: date-time
+ error:
+ type: string
+
+ TasksListResponse:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] Paginated list of background tasks for the authenticated user."
+ required:
+ - tasks
+ - pagination
+ properties:
+ tasks:
+ type: array
+ items:
+ $ref: "#/components/schemas/TaskEntry"
+ pagination:
+ $ref: "#/components/schemas/PaginationInfo"
+
+ # ===== Cloud-only schemas (Comfy-Org/cloud runtime, BE-1106) =====
+ AssetDownloadResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Acknowledgement of an async asset download task; clients poll GET /api/tasks/{task_id} for status.'
+ required:
+ - task_id
+ - status
+ properties:
+ task_id:
+ type: string
+ format: uuid
+ description: Task ID for tracking download progress via GET /api/tasks/{task_id}
+ status:
+ type: string
+ enum:
+ - created
+ - running
+ - completed
+ - failed
+ description: Current task status
+ message:
+ type: string
+ description: Human-readable message
+ example: Download task created. Use task_id to track progress.
+
+ AssetMetadataResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Metadata for a remotely hosted asset resolved by URL.'
+ required:
+ - content_length
+ properties:
+ content_length:
+ type: integer
+ format: int64
+ description: Size of the asset in bytes (-1 if unknown)
+ example: 4294967296
+ content_type:
+ type: string
+ description: MIME type of the asset
+ example: application/octet-stream
+ filename:
+ type: string
+ description: Suggested filename for the asset from source
+ example: realistic-vision-v5.safetensors
+ name:
+ type: string
+ description: Display name or title for the asset from source
+ example: Realistic Vision v5.0
+ tags:
+ type: array
+ items:
+ type: string
+ description: Tags for categorization from source
+ example:
+ - models
+ - checkpoint
+ preview_image:
+ type: string
+ description: Preview image as base64-encoded data URL
+ example: data:image/jpeg;base64,/9j/4AAQSkZJRg...
+ validation:
+ description: Validation results for the file
+ allOf:
+ - $ref: '#/components/schemas/ValidationResult'
+
+ BillingBalanceResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Current credit balance and usage details for a workspace.'
+ required:
+ - amount_micros
+ - currency
+ properties:
+ amount_micros:
+ type: number
+ format: double
+ description: The total remaining balance in microamount (1/1,000,000 of the currency unit)
+ prepaid_balance_micros:
+ type: number
+ format: double
+ description: The remaining balance from prepaid commits in microamount
+ cloud_credit_balance_micros:
+ type: number
+ format: double
+ description: The remaining balance from cloud credits in microamount
+ pending_charges_micros:
+ type: number
+ format: double
+ description: The total amount of pending/unbilled charges from draft invoices in microamount
+ effective_balance_micros:
+ type: number
+ format: double
+ description: The effective balance (total balance minus pending charges). Can be negative if pending charges exceed
+ the balance.
+ currency:
+ type: string
+ example: usd
+ description: Currency code
+
+ BillingPlansResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] List of available billing plans for subscription.'
+ required:
+ - plans
+ properties:
+ current_plan_slug:
+ type: string
+ description: Current plan slug if subscribed
+ plans:
+ type: array
+ items:
+ $ref: '#/components/schemas/Plan'
+
+ BillingStatusResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Current billing and subscription status for a workspace.'
+ required:
+ - is_active
+ - has_funds
+ properties:
+ is_active:
+ type: boolean
+ description: Whether the workspace has an active subscription
+ subscription_status:
+ type: string
+ enum:
+ - active
+ - ended
+ - canceled
+ description: Subscription activity status (scheduled subscriptions are not returned)
+ subscription_tier:
+ $ref: '#/components/schemas/SubscriptionTier'
+ subscription_duration:
+ $ref: '#/components/schemas/SubscriptionDuration'
+ plan_slug:
+ type: string
+ description: Plan identifier (e.g., standard-monthly, team-pro-annual)
+ billing_status:
+ $ref: '#/components/schemas/BillingStatus'
+ has_funds:
+ type: boolean
+ description: Whether the workspace has available credits
+ cancel_at:
+ type: string
+ format: date-time
+ description: When the subscription will become inactive (if canceled)
+ renewal_date:
+ type: string
+ format: date-time
+ description: When the current billing period ends and the next one begins
+
+ GetUserDataResponseFull:
+ type: array
+ x-runtime: [cloud]
+ description: '[cloud-only] List of user data file entries (each with path, size, and modification time) returned when full_info=true.'
+ items:
+ $ref: '#/components/schemas/GetUserDataResponseFullFile'
+
+ HistoryDetailEntry:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] History entry with full prompt data'
+ properties:
+ prompt:
+ type: object
+ description: Full prompt execution data
+ properties:
+ priority:
+ type: number
+ format: double
+ description: Execution priority
+ prompt_id:
+ type: string
+ description: The prompt ID
+ prompt:
+ type: object
+ description: The workflow nodes
+ additionalProperties: true
+ extra_data:
+ type: object
+ description: Additional execution data
+ additionalProperties: true
+ outputs_to_execute:
+ type: array
+ items:
+ type: string
+ description: Output nodes to execute
+ outputs:
+ type: object
+ description: Output data from execution (generated images, files, etc.)
+ additionalProperties: true
+ status:
+ type: object
+ description: Execution status and timeline information
+ additionalProperties: true
+ meta:
+ type: object
+ description: Metadata about the execution and nodes
+ additionalProperties: true
+
+ HistoryDetailResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Detailed execution history response for a specific prompt.
+
+ Returns a dictionary with prompt_id as key and full history data as value.
+
+ '
+ additionalProperties:
+ $ref: '#/components/schemas/HistoryDetailEntry'
+
+ HistoryResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Execution history response with history array.
+
+ Returns an object with a "history" key containing an array of history entries.
+
+ Each entry includes prompt_id as a property along with execution data.
+
+ '
+ required:
+ - history
+ properties:
+ history:
+ type: array
+ description: Array of history entries ordered by creation time (newest first)
+ items:
+ $ref: '#/components/schemas/HistoryEntry'
+
+ HubLabelInfo:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Metadata for a single Hub label.'
+ required:
+ - name
+ - display_name
+ - type
+ properties:
+ name:
+ type: string
+ description: Slug identifier.
+ display_name:
+ type: string
+ description: Human-readable display name.
+ description:
+ type: string
+ description: Optional description of the label.
+ type:
+ type: string
+ enum:
+ - tag
+ - model
+ - custom_node
+ description: Label category.
+
+ HubLabelListResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response wrapper for the available Hub label catalog.'
+ required:
+ - labels
+ properties:
+ labels:
+ type: array
+ items:
+ $ref: '#/components/schemas/HubLabelInfo'
+ description: Available labels, optionally filtered by type.
+
+ HubProfileSummary:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Abbreviated Hub profile used in workflow listings.'
+ required:
+ - username
+ properties:
+ username:
+ type: string
+ display_name:
+ type: string
+ avatar_url:
+ type: string
+ description: Public URL of the profile avatar image.
+
+ HubWorkflowListResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Paginated list of Hub workflows matching search criteria.'
+ required:
+ - workflows
+ properties:
+ workflows:
+ type: array
+ items:
+ anyOf:
+ - $ref: '#/components/schemas/HubWorkflowSummary'
+ - $ref: '#/components/schemas/HubWorkflowDetail'
+ description: Array of HubWorkflowSummary (default) or HubWorkflowDetail (when detail=true).
+ next_cursor:
+ type: string
+ description: Cursor for the next page, empty if no more results.
+
+ HubWorkflowStatus:
+ type: string
+ x-runtime: [cloud]
+ description: '[cloud-only] Public workflow status. NULL in the database is represented as pending in API responses.'
+ enum:
+ - pending
+ - approved
+ - rejected
+ - deprecated
+
+ HubWorkflowSummary:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Abbreviated Hub workflow metadata used in search and listing results.'
+ required:
+ - share_id
+ - name
+ - profile
+ - status
+ properties:
+ share_id:
+ type: string
+ name:
+ type: string
+ status:
+ $ref: '#/components/schemas/HubWorkflowStatus'
+ description:
+ type: string
+ tags:
+ type: array
+ items:
+ $ref: '#/components/schemas/LabelRef'
+ models:
+ type: array
+ items:
+ $ref: '#/components/schemas/LabelRef'
+ custom_nodes:
+ type: array
+ items:
+ $ref: '#/components/schemas/LabelRef'
+ thumbnail_type:
+ type: string
+ enum:
+ - image
+ - video
+ - image_comparison
+ thumbnail_url:
+ type: string
+ thumbnail_comparison_url:
+ type: string
+ publish_time:
+ type: string
+ format: date-time
+ nullable: true
+ profile:
+ $ref: '#/components/schemas/HubProfileSummary'
+ metadata:
+ type: object
+ additionalProperties: true
+ tutorial_url:
+ type: string
+ sample_image_urls:
+ type: array
+ items:
+ type: string
+
+ HubWorkflowTemplateEntry:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Entry in the curated workflow template gallery shown on the home page.'
+ required:
+ - name
+ - title
+ - status
+ properties:
+ name:
+ type: string
+ description: Slug identifier for the template
+ title:
+ type: string
+ status:
+ $ref: '#/components/schemas/HubWorkflowStatus'
+ description:
+ type: string
+ tags:
+ type: array
+ items:
+ type: string
+ models:
+ type: array
+ items:
+ type: string
+ requiresCustomNodes:
+ type: array
+ items:
+ type: string
+ thumbnailVariant:
+ type: string
+ mediaType:
+ type: string
+ mediaSubtype:
+ type: string
+ size:
+ type: integer
+ format: int64
+ description: Workflow asset size in bytes.
+ vram:
+ type: integer
+ format: int64
+ description: Approximate VRAM requirement in bytes.
+ usage:
+ type: integer
+ format: int64
+ description: Usage count reported upstream.
+ searchRank:
+ type: integer
+ format: int64
+ description: Search ranking score reported upstream.
+ isEssential:
+ type: boolean
+ description: Whether the template belongs to a module marked as essential.
+ openSource:
+ type: boolean
+ profile:
+ $ref: '#/components/schemas/HubProfileSummary'
+ tutorialUrl:
+ type: string
+ logos:
+ type: array
+ items:
+ type: object
+ additionalProperties: true
+ date:
+ type: string
+ description: Publication date in YYYY-MM-DD format
+ io:
+ type: object
+ properties:
+ inputs:
+ type: array
+ items:
+ type: object
+ additionalProperties: true
+ outputs:
+ type: array
+ items:
+ type: object
+ additionalProperties: true
+ includeOnDistributions:
+ type: array
+ items:
+ type: string
+ thumbnailUrl:
+ type: string
+ description: Public URL of the primary thumbnail
+ thumbnailComparisonUrl:
+ type: string
+ description: Public URL of the comparison thumbnail
+ shareId:
+ type: string
+ description: Share ID for linking to the hub workflow detail
+ extendedDescription:
+ type: string
+ description: AI-generated extended description of the workflow
+ metaDescription:
+ type: string
+ description: AI-generated SEO meta description (under 160 chars)
+ howToUse:
+ type: array
+ items:
+ type: string
+ description: AI-generated step-by-step usage instructions
+ suggestedUseCases:
+ type: array
+ items:
+ type: string
+ description: AI-generated suggested use cases
+ faqItems:
+ type: array
+ items:
+ type: object
+ required:
+ - question
+ - answer
+ properties:
+ question:
+ type: string
+ answer:
+ type: string
+ description: AI-generated FAQ items
+ contentTemplate:
+ type: string
+ description: Content template used for generation (tutorial, showcase, comparison, breakthrough)
+
+ JobStatusResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Job status information'
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: The job ID
+ status:
+ type: string
+ enum:
+ - waiting_to_dispatch
+ - pending
+ - in_progress
+ - completed
+ - error
+ - cancelled
+ description: Current job status
+ created_at:
+ type: string
+ format: date-time
+ description: When the job was created
+ updated_at:
+ type: string
+ format: date-time
+ description: When the job was last updated
+ last_state_update:
+ type: string
+ format: date-time
+ description: When the job status was last changed
+ assigned_inference:
+ type: string
+ nullable: true
+ description: The inference instance assigned to this job (if any)
+ error_message:
+ type: string
+ nullable: true
+ description: Error message if the job failed
+ required:
+ - id
+ - status
+ - created_at
+ - updated_at
+
+ JobsListResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Paginated list of jobs for the authenticated user.'
+ required:
+ - jobs
+ - pagination
+ properties:
+ jobs:
+ type: array
+ description: Array of jobs ordered by specified sort field
+ items:
+ $ref: '#/components/schemas/JobEntry'
+ pagination:
+ $ref: '#/components/schemas/PaginationInfo'
+
+ LabelRef:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Reference to a Hub label by ID.'
+ required:
+ - name
+ - display_name
+ properties:
+ name:
+ type: string
+ description: Slug identifier (e.g. "video-generation", "flux").
+ display_name:
+ type: string
+ description: Human-readable display name (e.g. "Video Generation", "Flux").
+
+ LogsResponse:
+ type: array
+ x-runtime: [cloud]
+ description: '[cloud-only] System logs response'
+ items:
+ type: object
+ properties:
+ timestamp:
+ type: string
+ format: date-time
+ description: When the log entry was created
+ level:
+ type: string
+ enum:
+ - debug
+ - info
+ - warn
+ - error
+ description: Log level
+ message:
+ type: string
+ description: Log message
+ source:
+ type: string
+ description: Source of the log entry
+ metadata:
+ type: object
+ additionalProperties: true
+ description: Additional log metadata
+
+ Member:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Workspace member with profile and role information.'
+ required:
+ - id
+ - name
+ - email
+ - role
+ - joined_at
+ properties:
+ id:
+ type: string
+ description: User ID
+ name:
+ type: string
+ description: User's display name
+ email:
+ type: string
+ format: email
+ description: User's email address
+ role:
+ type: string
+ enum:
+ - owner
+ - member
+ description: User's role in the workspace
+ joined_at:
+ type: string
+ format: date-time
+ description: When the user joined the workspace
+
+ OAuthRegisterBadRequestResponse:
+ x-runtime: [cloud]
+ description: "[cloud-only] Union of the two 400 shapes /oauth/register can emit. `OAuthRegisterError` is the handler-shaped\
+ \ RFC 7591 \xA73.2.2 error; `BindingErrorResponse` is the strict-server binding-layer error fired when the request body\
+ \ fails OpenAPI-schema validation before the handler runs.\n"
+ oneOf:
+ - $ref: '#/components/schemas/OAuthRegisterError'
+ - $ref: '#/components/schemas/BindingErrorResponse'
+
+ PendingInvite:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] An outstanding workspace invitation that has not yet been accepted.'
+ required:
+ - id
+ - email
+ - invited_at
+ - expires_at
+ properties:
+ id:
+ type: string
+ description: Invite ID
+ email:
+ type: string
+ format: email
+ description: Email address of the invited user
+ token:
+ type: string
+ description: Invite token for constructing invite links. Empty for expired invites.
+ invited_at:
+ type: string
+ format: date-time
+ description: When the invite was created
+ expires_at:
+ type: string
+ format: date-time
+ description: When the invite expires
+
+ Plan:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Billing plan details including pricing, limits, and features.'
+ required:
+ - slug
+ - tier
+ - duration
+ - price_cents
+ - credits_cents
+ - max_seats
+ - availability
+ - seat_summary
+ properties:
+ slug:
+ type: string
+ description: Plan identifier (e.g., "pro-monthly", "team-standard-annual")
+ example: pro-monthly
+ tier:
+ $ref: '#/components/schemas/SubscriptionTier'
+ duration:
+ $ref: '#/components/schemas/SubscriptionDuration'
+ price_cents:
+ type: integer
+ format: int64
+ description: Per-member price in cents (base + one seat)
+ example: 10000
+ credits_cents:
+ type: integer
+ format: int64
+ description: Per-member credits in cents (base + one seat)
+ example: 10000
+ max_seats:
+ type: integer
+ format: int64
+ description: Maximum number of seats allowed for this plan
+ example: 20
+ availability:
+ $ref: '#/components/schemas/PlanAvailability'
+ seat_summary:
+ $ref: '#/components/schemas/PlanSeatSummary'
+
+ PlanAvailability:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Availability and eligibility information for a billing plan.'
+ required:
+ - available
+ properties:
+ available:
+ type: boolean
+ description: Whether the workspace can subscribe to this plan
+ reason:
+ $ref: '#/components/schemas/PlanAvailabilityReason'
+
+ PlanAvailabilityReason:
+ type: string
+ x-runtime: [cloud]
+ enum:
+ - same_plan
+ - incompatible_transition
+ - requires_team
+ - requires_personal
+ - exceeds_max_seats
+ description: '[cloud-only] Reason why a plan is unavailable'
+
+ PlanSeatSummary:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Summary of seat costs based on current workspace members'
+ required:
+ - seat_count
+ - total_cost_cents
+ - total_credits_cents
+ properties:
+ seat_count:
+ type: integer
+ description: Total number of seats (owner + members) that would be charged
+ example: 5
+ total_cost_cents:
+ type: integer
+ format: int64
+ description: Total cost for all seats in cents
+ example: 50000
+ total_credits_cents:
+ type: integer
+ format: int64
+ description: Total credits granted for all seats in cents
+ example: 50000
+
+ PreviewPlanInfo:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Plan information for preview display'
+ required:
+ - slug
+ - tier
+ - duration
+ - price_cents
+ - credits_cents
+ - seat_summary
+ properties:
+ slug:
+ type: string
+ description: Plan slug
+ example: team-pro-monthly
+ tier:
+ $ref: '#/components/schemas/SubscriptionTier'
+ duration:
+ $ref: '#/components/schemas/SubscriptionDuration'
+ price_cents:
+ type: integer
+ format: int64
+ description: Per-seat price in cents
+ example: 10000
+ credits_cents:
+ type: integer
+ format: int64
+ description: Per-seat credits in cents
+ example: 10000
+ seat_summary:
+ $ref: '#/components/schemas/PlanSeatSummary'
+ period_start:
+ type: string
+ format: date-time
+ description: Current billing period start (only for current_plan)
+ period_end:
+ type: string
+ format: date-time
+ description: Current billing period end (only for current_plan)
+
+ PreviewSubscribeResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Itemized cost preview for a pending subscription change.'
+ required:
+ - allowed
+ - transition_type
+ - effective_at
+ - is_immediate
+ - cost_today_cents
+ - cost_next_period_cents
+ - credits_today_cents
+ - credits_next_period_cents
+ - new_plan
+ properties:
+ allowed:
+ type: boolean
+ description: Whether this subscription change is allowed
+ reason:
+ type: string
+ description: Reason why the change is not allowed (only present if allowed=false)
+ transition_type:
+ type: string
+ enum:
+ - new_subscription
+ - upgrade
+ - downgrade
+ - duration_change
+ description: Type of subscription transition
+ effective_at:
+ type: string
+ format: date-time
+ description: When the change takes effect
+ is_immediate:
+ type: boolean
+ description: Whether the change takes effect immediately (true) or at period end (false)
+ cost_today_cents:
+ type: integer
+ format: int64
+ description: Amount to charge today in cents (0 for downgrades)
+ example: 5000
+ cost_next_period_cents:
+ type: integer
+ format: int64
+ description: Amount that will be charged at next billing period in cents
+ example: 10000
+ credits_today_cents:
+ type: integer
+ format: int64
+ description: Credits granted today in cents (prorated for mid-period upgrades)
+ example: 5000
+ credits_next_period_cents:
+ type: integer
+ format: int64
+ description: Credits that will be granted at next billing period in cents
+ example: 10000
+ current_plan:
+ $ref: '#/components/schemas/PreviewPlanInfo'
+ new_plan:
+ $ref: '#/components/schemas/PreviewPlanInfo'
+
+ PublishedWorkflowDetail:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Full detail of a publicly published workflow on the Hub.'
+ required:
+ - share_id
+ - workflow_id
+ - name
+ - listed
+ - workflow_json
+ - assets
+ properties:
+ share_id:
+ type: string
+ workflow_id:
+ type: string
+ name:
+ type: string
+ description: Human-readable workflow name.
+ listed:
+ type: boolean
+ publish_time:
+ type: string
+ format: date-time
+ nullable: true
+ workflow_json:
+ type: object
+ additionalProperties: true
+ description: The workflow JSON content at publish time.
+ assets:
+ type: array
+ description: Published assets with their library status for the caller.
+ items:
+ $ref: '#/components/schemas/AssetInfo'
+
+ SecretResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] User secret metadata (the secret value itself is never returned after creation).'
+ required:
+ - id
+ - name
+ - created_at
+ - updated_at
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: Unique identifier for the secret
+ name:
+ type: string
+ description: User-provided label for the secret
+ provider:
+ type: string
+ description: Provider identifier (e.g., huggingface, civitai)
+ last_used_at:
+ type: string
+ format: date-time
+ description: When the secret was last used for decryption
+ created_at:
+ type: string
+ format: date-time
+ description: When the secret was created
+ updated_at:
+ type: string
+ format: date-time
+ description: When the secret was last updated
+
+ SubscriptionDuration:
+ type: string
+ x-runtime: [cloud]
+ enum:
+ - MONTHLY
+ - ANNUAL
+ description: '[cloud-only] Billing period (uppercase to match comfy-api)'
+
+ SubscriptionTier:
+ type: string
+ x-runtime: [cloud]
+ enum:
+ - FREE
+ - STANDARD
+ - CREATOR
+ - PRO
+ - FOUNDERS_EDITION
+ description: '[cloud-only] Subscription tier (uppercase to match comfy-api)'
+
+ UserDataResponseFull:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] User data listing entry with file metadata (path, size, modification time).'
+ properties:
+ path:
+ type: string
+ size:
+ type: integer
+ modified:
+ type: integer
+ format: int64
+ description: UNIX timestamp of the last modification in milliseconds.
+
+ ValidationError:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Details of a single validation error encountered during asset operations.'
+ required:
+ - code
+ - message
+ - field
+ properties:
+ code:
+ type: string
+ description: Machine-readable error code
+ example: FORMAT_NOT_ALLOWED
+ message:
+ type: string
+ description: Human-readable error message
+ example: 'File format "PickleTensor" is not allowed. Allowed formats: [SafeTensor]'
+ field:
+ type: string
+ description: Field that failed validation
+ example: format
+
+ ValidationResult:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Result of validating a set of asset operations.'
+ required:
+ - is_valid
+ properties:
+ is_valid:
+ type: boolean
+ description: Overall validation status (true if all checks passed)
+ example: true
+ errors:
+ type: array
+ items:
+ $ref: '#/components/schemas/ValidationError'
+ description: Blocking validation errors that prevent download
+ warnings:
+ type: array
+ items:
+ $ref: '#/components/schemas/ValidationError'
+ description: Non-blocking validation warnings (informational only)
+
+ WorkflowForkedFrom:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Reference to the parent workflow from which this workflow was forked.'
+ properties:
+ workflow_id:
+ type: string
+ workflow_version_id:
+ type: string
+
+ WorkflowResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Full workflow entity including metadata and version history.'
+ required:
+ - id
+ - latest_version
+ - created_by
+ - created_at
+ - updated_at
+ properties:
+ id:
+ type: string
+ name:
+ type: string
+ description:
+ type: string
+ default_view:
+ type: string
+ enum:
+ - workflow
+ - app
+ latest_version:
+ type: integer
+ forked_from:
+ $ref: '#/components/schemas/WorkflowForkedFrom'
+ created_by:
+ type: string
+ created_at:
+ type: string
+ format: date-time
+ updated_at:
+ type: string
+ format: date-time
+
+ WorkflowVersionContentResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Full workflow version including the serialized workflow JSON.'
+ required:
+ - id
+ - version
+ - workflow_json
+ - created_by
+ - created_at
+ properties:
+ id:
+ type: string
+ version:
+ type: integer
+ workflow_json:
+ type: object
+ additionalProperties: true
+ created_by:
+ type: string
+ created_at:
+ type: string
+ format: date-time
+ dependency_asset_ids:
+ type: array
+ items:
+ type: string
+
+ WorkspaceAPIKeyInfo:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Metadata for a workspace-scoped API key (secret is never returned).'
+ required:
+ - id
+ - workspace_id
+ - user_id
+ - name
+ - description
+ - key_prefix
+ - created_at
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: API key ID
+ workspace_id:
+ type: string
+ description: Workspace this key belongs to
+ user_id:
+ type: string
+ description: User who created this key
+ name:
+ type: string
+ description: User-provided label
+ description:
+ type: string
+ description: User-provided description of the key's purpose. Limit is byte-based (UTF-8 encoding); 5000 bytes equals
+ 5000 ASCII characters or fewer multi-byte characters.
+ maxLength: 5000
+ key_prefix:
+ type: string
+ description: First 8 chars after prefix for display
+ expires_at:
+ type: string
+ format: date-time
+ description: When the key expires (if set)
+ last_used_at:
+ type: string
+ format: date-time
+ description: Last time the key was used
+ revoked_at:
+ type: string
+ format: date-time
+ description: When the key was revoked (if revoked)
+ created_at:
+ type: string
+ format: date-time
+ description: When the key was created
+
+ WorkspaceSummary:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Abbreviated workspace metadata used in list responses.'
+ required:
+ - id
+ - name
+ - type
+ properties:
+ id:
+ type: string
+ example: w-a1b2c3d4-5678-90ab-cdef-1234567890ab
+ name:
+ type: string
+ example: My Team
+ type:
+ type: string
+ enum:
+ - personal
+ - team
+
+ WorkspaceWithRole:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Workspace entity annotated with the requesting user''s role.'
+ required:
+ - id
+ - name
+ - type
+ - role
+ - created_at
+ - joined_at
+ properties:
+ id:
+ type: string
+ example: w-a1b2c3d4-5678-90ab-cdef-1234567890ab
+ name:
+ type: string
+ example: My Team
+ type:
+ type: string
+ enum:
+ - personal
+ - team
+ role:
+ type: string
+ enum:
+ - owner
+ - member
+ created_at:
+ type: string
+ format: date-time
+ description: When the workspace was created
+ joined_at:
+ type: string
+ format: date-time
+ description: When the user joined the workspace (same as created_at for the workspace creator)
+ subscription_tier:
+ $ref: '#/components/schemas/SubscriptionTier'
+
+ BindingErrorResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Error shape returned when request binding or validation fails before the handler runs.'
+ required:
+ - message
+ properties:
+ message:
+ type: string
+
+ ErrorResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Standard error response from cloud endpoints with a machine-readable code and human-readable message.'
+ required:
+ - code
+ - message
+ properties:
+ code:
+ type: string
+ description: Machine-readable error code
+ message:
+ type: string
+ description: Human-readable error message
+
+ AcceptInviteResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response returned after successfully accepting a workspace invitation.'
+ required:
+ - workspace_id
+ - workspace_name
+ properties:
+ workspace_id:
+ type: string
+ description: ID of the workspace joined
+ workspace_name:
+ type: string
+ description: Name of the workspace joined
+
+ BillingEventsResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Paginated list of billing events for a workspace.'
+ required:
+ - total
+ - events
+ - page
+ - limit
+ - totalPages
+ properties:
+ total:
+ type: integer
+ description: Total number of events
+ events:
+ type: array
+ items:
+ $ref: '#/components/schemas/BillingEvent'
+ page:
+ type: integer
+ description: Current page number (1-indexed)
+ limit:
+ type: integer
+ description: Items per page
+ totalPages:
+ type: integer
+ description: Total number of pages
+
+ BillingOpStatusResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Status of an asynchronous billing operation.'
+ required:
+ - id
+ - status
+ - started_at
+ properties:
+ id:
+ type: string
+ description: Unique identifier for the billing operation
+ status:
+ type: string
+ enum:
+ - pending
+ - succeeded
+ - failed
+ description: Current status of the operation
+ error_message:
+ type: string
+ description: Error message if status is failed
+ started_at:
+ type: string
+ format: date-time
+ description: When the operation was initiated
+ completed_at:
+ type: string
+ format: date-time
+ description: When the operation completed (success or failure)
+
+ CancelSubscriptionResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response after successfully cancelling a subscription.'
+ required:
+ - cancel_at
+ - billing_op_id
+ properties:
+ billing_op_id:
+ type: string
+ description: Billing operation ID to poll for status via GET /api/billing/ops/{id}
+ cancel_at:
+ type: string
+ format: date-time
+ description: The date when the subscription will end (end of current billing period)
+
+ CreateTopupResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response after successfully purchasing a credit top-up.'
+ required:
+ - topup_id
+ - status
+ - amount_cents
+ - billing_op_id
+ properties:
+ billing_op_id:
+ type: string
+ description: Billing operation ID to poll for status via GET /api/billing/ops/{id}
+ topup_id:
+ type: string
+ description: Unique identifier for the top-up request (same as billing_op_id, deprecated)
+ status:
+ type: string
+ enum:
+ - pending
+ - completed
+ - failed
+ description: Current status of the top-up
+ amount_cents:
+ type: integer
+ format: int64
+ description: Amount being charged in cents
+
+ CreateWorkspaceAPIKeyResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response containing the newly created workspace API key.'
+ required:
+ - id
+ - name
+ - description
+ - key
+ - key_prefix
+ - created_at
+ properties:
+ id:
+ type: string
+ format: uuid
+ description: API key ID
+ name:
+ type: string
+ description: User-provided label
+ description:
+ type: string
+ description: User-provided description of the key's purpose. Limit is byte-based (UTF-8 encoding); 5000 bytes equals
+ 5000 ASCII characters or fewer multi-byte characters.
+ maxLength: 5000
+ key:
+ type: string
+ description: The full plaintext API key (only shown once)
+ key_prefix:
+ type: string
+ description: First 8 chars after prefix for display
+ expires_at:
+ type: string
+ format: date-time
+ description: When the key expires (if set)
+ created_at:
+ type: string
+ format: date-time
+ description: When the key was created
+
+ ExchangeTokenResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response containing the issued Cloud JWT and its expiry.'
+ required:
+ - token
+ - expires_at
+ - workspace
+ - role
+ - permissions
+ properties:
+ token:
+ type: string
+ description: Cloud JWT token
+ expires_at:
+ type: string
+ format: date-time
+ description: Token expiration time (RFC 3339)
+ workspace:
+ $ref: '#/components/schemas/WorkspaceSummary'
+ role:
+ type: string
+ enum:
+ - owner
+ - member
+ description: User's role in the workspace
+ permissions:
+ type: array
+ items:
+ type: string
+ description: Permission strings for the role
+ example:
+ - owner:*
+
+ JobCancelResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response for POST /api/jobs/{job_id}/cancel. Returned on both fresh cancels and idempotent no-ops.'
+ required:
+ - cancelled
+ properties:
+ cancelled:
+ type: boolean
+ description: "True when a cancel event was successfully dispatched by this call.\nFalse when the job was already in\
+ \ a terminal or cancelling state,\nin which case the call is a no-op (still 200 \u2014 idempotent).\n"
+
+ ResubscribeResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response after successfully resubscribing to a billing plan.'
+ required:
+ - status
+ - billing_op_id
+ properties:
+ billing_op_id:
+ type: string
+ description: Billing operation ID to poll for status via GET /api/billing/ops/{id}
+ status:
+ type: string
+ enum:
+ - active
+ description: The subscription status after resubscribing
+ message:
+ type: string
+ description: Human-readable confirmation message
+
+ SubscribeResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Response after successfully subscribing to a billing plan.'
+ required:
+ - status
+ - billing_op_id
+ properties:
+ billing_op_id:
+ type: string
+ description: Billing operation ID to poll for status via GET /api/billing/ops/{id}
+ status:
+ type: string
+ enum:
+ - subscribed
+ - needs_payment_method
+ - pending_payment
+ description: 'Status of the subscription operation:
+
+ - subscribed: Subscription is active immediately
+
+ - needs_payment_method: User must add payment method via payment_method_url
+
+ - pending_payment: Upgrade initiated, waiting for payment to complete
+
+ '
+ effective_at:
+ type: string
+ format: date-time
+ description: When the subscription became/becomes active (present when status=subscribed or pending_payment)
+ payment_method_url:
+ type: string
+ description: URL to redirect user to add payment method (present when status=needs_payment_method)
+
+ UserResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] User information response'
+ required:
+ - id
+ - status
+ properties:
+ id:
+ type: string
+ description: Firebase UID of the authenticated user
+ status:
+ type: string
+ description: User status (always "active" for authenticated users)
+
+ WorkflowListResponse:
+ type: object
+ x-runtime: [cloud]
+ description: '[cloud-only] Paginated list of saved workflows.'
+ required:
+ - data
+ - pagination
+ properties:
+ data:
+ type: array
+ items:
+ $ref: '#/components/schemas/WorkflowResponse'
+ pagination:
+ $ref: '#/components/schemas/PaginationInfo'
+
+ FeedbackRequest:
+ type: object
+ x-runtime: [cloud]
+ description: "[cloud-only] User feedback submission body."
+ required:
+ - message
+ properties:
+ type:
+ type: string
+ enum:
+ - missing_nodes
+ - general
+ - missing_models
+ description: Feedback category
+ category:
+ type: string
+ description: Additional category metadata
+ message:
+ type: string
+ description: User-provided feedback message
diff --git a/pyproject.toml b/pyproject.toml
index 1fc9402a1..1e449b4a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "ComfyUI"
-version = "0.18.1"
+version = "0.22.0"
readme = "README.md"
license = { file = "LICENSE" }
requires-python = ">=3.10"
diff --git a/requirements.txt b/requirements.txt
index cfb4d4fb8..9308e29d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-comfyui-frontend-package==1.42.10
-comfyui-workflow-templates==0.9.45
-comfyui-embedded-docs==0.4.3
+comfyui-frontend-package==1.44.19
+comfyui-workflow-templates==0.9.85
+comfyui-embedded-docs==0.5.1
torch
torchsde
torchvision
@@ -19,11 +19,11 @@ scipy
tqdm
psutil
alembic
-SQLAlchemy
+SQLAlchemy>=2.0.0
filelock
av>=14.2.0
comfy-kitchen>=0.2.8
-comfy-aimdo>=0.2.12
+comfy-aimdo==0.4.5
requests
simpleeval>=1.0.0
blake3
diff --git a/server.py b/server.py
index 881da8e66..268441bd1 100644
--- a/server.py
+++ b/server.py
@@ -1,3 +1,4 @@
+import errno
import os
import sys
import asyncio
@@ -645,16 +646,36 @@ class PromptServer():
@routes.get("/system_stats")
async def system_stats(request):
- device = comfy.model_management.get_torch_device()
- device_name = comfy.model_management.get_torch_device_name(device)
+ primary_device = comfy.model_management.get_torch_device()
cpu_device = comfy.model_management.torch.device("cpu")
ram_total = comfy.model_management.get_total_memory(cpu_device)
ram_free = comfy.model_management.get_free_memory(cpu_device)
- vram_total, torch_vram_total = comfy.model_management.get_total_memory(device, torch_total_too=True)
- vram_free, torch_vram_free = comfy.model_management.get_free_memory(device, torch_free_too=True)
required_frontend_version = FrontendManager.get_required_frontend_version()
installed_templates_version = FrontendManager.get_installed_templates_version()
required_templates_version = FrontendManager.get_required_templates_version()
+ comfy_package_versions = FrontendManager.get_comfy_package_versions()
+
+ # Report every torch device visible to multigpu, with the primary
+ # device first so existing clients that read devices[0] keep working.
+ torch_devices = comfy.model_management.get_all_torch_devices()
+ if primary_device in torch_devices:
+ torch_devices = [primary_device] + [d for d in torch_devices if d != primary_device]
+ else:
+ torch_devices = [primary_device] + list(torch_devices)
+
+ device_entries = []
+ for d in torch_devices:
+ vram_total, torch_vram_total = comfy.model_management.get_total_memory(d, torch_total_too=True)
+ vram_free, torch_vram_free = comfy.model_management.get_free_memory(d, torch_free_too=True)
+ device_entries.append({
+ "name": comfy.model_management.get_torch_device_name(d),
+ "type": d.type,
+ "index": d.index,
+ "vram_total": vram_total,
+ "vram_free": vram_free,
+ "torch_vram_total": torch_vram_total,
+ "torch_vram_free": torch_vram_free,
+ })
system_stats = {
"system": {
@@ -665,22 +686,13 @@ class PromptServer():
"required_frontend_version": required_frontend_version,
"installed_templates_version": installed_templates_version,
"required_templates_version": required_templates_version,
+ "comfy_package_versions": comfy_package_versions,
"python_version": sys.version,
"pytorch_version": comfy.model_management.torch_version,
"embedded_python": os.path.split(os.path.split(sys.executable)[0])[1] == "python_embeded",
"argv": sys.argv
},
- "devices": [
- {
- "name": device_name,
- "type": device.type,
- "index": device.index,
- "vram_total": vram_total,
- "vram_free": vram_free,
- "torch_vram_total": torch_vram_total,
- "torch_vram_free": torch_vram_free,
- }
- ]
+ "devices": device_entries
}
return web.json_response(system_stats)
@@ -1245,7 +1257,13 @@ class PromptServer():
address = addr[0]
port = addr[1]
site = web.TCPSite(runner, address, port, ssl_context=ssl_ctx)
- await site.start()
+ try:
+ await site.start()
+ except OSError as e:
+ if e.errno == errno.EADDRINUSE:
+ logging.error(f"Port {port} is already in use on address {address}. Please close the other application or use a different port with --port.")
+ raise SystemExit(1)
+ raise
if not hasattr(self, 'address'):
self.address = address #TODO: remove this
diff --git a/tests-unit/app_test/frontend_manager_test.py b/tests-unit/app_test/frontend_manager_test.py
index 1d5a84b47..8c8a2eb48 100644
--- a/tests-unit/app_test/frontend_manager_test.py
+++ b/tests-unit/app_test/frontend_manager_test.py
@@ -52,7 +52,10 @@ def mock_provider(mock_releases):
@pytest.fixture(autouse=True)
def clear_cache():
import utils.install_util
+ import app.frontend_management
+
utils.install_util.PACKAGE_VERSIONS = {}
+ app.frontend_management.COMFY_PACKAGE_VERSIONS = []
def test_get_release(mock_provider, mock_releases):
@@ -147,7 +150,7 @@ def test_init_frontend_default_with_mocks():
# Act
with (
- patch("app.frontend_management.check_frontend_version") as mock_check,
+ patch("app.frontend_management.check_comfy_packages_versions") as mock_check,
patch.object(
FrontendManager, "default_frontend_path", return_value="/mocked/path"
),
@@ -168,7 +171,7 @@ def test_init_frontend_fallback_on_error():
patch.object(
FrontendManager, "init_frontend_unsafe", side_effect=Exception("Test error")
),
- patch("app.frontend_management.check_frontend_version") as mock_check,
+ patch("app.frontend_management.check_comfy_packages_versions") as mock_check,
patch.object(
FrontendManager, "default_frontend_path", return_value="/default/path"
),
@@ -277,7 +280,9 @@ def test_get_installed_templates_version():
def test_get_installed_templates_version_not_installed():
# Act
- with patch("app.frontend_management.version", side_effect=Exception("Package not found")):
+ with patch(
+ "app.frontend_management.version", side_effect=Exception("Package not found")
+ ):
version = FrontendManager.get_installed_templates_version()
# Assert
diff --git a/tests-unit/app_test/node_replace_manager_test.py b/tests-unit/app_test/node_replace_manager_test.py
new file mode 100644
index 000000000..8a3fd18bb
--- /dev/null
+++ b/tests-unit/app_test/node_replace_manager_test.py
@@ -0,0 +1,90 @@
+"""Tests for NodeReplaceManager registration behavior."""
+import importlib
+import sys
+import types
+
+import pytest
+
+
+@pytest.fixture
+def NodeReplaceManager(monkeypatch):
+ """Provide NodeReplaceManager with `nodes` stubbed.
+
+ `app.node_replace_manager` does `import nodes` at module level, which pulls in
+ torch + the full ComfyUI graph. register() doesn't actually need it, so we
+ stub `nodes` per-test (via monkeypatch so it's torn down) and reload the
+ module so it picks up the stub instead of any cached real import.
+ """
+ fake_nodes = types.ModuleType("nodes")
+ fake_nodes.NODE_CLASS_MAPPINGS = {}
+ monkeypatch.setitem(sys.modules, "nodes", fake_nodes)
+ monkeypatch.delitem(sys.modules, "app.node_replace_manager", raising=False)
+ module = importlib.import_module("app.node_replace_manager")
+ yield module.NodeReplaceManager
+ # Drop the freshly-imported module so the next test (or a later real import
+ # of `nodes`) starts from a clean slate.
+ sys.modules.pop("app.node_replace_manager", None)
+
+
+class FakeNodeReplace:
+ """Lightweight stand-in for comfy_api.latest._io.NodeReplace."""
+ def __init__(self, new_node_id, old_node_id, old_widget_ids=None,
+ input_mapping=None, output_mapping=None):
+ self.new_node_id = new_node_id
+ self.old_node_id = old_node_id
+ self.old_widget_ids = old_widget_ids
+ self.input_mapping = input_mapping
+ self.output_mapping = output_mapping
+
+
+def test_register_adds_replacement(NodeReplaceManager):
+ manager = NodeReplaceManager()
+ manager.register(FakeNodeReplace(new_node_id="NewNode", old_node_id="OldNode"))
+ assert manager.has_replacement("OldNode")
+ assert len(manager.get_replacement("OldNode")) == 1
+
+
+def test_register_allows_multiple_alternatives_for_same_old_node(NodeReplaceManager):
+ """Different new_node_ids for the same old_node_id should all be kept."""
+ manager = NodeReplaceManager()
+ manager.register(FakeNodeReplace(new_node_id="AltA", old_node_id="OldNode"))
+ manager.register(FakeNodeReplace(new_node_id="AltB", old_node_id="OldNode"))
+ replacements = manager.get_replacement("OldNode")
+ assert len(replacements) == 2
+ assert {r.new_node_id for r in replacements} == {"AltA", "AltB"}
+
+
+def test_register_is_idempotent_for_duplicate_pair(NodeReplaceManager):
+ """Re-registering the same (old_node_id, new_node_id) should be a no-op."""
+ manager = NodeReplaceManager()
+ manager.register(FakeNodeReplace(new_node_id="NewNode", old_node_id="OldNode"))
+ manager.register(FakeNodeReplace(new_node_id="NewNode", old_node_id="OldNode"))
+ manager.register(FakeNodeReplace(new_node_id="NewNode", old_node_id="OldNode"))
+ assert len(manager.get_replacement("OldNode")) == 1
+
+
+def test_register_idempotent_preserves_first_registration(NodeReplaceManager):
+ """First registration wins; later duplicates with different mappings are ignored."""
+ manager = NodeReplaceManager()
+ first = FakeNodeReplace(
+ new_node_id="NewNode", old_node_id="OldNode",
+ input_mapping=[{"new_id": "a", "old_id": "x"}],
+ )
+ second = FakeNodeReplace(
+ new_node_id="NewNode", old_node_id="OldNode",
+ input_mapping=[{"new_id": "b", "old_id": "y"}],
+ )
+ manager.register(first)
+ manager.register(second)
+ replacements = manager.get_replacement("OldNode")
+ assert len(replacements) == 1
+ assert replacements[0] is first
+
+
+def test_register_dedupe_does_not_affect_other_old_nodes(NodeReplaceManager):
+ manager = NodeReplaceManager()
+ manager.register(FakeNodeReplace(new_node_id="NewA", old_node_id="OldA"))
+ manager.register(FakeNodeReplace(new_node_id="NewA", old_node_id="OldA"))
+ manager.register(FakeNodeReplace(new_node_id="NewB", old_node_id="OldB"))
+ assert len(manager.get_replacement("OldA")) == 1
+ assert len(manager.get_replacement("OldB")) == 1
diff --git a/tests-unit/assets_test/conftest.py b/tests-unit/assets_test/conftest.py
index 6c5c56113..9867b4e14 100644
--- a/tests-unit/assets_test/conftest.py
+++ b/tests-unit/assets_test/conftest.py
@@ -236,6 +236,8 @@ def seeded_asset(request: pytest.FixtureRequest, http: requests.Session, api_bas
r = http.post(api_base + "/api/assets", files=files, data=form_data, timeout=120)
body = r.json()
assert r.status_code == 201, body
+ from helpers import assert_hash_fields_consistent
+ assert_hash_fields_consistent(body)
return body
diff --git a/tests-unit/assets_test/helpers.py b/tests-unit/assets_test/helpers.py
index 770e011f4..ae3de6dc3 100644
--- a/tests-unit/assets_test/helpers.py
+++ b/tests-unit/assets_test/helpers.py
@@ -26,3 +26,26 @@ def trigger_sync_seed_assets(session: requests.Session, base_url: str) -> None:
def get_asset_filename(asset_hash: str, extension: str) -> str:
return asset_hash.removeprefix("blake3:") + extension
+
+
+def assert_hash_fields_consistent(body: dict, expected_hash: str | None = None) -> None:
+ """Assert hash and asset_hash invariants on an Asset response.
+
+ Both must be present or both absent (so a regression that drops only one
+ is caught). When present, they must equal each other and, if expected_hash
+ is provided, must equal that value.
+ """
+ hash_present = "hash" in body
+ asset_hash_present = "asset_hash" in body
+ assert hash_present == asset_hash_present, (
+ f"hash and asset_hash must both be present or both absent: "
+ f"hash present={hash_present}, asset_hash present={asset_hash_present}"
+ )
+ if hash_present:
+ h = body["hash"]
+ ah = body["asset_hash"]
+ assert h == ah, f"hash and asset_hash must match: hash={h!r}, asset_hash={ah!r}"
+ if expected_hash is not None:
+ assert h == expected_hash, (
+ f"hash must equal expected: got {h!r}, expected {expected_hash!r}"
+ )
diff --git a/tests-unit/assets_test/test_assets_missing_sync.py b/tests-unit/assets_test/test_assets_missing_sync.py
index 47dc130cb..29ec1d09d 100644
--- a/tests-unit/assets_test/test_assets_missing_sync.py
+++ b/tests-unit/assets_test/test_assets_missing_sync.py
@@ -40,7 +40,9 @@ def test_seed_asset_removed_when_file_is_deleted(
# there should be exactly one with that name
matches = [a for a in body1.get("assets", []) if a.get("name") == name]
assert matches
- assert matches[0].get("asset_hash") is None
+ # Seed assets have no hash; exclude_none drops both keys from the response
+ assert "asset_hash" not in matches[0]
+ assert "hash" not in matches[0]
asset_info_id = matches[0]["id"]
# Remove the underlying file and sync again
diff --git a/tests-unit/assets_test/test_crud.py b/tests-unit/assets_test/test_crud.py
index 07310223e..fd2e9a098 100644
--- a/tests-unit/assets_test/test_crud.py
+++ b/tests-unit/assets_test/test_crud.py
@@ -21,6 +21,8 @@ def test_create_from_hash_success(
b1 = r1.json()
assert r1.status_code == 201, b1
assert b1["asset_hash"] == h
+ assert b1["hash"] == h
+ assert b1["hash"] == b1["asset_hash"]
assert b1["created_new"] is False
aid = b1["id"]
@@ -39,6 +41,7 @@ def test_get_and_delete_asset(http: requests.Session, api_base: str, seeded_asse
detail = rg.json()
assert rg.status_code == 200, detail
assert detail["id"] == aid
+ assert detail["hash"] == detail["asset_hash"]
assert "user_metadata" in detail
assert "filename" in detail["user_metadata"]
@@ -97,6 +100,7 @@ def test_delete_upon_reference_count(
copy = r2.json()
assert r2.status_code == 201, copy
assert copy["asset_hash"] == src_hash
+ assert copy["hash"] == src_hash
assert copy["created_new"] is False
# Soft-delete original reference (default) -> asset identity must remain
@@ -139,6 +143,7 @@ def test_update_asset_fields(http: requests.Session, api_base: str, seeded_asset
body = ru.json()
assert ru.status_code == 200, body
assert body["name"] == payload["name"]
+ assert body["hash"] == body["asset_hash"]
assert body["tags"] == original_tags # tags unchanged
assert body["user_metadata"]["purpose"] == "updated"
# filename should still be present and normalized by server
@@ -289,7 +294,9 @@ def test_metadata_filename_is_set_for_seed_asset_without_hash(
assert r1.status_code == 200, body
matches = [a for a in body.get("assets", []) if a.get("name") == name]
assert matches, "Seed asset should be visible after sync"
- assert matches[0].get("asset_hash") is None # still a seed
+ # Seed assets have no hash; exclude_none drops both keys from the response
+ assert "asset_hash" not in matches[0]
+ assert "hash" not in matches[0]
aid = matches[0]["id"]
r2 = http.get(f"{api_base}/api/assets/{aid}", timeout=120)
diff --git a/tests-unit/assets_test/test_list_filter.py b/tests-unit/assets_test/test_list_filter.py
index dcb7a73ca..17bbea5c6 100644
--- a/tests-unit/assets_test/test_list_filter.py
+++ b/tests-unit/assets_test/test_list_filter.py
@@ -3,6 +3,7 @@ import uuid
import pytest
import requests
+from helpers import assert_hash_fields_consistent
def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asset_factory, make_asset_bytes):
@@ -26,6 +27,10 @@ def test_list_assets_paging_and_sort(http: requests.Session, api_base: str, asse
got1 = [a["name"] for a in b1["assets"]]
assert got1 == sorted(names)[:2]
assert b1["has_more"] is True
+ # Populated assets in list responses must carry both `hash` and `asset_hash` consistently
+ for asset in b1["assets"]:
+ assert_hash_fields_consistent(asset)
+ assert "hash" in asset, "populated asset must emit hash on list endpoint"
r2 = http.get(
api_base + "/api/assets",
diff --git a/tests-unit/assets_test/test_uploads.py b/tests-unit/assets_test/test_uploads.py
index 0f2b124a3..427a417cc 100644
--- a/tests-unit/assets_test/test_uploads.py
+++ b/tests-unit/assets_test/test_uploads.py
@@ -5,6 +5,20 @@ from concurrent.futures import ThreadPoolExecutor
import requests
import pytest
+from app.assets.api.schemas_out import Asset, AssetCreated
+
+
+def test_asset_created_inherits_hash_field():
+ """AssetCreated must inherit `hash` from Asset so POST /api/assets responses emit it.
+
+ Schema-level guard: integration tests cover the wire shape, but inheritance
+ drift (e.g. AssetCreated ever being redefined to no longer extend Asset)
+ would silently drop `hash` from a major endpoint without this check.
+ """
+ assert "hash" in Asset.model_fields
+ assert "hash" in AssetCreated.model_fields
+ assert AssetCreated.model_fields["hash"].annotation == Asset.model_fields["hash"].annotation
+
def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, make_asset_bytes):
name = "dup_a.safetensors"
@@ -17,6 +31,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma
a1 = r1.json()
assert r1.status_code == 201, a1
assert a1["created_new"] is True
+ assert a1["hash"] == a1["asset_hash"]
# Second upload with the same data and name creates a new AssetReference (duplicates allowed)
# Returns 200 because Asset already exists, but a new AssetReference is created
@@ -26,6 +41,7 @@ def test_upload_ok_duplicate_reference(http: requests.Session, api_base: str, ma
a2 = r2.json()
assert r2.status_code in (200, 201), a2
assert a2["asset_hash"] == a1["asset_hash"]
+ assert a2["hash"] == a1["hash"]
assert a2["id"] != a1["id"] # new reference with same content
# Third upload with the same data but different name also creates new AssetReference
@@ -50,6 +66,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_
b1 = r1.json()
assert r1.status_code == 201, b1
h = b1["asset_hash"]
+ assert b1["hash"] == h
# Now POST /api/assets with only hash and no file
files = [
@@ -63,6 +80,7 @@ def test_upload_fastpath_from_existing_hash_no_file(http: requests.Session, api_
assert r2.status_code == 200, b2 # fast path returns 200 with created_new == False
assert b2["created_new"] is False
assert b2["asset_hash"] == h
+ assert b2["hash"] == h
def test_upload_fastpath_with_known_hash_and_file(
@@ -75,6 +93,7 @@ def test_upload_fastpath_with_known_hash_and_file(
b1 = r1.json()
assert r1.status_code == 201, b1
h = b1["asset_hash"]
+ assert b1["hash"] == h
# Send both file and hash of existing content -> server must drain file and create from hash (200)
files = {"file": ("ignored.bin", b"ignored" * 10, "application/octet-stream")}
@@ -84,6 +103,7 @@ def test_upload_fastpath_with_known_hash_and_file(
assert r2.status_code == 200, b2
assert b2["created_new"] is False
assert b2["asset_hash"] == h
+ assert b2["hash"] == h
def test_upload_multiple_tags_fields_are_merged(http: requests.Session, api_base: str):
@@ -142,6 +162,8 @@ def test_concurrent_upload_identical_bytes_different_names(
assert r1.status_code in (200, 201), b1
assert r2.status_code in (200, 201), b2
assert b1["asset_hash"] == b2["asset_hash"]
+ assert b1["hash"] == b2["hash"]
+ assert b1["hash"] == b1["asset_hash"]
assert b1["id"] != b2["id"]
created_flags = sorted([bool(b1.get("created_new")), bool(b2.get("created_new"))])
diff --git a/tests-unit/comfy_api_test/multicombo_serialization_test.py b/tests-unit/comfy_api_test/multicombo_serialization_test.py
new file mode 100644
index 000000000..421c65a0d
--- /dev/null
+++ b/tests-unit/comfy_api_test/multicombo_serialization_test.py
@@ -0,0 +1,78 @@
+from comfy_api.latest._io import Combo, MultiCombo
+
+
+def test_multicombo_serializes_multi_select_as_object():
+ multi_combo = MultiCombo.Input(
+ id="providers",
+ options=["a", "b", "c"],
+ default=["a"],
+ )
+
+ serialized = multi_combo.as_dict()
+
+ assert serialized["multiselect"] is True
+ assert "multi_select" in serialized
+ assert serialized["multi_select"] == {}
+
+
+def test_multicombo_serializes_multi_select_with_placeholder_and_chip():
+ multi_combo = MultiCombo.Input(
+ id="providers",
+ options=["a", "b", "c"],
+ default=["a"],
+ placeholder="Select providers",
+ chip=True,
+ )
+
+ serialized = multi_combo.as_dict()
+
+ assert serialized["multiselect"] is True
+ assert serialized["multi_select"] == {
+ "placeholder": "Select providers",
+ "chip": True,
+ }
+
+
+def test_combo_does_not_serialize_multiselect():
+ """Regular Combo should not have multiselect in its serialized output."""
+ combo = Combo.Input(
+ id="choice",
+ options=["a", "b", "c"],
+ )
+
+ serialized = combo.as_dict()
+
+ # Combo sets multiselect=False, but prune_dict keeps False (not None),
+ # so it should be present but False
+ assert serialized.get("multiselect") is False
+ assert "multi_select" not in serialized
+
+
+def _validate_combo_values(val, combo_options, is_multiselect):
+ """Reproduce the validation logic from execution.py for testing."""
+ if is_multiselect and isinstance(val, list):
+ return [v for v in val if v not in combo_options]
+ else:
+ return [val] if val not in combo_options else []
+
+
+def test_multicombo_validation_accepts_valid_list():
+ options = ["a", "b", "c"]
+ assert _validate_combo_values(["a", "b"], options, True) == []
+
+
+def test_multicombo_validation_rejects_invalid_values():
+ options = ["a", "b", "c"]
+ assert _validate_combo_values(["a", "x"], options, True) == ["x"]
+
+
+def test_multicombo_validation_accepts_empty_list():
+ options = ["a", "b", "c"]
+ assert _validate_combo_values([], options, True) == []
+
+
+def test_combo_validation_rejects_list_even_with_valid_items():
+ """A regular Combo should not accept a list value."""
+ options = ["a", "b", "c"]
+ invalid = _validate_combo_values(["a", "b"], options, False)
+ assert len(invalid) > 0
diff --git a/tests-unit/comfy_extras_test/nodes_math_test.py b/tests-unit/comfy_extras_test/nodes_math_test.py
index fa4cdcac3..714e37c32 100644
--- a/tests-unit/comfy_extras_test/nodes_math_test.py
+++ b/tests-unit/comfy_extras_test/nodes_math_test.py
@@ -124,9 +124,11 @@ class TestMathExpressionExecute:
with pytest.raises(Exception, match="not defined"):
self._exec("str(a)", a=42)
- def test_boolean_result_raises(self):
- with pytest.raises(ValueError, match="got bool"):
- self._exec("a > b", a=5, b=3)
+ def test_boolean_result(self):
+ result = self._exec("a > b", a=5, b=3)
+ assert result[2] is True
+ result = self._exec("a > b", a=3, b=5)
+ assert result[2] is False
def test_empty_expression_raises(self):
with pytest.raises(ValueError, match="Expression cannot be empty"):
diff --git a/tests-unit/comfy_test/model_detection_test.py b/tests-unit/comfy_test/model_detection_test.py
index 2551a417b..4e9350602 100644
--- a/tests-unit/comfy_test/model_detection_test.py
+++ b/tests-unit/comfy_test/model_detection_test.py
@@ -1,9 +1,23 @@
+from collections import defaultdict
+
import torch
from comfy.model_detection import detect_unet_config, model_config_from_unet_config
import comfy.supported_models
+def _freeze(value):
+ """Recursively convert a value to a hashable form so configs can be
+ compared/used as dict keys or set members."""
+ if isinstance(value, dict):
+ return frozenset((k, _freeze(v)) for k, v in value.items())
+ if isinstance(value, (list, tuple)):
+ return tuple(_freeze(v) for v in value)
+ if isinstance(value, set):
+ return frozenset(_freeze(v) for v in value)
+ return value
+
+
def _make_longcat_comfyui_sd():
"""Minimal ComfyUI-format state dict for pre-converted LongCat-Image weights."""
sd = {}
@@ -110,3 +124,21 @@ class TestModelDetection:
model_config = model_config_from_unet_config(unet_config, sd)
assert model_config is not None
assert type(model_config).__name__ == "FluxSchnell"
+
+ def test_unet_config_and_required_keys_combination_is_unique(self):
+ """Each model in the registry must have a unique combination of
+ ``unet_config`` and ``required_keys``. If two models share the same
+ combination, ``BASE.matches`` cannot disambiguate between them and the
+ first one in the list will always win."""
+ models = comfy.supported_models.models
+ groups = defaultdict(list)
+ for model in models:
+ key = (_freeze(model.unet_config), _freeze(model.required_keys))
+ groups[key].append(model.__name__)
+
+ duplicates = {k: names for k, names in groups.items() if len(names) > 1}
+ assert not duplicates, (
+ "Found models sharing the same (unet_config, required_keys) "
+ "combination, which makes detection ambiguous: "
+ + "; ".join(", ".join(names) for names in duplicates.values())
+ )
diff --git a/tests-unit/deploy_environment_test.py b/tests-unit/deploy_environment_test.py
new file mode 100644
index 000000000..c3497fbb0
--- /dev/null
+++ b/tests-unit/deploy_environment_test.py
@@ -0,0 +1,109 @@
+"""Tests for comfy.deploy_environment."""
+
+import os
+
+import pytest
+
+from comfy import deploy_environment
+from comfy.deploy_environment import get_deploy_environment
+
+
+@pytest.fixture(autouse=True)
+def _reset_cache_and_install_dir(tmp_path, monkeypatch):
+ """Reset the functools cache and point the ComfyUI install dir at a tmp dir for each test."""
+ get_deploy_environment.cache_clear()
+ monkeypatch.setattr(deploy_environment, "_COMFY_INSTALL_DIR", str(tmp_path))
+ yield
+ get_deploy_environment.cache_clear()
+
+
+def _write_env_file(tmp_path, content: str) -> str:
+ """Write the env file with exact content (no newline translation).
+
+ `newline=""` disables Python's text-mode newline translation so the bytes
+ on disk match the literal string passed in, regardless of host OS.
+ Newline-style tests (CRLF, lone CR) rely on this.
+ """
+ path = os.path.join(str(tmp_path), ".comfy_environment")
+ with open(path, "w", encoding="utf-8", newline="") as f:
+ f.write(content)
+ return path
+
+
+class TestGetDeployEnvironment:
+ def test_returns_local_git_when_file_missing(self):
+ assert get_deploy_environment() == "local-git"
+
+ def test_reads_value_from_file(self, tmp_path):
+ _write_env_file(tmp_path, "local-desktop2-standalone\n")
+ assert get_deploy_environment() == "local-desktop2-standalone"
+
+ def test_strips_trailing_whitespace_and_newline(self, tmp_path):
+ _write_env_file(tmp_path, " local-desktop2-standalone \n")
+ assert get_deploy_environment() == "local-desktop2-standalone"
+
+ def test_only_first_line_is_used(self, tmp_path):
+ _write_env_file(tmp_path, "first-line\nsecond-line\n")
+ assert get_deploy_environment() == "first-line"
+
+ def test_crlf_line_ending(self, tmp_path):
+ # Windows editors often save text files with CRLF line endings.
+ # The CR must not end up in the returned value.
+ _write_env_file(tmp_path, "local-desktop2-standalone\r\n")
+ assert get_deploy_environment() == "local-desktop2-standalone"
+
+ def test_crlf_multiline_only_first_line_used(self, tmp_path):
+ _write_env_file(tmp_path, "first-line\r\nsecond-line\r\n")
+ assert get_deploy_environment() == "first-line"
+
+ def test_crlf_with_surrounding_whitespace(self, tmp_path):
+ _write_env_file(tmp_path, " local-desktop2-standalone \r\n")
+ assert get_deploy_environment() == "local-desktop2-standalone"
+
+ def test_lone_cr_line_ending(self, tmp_path):
+ # Classic-Mac / some legacy editors use a bare CR.
+ # Universal-newlines decoding treats it as a line terminator too.
+ _write_env_file(tmp_path, "local-desktop2-standalone\r")
+ assert get_deploy_environment() == "local-desktop2-standalone"
+
+ def test_empty_file_falls_back_to_default(self, tmp_path):
+ _write_env_file(tmp_path, "")
+ assert get_deploy_environment() == "local-git"
+
+ def test_empty_after_whitespace_strip_falls_back_to_default(self, tmp_path):
+ _write_env_file(tmp_path, " \n")
+ assert get_deploy_environment() == "local-git"
+
+ def test_strips_control_chars_within_first_line(self, tmp_path):
+ # Embedded NUL/control chars in the value should be stripped
+ # (header-injection / smuggling protection).
+ _write_env_file(tmp_path, "abc\x00\x07xyz\n")
+ assert get_deploy_environment() == "abcxyz"
+
+ def test_strips_non_ascii_characters(self, tmp_path):
+ _write_env_file(tmp_path, "café-é\n")
+ assert get_deploy_environment() == "caf-"
+
+ def test_caps_read_at_128_bytes(self, tmp_path):
+ # A single huge line with no newline must not be fully read into memory.
+ huge = "x" * 10_000
+ _write_env_file(tmp_path, huge)
+ result = get_deploy_environment()
+ assert result == "x" * 128
+
+ def test_result_is_cached_across_calls(self, tmp_path):
+ path = _write_env_file(tmp_path, "first_value\n")
+ assert get_deploy_environment() == "first_value"
+ # Overwrite the file — cached value should still be returned.
+ with open(path, "w", encoding="utf-8") as f:
+ f.write("second_value\n")
+ assert get_deploy_environment() == "first_value"
+
+ def test_unreadable_file_falls_back_to_default(self, tmp_path, monkeypatch):
+ _write_env_file(tmp_path, "should_not_be_used\n")
+
+ def _boom(*args, **kwargs):
+ raise OSError("simulated read failure")
+
+ monkeypatch.setattr("builtins.open", _boom)
+ assert get_deploy_environment() == "local-git"
diff --git a/tests-unit/feature_flags_test.py b/tests-unit/feature_flags_test.py
index f2702cfc8..8ec52a124 100644
--- a/tests-unit/feature_flags_test.py
+++ b/tests-unit/feature_flags_test.py
@@ -1,10 +1,15 @@
"""Tests for feature flags functionality."""
+import pytest
+
from comfy_api.feature_flags import (
get_connection_feature,
supports_feature,
get_server_features,
+ CLI_FEATURE_FLAG_REGISTRY,
SERVER_FEATURE_FLAGS,
+ _coerce_flag_value,
+ _parse_cli_feature_flags,
)
@@ -96,3 +101,83 @@ class TestFeatureFlags:
result = get_connection_feature(sockets_metadata, "sid1", "any_feature")
assert result is False
assert supports_feature(sockets_metadata, "sid1", "any_feature") is False
+
+
+class TestCoerceFlagValue:
+ """Test suite for _coerce_flag_value."""
+
+ def test_registered_bool_true(self):
+ assert _coerce_flag_value("show_signin_button", "true") is True
+ assert _coerce_flag_value("show_signin_button", "True") is True
+
+ def test_registered_bool_false(self):
+ assert _coerce_flag_value("show_signin_button", "false") is False
+ assert _coerce_flag_value("show_signin_button", "FALSE") is False
+
+ def test_unregistered_key_stays_string(self):
+ assert _coerce_flag_value("unknown_flag", "true") == "true"
+ assert _coerce_flag_value("unknown_flag", "42") == "42"
+
+ def test_bool_typo_raises(self):
+ """Strict bool: typos like 'ture' or 'yes' must raise so the flag can be dropped."""
+ with pytest.raises(ValueError):
+ _coerce_flag_value("show_signin_button", "ture")
+ with pytest.raises(ValueError):
+ _coerce_flag_value("show_signin_button", "yes")
+ with pytest.raises(ValueError):
+ _coerce_flag_value("show_signin_button", "1")
+ with pytest.raises(ValueError):
+ _coerce_flag_value("show_signin_button", "")
+
+ def test_failed_int_coercion_raises(self, monkeypatch):
+ """Malformed values for typed flags must raise; caller decides what to do."""
+ monkeypatch.setitem(
+ CLI_FEATURE_FLAG_REGISTRY,
+ "test_int_flag",
+ {"type": "int", "default": 0, "description": "test"},
+ )
+ with pytest.raises(ValueError):
+ _coerce_flag_value("test_int_flag", "not_a_number")
+
+
+class TestParseCliFeatureFlags:
+ """Test suite for _parse_cli_feature_flags."""
+
+ def test_single_flag(self, monkeypatch):
+ monkeypatch.setattr("comfy_api.feature_flags.args", type("Args", (), {"feature_flag": ["show_signin_button=true"]})())
+ result = _parse_cli_feature_flags()
+ assert result == {"show_signin_button": True}
+
+ def test_missing_equals_defaults_to_true(self, monkeypatch):
+ """Bare flag without '=' is treated as the string 'true' (and coerced if registered)."""
+ monkeypatch.setattr("comfy_api.feature_flags.args", type("Args", (), {"feature_flag": ["show_signin_button", "valid=1"]})())
+ result = _parse_cli_feature_flags()
+ assert result == {"show_signin_button": True, "valid": "1"}
+
+ def test_empty_key_skipped(self, monkeypatch):
+ monkeypatch.setattr("comfy_api.feature_flags.args", type("Args", (), {"feature_flag": ["=value", "valid=1"]})())
+ result = _parse_cli_feature_flags()
+ assert result == {"valid": "1"}
+
+ def test_invalid_bool_value_dropped(self, monkeypatch, caplog):
+ """A typo'd bool value must be dropped entirely, not silently set to False
+ and not stored as a raw string. A warning must be logged."""
+ monkeypatch.setattr(
+ "comfy_api.feature_flags.args",
+ type("Args", (), {"feature_flag": ["show_signin_button=ture", "valid=1"]})(),
+ )
+ with caplog.at_level("WARNING"):
+ result = _parse_cli_feature_flags()
+ assert result == {"valid": "1"}
+ assert "show_signin_button" not in result
+ assert any("show_signin_button" in r.message and "drop" in r.message.lower() for r in caplog.records)
+
+
+class TestCliFeatureFlagRegistry:
+ """Test suite for the CLI feature flag registry."""
+
+ def test_registry_entries_have_required_fields(self):
+ for key, info in CLI_FEATURE_FLAG_REGISTRY.items():
+ assert "type" in info, f"{key} missing 'type'"
+ assert "default" in info, f"{key} missing 'default'"
+ assert "description" in info, f"{key} missing 'description'"
diff --git a/tests-unit/prompt_server_test/user_manager_test.py b/tests-unit/prompt_server_test/user_manager_test.py
index b939d8e68..27118400f 100644
--- a/tests-unit/prompt_server_test/user_manager_test.py
+++ b/tests-unit/prompt_server_test/user_manager_test.py
@@ -69,7 +69,11 @@ async def test_listuserdata_full_info(aiohttp_client, app, tmp_path):
assert len(result) == 1
assert result[0]["path"] == "file1.txt"
assert "size" in result[0]
- assert "modified" in result[0]
+ assert isinstance(result[0]["modified"], int)
+ assert isinstance(result[0]["created"], int)
+ # Verify millisecond magnitude (timestamps after year 2000 in ms are > 946684800000)
+ assert result[0]["modified"] > 946684800000
+ assert result[0]["created"] > 946684800000
async def test_listuserdata_split_path(aiohttp_client, app, tmp_path):
diff --git a/tests/execution/test_async_nodes.py b/tests/execution/test_async_nodes.py
index c771b4b36..54660c112 100644
--- a/tests/execution/test_async_nodes.py
+++ b/tests/execution/test_async_nodes.py
@@ -14,7 +14,6 @@ from tests.execution.test_execution import ComfyClient, run_warmup
class TestAsyncNodes:
@fixture(scope="class", autouse=True, params=[
(False, 0),
- (True, 0),
(True, 100),
])
def _server(self, args_pytest, request):
@@ -29,6 +28,8 @@ class TestAsyncNodes:
use_lru, lru_size = request.param
if use_lru:
pargs += ['--cache-lru', str(lru_size)]
+ else:
+ pargs += ['--cache-classic']
# Running server with args: pargs
p = subprocess.Popen(pargs)
yield
diff --git a/tests/execution/test_execution.py b/tests/execution/test_execution.py
index f73ca7e3c..15e2304fc 100644
--- a/tests/execution/test_execution.py
+++ b/tests/execution/test_execution.py
@@ -183,8 +183,7 @@ class TestExecution:
# Initialize server and client
#
@fixture(scope="class", autouse=True, params=[
- { "extra_args" : [], "should_cache_results" : True },
- { "extra_args" : ["--cache-lru", 0], "should_cache_results" : True },
+ { "extra_args" : ["--cache-classic"], "should_cache_results" : True },
{ "extra_args" : ["--cache-lru", 100], "should_cache_results" : True },
{ "extra_args" : ["--cache-none"], "should_cache_results" : False },
])
diff --git a/tests/execution/testing_nodes/testing-pack/api_test_nodes.py b/tests/execution/testing_nodes/testing-pack/api_test_nodes.py
index b2eaae05e..70c2a9e95 100644
--- a/tests/execution/testing_nodes/testing-pack/api_test_nodes.py
+++ b/tests/execution/testing_nodes/testing-pack/api_test_nodes.py
@@ -21,7 +21,7 @@ class TestAsyncProgressUpdate(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "execute"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def execute(self, value, sleep_seconds):
start = time.time()
@@ -51,7 +51,7 @@ class TestSyncProgressUpdate(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "execute"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
def execute(self, value, sleep_seconds):
start = time.time()
diff --git a/tests/execution/testing_nodes/testing-pack/async_test_nodes.py b/tests/execution/testing_nodes/testing-pack/async_test_nodes.py
index 547eea6f4..589dabf17 100644
--- a/tests/execution/testing_nodes/testing-pack/async_test_nodes.py
+++ b/tests/execution/testing_nodes/testing-pack/async_test_nodes.py
@@ -21,7 +21,7 @@ class TestAsyncValidation(ComfyNodeABC):
RETURN_TYPES = ("IMAGE",)
FUNCTION = "process"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
@classmethod
async def VALIDATE_INPUTS(cls, value, threshold):
@@ -53,7 +53,7 @@ class TestAsyncError(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "error_execution"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def error_execution(self, value, error_after):
await asyncio.sleep(error_after)
@@ -74,7 +74,7 @@ class TestAsyncValidationError(ComfyNodeABC):
RETURN_TYPES = ("IMAGE",)
FUNCTION = "process"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
@classmethod
async def VALIDATE_INPUTS(cls, value, max_value):
@@ -105,7 +105,7 @@ class TestAsyncTimeout(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "timeout_execution"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def timeout_execution(self, value, timeout, operation_time):
try:
@@ -129,7 +129,7 @@ class TestSyncError(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "sync_error"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
def sync_error(self, value):
raise RuntimeError("Intentional sync execution error for testing")
@@ -150,7 +150,7 @@ class TestAsyncLazyCheck(ComfyNodeABC):
RETURN_TYPES = ("IMAGE",)
FUNCTION = "process"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def check_lazy_status(self, condition, input1, input2):
# Simulate async checking (e.g., querying remote service)
@@ -184,7 +184,7 @@ class TestDynamicAsyncGeneration(ComfyNodeABC):
RETURN_TYPES = ("IMAGE",)
FUNCTION = "generate_async_workflow"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
def generate_async_workflow(self, image1, image2, num_async_nodes, sleep_duration):
g = GraphBuilder()
@@ -229,7 +229,7 @@ class TestAsyncResourceUser(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "use_resource"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def use_resource(self, value, resource_id, duration):
# Check if resource is already in use
@@ -265,7 +265,7 @@ class TestAsyncBatchProcessing(ComfyNodeABC):
RETURN_TYPES = ("IMAGE",)
FUNCTION = "process_batch"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def process_batch(self, images, process_time_per_item, unique_id):
batch_size = images.shape[0]
@@ -305,7 +305,7 @@ class TestAsyncConcurrentLimit(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "limited_execution"
- CATEGORY = "_for_testing/async"
+ CATEGORY = "experimental/async"
async def limited_execution(self, value, duration, node_id):
async with self._semaphore:
diff --git a/tests/execution/testing_nodes/testing-pack/specific_tests.py b/tests/execution/testing_nodes/testing-pack/specific_tests.py
index 4f8f01ae4..2eb5d520e 100644
--- a/tests/execution/testing_nodes/testing-pack/specific_tests.py
+++ b/tests/execution/testing_nodes/testing-pack/specific_tests.py
@@ -409,7 +409,7 @@ class TestSleep(ComfyNodeABC):
RETURN_TYPES = (IO.ANY,)
FUNCTION = "sleep"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
async def sleep(self, value, seconds, unique_id):
pbar = ProgressBar(seconds, node_id=unique_id)
@@ -440,7 +440,7 @@ class TestParallelSleep(ComfyNodeABC):
}
RETURN_TYPES = ("IMAGE",)
FUNCTION = "parallel_sleep"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
OUTPUT_NODE = True
def parallel_sleep(self, image1, image2, image3, sleep1, sleep2, sleep3, unique_id):
@@ -474,7 +474,7 @@ class TestOutputNodeWithSocketOutput:
}
RETURN_TYPES = ("IMAGE",)
FUNCTION = "process"
- CATEGORY = "_for_testing"
+ CATEGORY = "experimental"
OUTPUT_NODE = True
def process(self, image, value):
diff --git a/utils/install_util.py b/utils/install_util.py
index 34489aec5..fdba23a8f 100644
--- a/utils/install_util.py
+++ b/utils/install_util.py
@@ -39,7 +39,7 @@ def get_required_packages_versions():
if len(s) == 2:
version_str = s[-1]
if not is_valid_version(version_str):
- logging.error(f"Invalid version format in requirements.txt: {version_str}")
+ logging.debug(f"Invalid version format for {s[0]} in requirements.txt: {version_str}")
continue
out[s[0]] = version_str
return out.copy()